use std::collections::BTreeMap;
use camino::{Utf8Path, Utf8PathBuf};
use chrono::Utc;
use serde_json::Value;
use crate::dry_run::{build_fetch_plan, try_build_fetch_plan, FetchPlan};
use crate::http::HttpError;
use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
use crate::source::{FetchContext, FetchError, FetchResult, Source};
use crate::sources::arxiv::ArxivSource;
use crate::sources::crossref::CrossrefSource;
use crate::sources::unpaywall::UnpaywallSource;
use crate::store::{DoigetExtension, Metadata, Store};
use crate::{ArxivId, CapabilityProfile, Doi, Ref, Safekey, MAX_BATCH_REFS, SCHEMA_VERSION};
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub struct MetadataOnlyOutcome {
pub source: String,
pub resolver_profile: String,
pub license: Option<String>,
pub oa_url: Option<String>,
pub metadata: Value,
}
pub async fn metadata_only(
ref_: &Ref,
profile: &CapabilityProfile,
ctx: &FetchContext,
) -> Result<MetadataOnlyOutcome, FetchError> {
let cache_root = if resolver_base_overridden() {
None
} else {
ctx.cache_root.as_deref()
};
if let Some(root) = cache_root {
if let Some(cached) = crate::resolver_cache::read(root, ref_) {
return Ok(cached);
}
}
let outcome = match ref_ {
Ref::Doi(doi) => metadata_only_doi(doi, ref_, profile, ctx).await?,
Ref::Arxiv(id) => {
let arxiv = arxiv_source_from_env();
let metadata = arxiv.fetch_metadata_only(id, ctx).await?;
MetadataOnlyOutcome {
source: arxiv.name().to_string(),
resolver_profile: arxiv.name().to_string(),
license: Some("arxiv-default".to_string()),
oa_url: None,
metadata,
}
}
};
if let Some(root) = cache_root {
crate::resolver_cache::write(root, ref_, &outcome);
}
Ok(outcome)
}
fn resolver_base_overridden() -> bool {
[
"DOIGET_CROSSREF_BASE",
"DOIGET_UNPAYWALL_BASE",
"DOIGET_ARXIV_BASE",
]
.iter()
.any(|k| std::env::var_os(k).is_some())
}
pub async fn resolve_only(
ref_: &Ref,
profile: &CapabilityProfile,
ctx: &FetchContext,
) -> Result<MetadataOnlyOutcome, FetchError> {
metadata_only(ref_, profile, ctx).await
}
pub async fn metadata_only_to_store(
ref_: &Ref,
profile: &CapabilityProfile,
ctx: &FetchContext,
store: &dyn Store,
) -> Result<MetadataOnlyOutcome, FetchError> {
let outcome = metadata_only(ref_, profile, ctx).await?;
let safekey = ref_.safekey();
let metadata = build_metadata_only_metadata(ref_, &outcome);
write_metadata_and_pdf(store, &safekey, &metadata, None, ctx)?;
Ok(outcome)
}
fn build_metadata_only_metadata(ref_: &Ref, outcome: &MetadataOnlyOutcome) -> Metadata {
let (doi, arxiv_id) = match ref_ {
Ref::Doi(d) => (Some(d.clone()), None),
Ref::Arxiv(a) => (None, Some(a.clone())),
};
let ref_id = ref_.as_input_str().to_string();
let title = match extract_metadata_title(&outcome.metadata) {
Some(t) => t,
None => {
tracing::warn!(
ref_id = %ref_id,
source = %outcome.source,
"metadata-only: no usable title in resolver payload; \
persisting the ref id as the title placeholder"
);
ref_id
}
};
Metadata {
schema_version: SCHEMA_VERSION.to_string(),
title,
authors: extract_metadata_authors(&outcome.metadata),
year: None,
doi,
arxiv_id,
abstract_: None,
venue: None,
volume: None,
issue: None,
pages: None,
publisher: None,
issn: None,
isbn: None,
type_: None,
keywords: Vec::new(),
url: outcome.oa_url.clone(),
pdf_path: None,
doiget: Some(DoigetExtension {
fetched_at: Utc::now(),
source: outcome.source.clone(),
license: outcome
.license
.clone()
.unwrap_or_else(|| "unknown".to_string()),
size_bytes: 0,
mcp_call_id: None,
}),
other: BTreeMap::new(),
}
}
#[must_use]
pub fn cite_metadata(ref_: &Ref, outcome: &MetadataOnlyOutcome) -> Metadata {
let mut m = build_metadata_only_metadata(ref_, outcome);
if outcome.source == "crossref" {
let f = extract_crossref_fields(&outcome.metadata);
if let Some(title) = f.title {
m.title = title;
}
if !f.authors.is_empty() {
m.authors = f.authors;
}
m.year = f.year;
m.venue = f.venue;
m.volume = f.volume;
m.issue = f.issue;
m.pages = f.pages;
m.type_ = f.type_;
m.publisher = outcome
.metadata
.get("publisher")
.and_then(Value::as_str)
.map(str::to_string);
m.issn = outcome
.metadata
.get("ISSN")
.and_then(Value::as_array)
.and_then(|a| a.first())
.and_then(Value::as_str)
.map(str::to_string);
}
m
}
fn normalize_page_range(page: &str) -> String {
if page.contains("--") || !page.contains('-') {
return page.to_string();
}
page.replace('-', "--")
}
fn extract_metadata_title(meta: &Value) -> Option<String> {
let t = meta.get("title")?;
let s = match t.as_str() {
Some(s) => s.trim().to_string(),
None => t
.as_array()?
.iter()
.filter_map(Value::as_str)
.map(str::trim)
.find(|s| !s.is_empty())?
.to_string(),
};
if s.is_empty() {
None
} else {
Some(s)
}
}
fn extract_metadata_authors(meta: &Value) -> Vec<String> {
if let Some(arr) = meta.get("authors").and_then(Value::as_array) {
let v: Vec<String> = arr
.iter()
.filter_map(|a| a.as_str().map(str::to_string))
.collect();
if !v.is_empty() {
return v;
}
}
for key in ["author", "z_authors"] {
if let Some(arr) = meta.get(key).and_then(Value::as_array) {
let v: Vec<String> = arr
.iter()
.filter_map(|a| {
let given = a.get("given").and_then(Value::as_str).unwrap_or("");
let family = a.get("family").and_then(Value::as_str).unwrap_or("");
let name = format!("{given} {family}");
let name = name.trim();
if name.is_empty() {
a.get("name").and_then(Value::as_str).map(str::to_string)
} else {
Some(name.to_string())
}
})
.collect();
if !v.is_empty() {
return v;
}
}
}
Vec::new()
}
const FALLBACK_CONTACT_EMAIL: &str = "doiget@localhost";
fn contact_email_from_env() -> String {
std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| FALLBACK_CONTACT_EMAIL.to_string())
}
fn arxiv_source_from_env() -> ArxivSource {
if let Ok(s) = std::env::var("DOIGET_ARXIV_BASE") {
if let Ok(url) = url::Url::parse(&s) {
return ArxivSource::with_base(url);
}
}
ArxivSource::new()
}
fn crossref_source_from_env(contact: &str) -> CrossrefSource {
if let Ok(s) = std::env::var("DOIGET_CROSSREF_BASE") {
if let Ok(url) = url::Url::parse(&s) {
return CrossrefSource::with_base(url, contact.to_string());
}
}
CrossrefSource::new(contact.to_string())
}
fn unpaywall_source_from_env(contact: &str) -> UnpaywallSource {
if let Ok(s) = std::env::var("DOIGET_UNPAYWALL_BASE") {
if let Ok(url) = url::Url::parse(&s) {
return UnpaywallSource::with_base(url, contact.to_string());
}
}
UnpaywallSource::new(contact.to_string())
}
async fn metadata_only_doi(
_doi: &Doi,
ref_: &Ref,
profile: &CapabilityProfile,
ctx: &FetchContext,
) -> Result<MetadataOnlyOutcome, FetchError> {
let contact = contact_email_from_env();
let crossref = crossref_source_from_env(&contact);
match crossref.fetch(ref_, profile, ctx).await {
Ok(res) => {
let metadata = res.metadata_json.unwrap_or(Value::Null);
let oa_url = extract_crossref_oa_url(&metadata);
Ok(MetadataOnlyOutcome {
source: crossref.name().to_string(),
resolver_profile: crossref.name().to_string(),
license: None,
oa_url,
metadata,
})
}
Err(crossref_err) => {
let unpaywall = unpaywall_source_from_env(&contact);
match unpaywall.fetch(ref_, profile, ctx).await {
Ok(res) => {
let metadata = res.metadata_json.unwrap_or(Value::Null);
let oa_url = extract_unpaywall_oa_url(&metadata);
let license = if res.license == "unknown" {
None
} else {
Some(res.license)
};
Ok(MetadataOnlyOutcome {
source: unpaywall.name().to_string(),
resolver_profile: unpaywall.name().to_string(),
license,
oa_url,
metadata,
})
}
Err(_unpaywall_err) => {
Err(crossref_err)
}
}
}
}
}
fn extract_crossref_oa_url(msg: &Value) -> Option<String> {
let arr = msg.get("link")?.as_array()?;
arr.iter()
.filter_map(|entry| entry.get("URL").and_then(Value::as_str))
.find(|s| !s.is_empty())
.map(|s| s.to_string())
}
fn extract_unpaywall_oa_url(meta: &Value) -> Option<String> {
let loc = meta.get("best_oa_location")?;
loc.get("url_for_pdf")
.and_then(Value::as_str)
.or_else(|| loc.get("url").and_then(Value::as_str))
.map(|s| s.to_string())
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum PdfLegStatus {
Fetched,
NoOaUrl,
Blocked {
code: crate::ErrorCode,
message: String,
denial: Option<crate::DenialContext>,
suggested_arxiv_id: Option<String>,
},
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct FetchPaperOutcome {
pub source: String,
pub resolver_profile: String,
pub license: String,
pub path: Utf8PathBuf,
pub size_bytes: u64,
pub schema_version: String,
pub pdf_leg: PdfLegStatus,
pub safekey: String,
pub canonical_digest: String,
}
impl FetchPaperOutcome {
#[doc(hidden)]
pub fn for_test_synthetic(
safekey: impl Into<String>,
source: impl Into<String>,
pdf_leg: PdfLegStatus,
) -> Self {
let safekey: String = safekey.into();
let source: String = source.into();
Self {
source: source.clone(),
resolver_profile: source.clone(),
license: "unknown".to_string(),
path: Utf8PathBuf::from(format!("/tmp/{safekey}.pdf")),
size_bytes: 0,
schema_version: SCHEMA_VERSION.to_string(),
pdf_leg,
safekey: safekey.clone(),
canonical_digest: "00".repeat(32),
}
}
}
pub async fn fetch_paper(
ref_: &Ref,
profile: &CapabilityProfile,
ctx: &FetchContext,
store: &dyn Store,
store_root: &Utf8Path,
) -> Result<FetchPaperOutcome, FetchError> {
let safekey = ref_.safekey();
match ref_ {
Ref::Arxiv(id) => {
fetch_paper_arxiv(id, ref_, profile, ctx, store, store_root, &safekey).await
}
Ref::Doi(doi) => {
fetch_paper_doi(doi, ref_, profile, ctx, store, store_root, &safekey).await
}
}
}
pub fn fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> FetchPlan {
build_fetch_plan(ref_, store_root)
}
pub fn try_fetch_paper_plan(ref_: &Ref, store_root: &Utf8Path) -> Result<FetchPlan, FetchError> {
try_build_fetch_plan(ref_, store_root)
}
async fn fetch_paper_arxiv(
id: &ArxivId,
ref_: &Ref,
profile: &CapabilityProfile,
ctx: &FetchContext,
store: &dyn Store,
store_root: &Utf8Path,
safekey: &Safekey,
) -> Result<FetchPaperOutcome, FetchError> {
let source = arxiv_source_from_env();
if !source.can_serve(profile, ref_) {
return Err(FetchError::NotEligible {
source_key: source.name().to_string(),
});
}
let FetchResult {
license,
pdf_bytes,
final_url,
..
} = source.fetch(ref_, profile, ctx).await?;
let pdf = pdf_bytes.ok_or_else(|| FetchError::SourceSchema {
hint: "arxiv source returned no PDF bytes".to_string(),
})?;
let size_bytes = pdf.len() as u64;
let metadata = Metadata {
schema_version: SCHEMA_VERSION.to_string(),
title: format!("arxiv:{}", id.as_str()),
authors: Vec::new(),
year: None,
doi: None,
arxiv_id: Some(id.clone()),
abstract_: None,
venue: None,
volume: None,
issue: None,
pages: None,
publisher: None,
issn: None,
isbn: None,
type_: None,
keywords: Vec::new(),
url: final_url.as_ref().map(|u| u.to_string()),
pdf_path: Some(format!("{}.pdf", safekey.as_str())),
doiget: Some(DoigetExtension {
fetched_at: Utc::now(),
source: "arxiv".to_string(),
license: license.clone(),
size_bytes,
mcp_call_id: None,
}),
other: BTreeMap::new(),
};
let tmp = stage_pdf_to_tempfile(&pdf)?;
let pdf_src = Utf8Path::from_path(tmp.path())
.ok_or_else(|| FetchError::SourceSchema {
hint: "staging tempfile path is not UTF-8".to_string(),
})?
.to_path_buf();
write_metadata_and_pdf(store, safekey, &metadata, Some(&pdf_src), ctx)?;
drop(tmp);
let path = store_root.join(format!("{}.pdf", safekey.as_str()));
let canonical_digest =
crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), "arxiv", None).digest_hex();
Ok(FetchPaperOutcome {
source: "arxiv".to_string(),
resolver_profile: "arxiv".to_string(),
license,
path,
size_bytes,
schema_version: SCHEMA_VERSION.to_string(),
pdf_leg: PdfLegStatus::Fetched,
safekey: safekey.as_str().to_string(),
canonical_digest,
})
}
async fn fetch_paper_doi(
doi: &Doi,
ref_: &Ref,
profile: &CapabilityProfile,
ctx: &FetchContext,
store: &dyn Store,
store_root: &Utf8Path,
safekey: &Safekey,
) -> Result<FetchPaperOutcome, FetchError> {
let contact = contact_email_from_env();
let unpaywall_contact = unpaywall_email_from_env(&contact);
let crossref = crossref_source_from_env(&contact);
let (cross, crossref_err) = match crossref.fetch(ref_, profile, ctx).await {
Ok(r) => (Some(r), None),
Err(e) => {
tracing::warn!(
error = %e,
"crossref fetch failed; continuing with unpaywall-only metadata + OA leg"
);
(None, Some(e))
}
};
let crossref_meta = cross
.as_ref()
.and_then(|c| c.metadata_json.clone())
.unwrap_or(Value::Null);
let extracted = extract_crossref_fields(&crossref_meta);
let unpaywall = unpaywall_source_from_env(&unpaywall_contact);
let upw_result = unpaywall.fetch(ref_, profile, ctx).await;
let (license, source_label, oa_chain) = match upw_result {
Ok(r) => {
let chain = extract_oa_url_chain(r.metadata_json.as_ref());
let label = if r.license != "unknown" {
"unpaywall".to_string()
} else {
"crossref".to_string()
};
(r.license, label, chain)
}
Err(e) => {
tracing::warn!(
error = %e,
doi = %doi.as_str(),
"unpaywall fetch failed; OA chain will be empty (downstream PdfLegStatus::NoOaUrl \
is conservative — Unpaywall was unreachable, not authoritatively oa-free)"
);
("unknown".to_string(), "crossref".to_string(), Vec::new())
}
};
let (pdf_leg, pdf_bytes) = if oa_chain.is_empty() {
(PdfLegStatus::NoOaUrl, None)
} else {
let mut succeeded: Option<Vec<u8>> = None;
let mut last_err: Option<HttpError> = None;
let total = oa_chain.len();
for (idx, candidate) in oa_chain.iter().enumerate() {
let attempt = idx + 1;
tracing::debug!(
attempt,
total,
url = %candidate,
"trying OA PDF candidate (ADR-0029 chain)"
);
match try_fetch_oa_pdf(doi, candidate, ctx).await {
Ok((bytes, _final_url)) => {
if attempt > 1 {
tracing::info!(
attempt,
total,
url = %candidate,
"OA PDF chain succeeded on fallback candidate (ADR-0029)"
);
}
succeeded = Some(bytes);
break;
}
Err(e) => {
tracing::warn!(
attempt,
total,
url = %candidate,
error = %e,
"OA PDF candidate failed; advancing to next (ADR-0029 chain)"
);
last_err = Some(e);
}
}
}
match (succeeded, last_err) {
(Some(bytes), _) => (PdfLegStatus::Fetched, Some(bytes)),
(None, Some(e)) => {
let fe = FetchError::Http(e);
let denial: Option<crate::DenialContext> = (&fe).into();
let message = fe.to_string();
let code: crate::ErrorCode = fe.into();
let suggested_arxiv_id = oa_chain.iter().find_map(extract_arxiv_id_from_url);
(
PdfLegStatus::Blocked {
code,
message,
denial,
suggested_arxiv_id,
},
None,
)
}
(None, None) => {
tracing::error!(
total = oa_chain.len(),
"OA PDF chain walker exhausted without recording success or error \
(defensive fallback — should be unreachable)"
);
(
PdfLegStatus::Blocked {
code: crate::ErrorCode::InternalError,
message:
"OA PDF chain walker exhausted without recording success or error \
(orchestrator bug — please report)"
.to_string(),
denial: None,
suggested_arxiv_id: None,
},
None,
)
}
}
};
if let Some(e) = crossref_err {
if pdf_bytes.is_none() {
return Err(e);
}
}
let (final_source_label, size_bytes, pdf_path_relative, pdf_staged) = match &pdf_bytes {
Some(bytes) => {
let staged = stage_pdf_to_tempfile(bytes)?;
(
"oa-publisher".to_string(),
bytes.len() as u64,
Some(format!("{}.pdf", safekey.as_str())),
Some(staged),
)
}
None => (source_label, 0u64, None, None),
};
let metadata = Metadata {
schema_version: SCHEMA_VERSION.to_string(),
title: extracted.title.unwrap_or_else(|| doi.as_str().to_string()),
authors: extracted.authors,
year: extracted.year,
doi: Some(doi.clone()),
arxiv_id: None,
abstract_: None,
venue: extracted.venue,
volume: extracted.volume,
issue: extracted.issue,
pages: extracted.pages,
publisher: None,
issn: None,
isbn: None,
type_: extracted.type_,
keywords: Vec::new(),
url: cross
.as_ref()
.and_then(|c| c.final_url.as_ref())
.map(|u| u.to_string()),
pdf_path: pdf_path_relative,
doiget: Some(DoigetExtension {
fetched_at: Utc::now(),
source: final_source_label.clone(),
license: license.clone(),
size_bytes,
mcp_call_id: None,
}),
other: BTreeMap::new(),
};
let pdf_src_path = pdf_staged
.as_ref()
.and_then(|tmp| Utf8Path::from_path(tmp.path()).map(|p| p.to_path_buf()));
write_metadata_and_pdf(store, safekey, &metadata, pdf_src_path.as_deref(), ctx)?;
drop(pdf_staged);
let path = if pdf_bytes.is_some() {
store_root.join(format!("{}.pdf", safekey.as_str()))
} else {
store_root
.join(".metadata")
.join(format!("{}.toml", safekey.as_str()))
};
let canonical_digest = crate::CanonicalRef::new(
crate::SourceType::Doi,
doi.as_str(),
&final_source_label,
None,
)
.digest_hex();
Ok(FetchPaperOutcome {
source: final_source_label.clone(),
resolver_profile: final_source_label,
license,
path,
size_bytes,
schema_version: SCHEMA_VERSION.to_string(),
pdf_leg,
safekey: safekey.as_str().to_string(),
canonical_digest,
})
}
fn stage_pdf_to_tempfile(bytes: &[u8]) -> Result<tempfile::NamedTempFile, FetchError> {
let tmp = tempfile::NamedTempFile::new().map_err(|e| FetchError::SourceSchema {
hint: format!("creating PDF staging tempfile: {e}"),
})?;
std::fs::write(tmp.path(), bytes).map_err(|e| FetchError::SourceSchema {
hint: format!("staging PDF bytes: {e}"),
})?;
Ok(tmp)
}
fn write_metadata_and_pdf(
store: &dyn Store,
safekey: &Safekey,
metadata: &Metadata,
pdf_src: Option<&Utf8Path>,
ctx: &FetchContext,
) -> Result<(), FetchError> {
let store_path_relative = if pdf_src.is_some() {
format!("{}.pdf", safekey.as_str())
} else {
format!(".metadata/{}.toml", safekey.as_str())
};
let size_bytes = metadata.doiget.as_ref().map(|d| d.size_bytes).unwrap_or(0);
let license = metadata.doiget.as_ref().map(|d| d.license.as_str());
let source_name = metadata.doiget.as_ref().map(|d| d.source.as_str());
let canonical_digest: Option<String> = match (metadata.doi.as_ref(), metadata.arxiv_id.as_ref())
{
(Some(d), _) => source_name.map(|s| {
crate::CanonicalRef::new(crate::SourceType::Doi, d.as_str(), s, None).digest_hex()
}),
(None, Some(a)) => source_name.map(|s| {
crate::CanonicalRef::new(crate::SourceType::Arxiv, a.as_str(), s, None).digest_hex()
}),
(None, None) => None,
};
match store.write(safekey, metadata, pdf_src) {
Ok(()) => {
ctx.log.append(RowInput {
event: LogEvent::StoreWrite,
result: LogResult::Ok,
capability: Capability::Oa,
ref_: metadata
.doi
.as_ref()
.map(|d| d.as_str())
.or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
source: source_name,
error_code: None,
size_bytes: Some(size_bytes),
license,
store_path: Some(&store_path_relative),
canonical_digest: canonical_digest.as_deref(),
})?;
Ok(())
}
Err(e) => {
if let Err(log_err) = ctx.log.append(RowInput {
event: LogEvent::StoreWrite,
result: LogResult::Err,
capability: Capability::Oa,
ref_: metadata
.doi
.as_ref()
.map(|d| d.as_str())
.or_else(|| metadata.arxiv_id.as_ref().map(|a| a.as_str())),
source: source_name,
error_code: Some("STORE_ERROR"),
size_bytes: None,
license: None,
store_path: Some(&store_path_relative),
canonical_digest: canonical_digest.as_deref(),
}) {
tracing::error!(
store_err = %e,
log_err = %log_err,
"BOTH store.write AND provenance log append failed; \
audit trail is broken for this attempt"
);
}
Err(FetchError::SourceSchema {
hint: format!("store write failed: {e}"),
})
}
}
}
async fn try_fetch_oa_pdf(
doi: &Doi,
url: &url::Url,
ctx: &FetchContext,
) -> Result<(Vec<u8>, url::Url), HttpError> {
const SOURCE: &str = "oa-publisher";
let _permit = ctx.rate_limiter.acquire(SOURCE).await;
let canonical =
crate::CanonicalRef::new(crate::SourceType::Doi, doi.as_str(), SOURCE, None).digest_hex();
if let Some(allowlist) = ctx.http.source_allowlist(SOURCE) {
let host = url
.host_str()
.map(|h| h.to_ascii_lowercase())
.unwrap_or_default();
if !allowlist.matches(&host) {
let e = HttpError::RedirectDenied {
source_key: SOURCE.to_string(),
host: host.clone(),
expected_hosts: allowlist.redirect_hosts.clone(),
};
tracing::info!(
oa_url = %url,
denied_host = %host,
"OA URL host outside oa-publisher allowlist (pre-fetch check, \
docs/REDIRECT_ALLOWLIST.md §1 / issue #145)"
);
let _ = ctx.log.append(RowInput {
event: LogEvent::Fetch,
result: LogResult::Err,
capability: Capability::Oa,
ref_: Some(doi.as_str()),
source: Some(SOURCE),
error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
size_bytes: None,
license: None,
store_path: None,
canonical_digest: Some(&canonical),
});
return Err(e);
}
}
match ctx.http.fetch_pdf(SOURCE, url.clone()).await {
Ok((body, final_url)) => {
let size_bytes = body.len() as u64;
if let Err(e) = ctx.log.append(RowInput {
event: LogEvent::Fetch,
result: LogResult::Ok,
capability: Capability::Oa,
ref_: Some(doi.as_str()),
source: Some(SOURCE),
error_code: None,
size_bytes: Some(size_bytes),
license: None,
store_path: None,
canonical_digest: Some(&canonical),
}) {
tracing::warn!(error = %e, "appending oa-publisher Fetch ok row failed");
}
Ok((body.to_vec(), final_url))
}
Err(e) => {
match &e {
HttpError::RedirectDenied { host, .. } => {
tracing::info!(
oa_url = %url,
denied_host = %host,
"OA URL host outside oa-publisher allowlist"
);
}
HttpError::NotAPdf { .. } => {
tracing::info!(
oa_url = %url,
"OA URL did not return a PDF magic byte"
);
}
other => {
tracing::warn!(
oa_url = %url,
error = %other,
"OA PDF fetch failed"
);
}
}
let _ = ctx.log.append(RowInput {
event: LogEvent::Fetch,
result: LogResult::Err,
capability: Capability::Oa,
ref_: Some(doi.as_str()),
source: Some(SOURCE),
error_code: Some(crate::ErrorCode::NetworkError.as_wire()),
size_bytes: None,
license: None,
store_path: None,
canonical_digest: Some(&canonical),
});
Err(e)
}
}
}
pub(crate) struct CrossrefFields {
pub(crate) title: Option<String>,
pub(crate) authors: Vec<String>,
pub(crate) year: Option<i32>,
pub(crate) venue: Option<String>,
pub(crate) volume: Option<String>,
pub(crate) issue: Option<String>,
pub(crate) pages: Option<String>,
pub(crate) type_: Option<String>,
}
pub(crate) fn extract_crossref_fields(msg: &Value) -> CrossrefFields {
let title = msg
.get("title")
.and_then(|v| v.as_array())
.and_then(|arr| arr.first())
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let authors = msg
.get("author")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|a| {
let family = a.get("family").and_then(|v| v.as_str());
let given = a.get("given").and_then(|v| v.as_str());
match (family, given) {
(Some(f), Some(g)) => Some(format!("{f}, {g}")),
(Some(f), None) => Some(f.to_string()),
(None, Some(g)) => Some(g.to_string()),
_ => None,
}
})
.collect()
})
.unwrap_or_default();
let year = msg
.get("issued")
.and_then(|v| v.get("date-parts"))
.and_then(|v| v.as_array())
.and_then(|arr| arr.first())
.and_then(|v| v.as_array())
.and_then(|arr| arr.first())
.and_then(|v| v.as_i64())
.and_then(|n| i32::try_from(n).ok());
let venue = msg
.get("container-title")
.and_then(|v| v.as_array())
.and_then(|arr| arr.first())
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let type_ = msg
.get("type")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let volume = msg
.get("volume")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let issue = msg
.get("issue")
.and_then(|v| v.as_str())
.map(|s| s.to_string());
let pages = msg
.get("page")
.and_then(|v| v.as_str())
.map(normalize_page_range);
CrossrefFields {
title,
authors,
year,
venue,
volume,
issue,
pages,
type_,
}
}
fn extract_oa_url_chain(meta: Option<&Value>) -> Vec<url::Url> {
let meta = match meta {
Some(m) => m,
None => return Vec::new(),
};
let mut out: Vec<url::Url> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
let mut push_unique = |u: url::Url| {
let key = u.as_str().to_string();
if seen.insert(key) {
out.push(u);
}
};
if let Some(best) = meta.get("best_oa_location") {
if let Some(u) = pull_oa_url_from_location(best) {
push_unique(u);
}
}
if let Some(arr) = meta.get("oa_locations").and_then(|v| v.as_array()) {
for loc in arr {
if let Some(u) = pull_oa_url_from_location(loc) {
push_unique(u);
}
}
}
out
}
fn pull_oa_url_from_location(loc: &Value) -> Option<url::Url> {
let candidate = loc
.get("url_for_pdf")
.and_then(|v| v.as_str())
.or_else(|| loc.get("url").and_then(|v| v.as_str()))?;
url::Url::parse(candidate).ok()
}
fn extract_arxiv_id_from_url(url: &url::Url) -> Option<String> {
let host = url.host_str()?;
let is_arxiv = matches!(
host,
"arxiv.org" | "www.arxiv.org" | "export.arxiv.org" | "e-print.arxiv.org"
);
if !is_arxiv {
return None;
}
let path = url.path();
let raw = if path.starts_with("/pdf/") {
let s = path.strip_prefix("/pdf/")?;
s.strip_suffix(".pdf").unwrap_or(s)
} else if path.starts_with("/abs/") {
path.strip_prefix("/abs/")?
} else {
return None;
};
Some(strip_arxiv_version(raw).to_string())
}
fn strip_arxiv_version(id: &str) -> &str {
if let Some(v_pos) = id.rfind('v') {
let before_v = id[..v_pos].chars().next_back();
let suffix = &id[v_pos + 1..];
if before_v.is_some_and(|c| c.is_ascii_digit())
&& !suffix.is_empty()
&& suffix.bytes().all(|b| b.is_ascii_digit())
{
return &id[..v_pos];
}
}
id
}
fn unpaywall_email_from_env(fallback_contact: &str) -> String {
std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| fallback_contact.to_string())
}
#[derive(Debug)]
pub struct BatchResultEntry {
pub ref_: Ref,
pub outcome: Result<FetchPaperOutcome, FetchError>,
}
#[derive(Debug)]
#[non_exhaustive]
pub struct BatchOutcome {
pub results: Vec<BatchResultEntry>,
}
pub async fn batch_fetch(
refs: &[Ref],
profile: &CapabilityProfile,
ctx: &FetchContext,
store: &dyn Store,
store_root: &Utf8Path,
) -> Result<BatchOutcome, FetchError> {
if refs.len() > MAX_BATCH_REFS {
return Err(FetchError::TooManyRefs {
got: refs.len(),
max: MAX_BATCH_REFS,
});
}
let mut results = Vec::with_capacity(refs.len());
for ref_ in refs {
let outcome = fetch_paper(ref_, profile, ctx, store, store_root).await;
results.push(BatchResultEntry {
ref_: ref_.clone(),
outcome,
});
}
Ok(BatchOutcome { results })
}
pub fn batch_fetch_plans(
refs: &[Ref],
store_root: &Utf8Path,
) -> Result<Vec<(Ref, FetchPlan)>, FetchError> {
if refs.len() > MAX_BATCH_REFS {
return Err(FetchError::TooManyRefs {
got: refs.len(),
max: MAX_BATCH_REFS,
});
}
refs.iter()
.map(|r| try_build_fetch_plan(r, store_root).map(|p| (r.clone(), p)))
.collect()
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
fn crossref_outcome() -> MetadataOnlyOutcome {
MetadataOnlyOutcome {
source: "crossref".to_string(),
resolver_profile: "crossref".to_string(),
license: None,
oa_url: None,
metadata: serde_json::json!({
"title": ["Rigorous results on valence-bond ground states"],
"author": [
{ "family": "Affleck", "given": "Ian" },
{ "family": "Lieb", "given": "Elliott H." },
],
"issued": { "date-parts": [[1988, 6, 1]] },
"container-title": ["Physical Review Letters"],
"publisher": "American Physical Society",
"ISSN": ["0031-9007", "1079-7114"],
"volume": "59",
"issue": "7",
"page": "799-802",
"type": "journal-article",
}),
}
}
#[test]
fn cite_metadata_enriches_from_crossref_envelope() {
let ref_ = Ref::parse("10.1103/PhysRevLett.59.799").unwrap();
let m = cite_metadata(&ref_, &crossref_outcome());
assert_eq!(m.title, "Rigorous results on valence-bond ground states");
assert_eq!(m.authors, vec!["Affleck, Ian", "Lieb, Elliott H."]);
assert_eq!(m.year, Some(1988));
assert_eq!(m.venue.as_deref(), Some("Physical Review Letters"));
assert_eq!(m.publisher.as_deref(), Some("American Physical Society"));
assert_eq!(m.issn.as_deref(), Some("0031-9007"));
assert_eq!(m.volume.as_deref(), Some("59"));
assert_eq!(m.issue.as_deref(), Some("7"));
assert_eq!(m.pages.as_deref(), Some("799--802"));
assert_eq!(m.type_.as_deref(), Some("journal-article"));
}
#[test]
fn cite_metadata_non_crossref_keeps_minimal_baseline() {
let ref_ = Ref::parse("arxiv:2401.12345").unwrap();
let outcome = MetadataOnlyOutcome {
source: "arxiv".to_string(),
resolver_profile: "arxiv".to_string(),
license: Some("arxiv-default".to_string()),
oa_url: None,
metadata: serde_json::json!({ "title": "An arXiv Preprint" }),
};
let m = cite_metadata(&ref_, &outcome);
assert_eq!(m.title, "An arXiv Preprint");
assert_eq!(m.year, None);
assert_eq!(m.venue, None);
assert_eq!(m.publisher, None);
assert_eq!(m.issn, None);
assert!(m.arxiv_id.is_some());
}
#[test]
fn test_extract_arxiv_id_from_url() {
let urls = [
("https://arxiv.org/pdf/1901.12345.pdf", Some("1901.12345")),
("https://arxiv.org/abs/1901.12345", Some("1901.12345")),
("https://arxiv.org/pdf/1901.12345v2.pdf", Some("1901.12345")),
("https://arxiv.org/abs/1901.12345v3", Some("1901.12345")),
(
"https://www.arxiv.org/pdf/cond-mat/9501001.pdf",
Some("cond-mat/9501001"),
),
(
"https://export.arxiv.org/abs/cond-mat/9501001",
Some("cond-mat/9501001"),
),
(
"https://arxiv.org/pdf/cond-mat/9501001v1.pdf",
Some("cond-mat/9501001"),
),
(
"https://e-print.arxiv.org/pdf/2401.12345.pdf",
Some("2401.12345"),
),
("https://example.org/pdf/1901.12345.pdf", None),
];
for (url_str, expected) in urls {
let url = url::Url::parse(url_str).unwrap();
assert_eq!(
extract_arxiv_id_from_url(&url),
expected.map(String::from),
"url: {url_str}"
);
}
}
#[test]
fn test_strip_arxiv_version() {
assert_eq!(strip_arxiv_version("2401.12345v2"), "2401.12345");
assert_eq!(strip_arxiv_version("2401.12345v10"), "2401.12345");
assert_eq!(strip_arxiv_version("2401.12345"), "2401.12345");
assert_eq!(
strip_arxiv_version("cond-mat/9501001v3"),
"cond-mat/9501001"
);
assert_eq!(strip_arxiv_version("quant-phv5"), "quant-phv5");
}
#[test]
fn extract_crossref_oa_url_finds_first_url() {
let msg = serde_json::json!({
"link": [
{"URL": "https://example.org/free.pdf"},
{"URL": "https://example.org/alt.pdf"}
]
});
assert_eq!(
extract_crossref_oa_url(&msg),
Some("https://example.org/free.pdf".to_string())
);
}
#[test]
fn extract_crossref_oa_url_returns_none_when_absent() {
let msg = serde_json::json!({});
assert!(extract_crossref_oa_url(&msg).is_none());
}
#[test]
fn extract_crossref_oa_url_skips_empty_url_strings() {
let msg = serde_json::json!({
"link": [
{"URL": ""},
{"URL": "https://example.org/real.pdf"}
]
});
assert_eq!(
extract_crossref_oa_url(&msg),
Some("https://example.org/real.pdf".to_string())
);
}
#[test]
fn extract_unpaywall_oa_url_prefers_url_for_pdf() {
let meta = serde_json::json!({
"best_oa_location": {
"url_for_pdf": "https://example.org/pdf",
"url": "https://example.org/landing"
}
});
assert_eq!(
extract_unpaywall_oa_url(&meta),
Some("https://example.org/pdf".to_string())
);
}
#[test]
fn extract_unpaywall_oa_url_falls_back_to_url() {
let meta = serde_json::json!({
"best_oa_location": {
"url": "https://example.org/landing"
}
});
assert_eq!(
extract_unpaywall_oa_url(&meta),
Some("https://example.org/landing".to_string())
);
}
#[test]
fn extract_unpaywall_oa_url_returns_none_when_absent() {
let meta = serde_json::json!({});
assert!(extract_unpaywall_oa_url(&meta).is_none());
}
#[test]
fn extract_crossref_fields_parses_minimal_shape() {
let msg = serde_json::json!({
"title": ["Example Title"],
"author": [{ "family": "Smith", "given": "Alice" }],
"issued": { "date-parts": [[2024, 1, 15]] },
"container-title": ["Phys. Rev. X"],
"type": "journal-article"
});
let f = extract_crossref_fields(&msg);
assert_eq!(f.title.as_deref(), Some("Example Title"));
assert_eq!(f.authors, vec!["Smith, Alice".to_string()]);
assert_eq!(f.year, Some(2024));
assert_eq!(f.venue.as_deref(), Some("Phys. Rev. X"));
assert_eq!(f.type_.as_deref(), Some("journal-article"));
}
#[test]
fn extract_crossref_fields_tolerates_missing() {
let f = extract_crossref_fields(&serde_json::json!({}));
assert!(f.title.is_none());
assert!(f.authors.is_empty());
assert!(f.year.is_none());
assert!(f.venue.is_none());
assert!(f.type_.is_none());
}
#[test]
fn extract_oa_url_chain_prefers_best_url_for_pdf() {
let meta = serde_json::json!({
"best_oa_location": {
"url_for_pdf": "https://example.org/pdf",
"url": "https://example.org/landing"
}
});
let chain = extract_oa_url_chain(Some(&meta));
assert_eq!(chain.len(), 1);
assert_eq!(chain[0].as_str(), "https://example.org/pdf");
}
#[test]
fn extract_oa_url_chain_falls_back_to_url_when_url_for_pdf_absent() {
let meta = serde_json::json!({
"best_oa_location": {
"url": "https://example.org/landing"
}
});
let chain = extract_oa_url_chain(Some(&meta));
assert_eq!(chain.len(), 1);
assert_eq!(chain[0].as_str(), "https://example.org/landing");
}
#[test]
fn extract_oa_url_chain_is_empty_when_no_locations() {
let meta = serde_json::json!({});
assert!(extract_oa_url_chain(Some(&meta)).is_empty());
assert!(extract_oa_url_chain(None).is_empty());
}
#[test]
fn extract_oa_url_chain_appends_oa_locations_after_best() {
let meta = serde_json::json!({
"best_oa_location": {
"url_for_pdf": "https://publisher.example.org/pdf"
},
"oa_locations": [
{"url_for_pdf": "https://publisher.example.org/pdf"},
{"url_for_pdf": "https://arxiv.org/pdf/2401.12345"},
{"url": "https://repo.example.edu/handle/123"}
]
});
let chain = extract_oa_url_chain(Some(&meta));
let strs: Vec<&str> = chain.iter().map(|u| u.as_str()).collect();
assert_eq!(
strs,
vec![
"https://publisher.example.org/pdf",
"https://arxiv.org/pdf/2401.12345",
"https://repo.example.edu/handle/123",
],
"chain ordering MUST be best_oa_location first, oa_locations[] verbatim after"
);
}
#[test]
fn extract_oa_url_chain_dedupes_repeated_urls() {
let meta = serde_json::json!({
"best_oa_location": {
"url_for_pdf": "https://example.org/pdf"
},
"oa_locations": [
{"url_for_pdf": "https://example.org/pdf"},
{"url_for_pdf": "https://example.org/pdf"},
{"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
]
});
let chain = extract_oa_url_chain(Some(&meta));
assert_eq!(chain.len(), 2);
assert_eq!(chain[0].as_str(), "https://example.org/pdf");
assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
}
#[test]
fn extract_oa_url_chain_skips_unparsable_urls() {
let meta = serde_json::json!({
"best_oa_location": {
"url_for_pdf": "https://good.example.org/pdf"
},
"oa_locations": [
{"url_for_pdf": "not a url"},
{"url_for_pdf": "https://arxiv.org/pdf/2401.12345"}
]
});
let chain = extract_oa_url_chain(Some(&meta));
assert_eq!(chain.len(), 2);
assert_eq!(chain[0].as_str(), "https://good.example.org/pdf");
assert_eq!(chain[1].as_str(), "https://arxiv.org/pdf/2401.12345");
}
#[test]
fn fetch_paper_plan_matches_build_fetch_plan() {
use crate::{ArxivId, Doi};
let r = Ref::Doi(Doi("10.1234/example".to_string()));
let root = Utf8PathBuf::from("/tmp/doiget-test");
let plan_a = fetch_paper_plan(&r, &root);
let plan_b = build_fetch_plan(&r, &root);
assert_eq!(plan_a.metadata_sources, plan_b.metadata_sources);
assert_eq!(plan_a.target_pdf_path, plan_b.target_pdf_path);
assert_eq!(plan_a.target_metadata_path, plan_b.target_metadata_path);
let r2 = Ref::Arxiv(ArxivId("2401.12345".to_string()));
let plan_c = fetch_paper_plan(&r2, &root);
let plan_d = build_fetch_plan(&r2, &root);
assert_eq!(plan_c.pdf_sources[0].key, plan_d.pdf_sources[0].key);
}
#[test]
fn batch_fetch_plans_returns_plan_per_ref_in_order() {
use crate::{ArxivId, Doi};
let refs = vec![
Ref::Doi(Doi("10.1234/alpha".to_string())),
Ref::Arxiv(ArxivId("2401.12345".to_string())),
];
let root = Utf8PathBuf::from("/tmp/doiget-batch-test");
let plans = batch_fetch_plans(&refs, &root).expect("under cap returns Ok");
assert_eq!(plans.len(), 2);
assert!(matches!(plans[0].0, Ref::Doi(_)));
assert!(matches!(plans[1].0, Ref::Arxiv(_)));
assert_eq!(plans[0].1.metadata_sources, vec!["crossref", "unpaywall"]);
assert_eq!(plans[1].1.pdf_sources[0].key, "arxiv");
}
#[test]
fn batch_fetch_plans_too_many_refs_returns_err() {
use crate::Doi;
let n = MAX_BATCH_REFS + 1;
let refs: Vec<Ref> = (0..n)
.map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
.collect();
let root = Utf8PathBuf::from("/tmp/doiget-toomany");
let err = batch_fetch_plans(&refs, &root).expect_err("over cap returns Err");
match err {
FetchError::TooManyRefs { got, max } => {
assert_eq!(got, n);
assert_eq!(max, MAX_BATCH_REFS);
}
other => panic!("expected TooManyRefs, got: {other:?}"),
}
}
#[tokio::test]
async fn batch_fetch_too_many_refs_returns_err_before_any_fetch() {
use crate::http::{tier_1_allowlist, HttpClient};
use crate::provenance::ProvenanceLog;
use crate::rate_limiter::RateLimiter;
use crate::store::FsStore;
use crate::{Doi, RateLimits};
use std::sync::Arc;
let td = tempfile::TempDir::new().expect("tempdir");
let log_path = Utf8Path::from_path(td.path())
.expect("utf-8")
.join("log.jsonl");
let store_root = Utf8Path::from_path(td.path())
.expect("utf-8")
.join("papers");
let ctx = FetchContext {
http: Arc::new(HttpClient::new(tier_1_allowlist()).expect("http client")),
rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
log: Arc::new(
ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
.expect("provenance log"),
),
session_id: "01J0000000000000000000TEST".into(),
cache_root: None,
};
let profile = CapabilityProfile::from_env().expect("clean env");
let store = FsStore::new(store_root.clone()).expect("fs store");
let n = MAX_BATCH_REFS + 1;
let refs: Vec<Ref> = (0..n)
.map(|i| Ref::Doi(Doi(format!("10.1234/n{}", i))))
.collect();
let err = batch_fetch(&refs, &profile, &ctx, &store, &store_root)
.await
.expect_err("over cap returns Err");
match err {
FetchError::TooManyRefs { got, max } => {
assert_eq!(got, n);
assert_eq!(max, MAX_BATCH_REFS);
}
other => panic!("expected TooManyRefs, got: {other:?}"),
}
}
#[tokio::test]
async fn try_fetch_oa_pdf_non_pdf_body_is_err_not_silent_none() {
use crate::http::HttpClient;
use crate::provenance::ProvenanceLog;
use crate::rate_limiter::RateLimiter;
use crate::{Doi, RateLimits};
use std::sync::Arc;
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(
ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
)
.mount(&server)
.await;
let host = server
.uri()
.parse::<url::Url>()
.expect("uri")
.host_str()
.expect("host")
.to_string();
let td = tempfile::TempDir::new().expect("tempdir");
let log_path = Utf8Path::from_path(td.path())
.expect("utf-8")
.join("log.jsonl");
let ctx = FetchContext {
http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
log: Arc::new(
ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
.expect("provenance log"),
),
session_id: "01J0000000000000000000TEST".into(),
cache_root: None,
};
let doi = Doi("10.1234/example".to_string());
let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
let res = try_fetch_oa_pdf(&doi, &url, &ctx).await;
match res {
Err(HttpError::NotAPdf { .. }) => {}
other => panic!("expected Err(NotAPdf), got: {other:?}"),
}
}
#[tokio::test]
async fn try_fetch_oa_pdf_off_allowlist_host_no_redirect_is_redirect_denied_145() {
use crate::http::HttpClient;
use crate::provenance::ProvenanceLog;
use crate::rate_limiter::RateLimiter;
use crate::{DenialContext, DenialReason, Doi, RateLimits};
use std::sync::Arc;
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7 real pdf".to_vec()))
.mount(&server)
.await;
let td = tempfile::TempDir::new().expect("tempdir");
let log_path = Utf8Path::from_path(td.path())
.expect("utf-8")
.join("log.jsonl");
let ctx = FetchContext {
http: Arc::new(HttpClient::new_for_tests_allow_http(
"oa-publisher",
"allowed-publisher.example.com",
)),
rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
log: Arc::new(
ProvenanceLog::open(log_path.clone(), "01J0000000000000000000TEST".into())
.expect("provenance log"),
),
session_id: "01J0000000000000000000TEST".into(),
cache_root: None,
};
let doi = Doi("10.1234/example".to_string());
let off_host_url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
let res = try_fetch_oa_pdf(&doi, &off_host_url, &ctx).await;
let err = match res {
Err(e @ HttpError::RedirectDenied { .. }) => e,
other => {
panic!("expected Err(RedirectDenied) from the pre-fetch check, got: {other:?}")
}
};
match &err {
HttpError::RedirectDenied {
source_key,
host,
expected_hosts,
} => {
assert_eq!(source_key, "oa-publisher");
assert_eq!(
host,
off_host_url
.host_str()
.expect("wiremock host")
.to_ascii_lowercase()
.as_str()
);
assert_eq!(
expected_hosts,
&vec!["allowed-publisher.example.com".to_string()]
);
}
_ => unreachable!(),
}
assert!(
server
.received_requests()
.await
.unwrap_or_default()
.is_empty(),
"the off-allowlist OA URL must NOT be fetched: the pre-check \
(REDIRECT_ALLOWLIST.md §1) rejects it before any request is \
issued; wiremock recorded request(s)",
);
let dc: Option<DenialContext> = (&err).into();
let dc = dc.expect("pre-fetch RedirectDenied -> Some(DenialContext)");
assert_eq!(dc.reason, DenialReason::RedirectNotInAllowlist);
assert_eq!(dc.source.as_deref(), Some("oa-publisher"));
assert_eq!(
dc.attempted,
Some(off_host_url.host_str().expect("host").to_ascii_lowercase()),
"attempted host must be the rejected OA URL host, lowercased — \
identical to what the redirect closure records",
);
assert_eq!(
dc.expected,
Some(vec!["allowed-publisher.example.com".to_string()]),
);
let log_txt = std::fs::read_to_string(&log_path).expect("read provenance log");
let fetch_err_row = log_txt
.lines()
.filter_map(|l| serde_json::from_str::<serde_json::Value>(l).ok())
.find(|v| {
v.get("event").and_then(|e| e.as_str()) == Some("fetch")
&& v.get("result").and_then(|r| r.as_str()) == Some("err")
})
.expect("a Fetch/err provenance row was written");
assert_eq!(
fetch_err_row.get("source").and_then(|s| s.as_str()),
Some("oa-publisher"),
);
assert_eq!(
fetch_err_row.get("error_code").and_then(|c| c.as_str()),
Some("NETWORK_ERROR"),
);
assert_eq!(
fetch_err_row.get("ref").and_then(|r| r.as_str()),
Some("10.1234/example"),
);
}
#[tokio::test]
async fn try_fetch_oa_pdf_on_allowlist_host_still_fetches_pdf_no_regression_145() {
use crate::http::HttpClient;
use crate::provenance::ProvenanceLog;
use crate::rate_limiter::RateLimiter;
use crate::{Doi, RateLimits};
use std::sync::Arc;
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};
let server = MockServer::start().await;
let body = b"%PDF-1.7\nhello pdf".to_vec();
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
.mount(&server)
.await;
let host = server
.uri()
.parse::<url::Url>()
.expect("uri")
.host_str()
.expect("host")
.to_string();
let td = tempfile::TempDir::new().expect("tempdir");
let log_path = Utf8Path::from_path(td.path())
.expect("utf-8")
.join("log.jsonl");
let ctx = FetchContext {
http: Arc::new(HttpClient::new_for_tests_allow_http("oa-publisher", &host)),
rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
log: Arc::new(
ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
.expect("provenance log"),
),
session_id: "01J0000000000000000000TEST".into(),
cache_root: None,
};
let doi = Doi("10.1234/example".to_string());
let url: url::Url = format!("{}/oa.pdf", server.uri()).parse().expect("url");
let (bytes, _final_url) = try_fetch_oa_pdf(&doi, &url, &ctx)
.await
.expect("on-allowlist OA URL still fetches the PDF");
assert_eq!(bytes, body, "PDF bytes must be returned unchanged");
}
#[test]
fn pre_fetch_denial_produces_byte_identical_denial_context_as_redirect_denied_145() {
use crate::{DenialContext, DenialReason};
let pre_fetch = HttpError::RedirectDenied {
source_key: "oa-publisher".to_string(),
host: "attacker.test".to_string(),
expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
};
let redirect_closure = HttpError::RedirectDenied {
source_key: "oa-publisher".to_string(),
host: "attacker.test".to_string(),
expected_hosts: vec!["*.springer.com".to_string(), "*.plos.org".to_string()],
};
let dc_pre: Option<DenialContext> = (&pre_fetch).into();
let dc_red: Option<DenialContext> = (&redirect_closure).into();
let dc_pre = dc_pre.expect("pre-fetch -> Some");
let dc_red = dc_red.expect("redirect -> Some");
assert_eq!(dc_pre, dc_red);
assert_eq!(dc_pre.reason, DenialReason::RedirectNotInAllowlist);
assert_eq!(dc_pre.source.as_deref(), Some("oa-publisher"));
assert_eq!(dc_pre.attempted.as_deref(), Some("attacker.test"));
assert_eq!(
dc_pre.expected,
Some(vec!["*.springer.com".to_string(), "*.plos.org".to_string()]),
);
assert_eq!(dc_pre.hop_index, None);
assert_eq!(dc_pre.cap, None);
assert_eq!(dc_pre.actual, None);
}
async fn md139_harness() -> (
wiremock::MockServer,
FetchContext,
crate::store::FsStore,
Utf8PathBuf,
tempfile::TempDir,
) {
use crate::http::HttpClient;
use crate::provenance::ProvenanceLog;
use crate::rate_limiter::RateLimiter;
use crate::store::FsStore;
use crate::RateLimits;
use std::sync::Arc;
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{"status":"ok","message":{"title":["Example Paper"],"author":[{"given":"Ada","family":"Lovelace"}]}}"#,
))
.mount(&server)
.await;
std::env::set_var("DOIGET_CROSSREF_BASE", server.uri());
let host = server
.uri()
.parse::<url::Url>()
.expect("uri")
.host_str()
.expect("host")
.to_string();
let td = tempfile::TempDir::new().expect("tempdir");
let base = Utf8Path::from_path(td.path()).expect("utf-8");
let log_path = base.join("log.jsonl");
let store_root = base.join("papers");
let ctx = FetchContext {
http: Arc::new(HttpClient::new_for_tests_allow_http_multi(&[
("crossref", &host),
("unpaywall", &host),
])),
rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
log: Arc::new(
ProvenanceLog::open(log_path, "01J0000000000000000000TEST".into())
.expect("provenance log"),
),
session_id: "01J0000000000000000000TEST".into(),
cache_root: None,
};
let store = FsStore::new(store_root.clone()).expect("fs store");
(server, ctx, store, store_root, td)
}
fn metadata_dir_tomls(store_root: &Utf8Path) -> Vec<Utf8PathBuf> {
let md = store_root.join(".metadata");
match std::fs::read_dir(md.as_std_path()) {
Ok(rd) => rd
.filter_map(|e| e.ok())
.filter_map(|e| Utf8PathBuf::from_path_buf(e.path()).ok())
.filter(|p| p.extension() == Some("toml"))
.collect(),
Err(_) => Vec::new(),
}
}
#[tokio::test]
#[serial_test::serial]
async fn metadata_only_to_store_writes_metadata_toml_139() {
let (_server, ctx, store, store_root, _td) = md139_harness().await;
let profile = CapabilityProfile::from_env().expect("clean env");
let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
.await
.expect("metadata_only_to_store ok");
assert_eq!(outcome.source, "crossref");
let tomls = metadata_dir_tomls(&store_root);
assert_eq!(
tomls.len(),
1,
"exactly one .metadata/*.toml must be written (MCP_TOOLS.md §11 SIDE EFFECT, #139); got {tomls:?}"
);
let body = std::fs::read_to_string(&tomls[0]).expect("read metadata toml");
let meta: crate::store::Metadata = toml::from_str(&body).expect("parse metadata toml");
assert_eq!(meta.title, "Example Paper");
assert_eq!(
meta.doi.as_ref().map(|d| d.as_str()),
Some("10.1234/example")
);
let ext = meta.doiget.expect("[doiget] table present");
assert_eq!(ext.source, "crossref");
assert_eq!(ext.size_bytes, 0, "metadata-only entry has no PDF");
std::env::remove_var("DOIGET_CROSSREF_BASE");
}
#[tokio::test]
#[serial_test::serial]
async fn resolve_only_and_pure_metadata_only_write_nothing_139() {
let (_server, ctx, _store, store_root, _td) = md139_harness().await;
let profile = CapabilityProfile::from_env().expect("clean env");
let ref_ = Ref::Doi(Doi("10.1234/example".to_string()));
let r = resolve_only(&ref_, &profile, &ctx)
.await
.expect("resolve_only ok");
assert_eq!(r.source, "crossref");
assert!(
metadata_dir_tomls(&store_root).is_empty(),
"resolve_only MUST NOT write a metadata TOML (docs/MCP_TOOLS.md §1; #139)"
);
let m = metadata_only(&ref_, &profile, &ctx)
.await
.expect("metadata_only ok");
assert_eq!(m.source, "crossref");
assert!(
metadata_dir_tomls(&store_root).is_empty(),
"pure metadata_only MUST NOT write to the store (#139)"
);
std::env::remove_var("DOIGET_CROSSREF_BASE");
}
#[tokio::test]
#[serial_test::serial]
async fn metadata_only_to_store_arxiv_writes_metadata_toml_139() {
use crate::http::HttpClient;
use crate::provenance::ProvenanceLog;
use crate::rate_limiter::RateLimiter;
use crate::store::FsStore;
use crate::RateLimits;
use std::sync::Arc;
use wiremock::matchers::method;
use wiremock::{Mock, MockServer, ResponseTemplate};
let atom = r#"<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<id>http://arxiv.org/abs/2401.12345v1</id>
<published>2024-01-15T00:00:00Z</published>
<title>Example arXiv Paper Title</title>
<summary>Example abstract.</summary>
<author><name>Jane Doe</name></author>
<category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
</entry>
</feed>"#;
let server = MockServer::start().await;
Mock::given(method("GET"))
.respond_with(ResponseTemplate::new(200).set_body_string(atom))
.mount(&server)
.await;
std::env::set_var("DOIGET_ARXIV_BASE", server.uri());
let host = server
.uri()
.parse::<url::Url>()
.expect("uri")
.host_str()
.expect("host")
.to_string();
let td = tempfile::TempDir::new().expect("tempdir");
let base = Utf8Path::from_path(td.path()).expect("utf-8");
let store_root = base.join("papers");
let ctx = FetchContext {
http: Arc::new(HttpClient::new_for_tests_allow_http("arxiv", &host)),
rate_limiter: Arc::new(RateLimiter::new(RateLimits::HARD_CODED)),
log: Arc::new(
ProvenanceLog::open(base.join("log.jsonl"), "01J0000000000000000000TEST".into())
.expect("provenance log"),
),
session_id: "01J0000000000000000000TEST".into(),
cache_root: None,
};
let store = FsStore::new(store_root.clone()).expect("fs store");
let profile = CapabilityProfile::from_env().expect("clean env");
let ref_ = Ref::Arxiv(crate::ArxivId::parse("2401.12345").expect("arxiv id"));
let outcome = metadata_only_to_store(&ref_, &profile, &ctx, &store)
.await
.expect("metadata_only_to_store (arxiv) ok");
assert_eq!(outcome.source, "arxiv");
let tomls = metadata_dir_tomls(&store_root);
assert_eq!(
tomls.len(),
1,
"arXiv metadata-only must write one TOML; got {tomls:?}"
);
let meta: crate::store::Metadata =
toml::from_str(&std::fs::read_to_string(&tomls[0]).expect("read")).expect("parse");
assert_eq!(meta.title, "Example arXiv Paper Title");
assert_eq!(
meta.arxiv_id.as_ref().map(|a| a.as_str()),
Some("2401.12345")
);
assert!(meta.doi.is_none(), "arXiv entry has no DOI");
let ext = meta.doiget.expect("[doiget] table");
assert_eq!(ext.source, "arxiv");
assert_eq!(ext.license, "arxiv-default");
std::env::remove_var("DOIGET_ARXIV_BASE");
}
#[test]
fn extract_metadata_title_handles_string_array_missing_blank() {
use serde_json::json;
assert_eq!(
extract_metadata_title(&json!({"title": "Hello"})),
Some("Hello".to_string())
);
assert_eq!(
extract_metadata_title(&json!({"title": ["Real Title"]})),
Some("Real Title".to_string())
);
assert_eq!(extract_metadata_title(&json!({"x": 1})), None);
assert_eq!(extract_metadata_title(&json!({"title": " "})), None);
assert_eq!(extract_metadata_title(&json!({"title": []})), None);
assert_eq!(
extract_metadata_title(&json!({"title": [" ", "Real Title"]})),
Some("Real Title".to_string())
);
assert_eq!(extract_metadata_title(&json!({"title": [" ", ""]})), None);
}
#[test]
fn extract_metadata_authors_handles_each_resolver_shape() {
use serde_json::json;
assert_eq!(
extract_metadata_authors(&json!({"authors": ["Jane Doe", "John Roe"]})),
vec!["Jane Doe".to_string(), "John Roe".to_string()]
);
assert_eq!(
extract_metadata_authors(&json!({"author": [{"given": "Ada", "family": "Lovelace"}]})),
vec!["Ada Lovelace".to_string()]
);
assert_eq!(
extract_metadata_authors(&json!({"author": [{"family": "Onsager"}]})),
vec!["Onsager".to_string()]
);
assert_eq!(
extract_metadata_authors(&json!({"author": [{"name": "K. Wilson"}]})),
vec!["K. Wilson".to_string()]
);
assert_eq!(
extract_metadata_authors(&json!({"z_authors": [{"given": "L", "family": "Kadanoff"}]})),
vec!["L Kadanoff".to_string()]
);
assert!(extract_metadata_authors(&json!({"x": 1})).is_empty());
assert!(extract_metadata_authors(&json!({"authors": []})).is_empty());
}
}