use async_trait::async_trait;
use serde::Deserialize;
use url::Url;
use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
use crate::source::{FetchContext, FetchError, FetchResult, Source};
use crate::{CapabilityProfile, Ref};
const DEFAULT_BASE: &str = "https://api.unpaywall.org/v2";
#[derive(Clone, Debug)]
pub struct UnpaywallSource {
base: Url,
contact_email: String,
}
impl UnpaywallSource {
pub fn new(contact_email: String) -> Self {
#[allow(clippy::expect_used)]
let base = Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid");
Self {
base,
contact_email,
}
}
pub fn with_base(base: Url, contact_email: String) -> Self {
Self {
base,
contact_email,
}
}
fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
let mut url = self.base.clone();
url.path_segments_mut()
.map_err(|()| FetchError::SourceSchema {
hint: "unpaywall base URL is cannot-be-a-base".into(),
})?
.push(doi.as_str()); url.query_pairs_mut()
.append_pair("email", &self.contact_email);
Ok(url)
}
}
#[async_trait]
impl Source for UnpaywallSource {
fn name(&self) -> &str {
"unpaywall"
}
fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
matches!(ref_, Ref::Doi(_))
}
async fn fetch(
&self,
ref_: &Ref,
_profile: &CapabilityProfile,
ctx: &FetchContext,
) -> Result<FetchResult, FetchError> {
let doi = match ref_ {
Ref::Doi(d) => d,
Ref::Arxiv(_) => {
return Err(FetchError::NotEligible {
source_key: "unpaywall".into(),
});
}
};
let _permit = ctx.rate_limiter.acquire(self.name()).await;
let url = self.request_url(doi)?;
let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
let work: UnpaywallWork =
serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
hint: format!("unpaywall returned non-JSON: {e}"),
})?;
let license = work
.best_oa_location
.as_ref()
.and_then(|loc| loc.license.clone())
.unwrap_or_else(|| "unknown".to_string());
let canonical = ref_.promote(self.name(), None).digest_hex();
ctx.log.append(RowInput {
event: LogEvent::Fetch,
result: LogResult::Ok,
capability: Capability::Oa,
ref_: Some(doi.as_str()),
source: Some(self.name()),
error_code: None,
size_bytes: Some(body.len() as u64),
license: Some(&license),
store_path: None,
canonical_digest: Some(&canonical),
})?;
Ok(FetchResult {
source: self.name().to_string(),
license,
pdf_bytes: None,
final_url: Some(final_url),
metadata_json: Some(serde_json::to_value(&work).unwrap_or(serde_json::Value::Null)),
})
}
}
#[derive(Debug, Deserialize, serde::Serialize)]
struct UnpaywallWork {
doi: String,
is_oa: bool,
#[serde(default)]
title: Option<String>,
#[serde(default)]
best_oa_location: Option<UnpaywallOaLocation>,
#[serde(default)]
oa_locations: Vec<UnpaywallOaLocation>,
}
#[derive(Debug, Deserialize, serde::Serialize, Clone)]
struct UnpaywallOaLocation {
#[serde(default)]
url: Option<String>,
#[serde(default)]
url_for_pdf: Option<String>,
#[serde(default)]
license: Option<String>,
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
use std::sync::Arc;
use camino::Utf8PathBuf;
use tempfile::TempDir;
use wiremock::matchers::{method, path, query_param};
use wiremock::{Mock, MockServer, ResponseTemplate};
use crate::http::HttpClient;
use crate::provenance::{LogRow, ProvenanceLog};
use crate::rate_limiter::RateLimiter;
use crate::source::FetchContext;
use crate::{ArxivId, CapabilityProfile, Doi, RateLimits};
const TEST_EMAIL: &str = "alice@example.org";
const TEST_DOI: &str = "10.1234/example";
const TEST_DOI_ENCODED: &str = "10.1234%2Fexample";
fn build_test_context(host: &str) -> (TempDir, FetchContext) {
let td = TempDir::new().expect("tempdir");
let log_dir =
Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
let log_path = log_dir.join("test.jsonl");
let http = Arc::new(HttpClient::new_for_tests_allow_http("unpaywall", host));
let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
let session_id = "01J0000000000000000000TEST".to_string();
let log = Arc::new(
ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
);
(
td,
FetchContext {
http,
rate_limiter,
log,
session_id,
},
)
}
fn host_of(uri: &str) -> String {
uri.parse::<Url>()
.expect("valid uri")
.host_str()
.expect("has host")
.to_string()
}
fn base_of(server_uri: &str) -> Url {
format!("{}/v2", server_uri).parse().expect("valid base")
}
fn ok_response_body() -> serde_json::Value {
serde_json::json!({
"doi": TEST_DOI,
"is_oa": true,
"title": "Example",
"best_oa_location": {
"url": "https://example.org/free.pdf",
"license": "cc-by"
}
})
}
#[test]
fn unpaywall_can_serve_returns_true_for_doi() {
let s = UnpaywallSource::new(TEST_EMAIL.to_string());
let profile = CapabilityProfile::from_env().expect("profile");
let r = Ref::Doi(Doi(TEST_DOI.to_string()));
assert!(s.can_serve(&profile, &r));
}
#[test]
fn unpaywall_can_serve_returns_false_for_arxiv() {
let s = UnpaywallSource::new(TEST_EMAIL.to_string());
let profile = CapabilityProfile::from_env().expect("profile");
let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
assert!(!s.can_serve(&profile, &r));
}
#[tokio::test]
async fn unpaywall_fetch_returns_oa_metadata() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
.and(query_param("email", TEST_EMAIL))
.respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
.mount(&server)
.await;
let host = host_of(&server.uri());
let (_td, ctx) = build_test_context(&host);
let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
let profile = CapabilityProfile::from_env().expect("profile");
let r = Ref::Doi(Doi(TEST_DOI.to_string()));
let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
assert_eq!(res.source, "unpaywall");
assert!(res.final_url.is_some());
let meta = res.metadata_json.expect("metadata present");
let parsed: UnpaywallWork = serde_json::from_value(meta).expect("metadata round-trips");
assert!(parsed.is_oa);
assert_eq!(parsed.doi, TEST_DOI);
}
#[tokio::test]
async fn unpaywall_extracts_license_from_best_oa_location() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
.and(query_param("email", TEST_EMAIL))
.respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
.mount(&server)
.await;
let host = host_of(&server.uri());
let (_td, ctx) = build_test_context(&host);
let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
let profile = CapabilityProfile::from_env().expect("profile");
let r = Ref::Doi(Doi(TEST_DOI.to_string()));
let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
assert_eq!(res.license, "cc-by");
}
#[tokio::test]
async fn unpaywall_falls_back_to_unknown_license() {
let body = serde_json::json!({
"doi": TEST_DOI,
"is_oa": true,
"best_oa_location": {
"url": "https://example.org/free.pdf",
"license": serde_json::Value::Null
}
});
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
.and(query_param("email", TEST_EMAIL))
.respond_with(ResponseTemplate::new(200).set_body_json(body))
.mount(&server)
.await;
let host = host_of(&server.uri());
let (_td, ctx) = build_test_context(&host);
let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
let profile = CapabilityProfile::from_env().expect("profile");
let r = Ref::Doi(Doi(TEST_DOI.to_string()));
let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
assert_eq!(res.license, "unknown");
}
#[tokio::test]
async fn unpaywall_with_arxiv_ref_errors_not_eligible() {
let (_td, ctx) = build_test_context("127.0.0.1");
let s = UnpaywallSource::new(TEST_EMAIL.to_string());
let profile = CapabilityProfile::from_env().expect("profile");
let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
let err = s
.fetch(&r, &profile, &ctx)
.await
.expect_err("arxiv must be ineligible");
match err {
FetchError::NotEligible { source_key } => {
assert_eq!(source_key, "unpaywall");
}
other => panic!("expected NotEligible, got {:?}", other),
}
}
#[tokio::test]
async fn unpaywall_writes_log_row_with_license() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
.and(query_param("email", TEST_EMAIL))
.respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
.mount(&server)
.await;
let host = host_of(&server.uri());
let (td, ctx) = build_test_context(&host);
let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
let profile = CapabilityProfile::from_env().expect("profile");
let r = Ref::Doi(Doi(TEST_DOI.to_string()));
let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
let log_path = Utf8PathBuf::try_from(td.path().to_path_buf())
.expect("temp path utf-8")
.join("test.jsonl");
let raw = std::fs::read_to_string(&log_path).expect("read log");
let rows: Vec<LogRow> = raw
.lines()
.filter(|l| !l.is_empty())
.map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
.collect();
let fetch_rows: Vec<&LogRow> = rows.iter().filter(|r| r.event == LogEvent::Fetch).collect();
assert_eq!(
fetch_rows.len(),
1,
"expected one Fetch row, got {:?}",
rows
);
let row = fetch_rows[0];
assert_eq!(row.result, LogResult::Ok);
assert_eq!(row.license.as_deref(), Some("cc-by"));
assert_eq!(row.source.as_deref(), Some("unpaywall"));
assert_eq!(row.ref_.as_deref(), Some(TEST_DOI));
}
#[test]
fn unpaywall_email_is_in_query_string() {
let s = UnpaywallSource::new(TEST_EMAIL.to_string());
let doi = Doi(TEST_DOI.to_string());
let url = s.request_url(&doi).expect("url builds");
let pair = url
.query_pairs()
.find(|(k, _)| k == "email")
.expect("email pair present");
assert_eq!(pair.1, TEST_EMAIL, "decoded email must match: {:?}", pair);
}
#[tokio::test]
async fn unpaywall_404_maps_to_http_error() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
.respond_with(ResponseTemplate::new(404))
.mount(&server)
.await;
let host = host_of(&server.uri());
let (_td, ctx) = build_test_context(&host);
let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
let profile = CapabilityProfile::from_env().expect("profile");
let r = Ref::Doi(Doi(TEST_DOI.to_string()));
let err = s
.fetch(&r, &profile, &ctx)
.await
.expect_err("404 must error");
match err {
FetchError::Http(_) => {}
other => panic!("expected FetchError::Http, got {:?}", other),
}
}
}