1use async_trait::async_trait;
9use serde::Deserialize;
10use url::Url;
11
12use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
13use crate::source::{FetchContext, FetchError, FetchResult, Source};
14use crate::{CapabilityProfile, Ref};
15
16const DEFAULT_BASE: &str = "https://api.unpaywall.org/v2";
17
18#[derive(Clone, Debug)]
20pub struct UnpaywallSource {
21 base: Url,
22 contact_email: String,
23}
24
25impl UnpaywallSource {
26 pub fn new(contact_email: String) -> Self {
29 #[allow(clippy::expect_used)]
34 let base = Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid");
35 Self {
36 base,
37 contact_email,
38 }
39 }
40
41 pub fn with_base(base: Url, contact_email: String) -> Self {
48 Self {
49 base,
50 contact_email,
51 }
52 }
53
54 fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
55 let mut url = self.base.clone();
58 url.path_segments_mut()
61 .map_err(|()| FetchError::SourceSchema {
62 hint: "unpaywall base URL is cannot-be-a-base".into(),
63 })?
64 .push(doi.as_str()); url.query_pairs_mut()
66 .append_pair("email", &self.contact_email);
67 Ok(url)
68 }
69}
70
71#[async_trait]
72impl Source for UnpaywallSource {
73 fn name(&self) -> &str {
74 "unpaywall"
75 }
76
77 fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
78 matches!(ref_, Ref::Doi(_))
79 }
80
81 async fn fetch(
82 &self,
83 ref_: &Ref,
84 _profile: &CapabilityProfile,
85 ctx: &FetchContext,
86 ) -> Result<FetchResult, FetchError> {
87 let doi = match ref_ {
88 Ref::Doi(d) => d,
89 Ref::Arxiv(_) => {
90 return Err(FetchError::NotEligible {
91 source_key: "unpaywall".into(),
92 });
93 }
94 };
95
96 let _permit = ctx.rate_limiter.acquire(self.name()).await;
97
98 let url = self.request_url(doi)?;
99 let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
100
101 let work: UnpaywallWork =
102 serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
103 hint: format!("unpaywall returned non-JSON: {e}"),
104 })?;
105
106 let license = work
110 .best_oa_location
111 .as_ref()
112 .and_then(|loc| loc.license.clone())
113 .unwrap_or_else(|| "unknown".to_string());
114
115 let canonical = ref_.promote(self.name(), None).digest_hex();
118 ctx.log.append(RowInput {
119 event: LogEvent::Fetch,
120 result: LogResult::Ok,
121 capability: Capability::Oa,
122 ref_: Some(doi.as_str()),
123 source: Some(self.name()),
124 error_code: None,
125 size_bytes: Some(body.len() as u64),
126 license: Some(&license),
127 store_path: None,
128 canonical_digest: Some(&canonical),
129 })?;
130
131 Ok(FetchResult {
141 source: self.name().to_string(),
142 license,
143 pdf_bytes: None,
144 final_url: Some(final_url),
145 metadata_json: Some(serde_json::to_value(&work).unwrap_or(serde_json::Value::Null)),
146 })
147 }
148}
149
150#[derive(Debug, Deserialize, serde::Serialize)]
153struct UnpaywallWork {
154 doi: String,
155 is_oa: bool,
156 #[serde(default)]
162 oa_status: Option<String>,
163 #[serde(default)]
164 title: Option<String>,
165 #[serde(default)]
166 best_oa_location: Option<UnpaywallOaLocation>,
167 #[serde(default)]
168 oa_locations: Vec<UnpaywallOaLocation>,
169}
170
171#[derive(Debug, Deserialize, serde::Serialize, Clone)]
172struct UnpaywallOaLocation {
173 #[serde(default)]
174 url: Option<String>,
175 #[serde(default)]
176 url_for_pdf: Option<String>,
177 #[serde(default)]
178 license: Option<String>,
179}
180
181#[cfg(test)]
186#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
187mod tests {
188 use super::*;
189
190 use std::sync::Arc;
191
192 use camino::Utf8PathBuf;
193 use tempfile::TempDir;
194 use wiremock::matchers::{method, path, query_param};
195 use wiremock::{Mock, MockServer, ResponseTemplate};
196
197 use crate::http::HttpClient;
198 use crate::provenance::{LogRow, ProvenanceLog};
199 use crate::rate_limiter::RateLimiter;
200 use crate::source::FetchContext;
201 use crate::{ArxivId, CapabilityProfile, Doi, RateLimits};
202
203 const TEST_EMAIL: &str = "alice@example.org";
204 const TEST_DOI: &str = "10.1234/example";
205 const TEST_DOI_ENCODED: &str = "10.1234%2Fexample";
209
210 fn build_test_context(host: &str) -> (TempDir, FetchContext) {
214 let td = TempDir::new().expect("tempdir");
215 let log_dir =
216 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
217 let log_path = log_dir.join("test.jsonl");
218
219 let http = Arc::new(HttpClient::new_for_tests_allow_http("unpaywall", host));
220 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
221 let session_id = "01J0000000000000000000TEST".to_string();
222 let log = Arc::new(
223 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
224 );
225
226 (
227 td,
228 FetchContext {
229 http,
230 rate_limiter,
231 log,
232 session_id,
233 cache_root: None,
234 },
235 )
236 }
237
238 fn host_of(uri: &str) -> String {
239 uri.parse::<Url>()
240 .expect("valid uri")
241 .host_str()
242 .expect("has host")
243 .to_string()
244 }
245
246 fn base_of(server_uri: &str) -> Url {
247 format!("{}/v2", server_uri).parse().expect("valid base")
251 }
252
253 fn ok_response_body() -> serde_json::Value {
254 serde_json::json!({
255 "doi": TEST_DOI,
256 "is_oa": true,
257 "title": "Example",
258 "best_oa_location": {
259 "url": "https://example.org/free.pdf",
260 "license": "cc-by"
261 }
262 })
263 }
264
265 #[test]
266 fn unpaywall_can_serve_returns_true_for_doi() {
267 let s = UnpaywallSource::new(TEST_EMAIL.to_string());
268 let profile = CapabilityProfile::from_env().expect("profile");
269 let r = Ref::Doi(Doi(TEST_DOI.to_string()));
270 assert!(s.can_serve(&profile, &r));
271 }
272
273 #[test]
274 fn unpaywall_can_serve_returns_false_for_arxiv() {
275 let s = UnpaywallSource::new(TEST_EMAIL.to_string());
276 let profile = CapabilityProfile::from_env().expect("profile");
277 let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
278 assert!(!s.can_serve(&profile, &r));
279 }
280
281 #[tokio::test]
282 async fn unpaywall_fetch_returns_oa_metadata() {
283 let server = MockServer::start().await;
284 Mock::given(method("GET"))
285 .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
286 .and(query_param("email", TEST_EMAIL))
287 .respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
288 .mount(&server)
289 .await;
290
291 let host = host_of(&server.uri());
292 let (_td, ctx) = build_test_context(&host);
293 let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
294 let profile = CapabilityProfile::from_env().expect("profile");
295 let r = Ref::Doi(Doi(TEST_DOI.to_string()));
296
297 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
298 assert_eq!(res.source, "unpaywall");
299 assert!(res.final_url.is_some());
300 let meta = res.metadata_json.expect("metadata present");
301 let parsed: UnpaywallWork = serde_json::from_value(meta).expect("metadata round-trips");
302 assert!(parsed.is_oa);
303 assert_eq!(parsed.doi, TEST_DOI);
304 }
305
306 #[tokio::test]
307 async fn unpaywall_extracts_license_from_best_oa_location() {
308 let server = MockServer::start().await;
309 Mock::given(method("GET"))
310 .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
311 .and(query_param("email", TEST_EMAIL))
312 .respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
313 .mount(&server)
314 .await;
315
316 let host = host_of(&server.uri());
317 let (_td, ctx) = build_test_context(&host);
318 let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
319 let profile = CapabilityProfile::from_env().expect("profile");
320 let r = Ref::Doi(Doi(TEST_DOI.to_string()));
321
322 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
323 assert_eq!(res.license, "cc-by");
324 }
325
326 #[tokio::test]
327 async fn unpaywall_surfaces_oa_status_in_metadata() {
328 let body = serde_json::json!({
331 "doi": TEST_DOI,
332 "is_oa": true,
333 "oa_status": "gold",
334 "best_oa_location": { "url": "https://example.org/free.pdf", "license": "cc-by" }
335 });
336 let server = MockServer::start().await;
337 Mock::given(method("GET"))
338 .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
339 .and(query_param("email", TEST_EMAIL))
340 .respond_with(ResponseTemplate::new(200).set_body_json(body))
341 .mount(&server)
342 .await;
343
344 let host = host_of(&server.uri());
345 let (_td, ctx) = build_test_context(&host);
346 let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
347 let profile = CapabilityProfile::from_env().expect("profile");
348 let r = Ref::Doi(Doi(TEST_DOI.to_string()));
349
350 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
351 let meta = res.metadata_json.expect("metadata present");
352 assert_eq!(meta.get("oa_status").and_then(|v| v.as_str()), Some("gold"));
353 }
354
355 #[tokio::test]
356 async fn unpaywall_falls_back_to_unknown_license() {
357 let body = serde_json::json!({
358 "doi": TEST_DOI,
359 "is_oa": true,
360 "best_oa_location": {
361 "url": "https://example.org/free.pdf",
362 "license": serde_json::Value::Null
363 }
364 });
365 let server = MockServer::start().await;
366 Mock::given(method("GET"))
367 .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
368 .and(query_param("email", TEST_EMAIL))
369 .respond_with(ResponseTemplate::new(200).set_body_json(body))
370 .mount(&server)
371 .await;
372
373 let host = host_of(&server.uri());
374 let (_td, ctx) = build_test_context(&host);
375 let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
376 let profile = CapabilityProfile::from_env().expect("profile");
377 let r = Ref::Doi(Doi(TEST_DOI.to_string()));
378
379 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
380 assert_eq!(res.license, "unknown");
381 }
382
383 #[tokio::test]
384 async fn unpaywall_with_arxiv_ref_errors_not_eligible() {
385 let (_td, ctx) = build_test_context("127.0.0.1");
388 let s = UnpaywallSource::new(TEST_EMAIL.to_string());
389 let profile = CapabilityProfile::from_env().expect("profile");
390 let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
391
392 let err = s
393 .fetch(&r, &profile, &ctx)
394 .await
395 .expect_err("arxiv must be ineligible");
396 match err {
397 FetchError::NotEligible { source_key } => {
398 assert_eq!(source_key, "unpaywall");
399 }
400 other => panic!("expected NotEligible, got {:?}", other),
401 }
402 }
403
404 #[tokio::test]
405 async fn unpaywall_writes_log_row_with_license() {
406 let server = MockServer::start().await;
407 Mock::given(method("GET"))
408 .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
409 .and(query_param("email", TEST_EMAIL))
410 .respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
411 .mount(&server)
412 .await;
413
414 let host = host_of(&server.uri());
415 let (td, ctx) = build_test_context(&host);
416 let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
417 let profile = CapabilityProfile::from_env().expect("profile");
418 let r = Ref::Doi(Doi(TEST_DOI.to_string()));
419
420 let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
421
422 let log_path = Utf8PathBuf::try_from(td.path().to_path_buf())
425 .expect("temp path utf-8")
426 .join("test.jsonl");
427 let raw = std::fs::read_to_string(&log_path).expect("read log");
428 let rows: Vec<LogRow> = raw
429 .lines()
430 .filter(|l| !l.is_empty())
431 .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
432 .collect();
433
434 let fetch_rows: Vec<&LogRow> = rows.iter().filter(|r| r.event == LogEvent::Fetch).collect();
435 assert_eq!(
436 fetch_rows.len(),
437 1,
438 "expected one Fetch row, got {:?}",
439 rows
440 );
441 let row = fetch_rows[0];
442 assert_eq!(row.result, LogResult::Ok);
443 assert_eq!(row.license.as_deref(), Some("cc-by"));
444 assert_eq!(row.source.as_deref(), Some("unpaywall"));
445 assert_eq!(row.ref_.as_deref(), Some(TEST_DOI));
446 }
447
448 #[test]
449 fn unpaywall_email_is_in_query_string() {
450 let s = UnpaywallSource::new(TEST_EMAIL.to_string());
457 let doi = Doi(TEST_DOI.to_string());
458 let url = s.request_url(&doi).expect("url builds");
459 let pair = url
460 .query_pairs()
461 .find(|(k, _)| k == "email")
462 .expect("email pair present");
463 assert_eq!(pair.1, TEST_EMAIL, "decoded email must match: {:?}", pair);
464 }
465
466 #[tokio::test]
467 async fn unpaywall_404_maps_to_http_error() {
468 let server = MockServer::start().await;
469 Mock::given(method("GET"))
470 .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
471 .respond_with(ResponseTemplate::new(404))
472 .mount(&server)
473 .await;
474
475 let host = host_of(&server.uri());
476 let (_td, ctx) = build_test_context(&host);
477 let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
478 let profile = CapabilityProfile::from_env().expect("profile");
479 let r = Ref::Doi(Doi(TEST_DOI.to_string()));
480
481 let err = s
482 .fetch(&r, &profile, &ctx)
483 .await
484 .expect_err("404 must error");
485 match err {
486 FetchError::Http(_) => {}
487 other => panic!("expected FetchError::Http, got {:?}", other),
488 }
489 }
490}