Skip to main content

doiget_core/sources/
unpaywall.rs

1//! Unpaywall source — OA URL discovery + license metadata for a given DOI.
2//!
3//! Spec: docs/SOURCES.md §4 Unpaywall. Free public API; the polite pool
4//! requires `email=<contact>` in the URL query. The `email` is set per
5//! `[network] unpaywall_email` in `config.toml` (Phase 1: caller-injected
6//! via `UnpaywallSource::new`).
7
8use async_trait::async_trait;
9use serde::Deserialize;
10use url::Url;
11
12use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
13use crate::source::{FetchContext, FetchError, FetchResult, Source};
14use crate::{CapabilityProfile, Ref};
15
16const DEFAULT_BASE: &str = "https://api.unpaywall.org/v2";
17
18/// Unpaywall Source impl.
19#[derive(Clone, Debug)]
20pub struct UnpaywallSource {
21    base: Url,
22    contact_email: String,
23}
24
25impl UnpaywallSource {
26    /// Production constructor. The `contact_email` is REQUIRED for the polite
27    /// pool — Unpaywall returns 403 without it.
28    pub fn new(contact_email: String) -> Self {
29        // `DEFAULT_BASE` is a compile-time const string with a valid HTTPS
30        // URL syntax; `Url::parse` on it cannot fail at runtime. The local
31        // `allow` is the documented exception to the workspace `expect_used`
32        // lint (see `rate_limiter.rs::acquire`).
33        #[allow(clippy::expect_used)]
34        let base = Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid");
35        Self {
36            base,
37            contact_email,
38        }
39    }
40
41    /// Construct with an arbitrary base URL.
42    ///
43    /// The orchestrator (`doiget-cli::commands::fetch`) uses this to honor
44    /// the `DOIGET_UNPAYWALL_BASE` env var, which lets integration tests
45    /// point the source at a wiremock origin without compile-time gates.
46    /// Production callers use [`UnpaywallSource::new`].
47    pub fn with_base(base: Url, contact_email: String) -> Self {
48        Self {
49            base,
50            contact_email,
51        }
52    }
53
54    fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
55        // The path layout is `/v2/<DOI>`. Unpaywall accepts the bare DOI
56        // (no `doi:` scheme); `Doi::as_str()` already strips it.
57        let mut url = self.base.clone();
58        // `path_segments_mut` properly URL-encodes each segment, including the
59        // forward slash inside the DOI suffix.
60        url.path_segments_mut()
61            .map_err(|()| FetchError::SourceSchema {
62                hint: "unpaywall base URL is cannot-be-a-base".into(),
63            })?
64            .push(doi.as_str()); // single-push so the `/` in the DOI is encoded
65        url.query_pairs_mut()
66            .append_pair("email", &self.contact_email);
67        Ok(url)
68    }
69}
70
71#[async_trait]
72impl Source for UnpaywallSource {
73    fn name(&self) -> &str {
74        "unpaywall"
75    }
76
77    fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
78        matches!(ref_, Ref::Doi(_))
79    }
80
81    async fn fetch(
82        &self,
83        ref_: &Ref,
84        _profile: &CapabilityProfile,
85        ctx: &FetchContext,
86    ) -> Result<FetchResult, FetchError> {
87        let doi = match ref_ {
88            Ref::Doi(d) => d,
89            Ref::Arxiv(_) => {
90                return Err(FetchError::NotEligible {
91                    source_key: "unpaywall".into(),
92                });
93            }
94        };
95
96        let _permit = ctx.rate_limiter.acquire(self.name()).await;
97
98        let url = self.request_url(doi)?;
99        let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
100
101        let work: UnpaywallWork =
102            serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
103                hint: format!("unpaywall returned non-JSON: {e}"),
104            })?;
105
106        // Resolve a license string from `best_oa_location.license`, falling back
107        // to "unknown" if absent. Spec: docs/STORE.md §2 — `license` is always
108        // a string (use "unknown" when not provided).
109        let license = work
110            .best_oa_location
111            .as_ref()
112            .and_then(|loc| loc.license.clone())
113            .unwrap_or_else(|| "unknown".to_string());
114
115        // ADR-0021 §1 canonical-digest under the "unpaywall" resolver
116        // profile. Distinct from a Crossref attempt for the same DOI.
117        let canonical = ref_.promote(self.name(), None).digest_hex();
118        ctx.log.append(RowInput {
119            event: LogEvent::Fetch,
120            result: LogResult::Ok,
121            capability: Capability::Oa,
122            ref_: Some(doi.as_str()),
123            source: Some(self.name()),
124            error_code: None,
125            size_bytes: Some(body.len() as u64),
126            license: Some(&license),
127            store_path: None,
128            canonical_digest: Some(&canonical),
129        })?;
130
131        // Note: this source returns metadata only; the actual PDF fetch
132        // from the discovered OA URL is the orchestrator's job
133        // (`crate::orchestrator::try_fetch_oa_pdf`, called from
134        // `fetch_paper_doi`). That leg runs the OA URL through the
135        // `oa-publisher` per-publisher allowlist BOTH as a pre-fetch host
136        // check (issue #145; `docs/REDIRECT_ALLOWLIST.md` §1 — applied to
137        // the metadata-discovered URL before the fetch is issued) and on
138        // every redirect hop via the per-source redirect closure in
139        // `crate::http`. See ARCHITECTURE.md §6.
140        Ok(FetchResult {
141            source: self.name().to_string(),
142            license,
143            pdf_bytes: None,
144            final_url: Some(final_url),
145            metadata_json: Some(serde_json::to_value(&work).unwrap_or(serde_json::Value::Null)),
146        })
147    }
148}
149
150/// Subset of the Unpaywall work record. We deserialize loosely — extra fields
151/// are ignored (no `deny_unknown_fields`) so future API additions don't break.
152#[derive(Debug, Deserialize, serde::Serialize)]
153struct UnpaywallWork {
154    doi: String,
155    is_oa: bool,
156    /// Unpaywall's OA classification: `gold` / `green` / `hybrid` /
157    /// `bronze` / `closed`. Surfaced to the caller as `oa_status` for OA
158    /// transparency (#281 item 4) so an agent can tell a paywalled
159    /// (`closed`) work from an openly-available one. Captured into
160    /// `metadata_json` (the orchestrator reads it from there).
161    #[serde(default)]
162    oa_status: Option<String>,
163    #[serde(default)]
164    title: Option<String>,
165    #[serde(default)]
166    best_oa_location: Option<UnpaywallOaLocation>,
167    #[serde(default)]
168    oa_locations: Vec<UnpaywallOaLocation>,
169}
170
171#[derive(Debug, Deserialize, serde::Serialize, Clone)]
172struct UnpaywallOaLocation {
173    #[serde(default)]
174    url: Option<String>,
175    #[serde(default)]
176    url_for_pdf: Option<String>,
177    #[serde(default)]
178    license: Option<String>,
179}
180
181// ---------------------------------------------------------------------------
182// Tests
183// ---------------------------------------------------------------------------
184
185#[cfg(test)]
186#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
187mod tests {
188    use super::*;
189
190    use std::sync::Arc;
191
192    use camino::Utf8PathBuf;
193    use tempfile::TempDir;
194    use wiremock::matchers::{method, path, query_param};
195    use wiremock::{Mock, MockServer, ResponseTemplate};
196
197    use crate::http::HttpClient;
198    use crate::provenance::{LogRow, ProvenanceLog};
199    use crate::rate_limiter::RateLimiter;
200    use crate::source::FetchContext;
201    use crate::{ArxivId, CapabilityProfile, Doi, RateLimits};
202
203    const TEST_EMAIL: &str = "alice@example.org";
204    const TEST_DOI: &str = "10.1234/example";
205    /// Percent-encoded form of `TEST_DOI` as it appears on the wire after
206    /// `path_segments_mut().push(...)`. Wiremock's `path` matcher operates on
207    /// the request's encoded path portion, so we match against this form.
208    const TEST_DOI_ENCODED: &str = "10.1234%2Fexample";
209
210    /// Build a `FetchContext` whose `HttpClient` allows plain-HTTP initial
211    /// legs against the wiremock origin. The redirect closure is unchanged
212    /// (HTTPS-only + allowlist) — only the *initial* connection is relaxed.
213    fn build_test_context(host: &str) -> (TempDir, FetchContext) {
214        let td = TempDir::new().expect("tempdir");
215        let log_dir =
216            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
217        let log_path = log_dir.join("test.jsonl");
218
219        let http = Arc::new(HttpClient::new_for_tests_allow_http("unpaywall", host));
220        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
221        let session_id = "01J0000000000000000000TEST".to_string();
222        let log = Arc::new(
223            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
224        );
225
226        (
227            td,
228            FetchContext {
229                http,
230                rate_limiter,
231                log,
232                session_id,
233                cache_root: None,
234            },
235        )
236    }
237
238    fn host_of(uri: &str) -> String {
239        uri.parse::<Url>()
240            .expect("valid uri")
241            .host_str()
242            .expect("has host")
243            .to_string()
244    }
245
246    fn base_of(server_uri: &str) -> Url {
247        // The wiremock server roots at `/`; Unpaywall lives at `/v2/<DOI>`.
248        // Including the `/v2` segment in the base lets `request_url`'s
249        // single-push DOI segment land at the correct path.
250        format!("{}/v2", server_uri).parse().expect("valid base")
251    }
252
253    fn ok_response_body() -> serde_json::Value {
254        serde_json::json!({
255            "doi": TEST_DOI,
256            "is_oa": true,
257            "title": "Example",
258            "best_oa_location": {
259                "url": "https://example.org/free.pdf",
260                "license": "cc-by"
261            }
262        })
263    }
264
265    #[test]
266    fn unpaywall_can_serve_returns_true_for_doi() {
267        let s = UnpaywallSource::new(TEST_EMAIL.to_string());
268        let profile = CapabilityProfile::from_env().expect("profile");
269        let r = Ref::Doi(Doi(TEST_DOI.to_string()));
270        assert!(s.can_serve(&profile, &r));
271    }
272
273    #[test]
274    fn unpaywall_can_serve_returns_false_for_arxiv() {
275        let s = UnpaywallSource::new(TEST_EMAIL.to_string());
276        let profile = CapabilityProfile::from_env().expect("profile");
277        let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
278        assert!(!s.can_serve(&profile, &r));
279    }
280
281    #[tokio::test]
282    async fn unpaywall_fetch_returns_oa_metadata() {
283        let server = MockServer::start().await;
284        Mock::given(method("GET"))
285            .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
286            .and(query_param("email", TEST_EMAIL))
287            .respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
288            .mount(&server)
289            .await;
290
291        let host = host_of(&server.uri());
292        let (_td, ctx) = build_test_context(&host);
293        let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
294        let profile = CapabilityProfile::from_env().expect("profile");
295        let r = Ref::Doi(Doi(TEST_DOI.to_string()));
296
297        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
298        assert_eq!(res.source, "unpaywall");
299        assert!(res.final_url.is_some());
300        let meta = res.metadata_json.expect("metadata present");
301        let parsed: UnpaywallWork = serde_json::from_value(meta).expect("metadata round-trips");
302        assert!(parsed.is_oa);
303        assert_eq!(parsed.doi, TEST_DOI);
304    }
305
306    #[tokio::test]
307    async fn unpaywall_extracts_license_from_best_oa_location() {
308        let server = MockServer::start().await;
309        Mock::given(method("GET"))
310            .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
311            .and(query_param("email", TEST_EMAIL))
312            .respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
313            .mount(&server)
314            .await;
315
316        let host = host_of(&server.uri());
317        let (_td, ctx) = build_test_context(&host);
318        let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
319        let profile = CapabilityProfile::from_env().expect("profile");
320        let r = Ref::Doi(Doi(TEST_DOI.to_string()));
321
322        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
323        assert_eq!(res.license, "cc-by");
324    }
325
326    #[tokio::test]
327    async fn unpaywall_surfaces_oa_status_in_metadata() {
328        // OA transparency (#281 item 4): the work's `oa_status` must round
329        // -trip into `metadata_json` so the orchestrator can surface it.
330        let body = serde_json::json!({
331            "doi": TEST_DOI,
332            "is_oa": true,
333            "oa_status": "gold",
334            "best_oa_location": { "url": "https://example.org/free.pdf", "license": "cc-by" }
335        });
336        let server = MockServer::start().await;
337        Mock::given(method("GET"))
338            .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
339            .and(query_param("email", TEST_EMAIL))
340            .respond_with(ResponseTemplate::new(200).set_body_json(body))
341            .mount(&server)
342            .await;
343
344        let host = host_of(&server.uri());
345        let (_td, ctx) = build_test_context(&host);
346        let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
347        let profile = CapabilityProfile::from_env().expect("profile");
348        let r = Ref::Doi(Doi(TEST_DOI.to_string()));
349
350        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
351        let meta = res.metadata_json.expect("metadata present");
352        assert_eq!(meta.get("oa_status").and_then(|v| v.as_str()), Some("gold"));
353    }
354
355    #[tokio::test]
356    async fn unpaywall_falls_back_to_unknown_license() {
357        let body = serde_json::json!({
358            "doi": TEST_DOI,
359            "is_oa": true,
360            "best_oa_location": {
361                "url": "https://example.org/free.pdf",
362                "license": serde_json::Value::Null
363            }
364        });
365        let server = MockServer::start().await;
366        Mock::given(method("GET"))
367            .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
368            .and(query_param("email", TEST_EMAIL))
369            .respond_with(ResponseTemplate::new(200).set_body_json(body))
370            .mount(&server)
371            .await;
372
373        let host = host_of(&server.uri());
374        let (_td, ctx) = build_test_context(&host);
375        let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
376        let profile = CapabilityProfile::from_env().expect("profile");
377        let r = Ref::Doi(Doi(TEST_DOI.to_string()));
378
379        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
380        assert_eq!(res.license, "unknown");
381    }
382
383    #[tokio::test]
384    async fn unpaywall_with_arxiv_ref_errors_not_eligible() {
385        // No mock: should never reach the network because the ref-kind
386        // gate fires first.
387        let (_td, ctx) = build_test_context("127.0.0.1");
388        let s = UnpaywallSource::new(TEST_EMAIL.to_string());
389        let profile = CapabilityProfile::from_env().expect("profile");
390        let r = Ref::Arxiv(ArxivId("2401.12345".to_string()));
391
392        let err = s
393            .fetch(&r, &profile, &ctx)
394            .await
395            .expect_err("arxiv must be ineligible");
396        match err {
397            FetchError::NotEligible { source_key } => {
398                assert_eq!(source_key, "unpaywall");
399            }
400            other => panic!("expected NotEligible, got {:?}", other),
401        }
402    }
403
404    #[tokio::test]
405    async fn unpaywall_writes_log_row_with_license() {
406        let server = MockServer::start().await;
407        Mock::given(method("GET"))
408            .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
409            .and(query_param("email", TEST_EMAIL))
410            .respond_with(ResponseTemplate::new(200).set_body_json(ok_response_body()))
411            .mount(&server)
412            .await;
413
414        let host = host_of(&server.uri());
415        let (td, ctx) = build_test_context(&host);
416        let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
417        let profile = CapabilityProfile::from_env().expect("profile");
418        let r = Ref::Doi(Doi(TEST_DOI.to_string()));
419
420        let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
421
422        // Read every JSON-Lines row from the log file and assert the Fetch
423        // row is present with license="cc-by".
424        let log_path = Utf8PathBuf::try_from(td.path().to_path_buf())
425            .expect("temp path utf-8")
426            .join("test.jsonl");
427        let raw = std::fs::read_to_string(&log_path).expect("read log");
428        let rows: Vec<LogRow> = raw
429            .lines()
430            .filter(|l| !l.is_empty())
431            .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
432            .collect();
433
434        let fetch_rows: Vec<&LogRow> = rows.iter().filter(|r| r.event == LogEvent::Fetch).collect();
435        assert_eq!(
436            fetch_rows.len(),
437            1,
438            "expected one Fetch row, got {:?}",
439            rows
440        );
441        let row = fetch_rows[0];
442        assert_eq!(row.result, LogResult::Ok);
443        assert_eq!(row.license.as_deref(), Some("cc-by"));
444        assert_eq!(row.source.as_deref(), Some("unpaywall"));
445        assert_eq!(row.ref_.as_deref(), Some(TEST_DOI));
446    }
447
448    #[test]
449    fn unpaywall_email_is_in_query_string() {
450        // White-box: invoke `request_url` directly to assert the email is
451        // serialized into the query. The wiremock-driven tests above assert
452        // the same property via the `query_param("email", ...)` matcher,
453        // but this test pins the contract without booting an HTTP server.
454        // `query_pairs_mut().append_pair` percent-encodes the `@`, so we
455        // verify against the decoded form via `query_pairs()`.
456        let s = UnpaywallSource::new(TEST_EMAIL.to_string());
457        let doi = Doi(TEST_DOI.to_string());
458        let url = s.request_url(&doi).expect("url builds");
459        let pair = url
460            .query_pairs()
461            .find(|(k, _)| k == "email")
462            .expect("email pair present");
463        assert_eq!(pair.1, TEST_EMAIL, "decoded email must match: {:?}", pair);
464    }
465
466    #[tokio::test]
467    async fn unpaywall_404_maps_to_http_error() {
468        let server = MockServer::start().await;
469        Mock::given(method("GET"))
470            .and(path(format!("/v2/{}", TEST_DOI_ENCODED)))
471            .respond_with(ResponseTemplate::new(404))
472            .mount(&server)
473            .await;
474
475        let host = host_of(&server.uri());
476        let (_td, ctx) = build_test_context(&host);
477        let s = UnpaywallSource::with_base(base_of(&server.uri()), TEST_EMAIL.to_string());
478        let profile = CapabilityProfile::from_env().expect("profile");
479        let r = Ref::Doi(Doi(TEST_DOI.to_string()));
480
481        let err = s
482            .fetch(&r, &profile, &ctx)
483            .await
484            .expect_err("404 must error");
485        match err {
486            FetchError::Http(_) => {}
487            other => panic!("expected FetchError::Http, got {:?}", other),
488        }
489    }
490}