Skip to main content

doiget_core/sources/
crossref.rs

1//! Crossref source — DOI metadata + OA URL discovery via `link[]` array.
2//!
3//! Spec: docs/SOURCES.md §4 (Crossref). No auth; polite-pool User-Agent
4//! contact email is REQUIRED — see [`CrossrefSource::new`].
5
6use async_trait::async_trait;
7use serde::Deserialize;
8use url::Url;
9
10use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
11use crate::source::{FetchContext, FetchError, FetchResult, Source};
12use crate::{CapabilityProfile, Ref};
13
14/// Production Crossref REST API base URL. Hard-coded per `docs/SOURCES.md`
15/// §4; tests inject a wiremock origin via [`CrossrefSource::with_base`].
16const DEFAULT_BASE: &str = "https://api.crossref.org";
17
18/// Crossref [`Source`] impl — DOI → metadata; OA URL via `message.link[]`.
19///
20/// See `docs/SOURCES.md` §4 for the access policy (no auth, polite pool).
21#[derive(Clone, Debug)]
22pub struct CrossrefSource {
23    /// API base URL. Production constructor pins this to
24    /// `https://api.crossref.org`; the [`with_base`](Self::with_base)
25    /// test-only constructor lets wiremock substitute an `http://127.0.0.1:N`
26    /// origin.
27    base: Url,
28    /// Polite-pool contact email per `docs/SOURCES.md` §4 Crossref.
29    /// Concretely formatted into the `User-Agent` header by [`crate::http::HttpClient`].
30    /// (Phase 1: caller injects via [`CrossrefSource::new`]; CLI / config wiring
31    /// lands in a follow-up PR.)
32    #[allow(dead_code)]
33    contact_email: String,
34}
35
36impl CrossrefSource {
37    /// Production constructor: hard-codes `https://api.crossref.org` as the
38    /// base URL. The `contact_email` value is appended to the polite-pool
39    /// User-Agent (config plumbing arrives in a later PR — see
40    /// `docs/SOURCES.md` §4).
41    #[must_use]
42    pub fn new(contact_email: String) -> Self {
43        Self {
44            // The hard-coded constant is a known-valid URL; the `expect`
45            // here is the documented exception to the workspace
46            // `expect_used` lint (it can never fire in practice).
47            #[allow(clippy::expect_used)]
48            base: Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid"),
49            contact_email,
50        }
51    }
52
53    /// Construct with an arbitrary base URL.
54    ///
55    /// The orchestrator (`doiget-cli::commands::fetch`) uses this to honor
56    /// the `DOIGET_CROSSREF_BASE` env var, which lets integration tests point
57    /// the source at a wiremock origin without compile-time gates. Production
58    /// callers use [`CrossrefSource::new`].
59    pub fn with_base(base: Url, contact_email: String) -> Self {
60        Self {
61            base,
62            contact_email,
63        }
64    }
65
66    /// Build the `/works/{doi}` URL for the configured base. Returns
67    /// [`FetchError::SourceSchema`] if joining the path produces an invalid
68    /// URL (only possible if the base URL is malformed — should never happen
69    /// in production).
70    fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
71        // Crossref accepts the bare DOI (no `doi:` scheme). `Doi::as_str()`
72        // already returns it without the scheme. The `/` inside the suffix
73        // is URL-encoded by `reqwest` when the request is built; wiremock
74        // sees the decoded path on its `path()` matcher.
75        let path = format!("/works/{}", doi.as_str());
76        self.base.join(&path).map_err(|e| FetchError::SourceSchema {
77            hint: format!("crossref URL construction failed: {e}"),
78        })
79    }
80}
81
82#[async_trait]
83impl Source for CrossrefSource {
84    fn name(&self) -> &str {
85        "crossref"
86    }
87
88    fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
89        matches!(ref_, Ref::Doi(_))
90    }
91
92    async fn fetch(
93        &self,
94        ref_: &Ref,
95        _profile: &CapabilityProfile,
96        ctx: &FetchContext,
97    ) -> Result<FetchResult, FetchError> {
98        let doi = match ref_ {
99            Ref::Doi(d) => d,
100            Ref::Arxiv(_) => {
101                return Err(FetchError::NotEligible {
102                    source_key: "crossref".into(),
103                });
104            }
105        };
106
107        // Step 1: rate limiter (politeness — `docs/SOURCES.md` §6).
108        let _permit = ctx.rate_limiter.acquire(self.name()).await;
109
110        // Step 2: HTTP fetch. Body is JSON; the `PDF_MAX_BYTES` size cap in
111        // `HttpClient` applies. Crossref responses are well under 100 MB
112        // even for bibliographically rich DOIs.
113        let url = self.request_url(doi)?;
114        let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
115
116        // Step 3: parse the response envelope. Crossref wraps the work
117        // record in a top-level `{ "status": "ok", "message": { ... } }`
118        // envelope (per <https://api.crossref.org/swagger-ui/index.html>).
119        let envelope: CrossrefEnvelope =
120            serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
121                hint: format!("crossref returned non-JSON: {e}"),
122            })?;
123        if envelope.status != "ok" {
124            return Err(FetchError::SourceSchema {
125                hint: format!("crossref status = {}", envelope.status),
126            });
127        }
128
129        // Step 4: log the fetch event (`docs/PROVENANCE_LOG.md` §3).
130        // ADR-0021 §1 canonical-digest: promote the ref under the
131        // "crossref" resolver profile (no version — Crossref does not
132        // expose a per-call version token in Phase 1).
133        let canonical = ref_.promote(self.name(), None).digest_hex();
134        ctx.log.append(RowInput {
135            event: LogEvent::Fetch,
136            result: LogResult::Ok,
137            capability: Capability::Oa,
138            ref_: Some(doi.as_str()),
139            source: Some(self.name()),
140            error_code: None,
141            size_bytes: Some(body.len() as u64),
142            license: None,
143            store_path: None,
144            canonical_digest: Some(&canonical),
145        })?;
146
147        Ok(FetchResult {
148            source: self.name().to_string(),
149            license: "unknown".into(),
150            // Crossref is metadata; PDF retrieval is the job of Unpaywall /
151            // publisher sources (Phase 1+ sibling PRs).
152            pdf_bytes: None,
153            final_url: Some(final_url),
154            metadata_json: Some(envelope.message),
155        })
156    }
157}
158
159/// Top-level Crossref response envelope. Only `status` and `message` are
160/// load-bearing here; `message-type`, `message-version`, etc. are ignored.
161#[derive(Debug, Deserialize)]
162struct CrossrefEnvelope {
163    status: String,
164    message: serde_json::Value,
165}
166
167// ---------------------------------------------------------------------------
168// Tests
169// ---------------------------------------------------------------------------
170
171#[cfg(test)]
172#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
173mod tests {
174    use super::*;
175
176    use std::sync::Arc;
177
178    use camino::Utf8PathBuf;
179    use tempfile::TempDir;
180    use wiremock::matchers::{method, path};
181    use wiremock::{Mock, MockServer, ResponseTemplate};
182
183    use crate::http::HttpClient;
184    use crate::provenance::ProvenanceLog;
185    use crate::rate_limiter::RateLimiter;
186    use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
187
188    /// Build a `FetchContext` whose [`HttpClient`] allows the wiremock
189    /// `http://` origin under the `crossref` source key, plus a
190    /// tempdir-backed `ProvenanceLog`. Returns the tempdir so the caller
191    /// keeps it alive for the duration of the test.
192    fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
193        let td = TempDir::new().expect("tempdir");
194        // Workspace lints ban `std::path::PathBuf`; convert via camino.
195        let log_dir =
196            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
197        let log_path = log_dir.join("test.jsonl");
198
199        // Use the test-only constructor that relaxes `https_only` for the
200        // initial leg so wiremock (which serves over plain HTTP) can be
201        // reached. Redirect closure still rejects http:// targets — see
202        // `http.rs::build_client_allow_http`.
203        let http = Arc::new(HttpClient::new_for_tests_allow_http(
204            "crossref",
205            wiremock_host,
206        ));
207        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
208        let session_id = "01J0000000000000000000TEST".to_string();
209        let log = Arc::new(
210            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
211        );
212
213        (
214            td,
215            FetchContext {
216                http,
217                rate_limiter,
218                log,
219                session_id,
220            },
221        )
222    }
223
224    /// Extract the host string of a wiremock server's URI.
225    fn server_host(server: &MockServer) -> String {
226        server
227            .uri()
228            .parse::<Url>()
229            .expect("wiremock uri parses")
230            .host_str()
231            .expect("wiremock uri has host")
232            .to_string()
233    }
234
235    /// Build a [`CrossrefSource`] pointing at the given wiremock URI.
236    fn crossref_for(server: &MockServer) -> CrossrefSource {
237        let base = server.uri().parse::<Url>().expect("wiremock uri parses");
238        CrossrefSource::with_base(base, "test@example.org".to_string())
239    }
240
241    #[test]
242    fn crossref_can_serve_returns_true_for_doi() {
243        let s = CrossrefSource::new("test@example.org".into());
244        let profile = CapabilityProfile::from_env().expect("clean env");
245        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
246        assert!(s.can_serve(&profile, &r));
247    }
248
249    #[test]
250    fn crossref_can_serve_returns_false_for_arxiv() {
251        let s = CrossrefSource::new("test@example.org".into());
252        let profile = CapabilityProfile::from_env().expect("clean env");
253        let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
254        assert!(!s.can_serve(&profile, &r));
255    }
256
257    #[tokio::test]
258    async fn crossref_fetch_returns_envelope_message() {
259        let server = MockServer::start().await;
260        Mock::given(method("GET"))
261            .and(path("/works/10.1234/example"))
262            .respond_with(
263                ResponseTemplate::new(200)
264                    .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
265            )
266            .mount(&server)
267            .await;
268
269        let host = server_host(&server);
270        let s = crossref_for(&server);
271        let (_td, ctx) = build_test_context(&host);
272        let profile = CapabilityProfile::from_env().expect("clean env");
273        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
274
275        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
276        assert_eq!(res.source, "crossref");
277        assert_eq!(
278            res.metadata_json,
279            Some(serde_json::json!({ "title": ["Example"] })),
280        );
281        assert!(res.pdf_bytes.is_none());
282        assert!(res.final_url.is_some());
283    }
284
285    #[tokio::test]
286    async fn crossref_fetch_with_arxiv_ref_errors_not_eligible() {
287        // wiremock not needed: the arxiv branch short-circuits before any
288        // outbound call. Construct the source with a dummy base, and pass
289        // a dummy allowlist host since fetch never reaches the HTTP layer.
290        let s = CrossrefSource::with_base(
291            Url::parse("http://127.0.0.1:1/").unwrap(),
292            "test@example.org".into(),
293        );
294        let (_td, ctx) = build_test_context("127.0.0.1");
295        let profile = CapabilityProfile::from_env().expect("clean env");
296        let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
297
298        let err = s.fetch(&r, &profile, &ctx).await.expect_err("not eligible");
299        match err {
300            FetchError::NotEligible { source_key } => {
301                assert_eq!(source_key, "crossref");
302            }
303            other => panic!("expected NotEligible, got {:?}", other),
304        }
305    }
306
307    #[tokio::test]
308    async fn crossref_fetch_writes_log_row() {
309        let server = MockServer::start().await;
310        Mock::given(method("GET"))
311            .and(path("/works/10.1234/example"))
312            .respond_with(
313                ResponseTemplate::new(200)
314                    .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
315            )
316            .mount(&server)
317            .await;
318
319        let host = server_host(&server);
320        let s = crossref_for(&server);
321        let (_td, ctx) = build_test_context(&host);
322        let profile = CapabilityProfile::from_env().expect("clean env");
323        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
324
325        let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
326
327        // Reopen the log file as raw JSON Lines and assert the single row's
328        // semantic fields. We deliberately don't reach into ProvenanceLog
329        // internals — the public read path is "parse the JSONL by line".
330        let log_path = _td.path().join("test.jsonl");
331        let raw = std::fs::read_to_string(&log_path).expect("log file readable");
332        let lines: Vec<&str> = raw.lines().filter(|l| !l.is_empty()).collect();
333        assert_eq!(lines.len(), 1, "expected exactly one row, got {:?}", lines);
334        let row: serde_json::Value = serde_json::from_str(lines[0]).expect("row is valid JSON");
335        assert_eq!(row["event"], "fetch");
336        assert_eq!(row["result"], "ok");
337        assert_eq!(row["source"], "crossref");
338        assert_eq!(row["ref"], "10.1234/example");
339    }
340
341    #[tokio::test]
342    async fn crossref_404_maps_to_http_error() {
343        let server = MockServer::start().await;
344        Mock::given(method("GET"))
345            .and(path("/works/10.1234/example"))
346            .respond_with(ResponseTemplate::new(404))
347            .mount(&server)
348            .await;
349
350        let host = server_host(&server);
351        let s = crossref_for(&server);
352        let (_td, ctx) = build_test_context(&host);
353        let profile = CapabilityProfile::from_env().expect("clean env");
354        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
355
356        let err = s.fetch(&r, &profile, &ctx).await.expect_err("404 errors");
357        match err {
358            FetchError::Http(_) => {}
359            other => panic!("expected Http(_) on 404, got {:?}", other),
360        }
361    }
362
363    #[tokio::test]
364    async fn crossref_non_ok_status_field_errors_source_schema() {
365        let server = MockServer::start().await;
366        Mock::given(method("GET"))
367            .and(path("/works/10.1234/example"))
368            .respond_with(
369                ResponseTemplate::new(200).set_body_string(r#"{"status":"error","message":{}}"#),
370            )
371            .mount(&server)
372            .await;
373
374        let host = server_host(&server);
375        let s = crossref_for(&server);
376        let (_td, ctx) = build_test_context(&host);
377        let profile = CapabilityProfile::from_env().expect("clean env");
378        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
379
380        let err = s
381            .fetch(&r, &profile, &ctx)
382            .await
383            .expect_err("non-ok status errors");
384        match err {
385            FetchError::SourceSchema { hint } => {
386                assert!(
387                    hint.contains("status"),
388                    "expected status mention in hint, got {hint}"
389                );
390            }
391            other => panic!("expected SourceSchema, got {:?}", other),
392        }
393    }
394}