Skip to main content

doiget_core/sources/
crossref.rs

1//! Crossref source — DOI metadata + OA URL discovery via `link[]` array.
2//!
3//! Spec: docs/SOURCES.md §4 (Crossref). No auth; polite-pool User-Agent
4//! contact email is REQUIRED — see [`CrossrefSource::new`].
5
6use std::collections::HashSet;
7
8use async_trait::async_trait;
9use serde::Deserialize;
10use url::Url;
11
12use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
13use crate::source::{FetchContext, FetchError, FetchResult, Source};
14use crate::{CapabilityProfile, Ref};
15
16/// Production Crossref REST API base URL. Hard-coded per `docs/SOURCES.md`
17/// §4; tests inject a wiremock origin via [`CrossrefSource::with_base`].
18const DEFAULT_BASE: &str = "https://api.crossref.org";
19
20/// Minimum token overlap similarity score to include a candidate in
21/// [`CrossrefSource::resolve_citation`] results.
22const MIN_CITATION_SCORE: f64 = 0.5;
23
24/// Crossref [`Source`] impl — DOI → metadata; OA URL via `message.link[]`.
25///
26/// See `docs/SOURCES.md` §4 for the access policy (no auth, polite pool).
27#[derive(Clone, Debug)]
28pub struct CrossrefSource {
29    /// API base URL. Production constructor pins this to
30    /// `https://api.crossref.org`; the [`with_base`](Self::with_base)
31    /// test-only constructor lets wiremock substitute an `http://127.0.0.1:N`
32    /// origin.
33    base: Url,
34    /// Polite-pool contact email per `docs/SOURCES.md` §4 Crossref.
35    /// Concretely formatted into the `User-Agent` header by [`crate::http::HttpClient`].
36    /// (Phase 1: caller injects via [`CrossrefSource::new`]; CLI / config wiring
37    /// lands in a follow-up PR.)
38    #[allow(dead_code)]
39    contact_email: String,
40}
41
42impl CrossrefSource {
43    /// Production constructor: hard-codes `https://api.crossref.org` as the
44    /// base URL. The `contact_email` value is appended to the polite-pool
45    /// User-Agent (config plumbing arrives in a later PR — see
46    /// `docs/SOURCES.md` §4).
47    #[must_use]
48    pub fn new(contact_email: String) -> Self {
49        Self {
50            // The hard-coded constant is a known-valid URL; the `expect`
51            // here is the documented exception to the workspace
52            // `expect_used` lint (it can never fire in practice).
53            #[allow(clippy::expect_used)]
54            base: Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid"),
55            contact_email,
56        }
57    }
58
59    /// Construct with an arbitrary base URL.
60    ///
61    /// The orchestrator (`doiget-cli::commands::fetch`) uses this to honor
62    /// the `DOIGET_CROSSREF_BASE` env var, which lets integration tests point
63    /// the source at a wiremock origin without compile-time gates. Production
64    /// callers use [`CrossrefSource::new`].
65    pub fn with_base(base: Url, contact_email: String) -> Self {
66        Self {
67            base,
68            contact_email,
69        }
70    }
71
72    /// Build the `/works/{doi}` URL for the configured base. Returns
73    /// [`FetchError::SourceSchema`] if joining the path produces an invalid
74    /// URL (only possible if the base URL is malformed — should never happen
75    /// in production).
76    fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
77        // Crossref accepts the bare DOI (no `doi:` scheme). `Doi::as_str()`
78        // already returns it without the scheme. The `/` inside the suffix
79        // is URL-encoded by `reqwest` when the request is built; wiremock
80        // sees the decoded path on its `path()` matcher.
81        let path = format!("/works/{}", doi.as_str());
82        self.base.join(&path).map_err(|e| FetchError::SourceSchema {
83            hint: format!("crossref URL construction failed: {e}"),
84        })
85    }
86
87    /// Resolves a free-form bibliographic citation string to ranked DOI candidates.
88    pub async fn resolve_citation(
89        &self,
90        query: &str,
91        rows: u8,
92        ctx: &FetchContext,
93    ) -> Result<Vec<crate::ResolvedCandidate>, FetchError> {
94        // 1. Rate limiter
95        let _permit = ctx.rate_limiter.acquire(self.name()).await;
96
97        // 2. Build works query URL
98        // /works?query.bibliographic=<query>&rows=<rows>&mailto=<email>
99        let mut url = self
100            .base
101            .join("/works")
102            .map_err(|e| FetchError::SourceSchema {
103                hint: format!("crossref resolve_citation URL construction failed: {e}"),
104            })?;
105        url.query_pairs_mut()
106            .append_pair("query.bibliographic", query)
107            .append_pair("rows", &rows.to_string())
108            .append_pair("mailto", &self.contact_email);
109
110        // 3. HTTP fetch
111        let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
112
113        // 4. Parse JSON
114        let envelope: serde_json::Value =
115            serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
116                hint: format!("crossref returned non-JSON for search: {e}"),
117            })?;
118
119        let items = envelope
120            .get("message")
121            .and_then(|m| m.get("items"))
122            .and_then(|i| i.as_array())
123            .ok_or_else(|| FetchError::SourceSchema {
124                hint: "crossref response missing message.items".to_string(),
125            })?;
126
127        // 5. Tokenize query (unique tokens, sorted)
128        let query_tokens = {
129            let mut t: Vec<String> = query
130                .split(|c: char| !c.is_alphanumeric())
131                .map(|s| s.to_lowercase())
132                .filter(|s| !s.is_empty())
133                .collect();
134            t.sort();
135            t.dedup();
136            t
137        };
138
139        if query_tokens.is_empty() {
140            return Ok(Vec::new());
141        }
142
143        let mut candidates = Vec::new();
144
145        for item in items {
146            let doi = match item.get("DOI").and_then(|v| v.as_str()) {
147                Some(d) => d.to_string(),
148                None => continue,
149            };
150
151            let fields = crate::orchestrator::extract_crossref_fields(item);
152
153            // Construct search text from candidate
154            let mut candidate_text = String::new();
155            if let Some(t) = &fields.title {
156                candidate_text.push_str(&t.to_lowercase());
157                candidate_text.push(' ');
158            }
159            if let Some(author) = fields.authors.first() {
160                candidate_text.push_str(&author.to_lowercase());
161                candidate_text.push(' ');
162            }
163            if let Some(v) = &fields.venue {
164                candidate_text.push_str(&v.to_lowercase());
165                candidate_text.push(' ');
166            }
167            if let Some(y) = fields.year {
168                candidate_text.push_str(&y.to_string());
169                candidate_text.push(' ');
170            }
171
172            // Simple tokenize of candidate into a HashSet for O(1) lookup.
173            let candidate_tokens: HashSet<String> = candidate_text
174                .split(|c: char| !c.is_alphanumeric())
175                .map(|s| s.to_lowercase())
176                .filter(|s| !s.is_empty())
177                .collect();
178
179            let matched = query_tokens
180                .iter()
181                .filter(|q| candidate_tokens.contains(*q))
182                .count();
183
184            let score = matched as f64 / query_tokens.len() as f64;
185
186            if score >= MIN_CITATION_SCORE {
187                let first_author = fields.authors.first().cloned().unwrap_or_default();
188                candidates.push(crate::ResolvedCandidate {
189                    doi,
190                    title: fields.title.unwrap_or_default(),
191                    author: first_author,
192                    year: fields.year,
193                    score,
194                    source: "crossref".to_string(),
195                });
196            }
197        }
198
199        // 6. Sort candidates by score descending
200        candidates.sort_by(|a, b| {
201            b.score
202                .partial_cmp(&a.score)
203                .unwrap_or(std::cmp::Ordering::Equal)
204        });
205
206        Ok(candidates)
207    }
208}
209
210#[async_trait]
211impl Source for CrossrefSource {
212    fn name(&self) -> &str {
213        "crossref"
214    }
215
216    fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
217        matches!(ref_, Ref::Doi(_))
218    }
219
220    async fn fetch(
221        &self,
222        ref_: &Ref,
223        _profile: &CapabilityProfile,
224        ctx: &FetchContext,
225    ) -> Result<FetchResult, FetchError> {
226        let doi = match ref_ {
227            Ref::Doi(d) => d,
228            Ref::Arxiv(_) => {
229                return Err(FetchError::NotEligible {
230                    source_key: "crossref".into(),
231                });
232            }
233        };
234
235        // Step 1: rate limiter (politeness — `docs/SOURCES.md` §6).
236        let _permit = ctx.rate_limiter.acquire(self.name()).await;
237
238        // Step 2: HTTP fetch. Body is JSON; the `PDF_MAX_BYTES` size cap in
239        // `HttpClient` applies. Crossref responses are well under 100 MB
240        // even for bibliographically rich DOIs.
241        let url = self.request_url(doi)?;
242        let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
243
244        // Step 3: parse the response envelope. Crossref wraps the work
245        // record in a top-level `{ "status": "ok", "message": { ... } }`
246        // envelope (per <https://api.crossref.org/swagger-ui/index.html>).
247        let envelope: CrossrefEnvelope =
248            serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
249                hint: format!("crossref returned non-JSON: {e}"),
250            })?;
251        if envelope.status != "ok" {
252            return Err(FetchError::SourceSchema {
253                hint: format!("crossref status = {}", envelope.status),
254            });
255        }
256
257        // Step 4: log the fetch event (`docs/PROVENANCE_LOG.md` §3).
258        // ADR-0021 §1 canonical-digest: promote the ref under the
259        // "crossref" resolver profile (no version — Crossref does not
260        // expose a per-call version token in Phase 1).
261        let canonical = ref_.promote(self.name(), None).digest_hex();
262        ctx.log.append(RowInput {
263            event: LogEvent::Fetch,
264            result: LogResult::Ok,
265            capability: Capability::Oa,
266            ref_: Some(doi.as_str()),
267            source: Some(self.name()),
268            error_code: None,
269            size_bytes: Some(body.len() as u64),
270            license: None,
271            store_path: None,
272            canonical_digest: Some(&canonical),
273        })?;
274
275        Ok(FetchResult {
276            source: self.name().to_string(),
277            license: "unknown".into(),
278            // Crossref is metadata; PDF retrieval is the job of Unpaywall /
279            // publisher sources (Phase 1+ sibling PRs).
280            pdf_bytes: None,
281            final_url: Some(final_url),
282            metadata_json: Some(envelope.message),
283        })
284    }
285}
286
287/// Top-level Crossref response envelope. Only `status` and `message` are
288/// load-bearing here; `message-type`, `message-version`, etc. are ignored.
289#[derive(Debug, Deserialize)]
290struct CrossrefEnvelope {
291    status: String,
292    message: serde_json::Value,
293}
294
295// ---------------------------------------------------------------------------
296// Tests
297// ---------------------------------------------------------------------------
298
299#[cfg(test)]
300#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
301mod tests {
302    use super::*;
303
304    use std::sync::Arc;
305
306    use camino::Utf8PathBuf;
307    use tempfile::TempDir;
308    use wiremock::matchers::{method, path};
309    use wiremock::{Mock, MockServer, ResponseTemplate};
310
311    use crate::http::HttpClient;
312    use crate::provenance::ProvenanceLog;
313    use crate::rate_limiter::RateLimiter;
314    use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
315
316    /// Build a `FetchContext` whose [`HttpClient`] allows the wiremock
317    /// `http://` origin under the `crossref` source key, plus a
318    /// tempdir-backed `ProvenanceLog`. Returns the tempdir so the caller
319    /// keeps it alive for the duration of the test.
320    fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
321        let td = TempDir::new().expect("tempdir");
322        // Workspace lints ban `std::path::PathBuf`; convert via camino.
323        let log_dir =
324            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
325        let log_path = log_dir.join("test.jsonl");
326
327        // Use the test-only constructor that relaxes `https_only` for the
328        // initial leg so wiremock (which serves over plain HTTP) can be
329        // reached. Redirect closure still rejects http:// targets — see
330        // `http.rs::build_client_allow_http`.
331        let http = Arc::new(HttpClient::new_for_tests_allow_http(
332            "crossref",
333            wiremock_host,
334        ));
335        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
336        let session_id = "01J0000000000000000000TEST".to_string();
337        let log = Arc::new(
338            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
339        );
340
341        (
342            td,
343            FetchContext {
344                http,
345                rate_limiter,
346                log,
347                session_id,
348                cache_root: None,
349            },
350        )
351    }
352
353    /// Extract the host string of a wiremock server's URI.
354    fn server_host(server: &MockServer) -> String {
355        server
356            .uri()
357            .parse::<Url>()
358            .expect("wiremock uri parses")
359            .host_str()
360            .expect("wiremock uri has host")
361            .to_string()
362    }
363
364    /// Build a [`CrossrefSource`] pointing at the given wiremock URI.
365    fn crossref_for(server: &MockServer) -> CrossrefSource {
366        let base = server.uri().parse::<Url>().expect("wiremock uri parses");
367        CrossrefSource::with_base(base, "test@example.org".to_string())
368    }
369
370    #[test]
371    fn crossref_can_serve_returns_true_for_doi() {
372        let s = CrossrefSource::new("test@example.org".into());
373        let profile = CapabilityProfile::from_env().expect("clean env");
374        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
375        assert!(s.can_serve(&profile, &r));
376    }
377
378    #[test]
379    fn crossref_can_serve_returns_false_for_arxiv() {
380        let s = CrossrefSource::new("test@example.org".into());
381        let profile = CapabilityProfile::from_env().expect("clean env");
382        let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
383        assert!(!s.can_serve(&profile, &r));
384    }
385
386    #[tokio::test]
387    async fn crossref_fetch_returns_envelope_message() {
388        let server = MockServer::start().await;
389        Mock::given(method("GET"))
390            .and(path("/works/10.1234/example"))
391            .respond_with(
392                ResponseTemplate::new(200)
393                    .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
394            )
395            .mount(&server)
396            .await;
397
398        let host = server_host(&server);
399        let s = crossref_for(&server);
400        let (_td, ctx) = build_test_context(&host);
401        let profile = CapabilityProfile::from_env().expect("clean env");
402        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
403
404        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
405        assert_eq!(res.source, "crossref");
406        assert_eq!(
407            res.metadata_json,
408            Some(serde_json::json!({ "title": ["Example"] })),
409        );
410        assert!(res.pdf_bytes.is_none());
411        assert!(res.final_url.is_some());
412    }
413
414    #[tokio::test]
415    async fn crossref_fetch_with_arxiv_ref_errors_not_eligible() {
416        // wiremock not needed: the arxiv branch short-circuits before any
417        // outbound call. Construct the source with a dummy base, and pass
418        // a dummy allowlist host since fetch never reaches the HTTP layer.
419        let s = CrossrefSource::with_base(
420            Url::parse("http://127.0.0.1:1/").unwrap(),
421            "test@example.org".into(),
422        );
423        let (_td, ctx) = build_test_context("127.0.0.1");
424        let profile = CapabilityProfile::from_env().expect("clean env");
425        let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
426
427        let err = s.fetch(&r, &profile, &ctx).await.expect_err("not eligible");
428        match err {
429            FetchError::NotEligible { source_key } => {
430                assert_eq!(source_key, "crossref");
431            }
432            other => panic!("expected NotEligible, got {:?}", other),
433        }
434    }
435
436    #[tokio::test]
437    async fn crossref_fetch_writes_log_row() {
438        let server = MockServer::start().await;
439        Mock::given(method("GET"))
440            .and(path("/works/10.1234/example"))
441            .respond_with(
442                ResponseTemplate::new(200)
443                    .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
444            )
445            .mount(&server)
446            .await;
447
448        let host = server_host(&server);
449        let s = crossref_for(&server);
450        let (_td, ctx) = build_test_context(&host);
451        let profile = CapabilityProfile::from_env().expect("clean env");
452        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
453
454        let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
455
456        // Reopen the log file as raw JSON Lines and assert the single row's
457        // semantic fields. We deliberately don't reach into ProvenanceLog
458        // internals — the public read path is "parse the JSONL by line".
459        let log_path = _td.path().join("test.jsonl");
460        let raw = std::fs::read_to_string(&log_path).expect("log file readable");
461        let lines: Vec<&str> = raw.lines().filter(|l| !l.is_empty()).collect();
462        assert_eq!(lines.len(), 1, "expected exactly one row, got {:?}", lines);
463        let row: serde_json::Value = serde_json::from_str(lines[0]).expect("row is valid JSON");
464        assert_eq!(row["event"], "fetch");
465        assert_eq!(row["result"], "ok");
466        assert_eq!(row["source"], "crossref");
467        assert_eq!(row["ref"], "10.1234/example");
468    }
469
470    #[tokio::test]
471    async fn crossref_404_maps_to_http_error() {
472        let server = MockServer::start().await;
473        Mock::given(method("GET"))
474            .and(path("/works/10.1234/example"))
475            .respond_with(ResponseTemplate::new(404))
476            .mount(&server)
477            .await;
478
479        let host = server_host(&server);
480        let s = crossref_for(&server);
481        let (_td, ctx) = build_test_context(&host);
482        let profile = CapabilityProfile::from_env().expect("clean env");
483        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
484
485        let err = s.fetch(&r, &profile, &ctx).await.expect_err("404 errors");
486        match err {
487            FetchError::Http(_) => {}
488            other => panic!("expected Http(_) on 404, got {:?}", other),
489        }
490    }
491
492    #[tokio::test]
493    async fn crossref_non_ok_status_field_errors_source_schema() {
494        let server = MockServer::start().await;
495        Mock::given(method("GET"))
496            .and(path("/works/10.1234/example"))
497            .respond_with(
498                ResponseTemplate::new(200).set_body_string(r#"{"status":"error","message":{}}"#),
499            )
500            .mount(&server)
501            .await;
502
503        let host = server_host(&server);
504        let s = crossref_for(&server);
505        let (_td, ctx) = build_test_context(&host);
506        let profile = CapabilityProfile::from_env().expect("clean env");
507        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
508
509        let err = s
510            .fetch(&r, &profile, &ctx)
511            .await
512            .expect_err("non-ok status errors");
513        match err {
514            FetchError::SourceSchema { hint } => {
515                assert!(
516                    hint.contains("status"),
517                    "expected status mention in hint, got {hint}"
518                );
519            }
520            other => panic!("expected SourceSchema, got {:?}", other),
521        }
522    }
523
524    #[tokio::test]
525    async fn test_resolve_citation_success() {
526        let server = MockServer::start().await;
527        let mock_body = serde_json::json!({
528            "status": "ok",
529            "message": {
530                "items": [
531                    {
532                        "DOI": "10.1000/xyz123",
533                        "title": ["Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition"],
534                        "author": [
535                            {"family": "Onsager", "given": "Lars"}
536                        ],
537                        "issued": {
538                            "date-parts": [[1944, 2, 1]]
539                        },
540                        "container-title": ["Physical Review"]
541                    },
542                    {
543                        "DOI": "10.1000/unrelated",
544                        "title": ["Some Unrelated Paper"],
545                        "author": [
546                            {"family": "Smith", "given": "John"}
547                        ],
548                        "issued": {
549                            "date-parts": [[2020]]
550                        }
551                    }
552                ]
553            }
554        });
555
556        Mock::given(method("GET"))
557            .and(path("/works"))
558            .respond_with(ResponseTemplate::new(200).set_body_json(mock_body))
559            .mount(&server)
560            .await;
561
562        let host = server_host(&server);
563        let s = crossref_for(&server);
564        let (_td, ctx) = build_test_context(&host);
565
566        let candidates = s
567            .resolve_citation("Onsager 1944", 2, &ctx)
568            .await
569            .expect("resolve ok");
570
571        // The query "Onsager 1944" has tokens ["onsager", "1944"].
572        // The first candidate has both "onsager" (author family) and "1944" (issued year). Score is 1.0.
573        // The second candidate has neither. Score is 0.0, filtered out.
574        assert_eq!(candidates.len(), 1);
575        let cand = &candidates[0];
576        assert_eq!(cand.doi, "10.1000/xyz123");
577        assert_eq!(cand.title, "Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition");
578        assert_eq!(cand.author, "Onsager, Lars");
579        assert_eq!(cand.year, Some(1944));
580        assert_eq!(cand.score, 1.0);
581    }
582}