Skip to main content

doiget_core/sources/
crossref.rs

1//! Crossref source — DOI metadata + OA URL discovery via `link[]` array.
2//!
3//! Spec: docs/SOURCES.md §4 (Crossref). No auth; polite-pool User-Agent
4//! contact email is REQUIRED — see [`CrossrefSource::new`].
5
6use std::collections::HashSet;
7
8use async_trait::async_trait;
9use serde::Deserialize;
10use url::Url;
11
12use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
13use crate::source::{FetchContext, FetchError, FetchResult, Source};
14use crate::{CapabilityProfile, Ref};
15
16/// Production Crossref REST API base URL. Hard-coded per `docs/SOURCES.md`
17/// §4; tests inject a wiremock origin via [`CrossrefSource::with_base`].
18const DEFAULT_BASE: &str = "https://api.crossref.org";
19
20/// Minimum token overlap similarity score to include a candidate in
21/// [`CrossrefSource::resolve_citation`] results.
22const MIN_CITATION_SCORE: f64 = 0.5;
23
24/// Crossref [`Source`] impl — DOI → metadata; OA URL via `message.link[]`.
25///
26/// See `docs/SOURCES.md` §4 for the access policy (no auth, polite pool).
27#[derive(Clone, Debug)]
28pub struct CrossrefSource {
29    /// API base URL. Production constructor pins this to
30    /// `https://api.crossref.org`; the [`with_base`](Self::with_base)
31    /// test-only constructor lets wiremock substitute an `http://127.0.0.1:N`
32    /// origin.
33    base: Url,
34    /// Polite-pool contact email per `docs/SOURCES.md` §4 Crossref.
35    /// Concretely formatted into the `User-Agent` header by [`crate::http::HttpClient`].
36    /// (Phase 1: caller injects via [`CrossrefSource::new`]; CLI / config wiring
37    /// lands in a follow-up PR.)
38    #[allow(dead_code)]
39    contact_email: String,
40}
41
42impl CrossrefSource {
43    /// Production constructor: hard-codes `https://api.crossref.org` as the
44    /// base URL. The `contact_email` value is appended to the polite-pool
45    /// User-Agent (config plumbing arrives in a later PR — see
46    /// `docs/SOURCES.md` §4).
47    #[must_use]
48    pub fn new(contact_email: String) -> Self {
49        Self {
50            // The hard-coded constant is a known-valid URL; the `expect`
51            // here is the documented exception to the workspace
52            // `expect_used` lint (it can never fire in practice).
53            #[allow(clippy::expect_used)]
54            base: Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid"),
55            contact_email,
56        }
57    }
58
59    /// Construct with an arbitrary base URL.
60    ///
61    /// The orchestrator (`doiget-cli::commands::fetch`) uses this to honor
62    /// the `DOIGET_CROSSREF_BASE` env var, which lets integration tests point
63    /// the source at a wiremock origin without compile-time gates. Production
64    /// callers use [`CrossrefSource::new`].
65    pub fn with_base(base: Url, contact_email: String) -> Self {
66        Self {
67            base,
68            contact_email,
69        }
70    }
71
72    /// Build the `/works/{doi}` URL for the configured base. Returns
73    /// [`FetchError::SourceSchema`] if joining the path produces an invalid
74    /// URL (only possible if the base URL is malformed — should never happen
75    /// in production).
76    fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
77        // Crossref accepts the bare DOI (no `doi:` scheme). `Doi::as_str()`
78        // already returns it without the scheme. The `/` inside the suffix
79        // is URL-encoded by `reqwest` when the request is built; wiremock
80        // sees the decoded path on its `path()` matcher.
81        let path = format!("/works/{}", doi.as_str());
82        self.base.join(&path).map_err(|e| FetchError::SourceSchema {
83            hint: format!("crossref URL construction failed: {e}"),
84        })
85    }
86
87    /// Resolves a free-form bibliographic citation string to ranked DOI candidates.
88    pub async fn resolve_citation(
89        &self,
90        query: &str,
91        rows: u8,
92        ctx: &FetchContext,
93    ) -> Result<Vec<crate::ResolvedCandidate>, FetchError> {
94        // 1. Rate limiter
95        let _permit = ctx.rate_limiter.acquire(self.name()).await;
96
97        // 2. Build works query URL
98        // /works?query.bibliographic=<query>&rows=<rows>&mailto=<email>
99        let mut url = self
100            .base
101            .join("/works")
102            .map_err(|e| FetchError::SourceSchema {
103                hint: format!("crossref resolve_citation URL construction failed: {e}"),
104            })?;
105        url.query_pairs_mut()
106            .append_pair("query.bibliographic", query)
107            .append_pair("rows", &rows.to_string())
108            .append_pair("mailto", &self.contact_email);
109
110        // 3. HTTP fetch
111        let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
112
113        // 4. Parse JSON
114        let envelope: serde_json::Value =
115            serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
116                hint: format!("crossref returned non-JSON for search: {e}"),
117            })?;
118
119        let items = envelope
120            .get("message")
121            .and_then(|m| m.get("items"))
122            .and_then(|i| i.as_array())
123            .ok_or_else(|| FetchError::SourceSchema {
124                hint: "crossref response missing message.items".to_string(),
125            })?;
126
127        // 5. Tokenize query (unique tokens, sorted)
128        let query_tokens = {
129            let mut t: Vec<String> = query
130                .split(|c: char| !c.is_alphanumeric())
131                .map(|s| s.to_lowercase())
132                .filter(|s| !s.is_empty())
133                .collect();
134            t.sort();
135            t.dedup();
136            t
137        };
138
139        if query_tokens.is_empty() {
140            return Ok(Vec::new());
141        }
142
143        let mut candidates = Vec::new();
144
145        for item in items {
146            let doi = match item.get("DOI").and_then(|v| v.as_str()) {
147                Some(d) => d.to_string(),
148                None => continue,
149            };
150
151            let fields = crate::orchestrator::extract_crossref_fields(item);
152
153            // Construct search text from candidate
154            let mut candidate_text = String::new();
155            if let Some(t) = &fields.title {
156                candidate_text.push_str(&t.to_lowercase());
157                candidate_text.push(' ');
158            }
159            // Score against ALL authors, not just the first, so a citation
160            // naming several authors (e.g. "Bulla Costi Pruschke 2008") still
161            // matches instead of being dropped below the threshold — #372.
162            for author in &fields.authors {
163                candidate_text.push_str(&author.to_lowercase());
164                candidate_text.push(' ');
165            }
166            if let Some(v) = &fields.venue {
167                candidate_text.push_str(&v.to_lowercase());
168                candidate_text.push(' ');
169            }
170            if let Some(y) = fields.year {
171                candidate_text.push_str(&y.to_string());
172                candidate_text.push(' ');
173            }
174
175            // Simple tokenize of candidate into a HashSet for O(1) lookup.
176            let candidate_tokens: HashSet<String> = candidate_text
177                .split(|c: char| !c.is_alphanumeric())
178                .map(|s| s.to_lowercase())
179                .filter(|s| !s.is_empty())
180                .collect();
181
182            let matched = query_tokens
183                .iter()
184                .filter(|q| candidate_tokens.contains(*q))
185                .count();
186
187            let score = matched as f64 / query_tokens.len() as f64;
188
189            if score >= MIN_CITATION_SCORE {
190                let first_author = fields.authors.first().cloned().unwrap_or_default();
191                candidates.push(crate::ResolvedCandidate {
192                    doi,
193                    title: fields.title.unwrap_or_default(),
194                    author: first_author,
195                    year: fields.year,
196                    score,
197                    source: "crossref".to_string(),
198                });
199            }
200        }
201
202        // 6. Sort candidates by score descending
203        candidates.sort_by(|a, b| {
204            b.score
205                .partial_cmp(&a.score)
206                .unwrap_or(std::cmp::Ordering::Equal)
207        });
208
209        Ok(candidates)
210    }
211}
212
213#[async_trait]
214impl Source for CrossrefSource {
215    fn name(&self) -> &str {
216        "crossref"
217    }
218
219    fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
220        matches!(ref_, Ref::Doi(_))
221    }
222
223    async fn fetch(
224        &self,
225        ref_: &Ref,
226        _profile: &CapabilityProfile,
227        ctx: &FetchContext,
228    ) -> Result<FetchResult, FetchError> {
229        let doi = match ref_ {
230            Ref::Doi(d) => d,
231            Ref::Arxiv(_) => {
232                return Err(FetchError::NotEligible {
233                    source_key: "crossref".into(),
234                });
235            }
236        };
237
238        // Step 1: rate limiter (politeness — `docs/SOURCES.md` §6).
239        let _permit = ctx.rate_limiter.acquire(self.name()).await;
240
241        // Step 2: HTTP fetch. Body is JSON; the `PDF_MAX_BYTES` size cap in
242        // `HttpClient` applies. Crossref responses are well under 100 MB
243        // even for bibliographically rich DOIs.
244        let url = self.request_url(doi)?;
245        let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
246
247        // Step 3: parse the response envelope. Crossref wraps the work
248        // record in a top-level `{ "status": "ok", "message": { ... } }`
249        // envelope (per <https://api.crossref.org/swagger-ui/index.html>).
250        let envelope: CrossrefEnvelope =
251            serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
252                hint: format!("crossref returned non-JSON: {e}"),
253            })?;
254        if envelope.status != "ok" {
255            return Err(FetchError::SourceSchema {
256                hint: format!("crossref status = {}", envelope.status),
257            });
258        }
259
260        // Step 4: log the fetch event (`docs/PROVENANCE_LOG.md` §3).
261        // ADR-0021 §1 canonical-digest: promote the ref under the
262        // "crossref" resolver profile (no version — Crossref does not
263        // expose a per-call version token in Phase 1).
264        let canonical = ref_.promote(self.name(), None).digest_hex();
265        ctx.log.append(RowInput {
266            event: LogEvent::Fetch,
267            result: LogResult::Ok,
268            capability: Capability::Oa,
269            ref_: Some(doi.as_str()),
270            source: Some(self.name()),
271            error_code: None,
272            size_bytes: Some(body.len() as u64),
273            license: None,
274            store_path: None,
275            canonical_digest: Some(&canonical),
276        })?;
277
278        Ok(FetchResult {
279            source: self.name().to_string(),
280            license: "unknown".into(),
281            // Crossref is metadata; PDF retrieval is the job of Unpaywall /
282            // publisher sources (Phase 1+ sibling PRs).
283            pdf_bytes: None,
284            final_url: Some(final_url),
285            metadata_json: Some(envelope.message),
286        })
287    }
288}
289
290/// Top-level Crossref response envelope. Only `status` and `message` are
291/// load-bearing here; `message-type`, `message-version`, etc. are ignored.
292#[derive(Debug, Deserialize)]
293struct CrossrefEnvelope {
294    status: String,
295    message: serde_json::Value,
296}
297
298// ---------------------------------------------------------------------------
299// Tests
300// ---------------------------------------------------------------------------
301
302#[cfg(test)]
303#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
304mod tests {
305    use super::*;
306
307    use std::sync::Arc;
308
309    use camino::Utf8PathBuf;
310    use tempfile::TempDir;
311    use wiremock::matchers::{method, path};
312    use wiremock::{Mock, MockServer, ResponseTemplate};
313
314    use crate::http::HttpClient;
315    use crate::provenance::ProvenanceLog;
316    use crate::rate_limiter::RateLimiter;
317    use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
318
319    /// Build a `FetchContext` whose [`HttpClient`] allows the wiremock
320    /// `http://` origin under the `crossref` source key, plus a
321    /// tempdir-backed `ProvenanceLog`. Returns the tempdir so the caller
322    /// keeps it alive for the duration of the test.
323    fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
324        let td = TempDir::new().expect("tempdir");
325        // Workspace lints ban `std::path::PathBuf`; convert via camino.
326        let log_dir =
327            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
328        let log_path = log_dir.join("test.jsonl");
329
330        // Use the test-only constructor that relaxes `https_only` for the
331        // initial leg so wiremock (which serves over plain HTTP) can be
332        // reached. Redirect closure still rejects http:// targets — see
333        // `http.rs::build_client_allow_http`.
334        let http = Arc::new(HttpClient::new_for_tests_allow_http(
335            "crossref",
336            wiremock_host,
337        ));
338        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
339        let session_id = "01J0000000000000000000TEST".to_string();
340        let log = Arc::new(
341            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
342        );
343
344        (
345            td,
346            FetchContext {
347                http,
348                rate_limiter,
349                log,
350                session_id,
351                cache_root: None,
352            },
353        )
354    }
355
356    /// Extract the host string of a wiremock server's URI.
357    fn server_host(server: &MockServer) -> String {
358        server
359            .uri()
360            .parse::<Url>()
361            .expect("wiremock uri parses")
362            .host_str()
363            .expect("wiremock uri has host")
364            .to_string()
365    }
366
367    /// Build a [`CrossrefSource`] pointing at the given wiremock URI.
368    fn crossref_for(server: &MockServer) -> CrossrefSource {
369        let base = server.uri().parse::<Url>().expect("wiremock uri parses");
370        CrossrefSource::with_base(base, "test@example.org".to_string())
371    }
372
373    #[test]
374    fn crossref_can_serve_returns_true_for_doi() {
375        let s = CrossrefSource::new("test@example.org".into());
376        let profile = CapabilityProfile::from_env().expect("clean env");
377        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
378        assert!(s.can_serve(&profile, &r));
379    }
380
381    #[test]
382    fn crossref_can_serve_returns_false_for_arxiv() {
383        let s = CrossrefSource::new("test@example.org".into());
384        let profile = CapabilityProfile::from_env().expect("clean env");
385        let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
386        assert!(!s.can_serve(&profile, &r));
387    }
388
389    #[tokio::test]
390    async fn crossref_fetch_returns_envelope_message() {
391        let server = MockServer::start().await;
392        Mock::given(method("GET"))
393            .and(path("/works/10.1234/example"))
394            .respond_with(
395                ResponseTemplate::new(200)
396                    .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
397            )
398            .mount(&server)
399            .await;
400
401        let host = server_host(&server);
402        let s = crossref_for(&server);
403        let (_td, ctx) = build_test_context(&host);
404        let profile = CapabilityProfile::from_env().expect("clean env");
405        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
406
407        let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
408        assert_eq!(res.source, "crossref");
409        assert_eq!(
410            res.metadata_json,
411            Some(serde_json::json!({ "title": ["Example"] })),
412        );
413        assert!(res.pdf_bytes.is_none());
414        assert!(res.final_url.is_some());
415    }
416
417    #[tokio::test]
418    async fn crossref_fetch_with_arxiv_ref_errors_not_eligible() {
419        // wiremock not needed: the arxiv branch short-circuits before any
420        // outbound call. Construct the source with a dummy base, and pass
421        // a dummy allowlist host since fetch never reaches the HTTP layer.
422        let s = CrossrefSource::with_base(
423            Url::parse("http://127.0.0.1:1/").unwrap(),
424            "test@example.org".into(),
425        );
426        let (_td, ctx) = build_test_context("127.0.0.1");
427        let profile = CapabilityProfile::from_env().expect("clean env");
428        let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
429
430        let err = s.fetch(&r, &profile, &ctx).await.expect_err("not eligible");
431        match err {
432            FetchError::NotEligible { source_key } => {
433                assert_eq!(source_key, "crossref");
434            }
435            other => panic!("expected NotEligible, got {:?}", other),
436        }
437    }
438
439    #[tokio::test]
440    async fn crossref_fetch_writes_log_row() {
441        let server = MockServer::start().await;
442        Mock::given(method("GET"))
443            .and(path("/works/10.1234/example"))
444            .respond_with(
445                ResponseTemplate::new(200)
446                    .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
447            )
448            .mount(&server)
449            .await;
450
451        let host = server_host(&server);
452        let s = crossref_for(&server);
453        let (_td, ctx) = build_test_context(&host);
454        let profile = CapabilityProfile::from_env().expect("clean env");
455        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
456
457        let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
458
459        // Reopen the log file as raw JSON Lines and assert the single row's
460        // semantic fields. We deliberately don't reach into ProvenanceLog
461        // internals — the public read path is "parse the JSONL by line".
462        let log_path = _td.path().join("test.jsonl");
463        let raw = std::fs::read_to_string(&log_path).expect("log file readable");
464        let lines: Vec<&str> = raw.lines().filter(|l| !l.is_empty()).collect();
465        assert_eq!(lines.len(), 1, "expected exactly one row, got {:?}", lines);
466        let row: serde_json::Value = serde_json::from_str(lines[0]).expect("row is valid JSON");
467        assert_eq!(row["event"], "fetch");
468        assert_eq!(row["result"], "ok");
469        assert_eq!(row["source"], "crossref");
470        assert_eq!(row["ref"], "10.1234/example");
471    }
472
473    #[tokio::test]
474    async fn crossref_404_maps_to_http_error() {
475        let server = MockServer::start().await;
476        Mock::given(method("GET"))
477            .and(path("/works/10.1234/example"))
478            .respond_with(ResponseTemplate::new(404))
479            .mount(&server)
480            .await;
481
482        let host = server_host(&server);
483        let s = crossref_for(&server);
484        let (_td, ctx) = build_test_context(&host);
485        let profile = CapabilityProfile::from_env().expect("clean env");
486        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
487
488        let err = s.fetch(&r, &profile, &ctx).await.expect_err("404 errors");
489        match err {
490            FetchError::Http(_) => {}
491            other => panic!("expected Http(_) on 404, got {:?}", other),
492        }
493    }
494
495    #[tokio::test]
496    async fn crossref_non_ok_status_field_errors_source_schema() {
497        let server = MockServer::start().await;
498        Mock::given(method("GET"))
499            .and(path("/works/10.1234/example"))
500            .respond_with(
501                ResponseTemplate::new(200).set_body_string(r#"{"status":"error","message":{}}"#),
502            )
503            .mount(&server)
504            .await;
505
506        let host = server_host(&server);
507        let s = crossref_for(&server);
508        let (_td, ctx) = build_test_context(&host);
509        let profile = CapabilityProfile::from_env().expect("clean env");
510        let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
511
512        let err = s
513            .fetch(&r, &profile, &ctx)
514            .await
515            .expect_err("non-ok status errors");
516        match err {
517            FetchError::SourceSchema { hint } => {
518                assert!(
519                    hint.contains("status"),
520                    "expected status mention in hint, got {hint}"
521                );
522            }
523            other => panic!("expected SourceSchema, got {:?}", other),
524        }
525    }
526
527    #[tokio::test]
528    async fn test_resolve_citation_success() {
529        let server = MockServer::start().await;
530        let mock_body = serde_json::json!({
531            "status": "ok",
532            "message": {
533                "items": [
534                    {
535                        "DOI": "10.1000/xyz123",
536                        "title": ["Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition"],
537                        "author": [
538                            {"family": "Onsager", "given": "Lars"}
539                        ],
540                        "issued": {
541                            "date-parts": [[1944, 2, 1]]
542                        },
543                        "container-title": ["Physical Review"]
544                    },
545                    {
546                        "DOI": "10.1000/unrelated",
547                        "title": ["Some Unrelated Paper"],
548                        "author": [
549                            {"family": "Smith", "given": "John"}
550                        ],
551                        "issued": {
552                            "date-parts": [[2020]]
553                        }
554                    }
555                ]
556            }
557        });
558
559        Mock::given(method("GET"))
560            .and(path("/works"))
561            .respond_with(ResponseTemplate::new(200).set_body_json(mock_body))
562            .mount(&server)
563            .await;
564
565        let host = server_host(&server);
566        let s = crossref_for(&server);
567        let (_td, ctx) = build_test_context(&host);
568
569        let candidates = s
570            .resolve_citation("Onsager 1944", 2, &ctx)
571            .await
572            .expect("resolve ok");
573
574        // The query "Onsager 1944" has tokens ["onsager", "1944"].
575        // The first candidate has both "onsager" (author family) and "1944" (issued year). Score is 1.0.
576        // The second candidate has neither. Score is 0.0, filtered out.
577        assert_eq!(candidates.len(), 1);
578        let cand = &candidates[0];
579        assert_eq!(cand.doi, "10.1000/xyz123");
580        assert_eq!(cand.title, "Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition");
581        assert_eq!(cand.author, "Onsager, Lars");
582        assert_eq!(cand.year, Some(1944));
583        assert_eq!(cand.score, 1.0);
584    }
585
586    #[tokio::test]
587    async fn resolve_citation_matches_non_first_authors() {
588        // #372: candidate scoring text must include ALL authors, not just the
589        // first. With the old first-author-only text, "Costi Pruschke 2008"
590        // (2nd / 3rd authors + year) matched only the year (1/3 = 0.33) and was
591        // filtered out; now all authors match (3/3 = 1.0).
592        let server = MockServer::start().await;
593        let mock_body = serde_json::json!({
594            "status": "ok",
595            "message": {
596                "items": [
597                    {
598                        "DOI": "10.1103/RevModPhys.80.395",
599                        "title": ["Numerical renormalization group method for quantum impurity systems"],
600                        "author": [
601                            {"family": "Bulla", "given": "Ralf"},
602                            {"family": "Costi", "given": "Theo A."},
603                            {"family": "Pruschke", "given": "Thomas"}
604                        ],
605                        "issued": { "date-parts": [[2008, 4, 2]] },
606                        "container-title": ["Reviews of Modern Physics"]
607                    }
608                ]
609            }
610        });
611
612        Mock::given(method("GET"))
613            .and(path("/works"))
614            .respond_with(ResponseTemplate::new(200).set_body_json(mock_body))
615            .mount(&server)
616            .await;
617
618        let host = server_host(&server);
619        let s = crossref_for(&server);
620        let (_td, ctx) = build_test_context(&host);
621
622        let candidates = s
623            .resolve_citation("Costi Pruschke 2008", 5, &ctx)
624            .await
625            .expect("resolve ok");
626
627        assert_eq!(candidates.len(), 1);
628        assert_eq!(candidates[0].doi, "10.1103/RevModPhys.80.395");
629        assert_eq!(candidates[0].score, 1.0);
630    }
631}