1use std::collections::HashSet;
7
8use async_trait::async_trait;
9use serde::Deserialize;
10use url::Url;
11
12use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
13use crate::source::{FetchContext, FetchError, FetchResult, Source};
14use crate::{CapabilityProfile, Ref};
15
16const DEFAULT_BASE: &str = "https://api.crossref.org";
19
20const MIN_CITATION_SCORE: f64 = 0.5;
23
24#[derive(Clone, Debug)]
28pub struct CrossrefSource {
29 base: Url,
34 #[allow(dead_code)]
39 contact_email: String,
40}
41
42impl CrossrefSource {
43 #[must_use]
48 pub fn new(contact_email: String) -> Self {
49 Self {
50 #[allow(clippy::expect_used)]
54 base: Url::parse(DEFAULT_BASE).expect("hard-coded base URL is valid"),
55 contact_email,
56 }
57 }
58
59 pub fn with_base(base: Url, contact_email: String) -> Self {
66 Self {
67 base,
68 contact_email,
69 }
70 }
71
72 fn request_url(&self, doi: &crate::Doi) -> Result<Url, FetchError> {
77 let path = format!("/works/{}", doi.as_str());
82 self.base.join(&path).map_err(|e| FetchError::SourceSchema {
83 hint: format!("crossref URL construction failed: {e}"),
84 })
85 }
86
87 pub async fn resolve_citation(
89 &self,
90 query: &str,
91 rows: u8,
92 ctx: &FetchContext,
93 ) -> Result<Vec<crate::ResolvedCandidate>, FetchError> {
94 let _permit = ctx.rate_limiter.acquire(self.name()).await;
96
97 let mut url = self
100 .base
101 .join("/works")
102 .map_err(|e| FetchError::SourceSchema {
103 hint: format!("crossref resolve_citation URL construction failed: {e}"),
104 })?;
105 url.query_pairs_mut()
106 .append_pair("query.bibliographic", query)
107 .append_pair("rows", &rows.to_string())
108 .append_pair("mailto", &self.contact_email);
109
110 let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
112
113 let envelope: serde_json::Value =
115 serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
116 hint: format!("crossref returned non-JSON for search: {e}"),
117 })?;
118
119 let items = envelope
120 .get("message")
121 .and_then(|m| m.get("items"))
122 .and_then(|i| i.as_array())
123 .ok_or_else(|| FetchError::SourceSchema {
124 hint: "crossref response missing message.items".to_string(),
125 })?;
126
127 let query_tokens = {
129 let mut t: Vec<String> = query
130 .split(|c: char| !c.is_alphanumeric())
131 .map(|s| s.to_lowercase())
132 .filter(|s| !s.is_empty())
133 .collect();
134 t.sort();
135 t.dedup();
136 t
137 };
138
139 if query_tokens.is_empty() {
140 return Ok(Vec::new());
141 }
142
143 let mut candidates = Vec::new();
144
145 for item in items {
146 let doi = match item.get("DOI").and_then(|v| v.as_str()) {
147 Some(d) => d.to_string(),
148 None => continue,
149 };
150
151 let fields = crate::orchestrator::extract_crossref_fields(item);
152
153 let mut candidate_text = String::new();
155 if let Some(t) = &fields.title {
156 candidate_text.push_str(&t.to_lowercase());
157 candidate_text.push(' ');
158 }
159 for author in &fields.authors {
163 candidate_text.push_str(&author.to_lowercase());
164 candidate_text.push(' ');
165 }
166 if let Some(v) = &fields.venue {
167 candidate_text.push_str(&v.to_lowercase());
168 candidate_text.push(' ');
169 }
170 if let Some(y) = fields.year {
171 candidate_text.push_str(&y.to_string());
172 candidate_text.push(' ');
173 }
174
175 let candidate_tokens: HashSet<String> = candidate_text
177 .split(|c: char| !c.is_alphanumeric())
178 .map(|s| s.to_lowercase())
179 .filter(|s| !s.is_empty())
180 .collect();
181
182 let matched = query_tokens
183 .iter()
184 .filter(|q| candidate_tokens.contains(*q))
185 .count();
186
187 let score = matched as f64 / query_tokens.len() as f64;
188
189 if score >= MIN_CITATION_SCORE {
190 let first_author = fields.authors.first().cloned().unwrap_or_default();
191 candidates.push(crate::ResolvedCandidate {
192 doi,
193 title: fields.title.unwrap_or_default(),
194 author: first_author,
195 year: fields.year,
196 score,
197 source: "crossref".to_string(),
198 });
199 }
200 }
201
202 candidates.sort_by(|a, b| {
204 b.score
205 .partial_cmp(&a.score)
206 .unwrap_or(std::cmp::Ordering::Equal)
207 });
208
209 Ok(candidates)
210 }
211}
212
213#[async_trait]
214impl Source for CrossrefSource {
215 fn name(&self) -> &str {
216 "crossref"
217 }
218
219 fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
220 matches!(ref_, Ref::Doi(_))
221 }
222
223 async fn fetch(
224 &self,
225 ref_: &Ref,
226 _profile: &CapabilityProfile,
227 ctx: &FetchContext,
228 ) -> Result<FetchResult, FetchError> {
229 let doi = match ref_ {
230 Ref::Doi(d) => d,
231 Ref::Arxiv(_) => {
232 return Err(FetchError::NotEligible {
233 source_key: "crossref".into(),
234 });
235 }
236 };
237
238 let _permit = ctx.rate_limiter.acquire(self.name()).await;
240
241 let url = self.request_url(doi)?;
245 let (body, final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
246
247 let envelope: CrossrefEnvelope =
251 serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
252 hint: format!("crossref returned non-JSON: {e}"),
253 })?;
254 if envelope.status != "ok" {
255 return Err(FetchError::SourceSchema {
256 hint: format!("crossref status = {}", envelope.status),
257 });
258 }
259
260 let canonical = ref_.promote(self.name(), None).digest_hex();
265 ctx.log.append(RowInput {
266 event: LogEvent::Fetch,
267 result: LogResult::Ok,
268 capability: Capability::Oa,
269 ref_: Some(doi.as_str()),
270 source: Some(self.name()),
271 error_code: None,
272 size_bytes: Some(body.len() as u64),
273 license: None,
274 store_path: None,
275 canonical_digest: Some(&canonical),
276 })?;
277
278 Ok(FetchResult {
279 source: self.name().to_string(),
280 license: "unknown".into(),
281 pdf_bytes: None,
284 final_url: Some(final_url),
285 metadata_json: Some(envelope.message),
286 })
287 }
288}
289
290#[derive(Debug, Deserialize)]
293struct CrossrefEnvelope {
294 status: String,
295 message: serde_json::Value,
296}
297
298#[cfg(test)]
303#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
304mod tests {
305 use super::*;
306
307 use std::sync::Arc;
308
309 use camino::Utf8PathBuf;
310 use tempfile::TempDir;
311 use wiremock::matchers::{method, path};
312 use wiremock::{Mock, MockServer, ResponseTemplate};
313
314 use crate::http::HttpClient;
315 use crate::provenance::ProvenanceLog;
316 use crate::rate_limiter::RateLimiter;
317 use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
318
319 fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
324 let td = TempDir::new().expect("tempdir");
325 let log_dir =
327 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
328 let log_path = log_dir.join("test.jsonl");
329
330 let http = Arc::new(HttpClient::new_for_tests_allow_http(
335 "crossref",
336 wiremock_host,
337 ));
338 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
339 let session_id = "01J0000000000000000000TEST".to_string();
340 let log = Arc::new(
341 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
342 );
343
344 (
345 td,
346 FetchContext {
347 http,
348 rate_limiter,
349 log,
350 session_id,
351 cache_root: None,
352 },
353 )
354 }
355
356 fn server_host(server: &MockServer) -> String {
358 server
359 .uri()
360 .parse::<Url>()
361 .expect("wiremock uri parses")
362 .host_str()
363 .expect("wiremock uri has host")
364 .to_string()
365 }
366
367 fn crossref_for(server: &MockServer) -> CrossrefSource {
369 let base = server.uri().parse::<Url>().expect("wiremock uri parses");
370 CrossrefSource::with_base(base, "test@example.org".to_string())
371 }
372
373 #[test]
374 fn crossref_can_serve_returns_true_for_doi() {
375 let s = CrossrefSource::new("test@example.org".into());
376 let profile = CapabilityProfile::from_env().expect("clean env");
377 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
378 assert!(s.can_serve(&profile, &r));
379 }
380
381 #[test]
382 fn crossref_can_serve_returns_false_for_arxiv() {
383 let s = CrossrefSource::new("test@example.org".into());
384 let profile = CapabilityProfile::from_env().expect("clean env");
385 let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
386 assert!(!s.can_serve(&profile, &r));
387 }
388
389 #[tokio::test]
390 async fn crossref_fetch_returns_envelope_message() {
391 let server = MockServer::start().await;
392 Mock::given(method("GET"))
393 .and(path("/works/10.1234/example"))
394 .respond_with(
395 ResponseTemplate::new(200)
396 .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
397 )
398 .mount(&server)
399 .await;
400
401 let host = server_host(&server);
402 let s = crossref_for(&server);
403 let (_td, ctx) = build_test_context(&host);
404 let profile = CapabilityProfile::from_env().expect("clean env");
405 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
406
407 let res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
408 assert_eq!(res.source, "crossref");
409 assert_eq!(
410 res.metadata_json,
411 Some(serde_json::json!({ "title": ["Example"] })),
412 );
413 assert!(res.pdf_bytes.is_none());
414 assert!(res.final_url.is_some());
415 }
416
417 #[tokio::test]
418 async fn crossref_fetch_with_arxiv_ref_errors_not_eligible() {
419 let s = CrossrefSource::with_base(
423 Url::parse("http://127.0.0.1:1/").unwrap(),
424 "test@example.org".into(),
425 );
426 let (_td, ctx) = build_test_context("127.0.0.1");
427 let profile = CapabilityProfile::from_env().expect("clean env");
428 let r = Ref::Arxiv(ArxivId::parse("2401.12345").unwrap());
429
430 let err = s.fetch(&r, &profile, &ctx).await.expect_err("not eligible");
431 match err {
432 FetchError::NotEligible { source_key } => {
433 assert_eq!(source_key, "crossref");
434 }
435 other => panic!("expected NotEligible, got {:?}", other),
436 }
437 }
438
439 #[tokio::test]
440 async fn crossref_fetch_writes_log_row() {
441 let server = MockServer::start().await;
442 Mock::given(method("GET"))
443 .and(path("/works/10.1234/example"))
444 .respond_with(
445 ResponseTemplate::new(200)
446 .set_body_string(r#"{"status":"ok","message":{"title":["Example"]}}"#),
447 )
448 .mount(&server)
449 .await;
450
451 let host = server_host(&server);
452 let s = crossref_for(&server);
453 let (_td, ctx) = build_test_context(&host);
454 let profile = CapabilityProfile::from_env().expect("clean env");
455 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
456
457 let _res = s.fetch(&r, &profile, &ctx).await.expect("fetch ok");
458
459 let log_path = _td.path().join("test.jsonl");
463 let raw = std::fs::read_to_string(&log_path).expect("log file readable");
464 let lines: Vec<&str> = raw.lines().filter(|l| !l.is_empty()).collect();
465 assert_eq!(lines.len(), 1, "expected exactly one row, got {:?}", lines);
466 let row: serde_json::Value = serde_json::from_str(lines[0]).expect("row is valid JSON");
467 assert_eq!(row["event"], "fetch");
468 assert_eq!(row["result"], "ok");
469 assert_eq!(row["source"], "crossref");
470 assert_eq!(row["ref"], "10.1234/example");
471 }
472
473 #[tokio::test]
474 async fn crossref_404_maps_to_http_error() {
475 let server = MockServer::start().await;
476 Mock::given(method("GET"))
477 .and(path("/works/10.1234/example"))
478 .respond_with(ResponseTemplate::new(404))
479 .mount(&server)
480 .await;
481
482 let host = server_host(&server);
483 let s = crossref_for(&server);
484 let (_td, ctx) = build_test_context(&host);
485 let profile = CapabilityProfile::from_env().expect("clean env");
486 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
487
488 let err = s.fetch(&r, &profile, &ctx).await.expect_err("404 errors");
489 match err {
490 FetchError::Http(_) => {}
491 other => panic!("expected Http(_) on 404, got {:?}", other),
492 }
493 }
494
495 #[tokio::test]
496 async fn crossref_non_ok_status_field_errors_source_schema() {
497 let server = MockServer::start().await;
498 Mock::given(method("GET"))
499 .and(path("/works/10.1234/example"))
500 .respond_with(
501 ResponseTemplate::new(200).set_body_string(r#"{"status":"error","message":{}}"#),
502 )
503 .mount(&server)
504 .await;
505
506 let host = server_host(&server);
507 let s = crossref_for(&server);
508 let (_td, ctx) = build_test_context(&host);
509 let profile = CapabilityProfile::from_env().expect("clean env");
510 let r = Ref::Doi(Doi::parse("10.1234/example").unwrap());
511
512 let err = s
513 .fetch(&r, &profile, &ctx)
514 .await
515 .expect_err("non-ok status errors");
516 match err {
517 FetchError::SourceSchema { hint } => {
518 assert!(
519 hint.contains("status"),
520 "expected status mention in hint, got {hint}"
521 );
522 }
523 other => panic!("expected SourceSchema, got {:?}", other),
524 }
525 }
526
527 #[tokio::test]
528 async fn test_resolve_citation_success() {
529 let server = MockServer::start().await;
530 let mock_body = serde_json::json!({
531 "status": "ok",
532 "message": {
533 "items": [
534 {
535 "DOI": "10.1000/xyz123",
536 "title": ["Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition"],
537 "author": [
538 {"family": "Onsager", "given": "Lars"}
539 ],
540 "issued": {
541 "date-parts": [[1944, 2, 1]]
542 },
543 "container-title": ["Physical Review"]
544 },
545 {
546 "DOI": "10.1000/unrelated",
547 "title": ["Some Unrelated Paper"],
548 "author": [
549 {"family": "Smith", "given": "John"}
550 ],
551 "issued": {
552 "date-parts": [[2020]]
553 }
554 }
555 ]
556 }
557 });
558
559 Mock::given(method("GET"))
560 .and(path("/works"))
561 .respond_with(ResponseTemplate::new(200).set_body_json(mock_body))
562 .mount(&server)
563 .await;
564
565 let host = server_host(&server);
566 let s = crossref_for(&server);
567 let (_td, ctx) = build_test_context(&host);
568
569 let candidates = s
570 .resolve_citation("Onsager 1944", 2, &ctx)
571 .await
572 .expect("resolve ok");
573
574 assert_eq!(candidates.len(), 1);
578 let cand = &candidates[0];
579 assert_eq!(cand.doi, "10.1000/xyz123");
580 assert_eq!(cand.title, "Lars Onsager, Crystal Statistics. I. A Two-Dimensional Model with an Order-Disorder Transition");
581 assert_eq!(cand.author, "Onsager, Lars");
582 assert_eq!(cand.year, Some(1944));
583 assert_eq!(cand.score, 1.0);
584 }
585
586 #[tokio::test]
587 async fn resolve_citation_matches_non_first_authors() {
588 let server = MockServer::start().await;
593 let mock_body = serde_json::json!({
594 "status": "ok",
595 "message": {
596 "items": [
597 {
598 "DOI": "10.1103/RevModPhys.80.395",
599 "title": ["Numerical renormalization group method for quantum impurity systems"],
600 "author": [
601 {"family": "Bulla", "given": "Ralf"},
602 {"family": "Costi", "given": "Theo A."},
603 {"family": "Pruschke", "given": "Thomas"}
604 ],
605 "issued": { "date-parts": [[2008, 4, 2]] },
606 "container-title": ["Reviews of Modern Physics"]
607 }
608 ]
609 }
610 });
611
612 Mock::given(method("GET"))
613 .and(path("/works"))
614 .respond_with(ResponseTemplate::new(200).set_body_json(mock_body))
615 .mount(&server)
616 .await;
617
618 let host = server_host(&server);
619 let s = crossref_for(&server);
620 let (_td, ctx) = build_test_context(&host);
621
622 let candidates = s
623 .resolve_citation("Costi Pruschke 2008", 5, &ctx)
624 .await
625 .expect("resolve ok");
626
627 assert_eq!(candidates.len(), 1);
628 assert_eq!(candidates[0].doi, "10.1103/RevModPhys.80.395");
629 assert_eq!(candidates[0].score, 1.0);
630 }
631}