1use async_trait::async_trait;
39use bytes::Bytes;
40use quick_xml::events::Event;
41use quick_xml::Reader;
42use serde_json::{json, Value};
43use url::Url;
44
45use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
46use crate::source::{FetchContext, FetchError, FetchResult, Source};
47use crate::{ArxivId, CapabilityProfile, Ref};
48
49const PDF_BASE: &str = "https://arxiv.org";
55
56const META_BASE: &str = "https://export.arxiv.org";
62
63#[derive(Clone, Debug)]
66pub struct ArxivSource {
67 base: Url,
69 meta_base: Url,
71}
72
73impl ArxivSource {
74 pub fn new() -> Self {
77 #[allow(clippy::expect_used)]
82 let base = Url::parse(PDF_BASE).expect("hard-coded PDF base URL is valid");
83 #[allow(clippy::expect_used)]
84 let meta_base = Url::parse(META_BASE).expect("hard-coded meta base URL is valid");
85 Self { base, meta_base }
86 }
87
88 pub fn with_base(base: Url) -> Self {
97 Self {
98 meta_base: base.clone(),
99 base,
100 }
101 }
102
103 fn pdf_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
115 let path = format!("/pdf/{}.pdf", id.as_str());
116 self.base.join(&path).map_err(|e| FetchError::SourceSchema {
117 hint: format!("arxiv URL construction failed: {e}"),
118 })
119 }
120
121 fn metadata_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
134 let mut url = self
135 .meta_base
136 .join("/api/query")
137 .map_err(|e| FetchError::SourceSchema {
138 hint: format!("arxiv metadata URL construction failed: {e}"),
139 })?;
140 url.query_pairs_mut().append_pair("id_list", id.as_str());
141 Ok(url)
142 }
143
144 pub async fn fetch_metadata_only(
160 &self,
161 id: &ArxivId,
162 ctx: &FetchContext,
163 ) -> Result<Value, FetchError> {
164 let _permit = ctx.rate_limiter.acquire(self.name()).await;
166
167 let url = self.metadata_url(id)?;
168 let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
169 let metadata = parse_atom_feed(&body)?;
170
171 let canonical =
175 crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), self.name(), None)
176 .digest_hex();
177 ctx.log.append(RowInput {
178 event: LogEvent::Fetch,
179 result: LogResult::Ok,
180 capability: Capability::Metadata,
185 ref_: Some(id.as_str()),
186 source: Some(self.name()),
187 error_code: None,
188 size_bytes: Some(body.len() as u64),
189 license: Some("arxiv-default"),
190 store_path: None,
191 canonical_digest: Some(&canonical),
192 })?;
193
194 Ok(metadata)
195 }
196}
197
198impl Default for ArxivSource {
199 fn default() -> Self {
200 Self::new()
201 }
202}
203
204#[async_trait]
205impl Source for ArxivSource {
206 fn name(&self) -> &str {
207 "arxiv"
208 }
209
210 fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
211 matches!(ref_, Ref::Arxiv(_))
212 }
213
214 async fn fetch(
215 &self,
216 ref_: &Ref,
217 _profile: &CapabilityProfile,
218 ctx: &FetchContext,
219 ) -> Result<FetchResult, FetchError> {
220 let id = match ref_ {
224 Ref::Arxiv(a) => a,
225 Ref::Doi(_) => {
226 return Err(FetchError::NotEligible {
227 source_key: "arxiv".into(),
228 });
229 }
230 };
231
232 let _permit = ctx.rate_limiter.acquire(self.name()).await;
235
236 let metadata_json = match self.metadata_url(id) {
248 Ok(meta_url) => match ctx.http.fetch_bytes(self.name(), meta_url).await {
249 Ok((bytes, _final)) => match parse_atom_feed(&bytes) {
250 Ok(v) => Some(v),
251 Err(e) => {
252 tracing::warn!(
253 arxiv_id = %id.as_str(),
254 error = %e,
255 "arxiv Atom feed parse failed; continuing with PDF-only fetch"
256 );
257 None
258 }
259 },
260 Err(e) => {
261 tracing::warn!(
262 arxiv_id = %id.as_str(),
263 error = %e,
264 "arxiv Atom feed fetch failed; continuing with PDF-only fetch"
265 );
266 None
267 }
268 },
269 Err(e) => {
270 tracing::warn!(
271 arxiv_id = %id.as_str(),
272 error = %e,
273 "arxiv metadata URL construction failed; continuing with PDF-only fetch"
274 );
275 None
276 }
277 };
278
279 let url = self.pdf_url(id)?;
281
282 let (body, final_url): (Bytes, Url) = ctx.http.fetch_pdf(self.name(), url).await?;
286
287 let canonical = ref_.promote(self.name(), None).digest_hex();
294 ctx.log.append(RowInput {
295 event: LogEvent::Fetch,
296 result: LogResult::Ok,
297 capability: Capability::Oa,
298 ref_: Some(id.as_str()),
299 source: Some(self.name()),
300 error_code: None,
301 size_bytes: Some(body.len() as u64),
302 license: Some("arxiv-default"),
308 store_path: None,
309 canonical_digest: Some(&canonical),
310 })?;
311
312 Ok(FetchResult {
313 source: self.name().to_string(),
314 license: "arxiv-default".into(),
315 pdf_bytes: Some(body),
316 final_url: Some(final_url),
317 metadata_json,
318 })
319 }
320}
321
322pub(crate) fn parse_atom_feed(xml: &[u8]) -> Result<Value, FetchError> {
361 let mut reader = Reader::from_reader(xml);
362 let config = reader.config_mut();
363 config.trim_text(true);
364
365 let mut in_entry = false;
368 let mut saw_entry = false;
369 let mut depth = 0_i32; let mut title: Option<String> = None;
374 let mut abstract_: Option<String> = None;
375 let mut published: Option<String> = None;
376 let mut updated: Option<String> = None;
377 let mut authors: Vec<String> = Vec::new();
378 let mut categories: Vec<String> = Vec::new();
379 let mut doi: Option<String> = None;
384 let mut journal_ref: Option<String> = None;
385
386 #[derive(Clone, Copy)]
389 enum Target {
390 Title,
391 Summary,
392 Published,
393 Updated,
394 AuthorName,
395 Doi,
396 JournalRef,
397 }
398 let mut target: Option<Target> = None;
399 let mut in_author = false;
400 let mut buf: Vec<u8> = Vec::new();
401
402 loop {
403 match reader.read_event_into(&mut buf) {
404 Ok(Event::Start(e)) => {
405 let name_bytes = e.name();
406 let local = local_name(name_bytes.as_ref());
407 if !in_entry {
408 if local == b"entry" {
409 in_entry = true;
410 saw_entry = true;
411 depth = 0;
412 }
413 buf.clear();
414 continue;
415 }
416 depth += 1;
417 if depth == 1 {
419 match local {
420 b"title" => target = Some(Target::Title),
421 b"summary" => target = Some(Target::Summary),
422 b"published" => target = Some(Target::Published),
423 b"updated" => target = Some(Target::Updated),
424 b"doi" => target = Some(Target::Doi),
428 b"journal_ref" => target = Some(Target::JournalRef),
429 b"author" => {
430 in_author = true;
431 authors.push(String::new());
432 }
433 _ => {}
434 }
435 } else if depth == 2 && in_author && local == b"name" {
436 target = Some(Target::AuthorName);
437 }
438 buf.clear();
439 }
440 Ok(Event::Empty(e)) => {
441 let name_bytes = e.name();
442 let local = local_name(name_bytes.as_ref());
443 if in_entry && depth == 0 && local == b"category" {
444 for attr in e.attributes().flatten() {
446 if attr.key.as_ref() == b"term" {
447 if let Ok(v) = attr.normalized_value(quick_xml::XmlVersion::Explicit1_0)
453 {
454 categories.push(v.into_owned());
455 }
456 }
457 }
458 }
459 buf.clear();
460 }
461 Ok(Event::Text(t)) => {
462 if let Some(tg) = target {
463 if let Some(s) = t.decode().ok().and_then(|raw| {
468 quick_xml::escape::unescape(&raw)
469 .ok()
470 .map(|c| c.into_owned())
471 }) {
472 match tg {
473 Target::Title => title.get_or_insert_with(String::new).push_str(&s),
474 Target::Summary => {
475 abstract_.get_or_insert_with(String::new).push_str(&s)
476 }
477 Target::Published => {
478 published.get_or_insert_with(String::new).push_str(&s)
479 }
480 Target::Updated => updated.get_or_insert_with(String::new).push_str(&s),
481 Target::Doi => doi.get_or_insert_with(String::new).push_str(&s),
482 Target::JournalRef => {
483 journal_ref.get_or_insert_with(String::new).push_str(&s)
484 }
485 Target::AuthorName => {
486 if let Some(last) = authors.last_mut() {
487 last.push_str(&s);
488 }
489 }
490 }
491 }
492 }
493 buf.clear();
494 }
495 Ok(Event::End(e)) => {
496 if !in_entry {
497 buf.clear();
498 continue;
499 }
500 let name_bytes = e.name();
501 let local = local_name(name_bytes.as_ref());
502 if depth == 0 && local == b"entry" {
503 break;
507 }
508 depth -= 1;
509 if depth == 0 {
510 if local == b"author" {
511 in_author = false;
512 if let Some(last) = authors.last() {
514 if last.is_empty() {
515 authors.pop();
516 }
517 }
518 }
519 target = None;
520 } else if depth == 1 && in_author && local == b"name" {
521 target = None;
522 }
523 buf.clear();
524 }
525 Ok(Event::Eof) => break,
526 Err(e) => {
527 return Err(FetchError::SourceSchema {
528 hint: format!("arxiv Atom XML parse error: {e}"),
529 });
530 }
531 _ => {
533 buf.clear();
534 }
535 }
536 }
537
538 if !saw_entry {
539 return Err(FetchError::NotFound {
544 hint: "arxiv Atom feed had no <entry> element (unknown id?)".into(),
545 });
546 }
547
548 let mut obj = serde_json::Map::new();
551 if let Some(t) = title {
552 let trimmed = t.trim().to_string();
553 if !trimmed.is_empty() {
554 obj.insert("title".into(), Value::String(trimmed));
555 }
556 }
557 if let Some(a) = abstract_ {
558 let trimmed = a.trim().to_string();
559 if !trimmed.is_empty() {
560 obj.insert("abstract".into(), Value::String(trimmed));
561 }
562 }
563 if !authors.is_empty() {
564 obj.insert(
565 "authors".into(),
566 Value::Array(authors.into_iter().map(Value::String).collect()),
567 );
568 }
569 if let Some(p) = published {
570 let trimmed = p.trim().to_string();
571 if !trimmed.is_empty() {
572 obj.insert("published".into(), Value::String(trimmed));
573 }
574 }
575 if let Some(u) = updated {
576 let trimmed = u.trim().to_string();
577 if !trimmed.is_empty() {
578 obj.insert("updated".into(), Value::String(trimmed));
579 }
580 }
581 if let Some(d) = doi {
592 let trimmed = d.trim().to_string();
593 if !trimmed.is_empty() {
594 obj.insert("doi".into(), Value::String(trimmed));
595 }
596 }
597 if let Some(j) = journal_ref {
598 let trimmed = j.trim().to_string();
599 if !trimmed.is_empty() {
600 obj.insert("journal_ref".into(), Value::String(trimmed));
601 }
602 }
603 if !categories.is_empty() {
604 obj.insert(
605 "categories".into(),
606 Value::Array(categories.into_iter().map(Value::String).collect()),
607 );
608 }
609 Ok(json!(obj))
610}
611
612fn local_name(qname: &[u8]) -> &[u8] {
619 match qname.iter().rposition(|&b| b == b':') {
620 Some(idx) => &qname[idx + 1..],
621 None => qname,
622 }
623}
624
625#[cfg(test)]
630#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
631mod tests {
632 use super::*;
633
634 use std::sync::Arc;
635
636 use camino::Utf8PathBuf;
637 use tempfile::TempDir;
638 use wiremock::matchers::{method, path};
639 use wiremock::{Mock, MockServer, ResponseTemplate};
640
641 use crate::http::{HttpClient, HttpError};
642 use crate::provenance::{LogRow, ProvenanceLog};
643 use crate::rate_limiter::RateLimiter;
644 use crate::source::FetchContext;
645 use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
646
647 const TEST_SESSION_ID: &str = "01J0000000000000000000TEST";
648
649 fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
652 let td = TempDir::new().expect("tempdir");
653 let log_dir =
654 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
655 let log_path = log_dir.join("test.jsonl");
656
657 let http = Arc::new(HttpClient::new_for_tests_allow_http("arxiv", wiremock_host));
658 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
659 let session_id = TEST_SESSION_ID.to_string();
660 let log = Arc::new(
661 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
662 );
663
664 (
665 td,
666 FetchContext {
667 http,
668 rate_limiter,
669 log,
670 session_id,
671 cache_root: None,
672 },
673 )
674 }
675
676 fn read_rows(path: &camino::Utf8Path) -> Vec<LogRow> {
677 let raw = std::fs::read_to_string(path).expect("read log");
678 raw.lines()
679 .filter(|l| !l.is_empty())
680 .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
681 .collect()
682 }
683
684 fn profile() -> CapabilityProfile {
685 CapabilityProfile::from_env().expect("Phase 0 stub profile")
686 }
687
688 #[test]
693 fn arxiv_can_serve_returns_true_for_arxiv() {
694 let s = ArxivSource::new();
695 let id = ArxivId::parse("2401.12345").expect("valid id");
696 let r = Ref::Arxiv(id);
697 assert!(s.can_serve(&profile(), &r));
698 }
699
700 #[test]
701 fn production_metadata_url_uses_export_host_pdf_uses_arxiv() {
702 let s = ArxivSource::new();
706 let id = ArxivId::parse("1706.03762").expect("valid id");
707 let meta = s.metadata_url(&id).expect("meta url");
708 assert_eq!(meta.host_str(), Some("export.arxiv.org"));
709 assert_eq!(meta.path(), "/api/query");
710 let pdf = s.pdf_url(&id).expect("pdf url");
711 assert_eq!(pdf.host_str(), Some("arxiv.org"));
712 }
713
714 #[test]
715 fn with_base_shares_one_origin_for_both_legs() {
716 let s = ArxivSource::with_base("http://127.0.0.1:9999".parse().expect("url"));
719 let id = ArxivId::parse("2401.12345").expect("valid id");
720 assert_eq!(
721 s.metadata_url(&id).expect("meta").host_str(),
722 s.pdf_url(&id).expect("pdf").host_str()
723 );
724 }
725
726 #[test]
727 fn arxiv_can_serve_returns_false_for_doi() {
728 let s = ArxivSource::new();
729 let r = Ref::Doi(Doi("10.1234/example".to_string()));
730 assert!(!s.can_serve(&profile(), &r));
731 }
732
733 #[tokio::test]
738 async fn arxiv_fetch_new_style_id_returns_pdf_bytes() {
739 let server = MockServer::start().await;
740 let body = b"%PDF-1.7\n%fixture\n".to_vec();
741 Mock::given(method("GET"))
742 .and(path("/pdf/2401.12345.pdf"))
743 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
744 .mount(&server)
745 .await;
746
747 let host = server
748 .uri()
749 .parse::<Url>()
750 .unwrap()
751 .host_str()
752 .unwrap()
753 .to_string();
754 let (_td, ctx) = build_test_context(&host);
755 let s = ArxivSource::with_base(server.uri().parse().unwrap());
756
757 let id = ArxivId::parse("2401.12345").unwrap();
758 let r = Ref::Arxiv(id);
759 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
760
761 assert_eq!(res.source, "arxiv");
762 assert_eq!(res.license, "arxiv-default");
763 let bytes = res.pdf_bytes.expect("pdf bytes set");
764 assert!(
765 bytes.starts_with(b"%PDF-"),
766 "expected PDF magic prefix, got {:?}",
767 &bytes[..bytes.len().min(8)]
768 );
769 assert_eq!(&bytes[..], &body[..]);
770 }
771
772 #[tokio::test]
773 async fn arxiv_fetch_old_style_id_returns_pdf_bytes() {
774 let server = MockServer::start().await;
778 let body = b"%PDF-1.4\n%old-style fixture\n".to_vec();
779 Mock::given(method("GET"))
780 .and(path("/pdf/cond-mat/9501001.pdf"))
781 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
782 .mount(&server)
783 .await;
784
785 let host = server
786 .uri()
787 .parse::<Url>()
788 .unwrap()
789 .host_str()
790 .unwrap()
791 .to_string();
792 let (_td, ctx) = build_test_context(&host);
793 let s = ArxivSource::with_base(server.uri().parse().unwrap());
794
795 let id = ArxivId::parse("cond-mat/9501001").expect("old-style id");
796 let r = Ref::Arxiv(id);
797 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
798
799 let bytes = res.pdf_bytes.expect("pdf bytes set");
800 assert!(bytes.starts_with(b"%PDF-"));
801 assert_eq!(&bytes[..], &body[..]);
802 }
803
804 #[tokio::test]
809 async fn arxiv_fetch_with_doi_ref_errors_not_eligible() {
810 let server = MockServer::start().await;
811 let host = server
812 .uri()
813 .parse::<Url>()
814 .unwrap()
815 .host_str()
816 .unwrap()
817 .to_string();
818 let (_td, ctx) = build_test_context(&host);
819 let s = ArxivSource::with_base(server.uri().parse().unwrap());
820
821 let r = Ref::Doi(Doi("10.1234/example".to_string()));
822 let err = s
823 .fetch(&r, &profile(), &ctx)
824 .await
825 .expect_err("doi ref must not be eligible");
826 match err {
827 FetchError::NotEligible { source_key } => {
828 assert_eq!(source_key, "arxiv");
829 }
830 other => panic!("expected NotEligible, got {:?}", other),
831 }
832 }
833
834 #[tokio::test]
835 async fn arxiv_fetch_writes_log_row_with_arxiv_default_license() {
836 let server = MockServer::start().await;
837 let body = b"%PDF-1.7\n%log-row fixture\n".to_vec();
838 Mock::given(method("GET"))
839 .and(path("/pdf/2401.12345.pdf"))
840 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
841 .mount(&server)
842 .await;
843 let host = server
844 .uri()
845 .parse::<Url>()
846 .unwrap()
847 .host_str()
848 .unwrap()
849 .to_string();
850 let (_td, ctx) = build_test_context(&host);
851 let log_path = ctx.log.path().to_path_buf();
853 let s = ArxivSource::with_base(server.uri().parse().unwrap());
854
855 let id = ArxivId::parse("2401.12345").unwrap();
856 let r = Ref::Arxiv(id);
857 let _ = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
858
859 let rows = read_rows(&log_path);
860 assert_eq!(rows.len(), 1, "exactly one fetch row expected");
861 let row = &rows[0];
862 assert_eq!(row.source.as_deref(), Some("arxiv"));
863 assert_eq!(row.ref_.as_deref(), Some("2401.12345"));
864 assert_eq!(row.license.as_deref(), Some("arxiv-default"));
865 assert_eq!(row.size_bytes, Some(body.len() as u64));
866 assert!(row.error_code.is_none());
867 }
868
869 #[tokio::test]
870 async fn arxiv_non_pdf_body_rejected() {
871 let server = MockServer::start().await;
875 Mock::given(method("GET"))
876 .and(path("/pdf/2401.12345.pdf"))
877 .respond_with(
878 ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
879 )
880 .mount(&server)
881 .await;
882 let host = server
883 .uri()
884 .parse::<Url>()
885 .unwrap()
886 .host_str()
887 .unwrap()
888 .to_string();
889 let (_td, ctx) = build_test_context(&host);
890 let s = ArxivSource::with_base(server.uri().parse().unwrap());
891
892 let id = ArxivId::parse("2401.12345").unwrap();
893 let r = Ref::Arxiv(id);
894 let err = s
895 .fetch(&r, &profile(), &ctx)
896 .await
897 .expect_err("non-pdf body must be rejected");
898 match err {
899 FetchError::Http(HttpError::NotAPdf { got }) => {
900 assert_eq!(&got, b"<html");
901 }
902 other => panic!("expected FetchError::Http(NotAPdf), got {:?}", other),
903 }
904 }
905
906 #[tokio::test]
907 async fn arxiv_404_maps_to_http_error() {
908 let server = MockServer::start().await;
909 Mock::given(method("GET"))
910 .and(path("/pdf/2401.99999.pdf"))
911 .respond_with(ResponseTemplate::new(404))
912 .mount(&server)
913 .await;
914 let host = server
915 .uri()
916 .parse::<Url>()
917 .unwrap()
918 .host_str()
919 .unwrap()
920 .to_string();
921 let (_td, ctx) = build_test_context(&host);
922 let s = ArxivSource::with_base(server.uri().parse().unwrap());
923
924 let id = ArxivId::parse("2401.99999").unwrap();
925 let r = Ref::Arxiv(id);
926 let err = s
927 .fetch(&r, &profile(), &ctx)
928 .await
929 .expect_err("404 must surface");
930 match err {
931 FetchError::Http(HttpError::HttpStatus { status, .. }) => {
932 assert_eq!(status, 404);
933 }
934 other => panic!("expected FetchError::Http(HttpStatus), got {:?}", other),
935 }
936 }
937
938 const SAMPLE_ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
945<feed xmlns="http://www.w3.org/2005/Atom">
946 <entry>
947 <id>http://arxiv.org/abs/2401.12345v1</id>
948 <updated>2024-02-01T00:00:00Z</updated>
949 <published>2024-01-15T00:00:00Z</published>
950 <title>Example arXiv Paper Title</title>
951 <summary>This is an example abstract.</summary>
952 <author>
953 <name>Jane Doe</name>
954 </author>
955 <author>
956 <name>John Roe</name>
957 </author>
958 <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
959 <category term="stat.ML" scheme="http://arxiv.org/schemas/atom"/>
960 </entry>
961</feed>"#;
962
963 #[test]
964 fn parse_atom_feed_extracts_all_fields() {
965 let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("Atom parses");
966 assert_eq!(v["title"], serde_json::json!("Example arXiv Paper Title"));
967 assert_eq!(
968 v["abstract"],
969 serde_json::json!("This is an example abstract.")
970 );
971 assert_eq!(v["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
972 assert_eq!(v["published"], serde_json::json!("2024-01-15T00:00:00Z"));
973 assert_eq!(v["updated"], serde_json::json!("2024-02-01T00:00:00Z"));
974 assert_eq!(v["categories"], serde_json::json!(["cs.LG", "stat.ML"]));
975 }
976
977 #[test]
978 fn parse_atom_feed_empty_feed_is_not_found() {
979 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
983<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
984 let err = parse_atom_feed(xml.as_bytes()).expect_err("empty feed must error");
985 match err {
986 FetchError::NotFound { hint } => {
987 assert!(
988 hint.contains("entry"),
989 "expected mention of <entry>; got {hint}"
990 );
991 }
992 other => panic!("expected NotFound, got {other:?}"),
993 }
994 }
995
996 #[test]
997 fn parse_atom_feed_captures_published_doi_and_journal_ref() {
998 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1003<feed xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
1004 <entry>
1005 <id>http://arxiv.org/abs/2101.54321v2</id>
1006 <title>Published Later</title>
1007 <arxiv:doi>10.1103/PhysRevLett.130.200601</arxiv:doi>
1008 <arxiv:journal_ref>Phys. Rev. Lett. 130, 200601 (2023)</arxiv:journal_ref>
1009 </entry>
1010</feed>"#;
1011 let v = parse_atom_feed(xml.as_bytes()).expect("parses");
1012 assert_eq!(
1013 v["doi"],
1014 serde_json::json!("10.1103/PhysRevLett.130.200601")
1015 );
1016 assert_eq!(
1017 v["journal_ref"],
1018 serde_json::json!("Phys. Rev. Lett. 130, 200601 (2023)")
1019 );
1020 }
1021
1022 #[test]
1023 fn parse_atom_feed_omits_doi_when_absent() {
1024 let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("parses");
1027 let obj = v.as_object().expect("object");
1028 assert!(!obj.contains_key("doi"), "doi must be omitted: {obj:?}");
1029 assert!(
1030 !obj.contains_key("journal_ref"),
1031 "journal_ref must be omitted: {obj:?}"
1032 );
1033 }
1034
1035 #[test]
1036 fn parse_atom_feed_journal_ref_only_without_doi() {
1037 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1040<feed xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
1041 <entry>
1042 <id>http://arxiv.org/abs/2101.00001v1</id>
1043 <title>Journal Ref Only</title>
1044 <arxiv:journal_ref>J. Stat. Mech. (2021) 013203</arxiv:journal_ref>
1045 </entry>
1046</feed>"#;
1047 let v = parse_atom_feed(xml.as_bytes()).expect("parses");
1048 let obj = v.as_object().expect("object");
1049 assert!(!obj.contains_key("doi"), "doi must be omitted: {obj:?}");
1050 assert_eq!(
1051 obj.get("journal_ref").and_then(Value::as_str),
1052 Some("J. Stat. Mech. (2021) 013203")
1053 );
1054 }
1055
1056 #[test]
1057 fn parse_atom_feed_whitespace_doi_is_omitted() {
1058 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1061<feed xmlns="http://www.w3.org/2005/Atom" xmlns:arxiv="http://arxiv.org/schemas/atom">
1062 <entry>
1063 <id>http://arxiv.org/abs/2101.00002v1</id>
1064 <title>Blank DOI</title>
1065 <arxiv:doi> </arxiv:doi>
1066 </entry>
1067</feed>"#;
1068 let v = parse_atom_feed(xml.as_bytes()).expect("parses");
1069 assert!(
1070 !v.as_object().expect("object").contains_key("doi"),
1071 "whitespace-only doi must be omitted: {v:?}"
1072 );
1073 }
1074
1075 #[test]
1076 fn parse_atom_feed_omits_missing_optional_fields() {
1077 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
1081<feed xmlns="http://www.w3.org/2005/Atom">
1082 <entry>
1083 <id>http://arxiv.org/abs/2401.00001v1</id>
1084 <title>Minimal Entry</title>
1085 </entry>
1086</feed>"#;
1087 let v = parse_atom_feed(xml.as_bytes()).expect("parses");
1088 let obj = v.as_object().expect("object");
1089 assert_eq!(
1090 obj.get("title").and_then(Value::as_str),
1091 Some("Minimal Entry")
1092 );
1093 assert!(
1094 !obj.contains_key("abstract"),
1095 "abstract should be omitted: {obj:?}"
1096 );
1097 assert!(
1098 !obj.contains_key("authors"),
1099 "authors should be omitted: {obj:?}"
1100 );
1101 assert!(
1102 !obj.contains_key("categories"),
1103 "categories should be omitted: {obj:?}"
1104 );
1105 }
1106
1107 #[tokio::test]
1112 async fn arxiv_fetch_metadata_only_returns_atom_metadata() {
1113 let server = MockServer::start().await;
1114 Mock::given(method("GET"))
1115 .and(path("/api/query"))
1116 .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
1117 .mount(&server)
1118 .await;
1119 let host = server
1120 .uri()
1121 .parse::<Url>()
1122 .unwrap()
1123 .host_str()
1124 .unwrap()
1125 .to_string();
1126 let (_td, ctx) = build_test_context(&host);
1127 let s = ArxivSource::with_base(server.uri().parse().unwrap());
1128 let id = ArxivId::parse("2401.12345").unwrap();
1129
1130 let meta = s
1131 .fetch_metadata_only(&id, &ctx)
1132 .await
1133 .expect("metadata_only ok");
1134 assert_eq!(
1135 meta["title"],
1136 serde_json::json!("Example arXiv Paper Title")
1137 );
1138 assert_eq!(meta["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
1139 }
1140
1141 #[tokio::test]
1142 async fn arxiv_fetch_populates_metadata_json_when_atom_endpoint_mocked() {
1143 let server = MockServer::start().await;
1146 Mock::given(method("GET"))
1147 .and(path("/api/query"))
1148 .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
1149 .mount(&server)
1150 .await;
1151 Mock::given(method("GET"))
1152 .and(path("/pdf/2401.12345.pdf"))
1153 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\n%fix\n".to_vec()))
1154 .mount(&server)
1155 .await;
1156 let host = server
1157 .uri()
1158 .parse::<Url>()
1159 .unwrap()
1160 .host_str()
1161 .unwrap()
1162 .to_string();
1163 let (_td, ctx) = build_test_context(&host);
1164 let s = ArxivSource::with_base(server.uri().parse().unwrap());
1165 let id = ArxivId::parse("2401.12345").unwrap();
1166 let r = Ref::Arxiv(id);
1167
1168 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1169 let meta = res.metadata_json.expect("metadata_json populated");
1170 assert_eq!(
1171 meta["title"],
1172 serde_json::json!("Example arXiv Paper Title")
1173 );
1174 }
1175
1176 #[tokio::test]
1177 async fn arxiv_fetch_atom_failure_falls_back_to_pdf_only() {
1178 let server = MockServer::start().await;
1182 Mock::given(method("GET"))
1183 .and(path("/pdf/2401.12345.pdf"))
1184 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\nx".to_vec()))
1185 .mount(&server)
1186 .await;
1187 let host = server
1188 .uri()
1189 .parse::<Url>()
1190 .unwrap()
1191 .host_str()
1192 .unwrap()
1193 .to_string();
1194 let (_td, ctx) = build_test_context(&host);
1195 let s = ArxivSource::with_base(server.uri().parse().unwrap());
1196 let id = ArxivId::parse("2401.12345").unwrap();
1197 let r = Ref::Arxiv(id);
1198
1199 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1200 assert!(res.metadata_json.is_none());
1201 assert!(res.pdf_bytes.is_some());
1202 }
1203}