1use async_trait::async_trait;
39use bytes::Bytes;
40use quick_xml::events::Event;
41use quick_xml::Reader;
42use serde_json::{json, Value};
43use url::Url;
44
45use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
46use crate::source::{FetchContext, FetchError, FetchResult, Source};
47use crate::{ArxivId, CapabilityProfile, Ref};
48
49const PDF_BASE: &str = "https://arxiv.org";
55
56const META_BASE: &str = "https://export.arxiv.org";
62
63#[derive(Clone, Debug)]
66pub struct ArxivSource {
67 base: Url,
69 meta_base: Url,
71}
72
73impl ArxivSource {
74 pub fn new() -> Self {
77 #[allow(clippy::expect_used)]
82 let base = Url::parse(PDF_BASE).expect("hard-coded PDF base URL is valid");
83 #[allow(clippy::expect_used)]
84 let meta_base = Url::parse(META_BASE).expect("hard-coded meta base URL is valid");
85 Self { base, meta_base }
86 }
87
88 pub fn with_base(base: Url) -> Self {
97 Self {
98 meta_base: base.clone(),
99 base,
100 }
101 }
102
103 fn pdf_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
115 let path = format!("/pdf/{}.pdf", id.as_str());
116 self.base.join(&path).map_err(|e| FetchError::SourceSchema {
117 hint: format!("arxiv URL construction failed: {e}"),
118 })
119 }
120
121 fn metadata_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
134 let mut url = self
135 .meta_base
136 .join("/api/query")
137 .map_err(|e| FetchError::SourceSchema {
138 hint: format!("arxiv metadata URL construction failed: {e}"),
139 })?;
140 url.query_pairs_mut().append_pair("id_list", id.as_str());
141 Ok(url)
142 }
143
144 pub async fn fetch_metadata_only(
160 &self,
161 id: &ArxivId,
162 ctx: &FetchContext,
163 ) -> Result<Value, FetchError> {
164 let _permit = ctx.rate_limiter.acquire(self.name()).await;
166
167 let url = self.metadata_url(id)?;
168 let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
169 let metadata = parse_atom_feed(&body)?;
170
171 let canonical =
175 crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), self.name(), None)
176 .digest_hex();
177 ctx.log.append(RowInput {
178 event: LogEvent::Fetch,
179 result: LogResult::Ok,
180 capability: Capability::Metadata,
185 ref_: Some(id.as_str()),
186 source: Some(self.name()),
187 error_code: None,
188 size_bytes: Some(body.len() as u64),
189 license: Some("arxiv-default"),
190 store_path: None,
191 canonical_digest: Some(&canonical),
192 })?;
193
194 Ok(metadata)
195 }
196}
197
198impl Default for ArxivSource {
199 fn default() -> Self {
200 Self::new()
201 }
202}
203
204#[async_trait]
205impl Source for ArxivSource {
206 fn name(&self) -> &str {
207 "arxiv"
208 }
209
210 fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
211 matches!(ref_, Ref::Arxiv(_))
212 }
213
214 async fn fetch(
215 &self,
216 ref_: &Ref,
217 _profile: &CapabilityProfile,
218 ctx: &FetchContext,
219 ) -> Result<FetchResult, FetchError> {
220 let id = match ref_ {
224 Ref::Arxiv(a) => a,
225 Ref::Doi(_) => {
226 return Err(FetchError::NotEligible {
227 source_key: "arxiv".into(),
228 });
229 }
230 };
231
232 let _permit = ctx.rate_limiter.acquire(self.name()).await;
235
236 let metadata_json = match self.metadata_url(id) {
248 Ok(meta_url) => match ctx.http.fetch_bytes(self.name(), meta_url).await {
249 Ok((bytes, _final)) => match parse_atom_feed(&bytes) {
250 Ok(v) => Some(v),
251 Err(e) => {
252 tracing::warn!(
253 arxiv_id = %id.as_str(),
254 error = %e,
255 "arxiv Atom feed parse failed; continuing with PDF-only fetch"
256 );
257 None
258 }
259 },
260 Err(e) => {
261 tracing::warn!(
262 arxiv_id = %id.as_str(),
263 error = %e,
264 "arxiv Atom feed fetch failed; continuing with PDF-only fetch"
265 );
266 None
267 }
268 },
269 Err(e) => {
270 tracing::warn!(
271 arxiv_id = %id.as_str(),
272 error = %e,
273 "arxiv metadata URL construction failed; continuing with PDF-only fetch"
274 );
275 None
276 }
277 };
278
279 let url = self.pdf_url(id)?;
281
282 let (body, final_url): (Bytes, Url) = ctx.http.fetch_pdf(self.name(), url).await?;
286
287 let canonical = ref_.promote(self.name(), None).digest_hex();
294 ctx.log.append(RowInput {
295 event: LogEvent::Fetch,
296 result: LogResult::Ok,
297 capability: Capability::Oa,
298 ref_: Some(id.as_str()),
299 source: Some(self.name()),
300 error_code: None,
301 size_bytes: Some(body.len() as u64),
302 license: Some("arxiv-default"),
308 store_path: None,
309 canonical_digest: Some(&canonical),
310 })?;
311
312 Ok(FetchResult {
313 source: self.name().to_string(),
314 license: "arxiv-default".into(),
315 pdf_bytes: Some(body),
316 final_url: Some(final_url),
317 metadata_json,
318 })
319 }
320}
321
322pub(crate) fn parse_atom_feed(xml: &[u8]) -> Result<Value, FetchError> {
358 let mut reader = Reader::from_reader(xml);
359 let config = reader.config_mut();
360 config.trim_text(true);
361
362 let mut in_entry = false;
365 let mut saw_entry = false;
366 let mut depth = 0_i32; let mut title: Option<String> = None;
371 let mut abstract_: Option<String> = None;
372 let mut published: Option<String> = None;
373 let mut updated: Option<String> = None;
374 let mut authors: Vec<String> = Vec::new();
375 let mut categories: Vec<String> = Vec::new();
376
377 #[derive(Clone, Copy)]
380 enum Target {
381 Title,
382 Summary,
383 Published,
384 Updated,
385 AuthorName,
386 }
387 let mut target: Option<Target> = None;
388 let mut in_author = false;
389 let mut buf: Vec<u8> = Vec::new();
390
391 loop {
392 match reader.read_event_into(&mut buf) {
393 Ok(Event::Start(e)) => {
394 let name_bytes = e.name();
395 let local = local_name(name_bytes.as_ref());
396 if !in_entry {
397 if local == b"entry" {
398 in_entry = true;
399 saw_entry = true;
400 depth = 0;
401 }
402 buf.clear();
403 continue;
404 }
405 depth += 1;
406 if depth == 1 {
408 match local {
409 b"title" => target = Some(Target::Title),
410 b"summary" => target = Some(Target::Summary),
411 b"published" => target = Some(Target::Published),
412 b"updated" => target = Some(Target::Updated),
413 b"author" => {
414 in_author = true;
415 authors.push(String::new());
416 }
417 _ => {}
418 }
419 } else if depth == 2 && in_author && local == b"name" {
420 target = Some(Target::AuthorName);
421 }
422 buf.clear();
423 }
424 Ok(Event::Empty(e)) => {
425 let name_bytes = e.name();
426 let local = local_name(name_bytes.as_ref());
427 if in_entry && depth == 0 && local == b"category" {
428 for attr in e.attributes().flatten() {
430 if attr.key.as_ref() == b"term" {
431 if let Ok(v) = attr.normalized_value(quick_xml::XmlVersion::Explicit1_0)
437 {
438 categories.push(v.into_owned());
439 }
440 }
441 }
442 }
443 buf.clear();
444 }
445 Ok(Event::Text(t)) => {
446 if let Some(tg) = target {
447 if let Some(s) = t.decode().ok().and_then(|raw| {
452 quick_xml::escape::unescape(&raw)
453 .ok()
454 .map(|c| c.into_owned())
455 }) {
456 match tg {
457 Target::Title => title.get_or_insert_with(String::new).push_str(&s),
458 Target::Summary => {
459 abstract_.get_or_insert_with(String::new).push_str(&s)
460 }
461 Target::Published => {
462 published.get_or_insert_with(String::new).push_str(&s)
463 }
464 Target::Updated => updated.get_or_insert_with(String::new).push_str(&s),
465 Target::AuthorName => {
466 if let Some(last) = authors.last_mut() {
467 last.push_str(&s);
468 }
469 }
470 }
471 }
472 }
473 buf.clear();
474 }
475 Ok(Event::End(e)) => {
476 if !in_entry {
477 buf.clear();
478 continue;
479 }
480 let name_bytes = e.name();
481 let local = local_name(name_bytes.as_ref());
482 if depth == 0 && local == b"entry" {
483 break;
487 }
488 depth -= 1;
489 if depth == 0 {
490 if local == b"author" {
491 in_author = false;
492 if let Some(last) = authors.last() {
494 if last.is_empty() {
495 authors.pop();
496 }
497 }
498 }
499 target = None;
500 } else if depth == 1 && in_author && local == b"name" {
501 target = None;
502 }
503 buf.clear();
504 }
505 Ok(Event::Eof) => break,
506 Err(e) => {
507 return Err(FetchError::SourceSchema {
508 hint: format!("arxiv Atom XML parse error: {e}"),
509 });
510 }
511 _ => {
513 buf.clear();
514 }
515 }
516 }
517
518 if !saw_entry {
519 return Err(FetchError::SourceSchema {
520 hint: "arxiv Atom feed had no <entry> element (unknown id?)".into(),
521 });
522 }
523
524 let mut obj = serde_json::Map::new();
527 if let Some(t) = title {
528 let trimmed = t.trim().to_string();
529 if !trimmed.is_empty() {
530 obj.insert("title".into(), Value::String(trimmed));
531 }
532 }
533 if let Some(a) = abstract_ {
534 let trimmed = a.trim().to_string();
535 if !trimmed.is_empty() {
536 obj.insert("abstract".into(), Value::String(trimmed));
537 }
538 }
539 if !authors.is_empty() {
540 obj.insert(
541 "authors".into(),
542 Value::Array(authors.into_iter().map(Value::String).collect()),
543 );
544 }
545 if let Some(p) = published {
546 let trimmed = p.trim().to_string();
547 if !trimmed.is_empty() {
548 obj.insert("published".into(), Value::String(trimmed));
549 }
550 }
551 if let Some(u) = updated {
552 let trimmed = u.trim().to_string();
553 if !trimmed.is_empty() {
554 obj.insert("updated".into(), Value::String(trimmed));
555 }
556 }
557 if !categories.is_empty() {
558 obj.insert(
559 "categories".into(),
560 Value::Array(categories.into_iter().map(Value::String).collect()),
561 );
562 }
563 Ok(json!(obj))
564}
565
566fn local_name(qname: &[u8]) -> &[u8] {
573 match qname.iter().rposition(|&b| b == b':') {
574 Some(idx) => &qname[idx + 1..],
575 None => qname,
576 }
577}
578
579#[cfg(test)]
584#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
585mod tests {
586 use super::*;
587
588 use std::sync::Arc;
589
590 use camino::Utf8PathBuf;
591 use tempfile::TempDir;
592 use wiremock::matchers::{method, path};
593 use wiremock::{Mock, MockServer, ResponseTemplate};
594
595 use crate::http::{HttpClient, HttpError};
596 use crate::provenance::{LogRow, ProvenanceLog};
597 use crate::rate_limiter::RateLimiter;
598 use crate::source::FetchContext;
599 use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
600
601 const TEST_SESSION_ID: &str = "01J0000000000000000000TEST";
602
603 fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
606 let td = TempDir::new().expect("tempdir");
607 let log_dir =
608 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
609 let log_path = log_dir.join("test.jsonl");
610
611 let http = Arc::new(HttpClient::new_for_tests_allow_http("arxiv", wiremock_host));
612 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
613 let session_id = TEST_SESSION_ID.to_string();
614 let log = Arc::new(
615 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
616 );
617
618 (
619 td,
620 FetchContext {
621 http,
622 rate_limiter,
623 log,
624 session_id,
625 cache_root: None,
626 },
627 )
628 }
629
630 fn read_rows(path: &camino::Utf8Path) -> Vec<LogRow> {
631 let raw = std::fs::read_to_string(path).expect("read log");
632 raw.lines()
633 .filter(|l| !l.is_empty())
634 .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
635 .collect()
636 }
637
638 fn profile() -> CapabilityProfile {
639 CapabilityProfile::from_env().expect("Phase 0 stub profile")
640 }
641
642 #[test]
647 fn arxiv_can_serve_returns_true_for_arxiv() {
648 let s = ArxivSource::new();
649 let id = ArxivId::parse("2401.12345").expect("valid id");
650 let r = Ref::Arxiv(id);
651 assert!(s.can_serve(&profile(), &r));
652 }
653
654 #[test]
655 fn production_metadata_url_uses_export_host_pdf_uses_arxiv() {
656 let s = ArxivSource::new();
660 let id = ArxivId::parse("1706.03762").expect("valid id");
661 let meta = s.metadata_url(&id).expect("meta url");
662 assert_eq!(meta.host_str(), Some("export.arxiv.org"));
663 assert_eq!(meta.path(), "/api/query");
664 let pdf = s.pdf_url(&id).expect("pdf url");
665 assert_eq!(pdf.host_str(), Some("arxiv.org"));
666 }
667
668 #[test]
669 fn with_base_shares_one_origin_for_both_legs() {
670 let s = ArxivSource::with_base("http://127.0.0.1:9999".parse().expect("url"));
673 let id = ArxivId::parse("2401.12345").expect("valid id");
674 assert_eq!(
675 s.metadata_url(&id).expect("meta").host_str(),
676 s.pdf_url(&id).expect("pdf").host_str()
677 );
678 }
679
680 #[test]
681 fn arxiv_can_serve_returns_false_for_doi() {
682 let s = ArxivSource::new();
683 let r = Ref::Doi(Doi("10.1234/example".to_string()));
684 assert!(!s.can_serve(&profile(), &r));
685 }
686
687 #[tokio::test]
692 async fn arxiv_fetch_new_style_id_returns_pdf_bytes() {
693 let server = MockServer::start().await;
694 let body = b"%PDF-1.7\n%fixture\n".to_vec();
695 Mock::given(method("GET"))
696 .and(path("/pdf/2401.12345.pdf"))
697 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
698 .mount(&server)
699 .await;
700
701 let host = server
702 .uri()
703 .parse::<Url>()
704 .unwrap()
705 .host_str()
706 .unwrap()
707 .to_string();
708 let (_td, ctx) = build_test_context(&host);
709 let s = ArxivSource::with_base(server.uri().parse().unwrap());
710
711 let id = ArxivId::parse("2401.12345").unwrap();
712 let r = Ref::Arxiv(id);
713 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
714
715 assert_eq!(res.source, "arxiv");
716 assert_eq!(res.license, "arxiv-default");
717 let bytes = res.pdf_bytes.expect("pdf bytes set");
718 assert!(
719 bytes.starts_with(b"%PDF-"),
720 "expected PDF magic prefix, got {:?}",
721 &bytes[..bytes.len().min(8)]
722 );
723 assert_eq!(&bytes[..], &body[..]);
724 }
725
726 #[tokio::test]
727 async fn arxiv_fetch_old_style_id_returns_pdf_bytes() {
728 let server = MockServer::start().await;
732 let body = b"%PDF-1.4\n%old-style fixture\n".to_vec();
733 Mock::given(method("GET"))
734 .and(path("/pdf/cond-mat/9501001.pdf"))
735 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
736 .mount(&server)
737 .await;
738
739 let host = server
740 .uri()
741 .parse::<Url>()
742 .unwrap()
743 .host_str()
744 .unwrap()
745 .to_string();
746 let (_td, ctx) = build_test_context(&host);
747 let s = ArxivSource::with_base(server.uri().parse().unwrap());
748
749 let id = ArxivId::parse("cond-mat/9501001").expect("old-style id");
750 let r = Ref::Arxiv(id);
751 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
752
753 let bytes = res.pdf_bytes.expect("pdf bytes set");
754 assert!(bytes.starts_with(b"%PDF-"));
755 assert_eq!(&bytes[..], &body[..]);
756 }
757
758 #[tokio::test]
763 async fn arxiv_fetch_with_doi_ref_errors_not_eligible() {
764 let server = MockServer::start().await;
765 let host = server
766 .uri()
767 .parse::<Url>()
768 .unwrap()
769 .host_str()
770 .unwrap()
771 .to_string();
772 let (_td, ctx) = build_test_context(&host);
773 let s = ArxivSource::with_base(server.uri().parse().unwrap());
774
775 let r = Ref::Doi(Doi("10.1234/example".to_string()));
776 let err = s
777 .fetch(&r, &profile(), &ctx)
778 .await
779 .expect_err("doi ref must not be eligible");
780 match err {
781 FetchError::NotEligible { source_key } => {
782 assert_eq!(source_key, "arxiv");
783 }
784 other => panic!("expected NotEligible, got {:?}", other),
785 }
786 }
787
788 #[tokio::test]
789 async fn arxiv_fetch_writes_log_row_with_arxiv_default_license() {
790 let server = MockServer::start().await;
791 let body = b"%PDF-1.7\n%log-row fixture\n".to_vec();
792 Mock::given(method("GET"))
793 .and(path("/pdf/2401.12345.pdf"))
794 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
795 .mount(&server)
796 .await;
797 let host = server
798 .uri()
799 .parse::<Url>()
800 .unwrap()
801 .host_str()
802 .unwrap()
803 .to_string();
804 let (_td, ctx) = build_test_context(&host);
805 let log_path = ctx.log.path().to_path_buf();
807 let s = ArxivSource::with_base(server.uri().parse().unwrap());
808
809 let id = ArxivId::parse("2401.12345").unwrap();
810 let r = Ref::Arxiv(id);
811 let _ = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
812
813 let rows = read_rows(&log_path);
814 assert_eq!(rows.len(), 1, "exactly one fetch row expected");
815 let row = &rows[0];
816 assert_eq!(row.source.as_deref(), Some("arxiv"));
817 assert_eq!(row.ref_.as_deref(), Some("2401.12345"));
818 assert_eq!(row.license.as_deref(), Some("arxiv-default"));
819 assert_eq!(row.size_bytes, Some(body.len() as u64));
820 assert!(row.error_code.is_none());
821 }
822
823 #[tokio::test]
824 async fn arxiv_non_pdf_body_rejected() {
825 let server = MockServer::start().await;
829 Mock::given(method("GET"))
830 .and(path("/pdf/2401.12345.pdf"))
831 .respond_with(
832 ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
833 )
834 .mount(&server)
835 .await;
836 let host = server
837 .uri()
838 .parse::<Url>()
839 .unwrap()
840 .host_str()
841 .unwrap()
842 .to_string();
843 let (_td, ctx) = build_test_context(&host);
844 let s = ArxivSource::with_base(server.uri().parse().unwrap());
845
846 let id = ArxivId::parse("2401.12345").unwrap();
847 let r = Ref::Arxiv(id);
848 let err = s
849 .fetch(&r, &profile(), &ctx)
850 .await
851 .expect_err("non-pdf body must be rejected");
852 match err {
853 FetchError::Http(HttpError::NotAPdf { got }) => {
854 assert_eq!(&got, b"<html");
855 }
856 other => panic!("expected FetchError::Http(NotAPdf), got {:?}", other),
857 }
858 }
859
860 #[tokio::test]
861 async fn arxiv_404_maps_to_http_error() {
862 let server = MockServer::start().await;
863 Mock::given(method("GET"))
864 .and(path("/pdf/2401.99999.pdf"))
865 .respond_with(ResponseTemplate::new(404))
866 .mount(&server)
867 .await;
868 let host = server
869 .uri()
870 .parse::<Url>()
871 .unwrap()
872 .host_str()
873 .unwrap()
874 .to_string();
875 let (_td, ctx) = build_test_context(&host);
876 let s = ArxivSource::with_base(server.uri().parse().unwrap());
877
878 let id = ArxivId::parse("2401.99999").unwrap();
879 let r = Ref::Arxiv(id);
880 let err = s
881 .fetch(&r, &profile(), &ctx)
882 .await
883 .expect_err("404 must surface");
884 match err {
885 FetchError::Http(HttpError::HttpStatus { status, .. }) => {
886 assert_eq!(status, 404);
887 }
888 other => panic!("expected FetchError::Http(HttpStatus), got {:?}", other),
889 }
890 }
891
892 const SAMPLE_ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
899<feed xmlns="http://www.w3.org/2005/Atom">
900 <entry>
901 <id>http://arxiv.org/abs/2401.12345v1</id>
902 <updated>2024-02-01T00:00:00Z</updated>
903 <published>2024-01-15T00:00:00Z</published>
904 <title>Example arXiv Paper Title</title>
905 <summary>This is an example abstract.</summary>
906 <author>
907 <name>Jane Doe</name>
908 </author>
909 <author>
910 <name>John Roe</name>
911 </author>
912 <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
913 <category term="stat.ML" scheme="http://arxiv.org/schemas/atom"/>
914 </entry>
915</feed>"#;
916
917 #[test]
918 fn parse_atom_feed_extracts_all_fields() {
919 let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("Atom parses");
920 assert_eq!(v["title"], serde_json::json!("Example arXiv Paper Title"));
921 assert_eq!(
922 v["abstract"],
923 serde_json::json!("This is an example abstract.")
924 );
925 assert_eq!(v["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
926 assert_eq!(v["published"], serde_json::json!("2024-01-15T00:00:00Z"));
927 assert_eq!(v["updated"], serde_json::json!("2024-02-01T00:00:00Z"));
928 assert_eq!(v["categories"], serde_json::json!(["cs.LG", "stat.ML"]));
929 }
930
931 #[test]
932 fn parse_atom_feed_empty_feed_errors_source_schema() {
933 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
934<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
935 let err = parse_atom_feed(xml.as_bytes()).expect_err("empty feed must error");
936 match err {
937 FetchError::SourceSchema { hint } => {
938 assert!(
939 hint.contains("entry"),
940 "expected mention of <entry>; got {hint}"
941 );
942 }
943 other => panic!("expected SourceSchema, got {other:?}"),
944 }
945 }
946
947 #[test]
948 fn parse_atom_feed_omits_missing_optional_fields() {
949 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
953<feed xmlns="http://www.w3.org/2005/Atom">
954 <entry>
955 <id>http://arxiv.org/abs/2401.00001v1</id>
956 <title>Minimal Entry</title>
957 </entry>
958</feed>"#;
959 let v = parse_atom_feed(xml.as_bytes()).expect("parses");
960 let obj = v.as_object().expect("object");
961 assert_eq!(
962 obj.get("title").and_then(Value::as_str),
963 Some("Minimal Entry")
964 );
965 assert!(
966 !obj.contains_key("abstract"),
967 "abstract should be omitted: {obj:?}"
968 );
969 assert!(
970 !obj.contains_key("authors"),
971 "authors should be omitted: {obj:?}"
972 );
973 assert!(
974 !obj.contains_key("categories"),
975 "categories should be omitted: {obj:?}"
976 );
977 }
978
979 #[tokio::test]
984 async fn arxiv_fetch_metadata_only_returns_atom_metadata() {
985 let server = MockServer::start().await;
986 Mock::given(method("GET"))
987 .and(path("/api/query"))
988 .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
989 .mount(&server)
990 .await;
991 let host = server
992 .uri()
993 .parse::<Url>()
994 .unwrap()
995 .host_str()
996 .unwrap()
997 .to_string();
998 let (_td, ctx) = build_test_context(&host);
999 let s = ArxivSource::with_base(server.uri().parse().unwrap());
1000 let id = ArxivId::parse("2401.12345").unwrap();
1001
1002 let meta = s
1003 .fetch_metadata_only(&id, &ctx)
1004 .await
1005 .expect("metadata_only ok");
1006 assert_eq!(
1007 meta["title"],
1008 serde_json::json!("Example arXiv Paper Title")
1009 );
1010 assert_eq!(meta["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
1011 }
1012
1013 #[tokio::test]
1014 async fn arxiv_fetch_populates_metadata_json_when_atom_endpoint_mocked() {
1015 let server = MockServer::start().await;
1018 Mock::given(method("GET"))
1019 .and(path("/api/query"))
1020 .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
1021 .mount(&server)
1022 .await;
1023 Mock::given(method("GET"))
1024 .and(path("/pdf/2401.12345.pdf"))
1025 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\n%fix\n".to_vec()))
1026 .mount(&server)
1027 .await;
1028 let host = server
1029 .uri()
1030 .parse::<Url>()
1031 .unwrap()
1032 .host_str()
1033 .unwrap()
1034 .to_string();
1035 let (_td, ctx) = build_test_context(&host);
1036 let s = ArxivSource::with_base(server.uri().parse().unwrap());
1037 let id = ArxivId::parse("2401.12345").unwrap();
1038 let r = Ref::Arxiv(id);
1039
1040 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1041 let meta = res.metadata_json.expect("metadata_json populated");
1042 assert_eq!(
1043 meta["title"],
1044 serde_json::json!("Example arXiv Paper Title")
1045 );
1046 }
1047
1048 #[tokio::test]
1049 async fn arxiv_fetch_atom_failure_falls_back_to_pdf_only() {
1050 let server = MockServer::start().await;
1054 Mock::given(method("GET"))
1055 .and(path("/pdf/2401.12345.pdf"))
1056 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\nx".to_vec()))
1057 .mount(&server)
1058 .await;
1059 let host = server
1060 .uri()
1061 .parse::<Url>()
1062 .unwrap()
1063 .host_str()
1064 .unwrap()
1065 .to_string();
1066 let (_td, ctx) = build_test_context(&host);
1067 let s = ArxivSource::with_base(server.uri().parse().unwrap());
1068 let id = ArxivId::parse("2401.12345").unwrap();
1069 let r = Ref::Arxiv(id);
1070
1071 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1072 assert!(res.metadata_json.is_none());
1073 assert!(res.pdf_bytes.is_some());
1074 }
1075}