1use async_trait::async_trait;
39use bytes::Bytes;
40use quick_xml::events::Event;
41use quick_xml::Reader;
42use serde_json::{json, Value};
43use url::Url;
44
45use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
46use crate::source::{FetchContext, FetchError, FetchResult, Source};
47use crate::{ArxivId, CapabilityProfile, Ref};
48
49const PDF_BASE: &str = "https://arxiv.org";
55
56#[derive(Clone, Debug)]
60pub struct ArxivSource {
61 base: Url,
62}
63
64impl ArxivSource {
65 pub fn new() -> Self {
67 #[allow(clippy::expect_used)]
72 let base = Url::parse(PDF_BASE).expect("hard-coded base URL is valid");
73 Self { base }
74 }
75
76 pub fn with_base(base: Url) -> Self {
83 Self { base }
84 }
85
86 fn pdf_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
98 let path = format!("/pdf/{}.pdf", id.as_str());
99 self.base.join(&path).map_err(|e| FetchError::SourceSchema {
100 hint: format!("arxiv URL construction failed: {e}"),
101 })
102 }
103
104 fn metadata_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
117 let mut url = self
118 .base
119 .join("/api/query")
120 .map_err(|e| FetchError::SourceSchema {
121 hint: format!("arxiv metadata URL construction failed: {e}"),
122 })?;
123 url.query_pairs_mut().append_pair("id_list", id.as_str());
124 Ok(url)
125 }
126
127 pub async fn fetch_metadata_only(
143 &self,
144 id: &ArxivId,
145 ctx: &FetchContext,
146 ) -> Result<Value, FetchError> {
147 let _permit = ctx.rate_limiter.acquire(self.name()).await;
149
150 let url = self.metadata_url(id)?;
151 let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
152 let metadata = parse_atom_feed(&body)?;
153
154 let canonical =
158 crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), self.name(), None)
159 .digest_hex();
160 ctx.log.append(RowInput {
161 event: LogEvent::Fetch,
162 result: LogResult::Ok,
163 capability: Capability::Metadata,
168 ref_: Some(id.as_str()),
169 source: Some(self.name()),
170 error_code: None,
171 size_bytes: Some(body.len() as u64),
172 license: Some("arxiv-default"),
173 store_path: None,
174 canonical_digest: Some(&canonical),
175 })?;
176
177 Ok(metadata)
178 }
179}
180
181impl Default for ArxivSource {
182 fn default() -> Self {
183 Self::new()
184 }
185}
186
187#[async_trait]
188impl Source for ArxivSource {
189 fn name(&self) -> &str {
190 "arxiv"
191 }
192
193 fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
194 matches!(ref_, Ref::Arxiv(_))
195 }
196
197 async fn fetch(
198 &self,
199 ref_: &Ref,
200 _profile: &CapabilityProfile,
201 ctx: &FetchContext,
202 ) -> Result<FetchResult, FetchError> {
203 let id = match ref_ {
207 Ref::Arxiv(a) => a,
208 Ref::Doi(_) => {
209 return Err(FetchError::NotEligible {
210 source_key: "arxiv".into(),
211 });
212 }
213 };
214
215 let _permit = ctx.rate_limiter.acquire(self.name()).await;
218
219 let metadata_json = match self.metadata_url(id) {
231 Ok(meta_url) => match ctx.http.fetch_bytes(self.name(), meta_url).await {
232 Ok((bytes, _final)) => match parse_atom_feed(&bytes) {
233 Ok(v) => Some(v),
234 Err(e) => {
235 tracing::warn!(
236 arxiv_id = %id.as_str(),
237 error = %e,
238 "arxiv Atom feed parse failed; continuing with PDF-only fetch"
239 );
240 None
241 }
242 },
243 Err(e) => {
244 tracing::warn!(
245 arxiv_id = %id.as_str(),
246 error = %e,
247 "arxiv Atom feed fetch failed; continuing with PDF-only fetch"
248 );
249 None
250 }
251 },
252 Err(e) => {
253 tracing::warn!(
254 arxiv_id = %id.as_str(),
255 error = %e,
256 "arxiv metadata URL construction failed; continuing with PDF-only fetch"
257 );
258 None
259 }
260 };
261
262 let url = self.pdf_url(id)?;
264
265 let (body, final_url): (Bytes, Url) = ctx.http.fetch_pdf(self.name(), url).await?;
269
270 let canonical = ref_.promote(self.name(), None).digest_hex();
277 ctx.log.append(RowInput {
278 event: LogEvent::Fetch,
279 result: LogResult::Ok,
280 capability: Capability::Oa,
281 ref_: Some(id.as_str()),
282 source: Some(self.name()),
283 error_code: None,
284 size_bytes: Some(body.len() as u64),
285 license: Some("arxiv-default"),
291 store_path: None,
292 canonical_digest: Some(&canonical),
293 })?;
294
295 Ok(FetchResult {
296 source: self.name().to_string(),
297 license: "arxiv-default".into(),
298 pdf_bytes: Some(body),
299 final_url: Some(final_url),
300 metadata_json,
301 })
302 }
303}
304
305pub(crate) fn parse_atom_feed(xml: &[u8]) -> Result<Value, FetchError> {
341 let mut reader = Reader::from_reader(xml);
342 let config = reader.config_mut();
343 config.trim_text(true);
344
345 let mut in_entry = false;
348 let mut saw_entry = false;
349 let mut depth = 0_i32; let mut title: Option<String> = None;
354 let mut abstract_: Option<String> = None;
355 let mut published: Option<String> = None;
356 let mut updated: Option<String> = None;
357 let mut authors: Vec<String> = Vec::new();
358 let mut categories: Vec<String> = Vec::new();
359
360 #[derive(Clone, Copy)]
363 enum Target {
364 Title,
365 Summary,
366 Published,
367 Updated,
368 AuthorName,
369 }
370 let mut target: Option<Target> = None;
371 let mut in_author = false;
372 let mut buf: Vec<u8> = Vec::new();
373
374 loop {
375 match reader.read_event_into(&mut buf) {
376 Ok(Event::Start(e)) => {
377 let name_bytes = e.name();
378 let local = local_name(name_bytes.as_ref());
379 if !in_entry {
380 if local == b"entry" {
381 in_entry = true;
382 saw_entry = true;
383 depth = 0;
384 }
385 buf.clear();
386 continue;
387 }
388 depth += 1;
389 if depth == 1 {
391 match local {
392 b"title" => target = Some(Target::Title),
393 b"summary" => target = Some(Target::Summary),
394 b"published" => target = Some(Target::Published),
395 b"updated" => target = Some(Target::Updated),
396 b"author" => {
397 in_author = true;
398 authors.push(String::new());
399 }
400 _ => {}
401 }
402 } else if depth == 2 && in_author && local == b"name" {
403 target = Some(Target::AuthorName);
404 }
405 buf.clear();
406 }
407 Ok(Event::Empty(e)) => {
408 let name_bytes = e.name();
409 let local = local_name(name_bytes.as_ref());
410 if in_entry && depth == 0 && local == b"category" {
411 for attr in e.attributes().flatten() {
413 if attr.key.as_ref() == b"term" {
414 if let Ok(v) = attr.unescape_value() {
415 categories.push(v.into_owned());
416 }
417 }
418 }
419 }
420 buf.clear();
421 }
422 Ok(Event::Text(t)) => {
423 if let Some(tg) = target {
424 if let Ok(s) = t.unescape() {
425 let s = s.into_owned();
426 match tg {
427 Target::Title => title.get_or_insert_with(String::new).push_str(&s),
428 Target::Summary => {
429 abstract_.get_or_insert_with(String::new).push_str(&s)
430 }
431 Target::Published => {
432 published.get_or_insert_with(String::new).push_str(&s)
433 }
434 Target::Updated => updated.get_or_insert_with(String::new).push_str(&s),
435 Target::AuthorName => {
436 if let Some(last) = authors.last_mut() {
437 last.push_str(&s);
438 }
439 }
440 }
441 }
442 }
443 buf.clear();
444 }
445 Ok(Event::End(e)) => {
446 if !in_entry {
447 buf.clear();
448 continue;
449 }
450 let name_bytes = e.name();
451 let local = local_name(name_bytes.as_ref());
452 if depth == 0 && local == b"entry" {
453 break;
457 }
458 depth -= 1;
459 if depth == 0 {
460 if local == b"author" {
461 in_author = false;
462 if let Some(last) = authors.last() {
464 if last.is_empty() {
465 authors.pop();
466 }
467 }
468 }
469 target = None;
470 } else if depth == 1 && in_author && local == b"name" {
471 target = None;
472 }
473 buf.clear();
474 }
475 Ok(Event::Eof) => break,
476 Err(e) => {
477 return Err(FetchError::SourceSchema {
478 hint: format!("arxiv Atom XML parse error: {e}"),
479 });
480 }
481 _ => {
483 buf.clear();
484 }
485 }
486 }
487
488 if !saw_entry {
489 return Err(FetchError::SourceSchema {
490 hint: "arxiv Atom feed had no <entry> element (unknown id?)".into(),
491 });
492 }
493
494 let mut obj = serde_json::Map::new();
497 if let Some(t) = title {
498 let trimmed = t.trim().to_string();
499 if !trimmed.is_empty() {
500 obj.insert("title".into(), Value::String(trimmed));
501 }
502 }
503 if let Some(a) = abstract_ {
504 let trimmed = a.trim().to_string();
505 if !trimmed.is_empty() {
506 obj.insert("abstract".into(), Value::String(trimmed));
507 }
508 }
509 if !authors.is_empty() {
510 obj.insert(
511 "authors".into(),
512 Value::Array(authors.into_iter().map(Value::String).collect()),
513 );
514 }
515 if let Some(p) = published {
516 let trimmed = p.trim().to_string();
517 if !trimmed.is_empty() {
518 obj.insert("published".into(), Value::String(trimmed));
519 }
520 }
521 if let Some(u) = updated {
522 let trimmed = u.trim().to_string();
523 if !trimmed.is_empty() {
524 obj.insert("updated".into(), Value::String(trimmed));
525 }
526 }
527 if !categories.is_empty() {
528 obj.insert(
529 "categories".into(),
530 Value::Array(categories.into_iter().map(Value::String).collect()),
531 );
532 }
533 Ok(json!(obj))
534}
535
536fn local_name(qname: &[u8]) -> &[u8] {
543 match qname.iter().rposition(|&b| b == b':') {
544 Some(idx) => &qname[idx + 1..],
545 None => qname,
546 }
547}
548
549#[cfg(test)]
554#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
555mod tests {
556 use super::*;
557
558 use std::sync::Arc;
559
560 use camino::Utf8PathBuf;
561 use tempfile::TempDir;
562 use wiremock::matchers::{method, path};
563 use wiremock::{Mock, MockServer, ResponseTemplate};
564
565 use crate::http::{HttpClient, HttpError};
566 use crate::provenance::{LogRow, ProvenanceLog};
567 use crate::rate_limiter::RateLimiter;
568 use crate::source::FetchContext;
569 use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
570
571 const TEST_SESSION_ID: &str = "01J0000000000000000000TEST";
572
573 fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
576 let td = TempDir::new().expect("tempdir");
577 let log_dir =
578 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
579 let log_path = log_dir.join("test.jsonl");
580
581 let http = Arc::new(HttpClient::new_for_tests_allow_http("arxiv", wiremock_host));
582 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
583 let session_id = TEST_SESSION_ID.to_string();
584 let log = Arc::new(
585 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
586 );
587
588 (
589 td,
590 FetchContext {
591 http,
592 rate_limiter,
593 log,
594 session_id,
595 },
596 )
597 }
598
599 fn read_rows(path: &camino::Utf8Path) -> Vec<LogRow> {
600 let raw = std::fs::read_to_string(path).expect("read log");
601 raw.lines()
602 .filter(|l| !l.is_empty())
603 .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
604 .collect()
605 }
606
607 fn profile() -> CapabilityProfile {
608 CapabilityProfile::from_env().expect("Phase 0 stub profile")
609 }
610
611 #[test]
616 fn arxiv_can_serve_returns_true_for_arxiv() {
617 let s = ArxivSource::new();
618 let id = ArxivId::parse("2401.12345").expect("valid id");
619 let r = Ref::Arxiv(id);
620 assert!(s.can_serve(&profile(), &r));
621 }
622
623 #[test]
624 fn arxiv_can_serve_returns_false_for_doi() {
625 let s = ArxivSource::new();
626 let r = Ref::Doi(Doi("10.1234/example".to_string()));
627 assert!(!s.can_serve(&profile(), &r));
628 }
629
630 #[tokio::test]
635 async fn arxiv_fetch_new_style_id_returns_pdf_bytes() {
636 let server = MockServer::start().await;
637 let body = b"%PDF-1.7\n%fixture\n".to_vec();
638 Mock::given(method("GET"))
639 .and(path("/pdf/2401.12345.pdf"))
640 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
641 .mount(&server)
642 .await;
643
644 let host = server
645 .uri()
646 .parse::<Url>()
647 .unwrap()
648 .host_str()
649 .unwrap()
650 .to_string();
651 let (_td, ctx) = build_test_context(&host);
652 let s = ArxivSource::with_base(server.uri().parse().unwrap());
653
654 let id = ArxivId::parse("2401.12345").unwrap();
655 let r = Ref::Arxiv(id);
656 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
657
658 assert_eq!(res.source, "arxiv");
659 assert_eq!(res.license, "arxiv-default");
660 let bytes = res.pdf_bytes.expect("pdf bytes set");
661 assert!(
662 bytes.starts_with(b"%PDF-"),
663 "expected PDF magic prefix, got {:?}",
664 &bytes[..bytes.len().min(8)]
665 );
666 assert_eq!(&bytes[..], &body[..]);
667 }
668
669 #[tokio::test]
670 async fn arxiv_fetch_old_style_id_returns_pdf_bytes() {
671 let server = MockServer::start().await;
675 let body = b"%PDF-1.4\n%old-style fixture\n".to_vec();
676 Mock::given(method("GET"))
677 .and(path("/pdf/cond-mat/9501001.pdf"))
678 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
679 .mount(&server)
680 .await;
681
682 let host = server
683 .uri()
684 .parse::<Url>()
685 .unwrap()
686 .host_str()
687 .unwrap()
688 .to_string();
689 let (_td, ctx) = build_test_context(&host);
690 let s = ArxivSource::with_base(server.uri().parse().unwrap());
691
692 let id = ArxivId::parse("cond-mat/9501001").expect("old-style id");
693 let r = Ref::Arxiv(id);
694 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
695
696 let bytes = res.pdf_bytes.expect("pdf bytes set");
697 assert!(bytes.starts_with(b"%PDF-"));
698 assert_eq!(&bytes[..], &body[..]);
699 }
700
701 #[tokio::test]
706 async fn arxiv_fetch_with_doi_ref_errors_not_eligible() {
707 let server = MockServer::start().await;
708 let host = server
709 .uri()
710 .parse::<Url>()
711 .unwrap()
712 .host_str()
713 .unwrap()
714 .to_string();
715 let (_td, ctx) = build_test_context(&host);
716 let s = ArxivSource::with_base(server.uri().parse().unwrap());
717
718 let r = Ref::Doi(Doi("10.1234/example".to_string()));
719 let err = s
720 .fetch(&r, &profile(), &ctx)
721 .await
722 .expect_err("doi ref must not be eligible");
723 match err {
724 FetchError::NotEligible { source_key } => {
725 assert_eq!(source_key, "arxiv");
726 }
727 other => panic!("expected NotEligible, got {:?}", other),
728 }
729 }
730
731 #[tokio::test]
732 async fn arxiv_fetch_writes_log_row_with_arxiv_default_license() {
733 let server = MockServer::start().await;
734 let body = b"%PDF-1.7\n%log-row fixture\n".to_vec();
735 Mock::given(method("GET"))
736 .and(path("/pdf/2401.12345.pdf"))
737 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
738 .mount(&server)
739 .await;
740 let host = server
741 .uri()
742 .parse::<Url>()
743 .unwrap()
744 .host_str()
745 .unwrap()
746 .to_string();
747 let (_td, ctx) = build_test_context(&host);
748 let log_path = ctx.log.path().to_path_buf();
750 let s = ArxivSource::with_base(server.uri().parse().unwrap());
751
752 let id = ArxivId::parse("2401.12345").unwrap();
753 let r = Ref::Arxiv(id);
754 let _ = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
755
756 let rows = read_rows(&log_path);
757 assert_eq!(rows.len(), 1, "exactly one fetch row expected");
758 let row = &rows[0];
759 assert_eq!(row.source.as_deref(), Some("arxiv"));
760 assert_eq!(row.ref_.as_deref(), Some("2401.12345"));
761 assert_eq!(row.license.as_deref(), Some("arxiv-default"));
762 assert_eq!(row.size_bytes, Some(body.len() as u64));
763 assert!(row.error_code.is_none());
764 }
765
766 #[tokio::test]
767 async fn arxiv_non_pdf_body_rejected() {
768 let server = MockServer::start().await;
772 Mock::given(method("GET"))
773 .and(path("/pdf/2401.12345.pdf"))
774 .respond_with(
775 ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
776 )
777 .mount(&server)
778 .await;
779 let host = server
780 .uri()
781 .parse::<Url>()
782 .unwrap()
783 .host_str()
784 .unwrap()
785 .to_string();
786 let (_td, ctx) = build_test_context(&host);
787 let s = ArxivSource::with_base(server.uri().parse().unwrap());
788
789 let id = ArxivId::parse("2401.12345").unwrap();
790 let r = Ref::Arxiv(id);
791 let err = s
792 .fetch(&r, &profile(), &ctx)
793 .await
794 .expect_err("non-pdf body must be rejected");
795 match err {
796 FetchError::Http(HttpError::NotAPdf { got }) => {
797 assert_eq!(&got, b"<html");
798 }
799 other => panic!("expected FetchError::Http(NotAPdf), got {:?}", other),
800 }
801 }
802
803 #[tokio::test]
804 async fn arxiv_404_maps_to_http_error() {
805 let server = MockServer::start().await;
806 Mock::given(method("GET"))
807 .and(path("/pdf/2401.99999.pdf"))
808 .respond_with(ResponseTemplate::new(404))
809 .mount(&server)
810 .await;
811 let host = server
812 .uri()
813 .parse::<Url>()
814 .unwrap()
815 .host_str()
816 .unwrap()
817 .to_string();
818 let (_td, ctx) = build_test_context(&host);
819 let s = ArxivSource::with_base(server.uri().parse().unwrap());
820
821 let id = ArxivId::parse("2401.99999").unwrap();
822 let r = Ref::Arxiv(id);
823 let err = s
824 .fetch(&r, &profile(), &ctx)
825 .await
826 .expect_err("404 must surface");
827 match err {
828 FetchError::Http(HttpError::HttpStatus { status, .. }) => {
829 assert_eq!(status, 404);
830 }
831 other => panic!("expected FetchError::Http(HttpStatus), got {:?}", other),
832 }
833 }
834
835 const SAMPLE_ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
842<feed xmlns="http://www.w3.org/2005/Atom">
843 <entry>
844 <id>http://arxiv.org/abs/2401.12345v1</id>
845 <updated>2024-02-01T00:00:00Z</updated>
846 <published>2024-01-15T00:00:00Z</published>
847 <title>Example arXiv Paper Title</title>
848 <summary>This is an example abstract.</summary>
849 <author>
850 <name>Jane Doe</name>
851 </author>
852 <author>
853 <name>John Roe</name>
854 </author>
855 <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
856 <category term="stat.ML" scheme="http://arxiv.org/schemas/atom"/>
857 </entry>
858</feed>"#;
859
860 #[test]
861 fn parse_atom_feed_extracts_all_fields() {
862 let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("Atom parses");
863 assert_eq!(v["title"], serde_json::json!("Example arXiv Paper Title"));
864 assert_eq!(
865 v["abstract"],
866 serde_json::json!("This is an example abstract.")
867 );
868 assert_eq!(v["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
869 assert_eq!(v["published"], serde_json::json!("2024-01-15T00:00:00Z"));
870 assert_eq!(v["updated"], serde_json::json!("2024-02-01T00:00:00Z"));
871 assert_eq!(v["categories"], serde_json::json!(["cs.LG", "stat.ML"]));
872 }
873
874 #[test]
875 fn parse_atom_feed_empty_feed_errors_source_schema() {
876 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
877<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
878 let err = parse_atom_feed(xml.as_bytes()).expect_err("empty feed must error");
879 match err {
880 FetchError::SourceSchema { hint } => {
881 assert!(
882 hint.contains("entry"),
883 "expected mention of <entry>; got {hint}"
884 );
885 }
886 other => panic!("expected SourceSchema, got {other:?}"),
887 }
888 }
889
890 #[test]
891 fn parse_atom_feed_omits_missing_optional_fields() {
892 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
896<feed xmlns="http://www.w3.org/2005/Atom">
897 <entry>
898 <id>http://arxiv.org/abs/2401.00001v1</id>
899 <title>Minimal Entry</title>
900 </entry>
901</feed>"#;
902 let v = parse_atom_feed(xml.as_bytes()).expect("parses");
903 let obj = v.as_object().expect("object");
904 assert_eq!(
905 obj.get("title").and_then(Value::as_str),
906 Some("Minimal Entry")
907 );
908 assert!(
909 !obj.contains_key("abstract"),
910 "abstract should be omitted: {obj:?}"
911 );
912 assert!(
913 !obj.contains_key("authors"),
914 "authors should be omitted: {obj:?}"
915 );
916 assert!(
917 !obj.contains_key("categories"),
918 "categories should be omitted: {obj:?}"
919 );
920 }
921
922 #[tokio::test]
927 async fn arxiv_fetch_metadata_only_returns_atom_metadata() {
928 let server = MockServer::start().await;
929 Mock::given(method("GET"))
930 .and(path("/api/query"))
931 .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
932 .mount(&server)
933 .await;
934 let host = server
935 .uri()
936 .parse::<Url>()
937 .unwrap()
938 .host_str()
939 .unwrap()
940 .to_string();
941 let (_td, ctx) = build_test_context(&host);
942 let s = ArxivSource::with_base(server.uri().parse().unwrap());
943 let id = ArxivId::parse("2401.12345").unwrap();
944
945 let meta = s
946 .fetch_metadata_only(&id, &ctx)
947 .await
948 .expect("metadata_only ok");
949 assert_eq!(
950 meta["title"],
951 serde_json::json!("Example arXiv Paper Title")
952 );
953 assert_eq!(meta["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
954 }
955
956 #[tokio::test]
957 async fn arxiv_fetch_populates_metadata_json_when_atom_endpoint_mocked() {
958 let server = MockServer::start().await;
961 Mock::given(method("GET"))
962 .and(path("/api/query"))
963 .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
964 .mount(&server)
965 .await;
966 Mock::given(method("GET"))
967 .and(path("/pdf/2401.12345.pdf"))
968 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\n%fix\n".to_vec()))
969 .mount(&server)
970 .await;
971 let host = server
972 .uri()
973 .parse::<Url>()
974 .unwrap()
975 .host_str()
976 .unwrap()
977 .to_string();
978 let (_td, ctx) = build_test_context(&host);
979 let s = ArxivSource::with_base(server.uri().parse().unwrap());
980 let id = ArxivId::parse("2401.12345").unwrap();
981 let r = Ref::Arxiv(id);
982
983 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
984 let meta = res.metadata_json.expect("metadata_json populated");
985 assert_eq!(
986 meta["title"],
987 serde_json::json!("Example arXiv Paper Title")
988 );
989 }
990
991 #[tokio::test]
992 async fn arxiv_fetch_atom_failure_falls_back_to_pdf_only() {
993 let server = MockServer::start().await;
997 Mock::given(method("GET"))
998 .and(path("/pdf/2401.12345.pdf"))
999 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\nx".to_vec()))
1000 .mount(&server)
1001 .await;
1002 let host = server
1003 .uri()
1004 .parse::<Url>()
1005 .unwrap()
1006 .host_str()
1007 .unwrap()
1008 .to_string();
1009 let (_td, ctx) = build_test_context(&host);
1010 let s = ArxivSource::with_base(server.uri().parse().unwrap());
1011 let id = ArxivId::parse("2401.12345").unwrap();
1012 let r = Ref::Arxiv(id);
1013
1014 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1015 assert!(res.metadata_json.is_none());
1016 assert!(res.pdf_bytes.is_some());
1017 }
1018}