1use async_trait::async_trait;
39use bytes::Bytes;
40use quick_xml::events::Event;
41use quick_xml::Reader;
42use serde_json::{json, Value};
43use url::Url;
44
45use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
46use crate::source::{FetchContext, FetchError, FetchResult, Source};
47use crate::{ArxivId, CapabilityProfile, Ref};
48
49const PDF_BASE: &str = "https://arxiv.org";
55
56#[derive(Clone, Debug)]
60pub struct ArxivSource {
61 base: Url,
62}
63
64impl ArxivSource {
65 pub fn new() -> Self {
67 #[allow(clippy::expect_used)]
72 let base = Url::parse(PDF_BASE).expect("hard-coded base URL is valid");
73 Self { base }
74 }
75
76 pub fn with_base(base: Url) -> Self {
83 Self { base }
84 }
85
86 fn pdf_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
98 let path = format!("/pdf/{}.pdf", id.as_str());
99 self.base.join(&path).map_err(|e| FetchError::SourceSchema {
100 hint: format!("arxiv URL construction failed: {e}"),
101 })
102 }
103
104 fn metadata_url(&self, id: &ArxivId) -> Result<Url, FetchError> {
117 let mut url = self
118 .base
119 .join("/api/query")
120 .map_err(|e| FetchError::SourceSchema {
121 hint: format!("arxiv metadata URL construction failed: {e}"),
122 })?;
123 url.query_pairs_mut().append_pair("id_list", id.as_str());
124 Ok(url)
125 }
126
127 pub async fn fetch_metadata_only(
143 &self,
144 id: &ArxivId,
145 ctx: &FetchContext,
146 ) -> Result<Value, FetchError> {
147 let _permit = ctx.rate_limiter.acquire(self.name()).await;
149
150 let url = self.metadata_url(id)?;
151 let (body, _final_url) = ctx.http.fetch_bytes(self.name(), url).await?;
152 let metadata = parse_atom_feed(&body)?;
153
154 let canonical =
158 crate::CanonicalRef::new(crate::SourceType::Arxiv, id.as_str(), self.name(), None)
159 .digest_hex();
160 ctx.log.append(RowInput {
161 event: LogEvent::Fetch,
162 result: LogResult::Ok,
163 capability: Capability::Metadata,
168 ref_: Some(id.as_str()),
169 source: Some(self.name()),
170 error_code: None,
171 size_bytes: Some(body.len() as u64),
172 license: Some("arxiv-default"),
173 store_path: None,
174 canonical_digest: Some(&canonical),
175 })?;
176
177 Ok(metadata)
178 }
179}
180
181impl Default for ArxivSource {
182 fn default() -> Self {
183 Self::new()
184 }
185}
186
187#[async_trait]
188impl Source for ArxivSource {
189 fn name(&self) -> &str {
190 "arxiv"
191 }
192
193 fn can_serve(&self, _profile: &CapabilityProfile, ref_: &Ref) -> bool {
194 matches!(ref_, Ref::Arxiv(_))
195 }
196
197 async fn fetch(
198 &self,
199 ref_: &Ref,
200 _profile: &CapabilityProfile,
201 ctx: &FetchContext,
202 ) -> Result<FetchResult, FetchError> {
203 let id = match ref_ {
207 Ref::Arxiv(a) => a,
208 Ref::Doi(_) => {
209 return Err(FetchError::NotEligible {
210 source_key: "arxiv".into(),
211 });
212 }
213 };
214
215 let _permit = ctx.rate_limiter.acquire(self.name()).await;
218
219 let metadata_json = match self.metadata_url(id) {
231 Ok(meta_url) => match ctx.http.fetch_bytes(self.name(), meta_url).await {
232 Ok((bytes, _final)) => match parse_atom_feed(&bytes) {
233 Ok(v) => Some(v),
234 Err(e) => {
235 tracing::warn!(
236 arxiv_id = %id.as_str(),
237 error = %e,
238 "arxiv Atom feed parse failed; continuing with PDF-only fetch"
239 );
240 None
241 }
242 },
243 Err(e) => {
244 tracing::warn!(
245 arxiv_id = %id.as_str(),
246 error = %e,
247 "arxiv Atom feed fetch failed; continuing with PDF-only fetch"
248 );
249 None
250 }
251 },
252 Err(e) => {
253 tracing::warn!(
254 arxiv_id = %id.as_str(),
255 error = %e,
256 "arxiv metadata URL construction failed; continuing with PDF-only fetch"
257 );
258 None
259 }
260 };
261
262 let url = self.pdf_url(id)?;
264
265 let (body, final_url): (Bytes, Url) = ctx.http.fetch_pdf(self.name(), url).await?;
269
270 let canonical = ref_.promote(self.name(), None).digest_hex();
277 ctx.log.append(RowInput {
278 event: LogEvent::Fetch,
279 result: LogResult::Ok,
280 capability: Capability::Oa,
281 ref_: Some(id.as_str()),
282 source: Some(self.name()),
283 error_code: None,
284 size_bytes: Some(body.len() as u64),
285 license: Some("arxiv-default"),
291 store_path: None,
292 canonical_digest: Some(&canonical),
293 })?;
294
295 Ok(FetchResult {
296 source: self.name().to_string(),
297 license: "arxiv-default".into(),
298 pdf_bytes: Some(body),
299 final_url: Some(final_url),
300 metadata_json,
301 })
302 }
303}
304
305pub(crate) fn parse_atom_feed(xml: &[u8]) -> Result<Value, FetchError> {
341 let mut reader = Reader::from_reader(xml);
342 let config = reader.config_mut();
343 config.trim_text(true);
344
345 let mut in_entry = false;
348 let mut saw_entry = false;
349 let mut depth = 0_i32; let mut title: Option<String> = None;
354 let mut abstract_: Option<String> = None;
355 let mut published: Option<String> = None;
356 let mut updated: Option<String> = None;
357 let mut authors: Vec<String> = Vec::new();
358 let mut categories: Vec<String> = Vec::new();
359
360 #[derive(Clone, Copy)]
363 enum Target {
364 Title,
365 Summary,
366 Published,
367 Updated,
368 AuthorName,
369 }
370 let mut target: Option<Target> = None;
371 let mut in_author = false;
372 let mut buf: Vec<u8> = Vec::new();
373
374 loop {
375 match reader.read_event_into(&mut buf) {
376 Ok(Event::Start(e)) => {
377 let name_bytes = e.name();
378 let local = local_name(name_bytes.as_ref());
379 if !in_entry {
380 if local == b"entry" {
381 in_entry = true;
382 saw_entry = true;
383 depth = 0;
384 }
385 buf.clear();
386 continue;
387 }
388 depth += 1;
389 if depth == 1 {
391 match local {
392 b"title" => target = Some(Target::Title),
393 b"summary" => target = Some(Target::Summary),
394 b"published" => target = Some(Target::Published),
395 b"updated" => target = Some(Target::Updated),
396 b"author" => {
397 in_author = true;
398 authors.push(String::new());
399 }
400 _ => {}
401 }
402 } else if depth == 2 && in_author && local == b"name" {
403 target = Some(Target::AuthorName);
404 }
405 buf.clear();
406 }
407 Ok(Event::Empty(e)) => {
408 let name_bytes = e.name();
409 let local = local_name(name_bytes.as_ref());
410 if in_entry && depth == 0 && local == b"category" {
411 for attr in e.attributes().flatten() {
413 if attr.key.as_ref() == b"term" {
414 if let Ok(v) = attr.normalized_value(quick_xml::XmlVersion::Explicit1_0)
420 {
421 categories.push(v.into_owned());
422 }
423 }
424 }
425 }
426 buf.clear();
427 }
428 Ok(Event::Text(t)) => {
429 if let Some(tg) = target {
430 if let Some(s) = t.decode().ok().and_then(|raw| {
435 quick_xml::escape::unescape(&raw)
436 .ok()
437 .map(|c| c.into_owned())
438 }) {
439 match tg {
440 Target::Title => title.get_or_insert_with(String::new).push_str(&s),
441 Target::Summary => {
442 abstract_.get_or_insert_with(String::new).push_str(&s)
443 }
444 Target::Published => {
445 published.get_or_insert_with(String::new).push_str(&s)
446 }
447 Target::Updated => updated.get_or_insert_with(String::new).push_str(&s),
448 Target::AuthorName => {
449 if let Some(last) = authors.last_mut() {
450 last.push_str(&s);
451 }
452 }
453 }
454 }
455 }
456 buf.clear();
457 }
458 Ok(Event::End(e)) => {
459 if !in_entry {
460 buf.clear();
461 continue;
462 }
463 let name_bytes = e.name();
464 let local = local_name(name_bytes.as_ref());
465 if depth == 0 && local == b"entry" {
466 break;
470 }
471 depth -= 1;
472 if depth == 0 {
473 if local == b"author" {
474 in_author = false;
475 if let Some(last) = authors.last() {
477 if last.is_empty() {
478 authors.pop();
479 }
480 }
481 }
482 target = None;
483 } else if depth == 1 && in_author && local == b"name" {
484 target = None;
485 }
486 buf.clear();
487 }
488 Ok(Event::Eof) => break,
489 Err(e) => {
490 return Err(FetchError::SourceSchema {
491 hint: format!("arxiv Atom XML parse error: {e}"),
492 });
493 }
494 _ => {
496 buf.clear();
497 }
498 }
499 }
500
501 if !saw_entry {
502 return Err(FetchError::SourceSchema {
503 hint: "arxiv Atom feed had no <entry> element (unknown id?)".into(),
504 });
505 }
506
507 let mut obj = serde_json::Map::new();
510 if let Some(t) = title {
511 let trimmed = t.trim().to_string();
512 if !trimmed.is_empty() {
513 obj.insert("title".into(), Value::String(trimmed));
514 }
515 }
516 if let Some(a) = abstract_ {
517 let trimmed = a.trim().to_string();
518 if !trimmed.is_empty() {
519 obj.insert("abstract".into(), Value::String(trimmed));
520 }
521 }
522 if !authors.is_empty() {
523 obj.insert(
524 "authors".into(),
525 Value::Array(authors.into_iter().map(Value::String).collect()),
526 );
527 }
528 if let Some(p) = published {
529 let trimmed = p.trim().to_string();
530 if !trimmed.is_empty() {
531 obj.insert("published".into(), Value::String(trimmed));
532 }
533 }
534 if let Some(u) = updated {
535 let trimmed = u.trim().to_string();
536 if !trimmed.is_empty() {
537 obj.insert("updated".into(), Value::String(trimmed));
538 }
539 }
540 if !categories.is_empty() {
541 obj.insert(
542 "categories".into(),
543 Value::Array(categories.into_iter().map(Value::String).collect()),
544 );
545 }
546 Ok(json!(obj))
547}
548
549fn local_name(qname: &[u8]) -> &[u8] {
556 match qname.iter().rposition(|&b| b == b':') {
557 Some(idx) => &qname[idx + 1..],
558 None => qname,
559 }
560}
561
562#[cfg(test)]
567#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
568mod tests {
569 use super::*;
570
571 use std::sync::Arc;
572
573 use camino::Utf8PathBuf;
574 use tempfile::TempDir;
575 use wiremock::matchers::{method, path};
576 use wiremock::{Mock, MockServer, ResponseTemplate};
577
578 use crate::http::{HttpClient, HttpError};
579 use crate::provenance::{LogRow, ProvenanceLog};
580 use crate::rate_limiter::RateLimiter;
581 use crate::source::FetchContext;
582 use crate::{ArxivId, CapabilityProfile, Doi, RateLimits, Ref};
583
584 const TEST_SESSION_ID: &str = "01J0000000000000000000TEST";
585
586 fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
589 let td = TempDir::new().expect("tempdir");
590 let log_dir =
591 Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
592 let log_path = log_dir.join("test.jsonl");
593
594 let http = Arc::new(HttpClient::new_for_tests_allow_http("arxiv", wiremock_host));
595 let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
596 let session_id = TEST_SESSION_ID.to_string();
597 let log = Arc::new(
598 ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
599 );
600
601 (
602 td,
603 FetchContext {
604 http,
605 rate_limiter,
606 log,
607 session_id,
608 },
609 )
610 }
611
612 fn read_rows(path: &camino::Utf8Path) -> Vec<LogRow> {
613 let raw = std::fs::read_to_string(path).expect("read log");
614 raw.lines()
615 .filter(|l| !l.is_empty())
616 .map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
617 .collect()
618 }
619
620 fn profile() -> CapabilityProfile {
621 CapabilityProfile::from_env().expect("Phase 0 stub profile")
622 }
623
624 #[test]
629 fn arxiv_can_serve_returns_true_for_arxiv() {
630 let s = ArxivSource::new();
631 let id = ArxivId::parse("2401.12345").expect("valid id");
632 let r = Ref::Arxiv(id);
633 assert!(s.can_serve(&profile(), &r));
634 }
635
636 #[test]
637 fn arxiv_can_serve_returns_false_for_doi() {
638 let s = ArxivSource::new();
639 let r = Ref::Doi(Doi("10.1234/example".to_string()));
640 assert!(!s.can_serve(&profile(), &r));
641 }
642
643 #[tokio::test]
648 async fn arxiv_fetch_new_style_id_returns_pdf_bytes() {
649 let server = MockServer::start().await;
650 let body = b"%PDF-1.7\n%fixture\n".to_vec();
651 Mock::given(method("GET"))
652 .and(path("/pdf/2401.12345.pdf"))
653 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
654 .mount(&server)
655 .await;
656
657 let host = server
658 .uri()
659 .parse::<Url>()
660 .unwrap()
661 .host_str()
662 .unwrap()
663 .to_string();
664 let (_td, ctx) = build_test_context(&host);
665 let s = ArxivSource::with_base(server.uri().parse().unwrap());
666
667 let id = ArxivId::parse("2401.12345").unwrap();
668 let r = Ref::Arxiv(id);
669 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
670
671 assert_eq!(res.source, "arxiv");
672 assert_eq!(res.license, "arxiv-default");
673 let bytes = res.pdf_bytes.expect("pdf bytes set");
674 assert!(
675 bytes.starts_with(b"%PDF-"),
676 "expected PDF magic prefix, got {:?}",
677 &bytes[..bytes.len().min(8)]
678 );
679 assert_eq!(&bytes[..], &body[..]);
680 }
681
682 #[tokio::test]
683 async fn arxiv_fetch_old_style_id_returns_pdf_bytes() {
684 let server = MockServer::start().await;
688 let body = b"%PDF-1.4\n%old-style fixture\n".to_vec();
689 Mock::given(method("GET"))
690 .and(path("/pdf/cond-mat/9501001.pdf"))
691 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
692 .mount(&server)
693 .await;
694
695 let host = server
696 .uri()
697 .parse::<Url>()
698 .unwrap()
699 .host_str()
700 .unwrap()
701 .to_string();
702 let (_td, ctx) = build_test_context(&host);
703 let s = ArxivSource::with_base(server.uri().parse().unwrap());
704
705 let id = ArxivId::parse("cond-mat/9501001").expect("old-style id");
706 let r = Ref::Arxiv(id);
707 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
708
709 let bytes = res.pdf_bytes.expect("pdf bytes set");
710 assert!(bytes.starts_with(b"%PDF-"));
711 assert_eq!(&bytes[..], &body[..]);
712 }
713
714 #[tokio::test]
719 async fn arxiv_fetch_with_doi_ref_errors_not_eligible() {
720 let server = MockServer::start().await;
721 let host = server
722 .uri()
723 .parse::<Url>()
724 .unwrap()
725 .host_str()
726 .unwrap()
727 .to_string();
728 let (_td, ctx) = build_test_context(&host);
729 let s = ArxivSource::with_base(server.uri().parse().unwrap());
730
731 let r = Ref::Doi(Doi("10.1234/example".to_string()));
732 let err = s
733 .fetch(&r, &profile(), &ctx)
734 .await
735 .expect_err("doi ref must not be eligible");
736 match err {
737 FetchError::NotEligible { source_key } => {
738 assert_eq!(source_key, "arxiv");
739 }
740 other => panic!("expected NotEligible, got {:?}", other),
741 }
742 }
743
744 #[tokio::test]
745 async fn arxiv_fetch_writes_log_row_with_arxiv_default_license() {
746 let server = MockServer::start().await;
747 let body = b"%PDF-1.7\n%log-row fixture\n".to_vec();
748 Mock::given(method("GET"))
749 .and(path("/pdf/2401.12345.pdf"))
750 .respond_with(ResponseTemplate::new(200).set_body_bytes(body.clone()))
751 .mount(&server)
752 .await;
753 let host = server
754 .uri()
755 .parse::<Url>()
756 .unwrap()
757 .host_str()
758 .unwrap()
759 .to_string();
760 let (_td, ctx) = build_test_context(&host);
761 let log_path = ctx.log.path().to_path_buf();
763 let s = ArxivSource::with_base(server.uri().parse().unwrap());
764
765 let id = ArxivId::parse("2401.12345").unwrap();
766 let r = Ref::Arxiv(id);
767 let _ = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
768
769 let rows = read_rows(&log_path);
770 assert_eq!(rows.len(), 1, "exactly one fetch row expected");
771 let row = &rows[0];
772 assert_eq!(row.source.as_deref(), Some("arxiv"));
773 assert_eq!(row.ref_.as_deref(), Some("2401.12345"));
774 assert_eq!(row.license.as_deref(), Some("arxiv-default"));
775 assert_eq!(row.size_bytes, Some(body.len() as u64));
776 assert!(row.error_code.is_none());
777 }
778
779 #[tokio::test]
780 async fn arxiv_non_pdf_body_rejected() {
781 let server = MockServer::start().await;
785 Mock::given(method("GET"))
786 .and(path("/pdf/2401.12345.pdf"))
787 .respond_with(
788 ResponseTemplate::new(200).set_body_bytes(b"<html>not a pdf</html>".to_vec()),
789 )
790 .mount(&server)
791 .await;
792 let host = server
793 .uri()
794 .parse::<Url>()
795 .unwrap()
796 .host_str()
797 .unwrap()
798 .to_string();
799 let (_td, ctx) = build_test_context(&host);
800 let s = ArxivSource::with_base(server.uri().parse().unwrap());
801
802 let id = ArxivId::parse("2401.12345").unwrap();
803 let r = Ref::Arxiv(id);
804 let err = s
805 .fetch(&r, &profile(), &ctx)
806 .await
807 .expect_err("non-pdf body must be rejected");
808 match err {
809 FetchError::Http(HttpError::NotAPdf { got }) => {
810 assert_eq!(&got, b"<html");
811 }
812 other => panic!("expected FetchError::Http(NotAPdf), got {:?}", other),
813 }
814 }
815
816 #[tokio::test]
817 async fn arxiv_404_maps_to_http_error() {
818 let server = MockServer::start().await;
819 Mock::given(method("GET"))
820 .and(path("/pdf/2401.99999.pdf"))
821 .respond_with(ResponseTemplate::new(404))
822 .mount(&server)
823 .await;
824 let host = server
825 .uri()
826 .parse::<Url>()
827 .unwrap()
828 .host_str()
829 .unwrap()
830 .to_string();
831 let (_td, ctx) = build_test_context(&host);
832 let s = ArxivSource::with_base(server.uri().parse().unwrap());
833
834 let id = ArxivId::parse("2401.99999").unwrap();
835 let r = Ref::Arxiv(id);
836 let err = s
837 .fetch(&r, &profile(), &ctx)
838 .await
839 .expect_err("404 must surface");
840 match err {
841 FetchError::Http(HttpError::HttpStatus { status, .. }) => {
842 assert_eq!(status, 404);
843 }
844 other => panic!("expected FetchError::Http(HttpStatus), got {:?}", other),
845 }
846 }
847
848 const SAMPLE_ATOM_FEED: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
855<feed xmlns="http://www.w3.org/2005/Atom">
856 <entry>
857 <id>http://arxiv.org/abs/2401.12345v1</id>
858 <updated>2024-02-01T00:00:00Z</updated>
859 <published>2024-01-15T00:00:00Z</published>
860 <title>Example arXiv Paper Title</title>
861 <summary>This is an example abstract.</summary>
862 <author>
863 <name>Jane Doe</name>
864 </author>
865 <author>
866 <name>John Roe</name>
867 </author>
868 <category term="cs.LG" scheme="http://arxiv.org/schemas/atom"/>
869 <category term="stat.ML" scheme="http://arxiv.org/schemas/atom"/>
870 </entry>
871</feed>"#;
872
873 #[test]
874 fn parse_atom_feed_extracts_all_fields() {
875 let v = parse_atom_feed(SAMPLE_ATOM_FEED.as_bytes()).expect("Atom parses");
876 assert_eq!(v["title"], serde_json::json!("Example arXiv Paper Title"));
877 assert_eq!(
878 v["abstract"],
879 serde_json::json!("This is an example abstract.")
880 );
881 assert_eq!(v["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
882 assert_eq!(v["published"], serde_json::json!("2024-01-15T00:00:00Z"));
883 assert_eq!(v["updated"], serde_json::json!("2024-02-01T00:00:00Z"));
884 assert_eq!(v["categories"], serde_json::json!(["cs.LG", "stat.ML"]));
885 }
886
887 #[test]
888 fn parse_atom_feed_empty_feed_errors_source_schema() {
889 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
890<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
891 let err = parse_atom_feed(xml.as_bytes()).expect_err("empty feed must error");
892 match err {
893 FetchError::SourceSchema { hint } => {
894 assert!(
895 hint.contains("entry"),
896 "expected mention of <entry>; got {hint}"
897 );
898 }
899 other => panic!("expected SourceSchema, got {other:?}"),
900 }
901 }
902
903 #[test]
904 fn parse_atom_feed_omits_missing_optional_fields() {
905 let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
909<feed xmlns="http://www.w3.org/2005/Atom">
910 <entry>
911 <id>http://arxiv.org/abs/2401.00001v1</id>
912 <title>Minimal Entry</title>
913 </entry>
914</feed>"#;
915 let v = parse_atom_feed(xml.as_bytes()).expect("parses");
916 let obj = v.as_object().expect("object");
917 assert_eq!(
918 obj.get("title").and_then(Value::as_str),
919 Some("Minimal Entry")
920 );
921 assert!(
922 !obj.contains_key("abstract"),
923 "abstract should be omitted: {obj:?}"
924 );
925 assert!(
926 !obj.contains_key("authors"),
927 "authors should be omitted: {obj:?}"
928 );
929 assert!(
930 !obj.contains_key("categories"),
931 "categories should be omitted: {obj:?}"
932 );
933 }
934
935 #[tokio::test]
940 async fn arxiv_fetch_metadata_only_returns_atom_metadata() {
941 let server = MockServer::start().await;
942 Mock::given(method("GET"))
943 .and(path("/api/query"))
944 .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
945 .mount(&server)
946 .await;
947 let host = server
948 .uri()
949 .parse::<Url>()
950 .unwrap()
951 .host_str()
952 .unwrap()
953 .to_string();
954 let (_td, ctx) = build_test_context(&host);
955 let s = ArxivSource::with_base(server.uri().parse().unwrap());
956 let id = ArxivId::parse("2401.12345").unwrap();
957
958 let meta = s
959 .fetch_metadata_only(&id, &ctx)
960 .await
961 .expect("metadata_only ok");
962 assert_eq!(
963 meta["title"],
964 serde_json::json!("Example arXiv Paper Title")
965 );
966 assert_eq!(meta["authors"], serde_json::json!(["Jane Doe", "John Roe"]));
967 }
968
969 #[tokio::test]
970 async fn arxiv_fetch_populates_metadata_json_when_atom_endpoint_mocked() {
971 let server = MockServer::start().await;
974 Mock::given(method("GET"))
975 .and(path("/api/query"))
976 .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_ATOM_FEED))
977 .mount(&server)
978 .await;
979 Mock::given(method("GET"))
980 .and(path("/pdf/2401.12345.pdf"))
981 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\n%fix\n".to_vec()))
982 .mount(&server)
983 .await;
984 let host = server
985 .uri()
986 .parse::<Url>()
987 .unwrap()
988 .host_str()
989 .unwrap()
990 .to_string();
991 let (_td, ctx) = build_test_context(&host);
992 let s = ArxivSource::with_base(server.uri().parse().unwrap());
993 let id = ArxivId::parse("2401.12345").unwrap();
994 let r = Ref::Arxiv(id);
995
996 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
997 let meta = res.metadata_json.expect("metadata_json populated");
998 assert_eq!(
999 meta["title"],
1000 serde_json::json!("Example arXiv Paper Title")
1001 );
1002 }
1003
1004 #[tokio::test]
1005 async fn arxiv_fetch_atom_failure_falls_back_to_pdf_only() {
1006 let server = MockServer::start().await;
1010 Mock::given(method("GET"))
1011 .and(path("/pdf/2401.12345.pdf"))
1012 .respond_with(ResponseTemplate::new(200).set_body_bytes(b"%PDF-1.7\nx".to_vec()))
1013 .mount(&server)
1014 .await;
1015 let host = server
1016 .uri()
1017 .parse::<Url>()
1018 .unwrap()
1019 .host_str()
1020 .unwrap()
1021 .to_string();
1022 let (_td, ctx) = build_test_context(&host);
1023 let s = ArxivSource::with_base(server.uri().parse().unwrap());
1024 let id = ArxivId::parse("2401.12345").unwrap();
1025 let r = Ref::Arxiv(id);
1026
1027 let res = s.fetch(&r, &profile(), &ctx).await.expect("fetch ok");
1028 assert!(res.metadata_json.is_none());
1029 assert!(res.pdf_bytes.is_some());
1030 }
1031}