1use std::collections::HashMap;
28use std::time::Duration;
29
30use chrono::{NaiveDate, Utc};
31use reqwest::{Client, StatusCode};
32use serde::Deserialize;
33use tokio::time::sleep;
34
35use crate::api_clients::SimpleEmbedder;
36use crate::ruvector_native::{Domain, SemanticVector};
37use crate::{FrameworkError, Result};
38
39const BIORXIV_RATE_LIMIT_MS: u64 = 1000; const MAX_RETRIES: u32 = 3;
42const RETRY_DELAY_MS: u64 = 2000;
43const DEFAULT_EMBEDDING_DIM: usize = 384;
44const DEFAULT_PAGE_SIZE: usize = 100;
45
46#[derive(Debug, Deserialize)]
52struct BiorxivApiResponse {
53 #[serde(default)]
55 count: Option<i64>,
56
57 #[serde(default)]
59 cursor: Option<i64>,
60
61 #[serde(default)]
63 collection: Vec<PreprintRecord>,
64}
65
66#[derive(Debug, Deserialize)]
68struct PreprintRecord {
69 doi: String,
71
72 title: String,
74
75 authors: String,
77
78 #[serde(default)]
80 author_corresponding: Option<String>,
81
82 #[serde(default)]
84 author_corresponding_institution: Option<String>,
85
86 date: String,
88
89 category: String,
91
92 #[serde(rename = "abstract")]
94 abstract_text: String,
95
96 #[serde(default)]
98 published: Option<String>,
99
100 #[serde(default)]
102 server: Option<String>,
103
104 #[serde(default)]
106 version: Option<String>,
107
108 #[serde(rename = "type", default)]
110 preprint_type: Option<String>,
111}
112
113pub struct BiorxivClient {
139 client: Client,
140 embedder: SimpleEmbedder,
141 base_url: String,
142}
143
144impl BiorxivClient {
145 pub fn new() -> Self {
152 Self::with_embedding_dim(DEFAULT_EMBEDDING_DIM)
153 }
154
155 pub fn with_embedding_dim(embedding_dim: usize) -> Self {
160 Self {
161 client: Client::builder()
162 .user_agent("RuVector-Discovery/1.0")
163 .timeout(Duration::from_secs(30))
164 .build()
165 .expect("Failed to create HTTP client"),
166 embedder: SimpleEmbedder::new(embedding_dim),
167 base_url: "https://api.biorxiv.org".to_string(),
168 }
169 }
170
171 pub async fn search_recent(&self, days: u64, limit: usize) -> Result<Vec<SemanticVector>> {
183 let end_date = Utc::now().date_naive();
184 let start_date = end_date - chrono::Duration::days(days as i64);
185
186 self.search_by_date_range(start_date, end_date, Some(limit)).await
187 }
188
189 pub async fn search_by_date_range(
205 &self,
206 start_date: NaiveDate,
207 end_date: NaiveDate,
208 limit: Option<usize>,
209 ) -> Result<Vec<SemanticVector>> {
210 let interval = format!("{}/{}", start_date, end_date);
211 self.fetch_with_pagination("biorxiv", &interval, limit).await
212 }
213
214 pub async fn search_by_category(
242 &self,
243 category: &str,
244 limit: usize,
245 ) -> Result<Vec<SemanticVector>> {
246 let end_date = Utc::now().date_naive();
248 let start_date = end_date - chrono::Duration::days(365);
249
250 let all_papers = self.search_by_date_range(start_date, end_date, Some(limit * 2)).await?;
251
252 Ok(all_papers
254 .into_iter()
255 .filter(|v| {
256 v.metadata
257 .get("category")
258 .map(|cat| cat.to_lowercase().contains(&category.to_lowercase()))
259 .unwrap_or(false)
260 })
261 .take(limit)
262 .collect())
263 }
264
265 async fn fetch_with_pagination(
267 &self,
268 server: &str,
269 interval: &str,
270 limit: Option<usize>,
271 ) -> Result<Vec<SemanticVector>> {
272 let mut all_vectors = Vec::new();
273 let mut cursor = 0;
274 let limit = limit.unwrap_or(usize::MAX);
275
276 loop {
277 if all_vectors.len() >= limit {
278 break;
279 }
280
281 let url = format!("{}/details/{}/{}/{}", self.base_url, server, interval, cursor);
282
283 sleep(Duration::from_millis(BIORXIV_RATE_LIMIT_MS)).await;
285
286 let response = self.fetch_with_retry(&url).await?;
287 let api_response: BiorxivApiResponse = response.json().await?;
288
289 if api_response.collection.is_empty() {
290 break;
291 }
292
293 for record in api_response.collection {
295 if all_vectors.len() >= limit {
296 break;
297 }
298
299 if let Some(vector) = self.record_to_vector(record, server) {
300 all_vectors.push(vector);
301 }
302 }
303
304 if let Some(new_cursor) = api_response.cursor {
306 if new_cursor as usize <= cursor {
307 break;
309 }
310 cursor = new_cursor as usize;
311 } else {
312 break;
313 }
314
315 if cursor > 10000 {
317 tracing::warn!("Pagination cursor exceeded 10000, stopping");
318 break;
319 }
320 }
321
322 Ok(all_vectors)
323 }
324
325 fn record_to_vector(&self, record: PreprintRecord, server: &str) -> Option<SemanticVector> {
327 let title = record.title.trim().replace('\n', " ");
329 let abstract_text = record.abstract_text.trim().replace('\n', " ");
330
331 let timestamp = NaiveDate::parse_from_str(&record.date, "%Y-%m-%d")
333 .ok()
334 .and_then(|d| d.and_hms_opt(0, 0, 0))
335 .map(|dt| dt.and_utc())
336 .unwrap_or_else(Utc::now);
337
338 let combined_text = format!("{} {}", title, abstract_text);
340 let embedding = self.embedder.embed_text(&combined_text);
341
342 let published_status = record.published.unwrap_or_else(|| "preprint".to_string());
344
345 let mut metadata = HashMap::new();
347 metadata.insert("doi".to_string(), record.doi.clone());
348 metadata.insert("title".to_string(), title);
349 metadata.insert("abstract".to_string(), abstract_text);
350 metadata.insert("authors".to_string(), record.authors);
351 metadata.insert("category".to_string(), record.category);
352 metadata.insert("server".to_string(), server.to_string());
353 metadata.insert("published_status".to_string(), published_status);
354
355 if let Some(corr) = record.author_corresponding {
356 metadata.insert("corresponding_author".to_string(), corr);
357 }
358 if let Some(inst) = record.author_corresponding_institution {
359 metadata.insert("institution".to_string(), inst);
360 }
361 if let Some(version) = record.version {
362 metadata.insert("version".to_string(), version);
363 }
364 if let Some(ptype) = record.preprint_type {
365 metadata.insert("type".to_string(), ptype);
366 }
367
368 metadata.insert("source".to_string(), "biorxiv".to_string());
369
370 Some(SemanticVector {
372 id: format!("doi:{}", record.doi),
373 embedding,
374 domain: Domain::Research,
375 timestamp,
376 metadata,
377 })
378 }
379
380 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
382 let mut retries = 0;
383 loop {
384 match self.client.get(url).send().await {
385 Ok(response) => {
386 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
387 retries += 1;
388 tracing::warn!("Rate limited, retrying in {}ms", RETRY_DELAY_MS * retries as u64);
389 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
390 continue;
391 }
392 if !response.status().is_success() {
393 return Err(FrameworkError::Network(
394 reqwest::Error::from(response.error_for_status().unwrap_err()),
395 ));
396 }
397 return Ok(response);
398 }
399 Err(_) if retries < MAX_RETRIES => {
400 retries += 1;
401 tracing::warn!("Request failed, retrying ({}/{})", retries, MAX_RETRIES);
402 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
403 }
404 Err(e) => return Err(FrameworkError::Network(e)),
405 }
406 }
407 }
408}
409
410impl Default for BiorxivClient {
411 fn default() -> Self {
412 Self::new()
413 }
414}
415
416pub struct MedrxivClient {
438 client: Client,
439 embedder: SimpleEmbedder,
440 base_url: String,
441}
442
443impl MedrxivClient {
444 pub fn new() -> Self {
451 Self::with_embedding_dim(DEFAULT_EMBEDDING_DIM)
452 }
453
454 pub fn with_embedding_dim(embedding_dim: usize) -> Self {
459 Self {
460 client: Client::builder()
461 .user_agent("RuVector-Discovery/1.0")
462 .timeout(Duration::from_secs(30))
463 .build()
464 .expect("Failed to create HTTP client"),
465 embedder: SimpleEmbedder::new(embedding_dim),
466 base_url: "https://api.biorxiv.org".to_string(),
467 }
468 }
469
470 pub async fn search_recent(&self, days: u64, limit: usize) -> Result<Vec<SemanticVector>> {
482 let end_date = Utc::now().date_naive();
483 let start_date = end_date - chrono::Duration::days(days as i64);
484
485 self.search_by_date_range(start_date, end_date, Some(limit)).await
486 }
487
488 pub async fn search_by_date_range(
504 &self,
505 start_date: NaiveDate,
506 end_date: NaiveDate,
507 limit: Option<usize>,
508 ) -> Result<Vec<SemanticVector>> {
509 let interval = format!("{}/{}", start_date, end_date);
510 self.fetch_with_pagination("medrxiv", &interval, limit).await
511 }
512
513 pub async fn search_covid(&self, limit: usize) -> Result<Vec<SemanticVector>> {
523 let end_date = Utc::now().date_naive();
525 let start_date = NaiveDate::from_ymd_opt(2020, 1, 1).expect("Valid date");
526
527 let all_papers = self.search_by_date_range(start_date, end_date, Some(limit * 2)).await?;
528
529 Ok(all_papers
531 .into_iter()
532 .filter(|v| {
533 let title = v.metadata.get("title").map(|s| s.to_lowercase()).unwrap_or_default();
534 let abstract_text = v.metadata.get("abstract").map(|s| s.to_lowercase()).unwrap_or_default();
535 let category = v.metadata.get("category").map(|s| s.to_lowercase()).unwrap_or_default();
536
537 let keywords = ["covid", "sars-cov-2", "coronavirus", "pandemic"];
538 keywords.iter().any(|kw| {
539 title.contains(kw) || abstract_text.contains(kw) || category.contains(kw)
540 })
541 })
542 .take(limit)
543 .collect())
544 }
545
546 pub async fn search_clinical(&self, limit: usize) -> Result<Vec<SemanticVector>> {
556 let end_date = Utc::now().date_naive();
558 let start_date = end_date - chrono::Duration::days(365);
559
560 let all_papers = self.search_by_date_range(start_date, end_date, Some(limit * 2)).await?;
561
562 Ok(all_papers
564 .into_iter()
565 .filter(|v| {
566 let title = v.metadata.get("title").map(|s| s.to_lowercase()).unwrap_or_default();
567 let abstract_text = v.metadata.get("abstract").map(|s| s.to_lowercase()).unwrap_or_default();
568 let category = v.metadata.get("category").map(|s| s.to_lowercase()).unwrap_or_default();
569
570 let keywords = ["clinical", "trial", "patient", "treatment", "therapy", "diagnosis"];
571 keywords.iter().any(|kw| {
572 title.contains(kw) || abstract_text.contains(kw) || category.contains(kw)
573 })
574 })
575 .take(limit)
576 .collect())
577 }
578
579 async fn fetch_with_pagination(
581 &self,
582 server: &str,
583 interval: &str,
584 limit: Option<usize>,
585 ) -> Result<Vec<SemanticVector>> {
586 let mut all_vectors = Vec::new();
587 let mut cursor = 0;
588 let limit = limit.unwrap_or(usize::MAX);
589
590 loop {
591 if all_vectors.len() >= limit {
592 break;
593 }
594
595 let url = format!("{}/details/{}/{}/{}", self.base_url, server, interval, cursor);
596
597 sleep(Duration::from_millis(BIORXIV_RATE_LIMIT_MS)).await;
599
600 let response = self.fetch_with_retry(&url).await?;
601 let api_response: BiorxivApiResponse = response.json().await?;
602
603 if api_response.collection.is_empty() {
604 break;
605 }
606
607 for record in api_response.collection {
609 if all_vectors.len() >= limit {
610 break;
611 }
612
613 if let Some(vector) = self.record_to_vector(record, server) {
614 all_vectors.push(vector);
615 }
616 }
617
618 if let Some(new_cursor) = api_response.cursor {
620 if new_cursor as usize <= cursor {
621 break;
623 }
624 cursor = new_cursor as usize;
625 } else {
626 break;
627 }
628
629 if cursor > 10000 {
631 tracing::warn!("Pagination cursor exceeded 10000, stopping");
632 break;
633 }
634 }
635
636 Ok(all_vectors)
637 }
638
639 fn record_to_vector(&self, record: PreprintRecord, server: &str) -> Option<SemanticVector> {
641 let title = record.title.trim().replace('\n', " ");
643 let abstract_text = record.abstract_text.trim().replace('\n', " ");
644
645 let timestamp = NaiveDate::parse_from_str(&record.date, "%Y-%m-%d")
647 .ok()
648 .and_then(|d| d.and_hms_opt(0, 0, 0))
649 .map(|dt| dt.and_utc())
650 .unwrap_or_else(Utc::now);
651
652 let combined_text = format!("{} {}", title, abstract_text);
654 let embedding = self.embedder.embed_text(&combined_text);
655
656 let published_status = record.published.unwrap_or_else(|| "preprint".to_string());
658
659 let mut metadata = HashMap::new();
661 metadata.insert("doi".to_string(), record.doi.clone());
662 metadata.insert("title".to_string(), title);
663 metadata.insert("abstract".to_string(), abstract_text);
664 metadata.insert("authors".to_string(), record.authors);
665 metadata.insert("category".to_string(), record.category);
666 metadata.insert("server".to_string(), server.to_string());
667 metadata.insert("published_status".to_string(), published_status);
668
669 if let Some(corr) = record.author_corresponding {
670 metadata.insert("corresponding_author".to_string(), corr);
671 }
672 if let Some(inst) = record.author_corresponding_institution {
673 metadata.insert("institution".to_string(), inst);
674 }
675 if let Some(version) = record.version {
676 metadata.insert("version".to_string(), version);
677 }
678 if let Some(ptype) = record.preprint_type {
679 metadata.insert("type".to_string(), ptype);
680 }
681
682 metadata.insert("source".to_string(), "medrxiv".to_string());
683
684 Some(SemanticVector {
686 id: format!("doi:{}", record.doi),
687 embedding,
688 domain: Domain::Medical,
689 timestamp,
690 metadata,
691 })
692 }
693
694 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
696 let mut retries = 0;
697 loop {
698 match self.client.get(url).send().await {
699 Ok(response) => {
700 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
701 retries += 1;
702 tracing::warn!("Rate limited, retrying in {}ms", RETRY_DELAY_MS * retries as u64);
703 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
704 continue;
705 }
706 if !response.status().is_success() {
707 return Err(FrameworkError::Network(
708 reqwest::Error::from(response.error_for_status().unwrap_err()),
709 ));
710 }
711 return Ok(response);
712 }
713 Err(_) if retries < MAX_RETRIES => {
714 retries += 1;
715 tracing::warn!("Request failed, retrying ({}/{})", retries, MAX_RETRIES);
716 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
717 }
718 Err(e) => return Err(FrameworkError::Network(e)),
719 }
720 }
721 }
722}
723
724impl Default for MedrxivClient {
725 fn default() -> Self {
726 Self::new()
727 }
728}
729
730#[cfg(test)]
735mod tests {
736 use super::*;
737
738 #[test]
739 fn test_biorxiv_client_creation() {
740 let client = BiorxivClient::new();
741 assert_eq!(client.base_url, "https://api.biorxiv.org");
742 }
743
744 #[test]
745 fn test_medrxiv_client_creation() {
746 let client = MedrxivClient::new();
747 assert_eq!(client.base_url, "https://api.biorxiv.org");
748 }
749
750 #[test]
751 fn test_custom_embedding_dim() {
752 let client = BiorxivClient::with_embedding_dim(512);
753 let embedding = client.embedder.embed_text("test");
754 assert_eq!(embedding.len(), 512);
755 }
756
757 #[test]
758 fn test_record_to_vector_biorxiv() {
759 let client = BiorxivClient::new();
760
761 let record = PreprintRecord {
762 doi: "10.1101/2024.01.01.123456".to_string(),
763 title: "Deep Learning for Neuroscience".to_string(),
764 authors: "John Doe; Jane Smith".to_string(),
765 author_corresponding: Some("John Doe".to_string()),
766 author_corresponding_institution: Some("MIT".to_string()),
767 date: "2024-01-15".to_string(),
768 category: "Neuroscience".to_string(),
769 abstract_text: "We propose a novel approach for analyzing neural data...".to_string(),
770 published: None,
771 server: Some("biorxiv".to_string()),
772 version: Some("1".to_string()),
773 preprint_type: Some("new results".to_string()),
774 };
775
776 let vector = client.record_to_vector(record, "biorxiv");
777 assert!(vector.is_some());
778
779 let v = vector.unwrap();
780 assert_eq!(v.id, "doi:10.1101/2024.01.01.123456");
781 assert_eq!(v.domain, Domain::Research);
782 assert_eq!(v.metadata.get("doi").unwrap(), "10.1101/2024.01.01.123456");
783 assert_eq!(v.metadata.get("title").unwrap(), "Deep Learning for Neuroscience");
784 assert_eq!(v.metadata.get("authors").unwrap(), "John Doe; Jane Smith");
785 assert_eq!(v.metadata.get("category").unwrap(), "Neuroscience");
786 assert_eq!(v.metadata.get("server").unwrap(), "biorxiv");
787 assert_eq!(v.metadata.get("published_status").unwrap(), "preprint");
788 }
789
790 #[test]
791 fn test_record_to_vector_medrxiv() {
792 let client = MedrxivClient::new();
793
794 let record = PreprintRecord {
795 doi: "10.1101/2024.01.01.654321".to_string(),
796 title: "COVID-19 Vaccine Efficacy Study".to_string(),
797 authors: "Alice Johnson; Bob Williams".to_string(),
798 author_corresponding: Some("Alice Johnson".to_string()),
799 author_corresponding_institution: Some("Harvard Medical School".to_string()),
800 date: "2024-03-20".to_string(),
801 category: "Infectious Diseases".to_string(),
802 abstract_text: "This study evaluates the efficacy of mRNA vaccines...".to_string(),
803 published: Some("Nature Medicine".to_string()),
804 server: Some("medrxiv".to_string()),
805 version: Some("2".to_string()),
806 preprint_type: Some("new results".to_string()),
807 };
808
809 let vector = client.record_to_vector(record, "medrxiv");
810 assert!(vector.is_some());
811
812 let v = vector.unwrap();
813 assert_eq!(v.id, "doi:10.1101/2024.01.01.654321");
814 assert_eq!(v.domain, Domain::Medical);
815 assert_eq!(v.metadata.get("doi").unwrap(), "10.1101/2024.01.01.654321");
816 assert_eq!(v.metadata.get("title").unwrap(), "COVID-19 Vaccine Efficacy Study");
817 assert_eq!(v.metadata.get("category").unwrap(), "Infectious Diseases");
818 assert_eq!(v.metadata.get("server").unwrap(), "medrxiv");
819 assert_eq!(v.metadata.get("published_status").unwrap(), "Nature Medicine");
820 }
821
822 #[test]
823 fn test_date_parsing() {
824 let client = BiorxivClient::new();
825
826 let record = PreprintRecord {
827 doi: "10.1101/test".to_string(),
828 title: "Test".to_string(),
829 authors: "Author".to_string(),
830 author_corresponding: None,
831 author_corresponding_institution: None,
832 date: "2024-01-15".to_string(),
833 category: "Test".to_string(),
834 abstract_text: "Abstract".to_string(),
835 published: None,
836 server: None,
837 version: None,
838 preprint_type: None,
839 };
840
841 let vector = client.record_to_vector(record, "biorxiv").unwrap();
842
843 let expected_date = NaiveDate::from_ymd_opt(2024, 1, 15)
845 .unwrap()
846 .and_hms_opt(0, 0, 0)
847 .unwrap()
848 .and_utc();
849
850 assert_eq!(vector.timestamp, expected_date);
851 }
852
853 #[tokio::test]
854 #[ignore] async fn test_search_recent_integration() {
856 let client = BiorxivClient::new();
857 let results = client.search_recent(7, 5).await;
858 assert!(results.is_ok());
859
860 let vectors = results.unwrap();
861 assert!(vectors.len() <= 5);
862
863 if !vectors.is_empty() {
864 let first = &vectors[0];
865 assert!(first.id.starts_with("doi:"));
866 assert_eq!(first.domain, Domain::Research);
867 assert!(first.metadata.contains_key("title"));
868 assert!(first.metadata.contains_key("abstract"));
869 }
870 }
871
872 #[tokio::test]
873 #[ignore] async fn test_medrxiv_search_recent_integration() {
875 let client = MedrxivClient::new();
876 let results = client.search_recent(7, 5).await;
877 assert!(results.is_ok());
878
879 let vectors = results.unwrap();
880 assert!(vectors.len() <= 5);
881
882 if !vectors.is_empty() {
883 let first = &vectors[0];
884 assert!(first.id.starts_with("doi:"));
885 assert_eq!(first.domain, Domain::Medical);
886 assert!(first.metadata.contains_key("title"));
887 assert!(first.metadata.contains_key("server"));
888 }
889 }
890
891 #[tokio::test]
892 #[ignore] async fn test_search_covid_integration() {
894 let client = MedrxivClient::new();
895 let results = client.search_covid(10).await;
896 assert!(results.is_ok());
897
898 let vectors = results.unwrap();
899
900 for v in &vectors {
902 let title = v.metadata.get("title").unwrap().to_lowercase();
903 let abstract_text = v.metadata.get("abstract").unwrap().to_lowercase();
904
905 let has_covid_keyword = title.contains("covid")
906 || title.contains("sars-cov-2")
907 || abstract_text.contains("covid")
908 || abstract_text.contains("sars-cov-2");
909
910 assert!(has_covid_keyword, "Expected COVID-related keywords in results");
911 }
912 }
913
914 #[tokio::test]
915 #[ignore] async fn test_search_by_category_integration() {
917 let client = BiorxivClient::new();
918 let results = client.search_by_category("neuroscience", 5).await;
919 assert!(results.is_ok());
920
921 let vectors = results.unwrap();
922 assert!(vectors.len() <= 5);
923
924 for v in &vectors {
926 let category = v.metadata.get("category").unwrap().to_lowercase();
927 assert!(category.contains("neuroscience"));
928 }
929 }
930}