ruvector_data_framework/
biorxiv_client.rs

1//! bioRxiv and medRxiv Preprint API Integration
2//!
3//! This module provides async clients for fetching preprints from bioRxiv.org and medRxiv.org,
4//! converting responses to SemanticVector format for RuVector discovery.
5//!
6//! # bioRxiv/medRxiv API Details
7//! - Base URL: https://api.biorxiv.org/details/[server]/[interval]/[cursor]
8//! - Free access, no authentication required
9//! - Returns JSON with preprint metadata
10//! - Rate limit: ~1 request per second (enforced by client)
11//!
12//! # Example
13//! ```rust,ignore
14//! use ruvector_data_framework::biorxiv_client::{BiorxivClient, MedrxivClient};
15//!
16//! // Life sciences preprints
17//! let biorxiv = BiorxivClient::new();
18//! let recent = biorxiv.search_recent(7, 50).await?;
19//! let category_papers = biorxiv.search_by_category("neuroscience", 100).await?;
20//!
21//! // Medical preprints
22//! let medrxiv = MedrxivClient::new();
23//! let covid_papers = medrxiv.search_covid(100).await?;
24//! let clinical = medrxiv.search_clinical(50).await?;
25//! ```
26
27use std::collections::HashMap;
28use std::time::Duration;
29
30use chrono::{DateTime, NaiveDate, Utc};
31use reqwest::{Client, StatusCode};
32use serde::Deserialize;
33use tokio::time::sleep;
34
35use crate::api_clients::SimpleEmbedder;
36use crate::ruvector_native::{Domain, SemanticVector};
37use crate::{FrameworkError, Result};
38
39/// Rate limiting configuration
40const BIORXIV_RATE_LIMIT_MS: u64 = 1000; // 1 second between requests (conservative)
41const MAX_RETRIES: u32 = 3;
42const RETRY_DELAY_MS: u64 = 2000;
43const DEFAULT_EMBEDDING_DIM: usize = 384;
44const DEFAULT_PAGE_SIZE: usize = 100;
45
46// ============================================================================
47// bioRxiv/medRxiv API Response Structures
48// ============================================================================
49
50/// API response from bioRxiv/medRxiv
51#[derive(Debug, Deserialize)]
52struct BiorxivApiResponse {
53    /// Total number of results
54    #[serde(default)]
55    count: Option<i64>,
56
57    /// Cursor for pagination (total number of records seen)
58    #[serde(default)]
59    cursor: Option<i64>,
60
61    /// Array of preprint records
62    #[serde(default)]
63    collection: Vec<PreprintRecord>,
64}
65
66/// Individual preprint record
67#[derive(Debug, Deserialize)]
68struct PreprintRecord {
69    /// DOI identifier
70    doi: String,
71
72    /// Paper title
73    title: String,
74
75    /// Authors (semicolon-separated)
76    authors: String,
77
78    /// Author corresponding information
79    #[serde(default)]
80    author_corresponding: Option<String>,
81
82    /// Author corresponding institution
83    #[serde(default)]
84    author_corresponding_institution: Option<String>,
85
86    /// Preprint publication date (YYYY-MM-DD)
87    date: String,
88
89    /// Subject category
90    category: String,
91
92    /// Abstract text
93    #[serde(rename = "abstract")]
94    abstract_text: String,
95
96    /// Journal publication status (if accepted)
97    #[serde(default)]
98    published: Option<String>,
99
100    /// Server (biorxiv or medrxiv)
101    #[serde(default)]
102    server: Option<String>,
103
104    /// Version number
105    #[serde(default)]
106    version: Option<String>,
107
108    /// Type (e.g., "new results")
109    #[serde(rename = "type", default)]
110    preprint_type: Option<String>,
111}
112
113// ============================================================================
114// bioRxiv Client (Life Sciences Preprints)
115// ============================================================================
116
117/// Client for bioRxiv.org preprint API
118///
119/// Provides methods to search for life sciences preprints, filter by category,
120/// and convert results to SemanticVector format for RuVector analysis.
121///
122/// # Categories
123/// - neuroscience
124/// - genomics
125/// - bioinformatics
126/// - cancer-biology
127/// - immunology
128/// - microbiology
129/// - molecular-biology
130/// - cell-biology
131/// - biochemistry
132/// - evolutionary-biology
133/// - and many more...
134///
135/// # Rate Limiting
136/// The client automatically enforces a rate limit of ~1 request per second.
137/// Includes retry logic for transient failures.
138pub struct BiorxivClient {
139    client: Client,
140    embedder: SimpleEmbedder,
141    base_url: String,
142}
143
144impl BiorxivClient {
145    /// Create a new bioRxiv API client
146    ///
147    /// # Example
148    /// ```rust,ignore
149    /// let client = BiorxivClient::new();
150    /// ```
151    pub fn new() -> Self {
152        Self::with_embedding_dim(DEFAULT_EMBEDDING_DIM)
153    }
154
155    /// Create a new bioRxiv API client with custom embedding dimension
156    ///
157    /// # Arguments
158    /// * `embedding_dim` - Dimension for text embeddings (default: 384)
159    pub fn with_embedding_dim(embedding_dim: usize) -> Self {
160        Self {
161            client: Client::builder()
162                .user_agent("RuVector-Discovery/1.0")
163                .timeout(Duration::from_secs(30))
164                .build()
165                .expect("Failed to create HTTP client"),
166            embedder: SimpleEmbedder::new(embedding_dim),
167            base_url: "https://api.biorxiv.org".to_string(),
168        }
169    }
170
171    /// Get recent preprints from the last N days
172    ///
173    /// # Arguments
174    /// * `days` - Number of days to look back (e.g., 7 for last week)
175    /// * `limit` - Maximum number of results to return
176    ///
177    /// # Example
178    /// ```rust,ignore
179    /// // Get preprints from the last 7 days
180    /// let recent = client.search_recent(7, 100).await?;
181    /// ```
182    pub async fn search_recent(&self, days: u64, limit: usize) -> Result<Vec<SemanticVector>> {
183        let end_date = Utc::now().date_naive();
184        let start_date = end_date - chrono::Duration::days(days as i64);
185
186        self.search_by_date_range(start_date, end_date, Some(limit)).await
187    }
188
189    /// Search preprints by date range
190    ///
191    /// # Arguments
192    /// * `start_date` - Start date (inclusive)
193    /// * `end_date` - End date (inclusive)
194    /// * `limit` - Optional maximum number of results
195    ///
196    /// # Example
197    /// ```rust,ignore
198    /// use chrono::NaiveDate;
199    ///
200    /// let start = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
201    /// let end = NaiveDate::from_ymd_opt(2024, 12, 31).unwrap();
202    /// let papers = client.search_by_date_range(start, end, Some(200)).await?;
203    /// ```
204    pub async fn search_by_date_range(
205        &self,
206        start_date: NaiveDate,
207        end_date: NaiveDate,
208        limit: Option<usize>,
209    ) -> Result<Vec<SemanticVector>> {
210        let interval = format!("{}/{}", start_date, end_date);
211        self.fetch_with_pagination("biorxiv", &interval, limit).await
212    }
213
214    /// Search preprints by subject category
215    ///
216    /// # Arguments
217    /// * `category` - Subject category (e.g., "neuroscience", "genomics")
218    /// * `limit` - Maximum number of results to return
219    ///
220    /// # Categories
221    /// - neuroscience
222    /// - genomics
223    /// - bioinformatics
224    /// - cancer-biology
225    /// - immunology
226    /// - microbiology
227    /// - molecular-biology
228    /// - cell-biology
229    /// - biochemistry
230    /// - evolutionary-biology
231    /// - ecology
232    /// - genetics
233    /// - developmental-biology
234    /// - synthetic-biology
235    /// - systems-biology
236    ///
237    /// # Example
238    /// ```rust,ignore
239    /// let neuroscience_papers = client.search_by_category("neuroscience", 100).await?;
240    /// ```
241    pub async fn search_by_category(
242        &self,
243        category: &str,
244        limit: usize,
245    ) -> Result<Vec<SemanticVector>> {
246        // Get recent papers (last 365 days) and filter by category
247        let end_date = Utc::now().date_naive();
248        let start_date = end_date - chrono::Duration::days(365);
249
250        let all_papers = self.search_by_date_range(start_date, end_date, Some(limit * 2)).await?;
251
252        // Filter by category
253        Ok(all_papers
254            .into_iter()
255            .filter(|v| {
256                v.metadata
257                    .get("category")
258                    .map(|cat| cat.to_lowercase().contains(&category.to_lowercase()))
259                    .unwrap_or(false)
260            })
261            .take(limit)
262            .collect())
263    }
264
265    /// Fetch preprints with pagination support
266    async fn fetch_with_pagination(
267        &self,
268        server: &str,
269        interval: &str,
270        limit: Option<usize>,
271    ) -> Result<Vec<SemanticVector>> {
272        let mut all_vectors = Vec::new();
273        let mut cursor = 0;
274        let limit = limit.unwrap_or(usize::MAX);
275
276        loop {
277            if all_vectors.len() >= limit {
278                break;
279            }
280
281            let url = format!("{}/details/{}/{}/{}", self.base_url, server, interval, cursor);
282
283            // Rate limiting
284            sleep(Duration::from_millis(BIORXIV_RATE_LIMIT_MS)).await;
285
286            let response = self.fetch_with_retry(&url).await?;
287            let api_response: BiorxivApiResponse = response.json().await?;
288
289            if api_response.collection.is_empty() {
290                break;
291            }
292
293            // Convert records to vectors
294            for record in api_response.collection {
295                if all_vectors.len() >= limit {
296                    break;
297                }
298
299                if let Some(vector) = self.record_to_vector(record, server) {
300                    all_vectors.push(vector);
301                }
302            }
303
304            // Update cursor for next page
305            if let Some(new_cursor) = api_response.cursor {
306                if new_cursor as usize <= cursor {
307                    // No more pages
308                    break;
309                }
310                cursor = new_cursor as usize;
311            } else {
312                break;
313            }
314
315            // Safety check: don't paginate indefinitely
316            if cursor > 10000 {
317                tracing::warn!("Pagination cursor exceeded 10000, stopping");
318                break;
319            }
320        }
321
322        Ok(all_vectors)
323    }
324
325    /// Convert preprint record to SemanticVector
326    fn record_to_vector(&self, record: PreprintRecord, server: &str) -> Option<SemanticVector> {
327        // Clean up title and abstract
328        let title = record.title.trim().replace('\n', " ");
329        let abstract_text = record.abstract_text.trim().replace('\n', " ");
330
331        // Parse publication date
332        let timestamp = NaiveDate::parse_from_str(&record.date, "%Y-%m-%d")
333            .ok()
334            .and_then(|d| d.and_hms_opt(0, 0, 0))
335            .map(|dt| dt.and_utc())
336            .unwrap_or_else(Utc::now);
337
338        // Generate embedding from title + abstract
339        let combined_text = format!("{} {}", title, abstract_text);
340        let embedding = self.embedder.embed_text(&combined_text);
341
342        // Determine publication status
343        let published_status = record.published.unwrap_or_else(|| "preprint".to_string());
344
345        // Build metadata
346        let mut metadata = HashMap::new();
347        metadata.insert("doi".to_string(), record.doi.clone());
348        metadata.insert("title".to_string(), title);
349        metadata.insert("abstract".to_string(), abstract_text);
350        metadata.insert("authors".to_string(), record.authors);
351        metadata.insert("category".to_string(), record.category);
352        metadata.insert("server".to_string(), server.to_string());
353        metadata.insert("published_status".to_string(), published_status);
354
355        if let Some(corr) = record.author_corresponding {
356            metadata.insert("corresponding_author".to_string(), corr);
357        }
358        if let Some(inst) = record.author_corresponding_institution {
359            metadata.insert("institution".to_string(), inst);
360        }
361        if let Some(version) = record.version {
362            metadata.insert("version".to_string(), version);
363        }
364        if let Some(ptype) = record.preprint_type {
365            metadata.insert("type".to_string(), ptype);
366        }
367
368        metadata.insert("source".to_string(), "biorxiv".to_string());
369
370        // bioRxiv papers are research domain
371        Some(SemanticVector {
372            id: format!("doi:{}", record.doi),
373            embedding,
374            domain: Domain::Research,
375            timestamp,
376            metadata,
377        })
378    }
379
380    /// Fetch with retry logic
381    async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
382        let mut retries = 0;
383        loop {
384            match self.client.get(url).send().await {
385                Ok(response) => {
386                    if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
387                        retries += 1;
388                        tracing::warn!("Rate limited, retrying in {}ms", RETRY_DELAY_MS * retries as u64);
389                        sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
390                        continue;
391                    }
392                    if !response.status().is_success() {
393                        return Err(FrameworkError::Network(
394                            reqwest::Error::from(response.error_for_status().unwrap_err()),
395                        ));
396                    }
397                    return Ok(response);
398                }
399                Err(_) if retries < MAX_RETRIES => {
400                    retries += 1;
401                    tracing::warn!("Request failed, retrying ({}/{})", retries, MAX_RETRIES);
402                    sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
403                }
404                Err(e) => return Err(FrameworkError::Network(e)),
405            }
406        }
407    }
408}
409
410impl Default for BiorxivClient {
411    fn default() -> Self {
412        Self::new()
413    }
414}
415
416// ============================================================================
417// medRxiv Client (Medical Preprints)
418// ============================================================================
419
420/// Client for medRxiv.org preprint API
421///
422/// Provides methods to search for medical and health sciences preprints,
423/// filter by specialty, and convert results to SemanticVector format.
424///
425/// # Categories
426/// - Cardiovascular Medicine
427/// - Infectious Diseases
428/// - Oncology
429/// - Public Health
430/// - Epidemiology
431/// - Psychiatry
432/// - and many more...
433///
434/// # Rate Limiting
435/// The client automatically enforces a rate limit of ~1 request per second.
436/// Includes retry logic for transient failures.
437pub struct MedrxivClient {
438    client: Client,
439    embedder: SimpleEmbedder,
440    base_url: String,
441}
442
443impl MedrxivClient {
444    /// Create a new medRxiv API client
445    ///
446    /// # Example
447    /// ```rust,ignore
448    /// let client = MedrxivClient::new();
449    /// ```
450    pub fn new() -> Self {
451        Self::with_embedding_dim(DEFAULT_EMBEDDING_DIM)
452    }
453
454    /// Create a new medRxiv API client with custom embedding dimension
455    ///
456    /// # Arguments
457    /// * `embedding_dim` - Dimension for text embeddings (default: 384)
458    pub fn with_embedding_dim(embedding_dim: usize) -> Self {
459        Self {
460            client: Client::builder()
461                .user_agent("RuVector-Discovery/1.0")
462                .timeout(Duration::from_secs(30))
463                .build()
464                .expect("Failed to create HTTP client"),
465            embedder: SimpleEmbedder::new(embedding_dim),
466            base_url: "https://api.biorxiv.org".to_string(),
467        }
468    }
469
470    /// Get recent preprints from the last N days
471    ///
472    /// # Arguments
473    /// * `days` - Number of days to look back (e.g., 7 for last week)
474    /// * `limit` - Maximum number of results to return
475    ///
476    /// # Example
477    /// ```rust,ignore
478    /// // Get medical preprints from the last 7 days
479    /// let recent = client.search_recent(7, 100).await?;
480    /// ```
481    pub async fn search_recent(&self, days: u64, limit: usize) -> Result<Vec<SemanticVector>> {
482        let end_date = Utc::now().date_naive();
483        let start_date = end_date - chrono::Duration::days(days as i64);
484
485        self.search_by_date_range(start_date, end_date, Some(limit)).await
486    }
487
488    /// Search preprints by date range
489    ///
490    /// # Arguments
491    /// * `start_date` - Start date (inclusive)
492    /// * `end_date` - End date (inclusive)
493    /// * `limit` - Optional maximum number of results
494    ///
495    /// # Example
496    /// ```rust,ignore
497    /// use chrono::NaiveDate;
498    ///
499    /// let start = NaiveDate::from_ymd_opt(2024, 1, 1).unwrap();
500    /// let end = NaiveDate::from_ymd_opt(2024, 12, 31).unwrap();
501    /// let papers = client.search_by_date_range(start, end, Some(200)).await?;
502    /// ```
503    pub async fn search_by_date_range(
504        &self,
505        start_date: NaiveDate,
506        end_date: NaiveDate,
507        limit: Option<usize>,
508    ) -> Result<Vec<SemanticVector>> {
509        let interval = format!("{}/{}", start_date, end_date);
510        self.fetch_with_pagination("medrxiv", &interval, limit).await
511    }
512
513    /// Search COVID-19 related preprints
514    ///
515    /// # Arguments
516    /// * `limit` - Maximum number of results to return
517    ///
518    /// # Example
519    /// ```rust,ignore
520    /// let covid_papers = client.search_covid(100).await?;
521    /// ```
522    pub async fn search_covid(&self, limit: usize) -> Result<Vec<SemanticVector>> {
523        // Search for COVID-19 related papers from 2020 onwards
524        let end_date = Utc::now().date_naive();
525        let start_date = NaiveDate::from_ymd_opt(2020, 1, 1).expect("Valid date");
526
527        let all_papers = self.search_by_date_range(start_date, end_date, Some(limit * 2)).await?;
528
529        // Filter by COVID-19 related keywords
530        Ok(all_papers
531            .into_iter()
532            .filter(|v| {
533                let title = v.metadata.get("title").map(|s| s.to_lowercase()).unwrap_or_default();
534                let abstract_text = v.metadata.get("abstract").map(|s| s.to_lowercase()).unwrap_or_default();
535                let category = v.metadata.get("category").map(|s| s.to_lowercase()).unwrap_or_default();
536
537                let keywords = ["covid", "sars-cov-2", "coronavirus", "pandemic"];
538                keywords.iter().any(|kw| {
539                    title.contains(kw) || abstract_text.contains(kw) || category.contains(kw)
540                })
541            })
542            .take(limit)
543            .collect())
544    }
545
546    /// Search clinical research preprints
547    ///
548    /// # Arguments
549    /// * `limit` - Maximum number of results to return
550    ///
551    /// # Example
552    /// ```rust,ignore
553    /// let clinical_papers = client.search_clinical(50).await?;
554    /// ```
555    pub async fn search_clinical(&self, limit: usize) -> Result<Vec<SemanticVector>> {
556        // Get recent papers and filter for clinical research
557        let end_date = Utc::now().date_naive();
558        let start_date = end_date - chrono::Duration::days(365);
559
560        let all_papers = self.search_by_date_range(start_date, end_date, Some(limit * 2)).await?;
561
562        // Filter by clinical keywords
563        Ok(all_papers
564            .into_iter()
565            .filter(|v| {
566                let title = v.metadata.get("title").map(|s| s.to_lowercase()).unwrap_or_default();
567                let abstract_text = v.metadata.get("abstract").map(|s| s.to_lowercase()).unwrap_or_default();
568                let category = v.metadata.get("category").map(|s| s.to_lowercase()).unwrap_or_default();
569
570                let keywords = ["clinical", "trial", "patient", "treatment", "therapy", "diagnosis"];
571                keywords.iter().any(|kw| {
572                    title.contains(kw) || abstract_text.contains(kw) || category.contains(kw)
573                })
574            })
575            .take(limit)
576            .collect())
577    }
578
579    /// Fetch preprints with pagination support
580    async fn fetch_with_pagination(
581        &self,
582        server: &str,
583        interval: &str,
584        limit: Option<usize>,
585    ) -> Result<Vec<SemanticVector>> {
586        let mut all_vectors = Vec::new();
587        let mut cursor = 0;
588        let limit = limit.unwrap_or(usize::MAX);
589
590        loop {
591            if all_vectors.len() >= limit {
592                break;
593            }
594
595            let url = format!("{}/details/{}/{}/{}", self.base_url, server, interval, cursor);
596
597            // Rate limiting
598            sleep(Duration::from_millis(BIORXIV_RATE_LIMIT_MS)).await;
599
600            let response = self.fetch_with_retry(&url).await?;
601            let api_response: BiorxivApiResponse = response.json().await?;
602
603            if api_response.collection.is_empty() {
604                break;
605            }
606
607            // Convert records to vectors
608            for record in api_response.collection {
609                if all_vectors.len() >= limit {
610                    break;
611                }
612
613                if let Some(vector) = self.record_to_vector(record, server) {
614                    all_vectors.push(vector);
615                }
616            }
617
618            // Update cursor for next page
619            if let Some(new_cursor) = api_response.cursor {
620                if new_cursor as usize <= cursor {
621                    // No more pages
622                    break;
623                }
624                cursor = new_cursor as usize;
625            } else {
626                break;
627            }
628
629            // Safety check: don't paginate indefinitely
630            if cursor > 10000 {
631                tracing::warn!("Pagination cursor exceeded 10000, stopping");
632                break;
633            }
634        }
635
636        Ok(all_vectors)
637    }
638
639    /// Convert preprint record to SemanticVector
640    fn record_to_vector(&self, record: PreprintRecord, server: &str) -> Option<SemanticVector> {
641        // Clean up title and abstract
642        let title = record.title.trim().replace('\n', " ");
643        let abstract_text = record.abstract_text.trim().replace('\n', " ");
644
645        // Parse publication date
646        let timestamp = NaiveDate::parse_from_str(&record.date, "%Y-%m-%d")
647            .ok()
648            .and_then(|d| d.and_hms_opt(0, 0, 0))
649            .map(|dt| dt.and_utc())
650            .unwrap_or_else(Utc::now);
651
652        // Generate embedding from title + abstract
653        let combined_text = format!("{} {}", title, abstract_text);
654        let embedding = self.embedder.embed_text(&combined_text);
655
656        // Determine publication status
657        let published_status = record.published.unwrap_or_else(|| "preprint".to_string());
658
659        // Build metadata
660        let mut metadata = HashMap::new();
661        metadata.insert("doi".to_string(), record.doi.clone());
662        metadata.insert("title".to_string(), title);
663        metadata.insert("abstract".to_string(), abstract_text);
664        metadata.insert("authors".to_string(), record.authors);
665        metadata.insert("category".to_string(), record.category);
666        metadata.insert("server".to_string(), server.to_string());
667        metadata.insert("published_status".to_string(), published_status);
668
669        if let Some(corr) = record.author_corresponding {
670            metadata.insert("corresponding_author".to_string(), corr);
671        }
672        if let Some(inst) = record.author_corresponding_institution {
673            metadata.insert("institution".to_string(), inst);
674        }
675        if let Some(version) = record.version {
676            metadata.insert("version".to_string(), version);
677        }
678        if let Some(ptype) = record.preprint_type {
679            metadata.insert("type".to_string(), ptype);
680        }
681
682        metadata.insert("source".to_string(), "medrxiv".to_string());
683
684        // medRxiv papers are medical domain
685        Some(SemanticVector {
686            id: format!("doi:{}", record.doi),
687            embedding,
688            domain: Domain::Medical,
689            timestamp,
690            metadata,
691        })
692    }
693
694    /// Fetch with retry logic
695    async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
696        let mut retries = 0;
697        loop {
698            match self.client.get(url).send().await {
699                Ok(response) => {
700                    if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
701                        retries += 1;
702                        tracing::warn!("Rate limited, retrying in {}ms", RETRY_DELAY_MS * retries as u64);
703                        sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
704                        continue;
705                    }
706                    if !response.status().is_success() {
707                        return Err(FrameworkError::Network(
708                            reqwest::Error::from(response.error_for_status().unwrap_err()),
709                        ));
710                    }
711                    return Ok(response);
712                }
713                Err(_) if retries < MAX_RETRIES => {
714                    retries += 1;
715                    tracing::warn!("Request failed, retrying ({}/{})", retries, MAX_RETRIES);
716                    sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
717                }
718                Err(e) => return Err(FrameworkError::Network(e)),
719            }
720        }
721    }
722}
723
724impl Default for MedrxivClient {
725    fn default() -> Self {
726        Self::new()
727    }
728}
729
730// ============================================================================
731// Tests
732// ============================================================================
733
734#[cfg(test)]
735mod tests {
736    use super::*;
737
738    #[test]
739    fn test_biorxiv_client_creation() {
740        let client = BiorxivClient::new();
741        assert_eq!(client.base_url, "https://api.biorxiv.org");
742    }
743
744    #[test]
745    fn test_medrxiv_client_creation() {
746        let client = MedrxivClient::new();
747        assert_eq!(client.base_url, "https://api.biorxiv.org");
748    }
749
750    #[test]
751    fn test_custom_embedding_dim() {
752        let client = BiorxivClient::with_embedding_dim(512);
753        let embedding = client.embedder.embed_text("test");
754        assert_eq!(embedding.len(), 512);
755    }
756
757    #[test]
758    fn test_record_to_vector_biorxiv() {
759        let client = BiorxivClient::new();
760
761        let record = PreprintRecord {
762            doi: "10.1101/2024.01.01.123456".to_string(),
763            title: "Deep Learning for Neuroscience".to_string(),
764            authors: "John Doe; Jane Smith".to_string(),
765            author_corresponding: Some("John Doe".to_string()),
766            author_corresponding_institution: Some("MIT".to_string()),
767            date: "2024-01-15".to_string(),
768            category: "Neuroscience".to_string(),
769            abstract_text: "We propose a novel approach for analyzing neural data...".to_string(),
770            published: None,
771            server: Some("biorxiv".to_string()),
772            version: Some("1".to_string()),
773            preprint_type: Some("new results".to_string()),
774        };
775
776        let vector = client.record_to_vector(record, "biorxiv");
777        assert!(vector.is_some());
778
779        let v = vector.unwrap();
780        assert_eq!(v.id, "doi:10.1101/2024.01.01.123456");
781        assert_eq!(v.domain, Domain::Research);
782        assert_eq!(v.metadata.get("doi").unwrap(), "10.1101/2024.01.01.123456");
783        assert_eq!(v.metadata.get("title").unwrap(), "Deep Learning for Neuroscience");
784        assert_eq!(v.metadata.get("authors").unwrap(), "John Doe; Jane Smith");
785        assert_eq!(v.metadata.get("category").unwrap(), "Neuroscience");
786        assert_eq!(v.metadata.get("server").unwrap(), "biorxiv");
787        assert_eq!(v.metadata.get("published_status").unwrap(), "preprint");
788    }
789
790    #[test]
791    fn test_record_to_vector_medrxiv() {
792        let client = MedrxivClient::new();
793
794        let record = PreprintRecord {
795            doi: "10.1101/2024.01.01.654321".to_string(),
796            title: "COVID-19 Vaccine Efficacy Study".to_string(),
797            authors: "Alice Johnson; Bob Williams".to_string(),
798            author_corresponding: Some("Alice Johnson".to_string()),
799            author_corresponding_institution: Some("Harvard Medical School".to_string()),
800            date: "2024-03-20".to_string(),
801            category: "Infectious Diseases".to_string(),
802            abstract_text: "This study evaluates the efficacy of mRNA vaccines...".to_string(),
803            published: Some("Nature Medicine".to_string()),
804            server: Some("medrxiv".to_string()),
805            version: Some("2".to_string()),
806            preprint_type: Some("new results".to_string()),
807        };
808
809        let vector = client.record_to_vector(record, "medrxiv");
810        assert!(vector.is_some());
811
812        let v = vector.unwrap();
813        assert_eq!(v.id, "doi:10.1101/2024.01.01.654321");
814        assert_eq!(v.domain, Domain::Medical);
815        assert_eq!(v.metadata.get("doi").unwrap(), "10.1101/2024.01.01.654321");
816        assert_eq!(v.metadata.get("title").unwrap(), "COVID-19 Vaccine Efficacy Study");
817        assert_eq!(v.metadata.get("category").unwrap(), "Infectious Diseases");
818        assert_eq!(v.metadata.get("server").unwrap(), "medrxiv");
819        assert_eq!(v.metadata.get("published_status").unwrap(), "Nature Medicine");
820    }
821
822    #[test]
823    fn test_date_parsing() {
824        let client = BiorxivClient::new();
825
826        let record = PreprintRecord {
827            doi: "10.1101/test".to_string(),
828            title: "Test".to_string(),
829            authors: "Author".to_string(),
830            author_corresponding: None,
831            author_corresponding_institution: None,
832            date: "2024-01-15".to_string(),
833            category: "Test".to_string(),
834            abstract_text: "Abstract".to_string(),
835            published: None,
836            server: None,
837            version: None,
838            preprint_type: None,
839        };
840
841        let vector = client.record_to_vector(record, "biorxiv").unwrap();
842
843        // Check that date was parsed correctly
844        let expected_date = NaiveDate::from_ymd_opt(2024, 1, 15)
845            .unwrap()
846            .and_hms_opt(0, 0, 0)
847            .unwrap()
848            .and_utc();
849
850        assert_eq!(vector.timestamp, expected_date);
851    }
852
853    #[tokio::test]
854    #[ignore] // Ignore by default to avoid hitting bioRxiv API in tests
855    async fn test_search_recent_integration() {
856        let client = BiorxivClient::new();
857        let results = client.search_recent(7, 5).await;
858        assert!(results.is_ok());
859
860        let vectors = results.unwrap();
861        assert!(vectors.len() <= 5);
862
863        if !vectors.is_empty() {
864            let first = &vectors[0];
865            assert!(first.id.starts_with("doi:"));
866            assert_eq!(first.domain, Domain::Research);
867            assert!(first.metadata.contains_key("title"));
868            assert!(first.metadata.contains_key("abstract"));
869        }
870    }
871
872    #[tokio::test]
873    #[ignore] // Ignore by default to avoid hitting medRxiv API in tests
874    async fn test_medrxiv_search_recent_integration() {
875        let client = MedrxivClient::new();
876        let results = client.search_recent(7, 5).await;
877        assert!(results.is_ok());
878
879        let vectors = results.unwrap();
880        assert!(vectors.len() <= 5);
881
882        if !vectors.is_empty() {
883            let first = &vectors[0];
884            assert!(first.id.starts_with("doi:"));
885            assert_eq!(first.domain, Domain::Medical);
886            assert!(first.metadata.contains_key("title"));
887            assert!(first.metadata.contains_key("server"));
888        }
889    }
890
891    #[tokio::test]
892    #[ignore] // Ignore by default to avoid hitting API
893    async fn test_search_covid_integration() {
894        let client = MedrxivClient::new();
895        let results = client.search_covid(10).await;
896        assert!(results.is_ok());
897
898        let vectors = results.unwrap();
899
900        // Verify that results contain COVID-related keywords
901        for v in &vectors {
902            let title = v.metadata.get("title").unwrap().to_lowercase();
903            let abstract_text = v.metadata.get("abstract").unwrap().to_lowercase();
904
905            let has_covid_keyword = title.contains("covid")
906                || title.contains("sars-cov-2")
907                || abstract_text.contains("covid")
908                || abstract_text.contains("sars-cov-2");
909
910            assert!(has_covid_keyword, "Expected COVID-related keywords in results");
911        }
912    }
913
914    #[tokio::test]
915    #[ignore] // Ignore by default to avoid hitting API
916    async fn test_search_by_category_integration() {
917        let client = BiorxivClient::new();
918        let results = client.search_by_category("neuroscience", 5).await;
919        assert!(results.is_ok());
920
921        let vectors = results.unwrap();
922        assert!(vectors.len() <= 5);
923
924        // Verify category filtering
925        for v in &vectors {
926            let category = v.metadata.get("category").unwrap().to_lowercase();
927            assert!(category.contains("neuroscience"));
928        }
929    }
930}