ruvector_data_framework/
genomics_clients.rs

1//! Genomics and DNA data API integrations for NCBI, UniProt, Ensembl, and GWAS Catalog
2//!
3//! This module provides async clients for fetching genomics data including genes, proteins,
4//! variants, and genome-wide association studies, converting responses to SemanticVector
5//! format for RuVector discovery.
6
7use std::collections::HashMap;
8use std::sync::Arc;
9use std::time::Duration;
10
11use chrono::{NaiveDate, Utc};
12use reqwest::{Client, StatusCode};
13use serde::Deserialize;
14use tokio::time::sleep;
15
16use crate::api_clients::SimpleEmbedder;
17use crate::ruvector_native::{Domain, SemanticVector};
18use crate::{FrameworkError, Result};
19
20/// Rate limiting configuration
21const NCBI_RATE_LIMIT_MS: u64 = 334; // ~3 requests/second without API key
22const NCBI_WITH_KEY_RATE_LIMIT_MS: u64 = 100; // 10 requests/second with key
23const UNIPROT_RATE_LIMIT_MS: u64 = 100; // Conservative rate limit
24const ENSEMBL_RATE_LIMIT_MS: u64 = 67; // 15 requests/second
25const GWAS_RATE_LIMIT_MS: u64 = 100; // Conservative rate limit
26const MAX_RETRIES: u32 = 3;
27const RETRY_DELAY_MS: u64 = 1000;
28
29// ============================================================================
30// NCBI Entrez Client (Genes, Proteins, Nucleotides, SNPs)
31// ============================================================================
32
33/// NCBI ESearch response
34#[derive(Debug, Deserialize)]
35struct NcbiSearchResponse {
36    esearchresult: NcbiSearchResult,
37}
38
39#[derive(Debug, Deserialize)]
40struct NcbiSearchResult {
41    #[serde(default)]
42    idlist: Vec<String>,
43    #[serde(default)]
44    count: String,
45}
46
47/// NCBI Gene summary response
48#[derive(Debug, Deserialize)]
49struct NcbiGeneSummaryResponse {
50    result: HashMap<String, NcbiGeneSummary>,
51}
52
53#[derive(Debug, Deserialize)]
54struct NcbiGeneSummary {
55    #[serde(default)]
56    uid: String,
57    #[serde(default)]
58    name: String,
59    #[serde(default)]
60    description: String,
61    #[serde(default)]
62    summary: String,
63    #[serde(default)]
64    organism: NcbiOrganism,
65    #[serde(default)]
66    chromosome: String,
67    #[serde(default)]
68    maplocation: String,
69}
70
71#[derive(Debug, Deserialize, Default)]
72struct NcbiOrganism {
73    #[serde(default)]
74    scientificname: String,
75    #[serde(default)]
76    commonname: String,
77}
78
79/// NCBI SNP docsum response
80#[derive(Debug, Deserialize)]
81struct NcbiSnpResponse {
82    result: HashMap<String, NcbiSnpSummary>,
83}
84
85#[derive(Debug, Deserialize)]
86struct NcbiSnpSummary {
87    #[serde(default)]
88    uid: String,
89    #[serde(default)]
90    snp_id: String,
91    #[serde(default)]
92    genes: Vec<NcbiGene>,
93    #[serde(default)]
94    chr: String,
95    #[serde(default)]
96    chrpos: String,
97    #[serde(default)]
98    fxn_class: String,
99}
100
101#[derive(Debug, Deserialize)]
102struct NcbiGene {
103    #[serde(default)]
104    name: String,
105}
106
107/// Client for NCBI Entrez APIs (genes, proteins, nucleotides, SNPs)
108pub struct NcbiClient {
109    client: Client,
110    base_url: String,
111    api_key: Option<String>,
112    rate_limit_delay: Duration,
113    embedder: Arc<SimpleEmbedder>,
114}
115
116impl NcbiClient {
117    /// Create a new NCBI Entrez client
118    ///
119    /// # Arguments
120    /// * `api_key` - Optional NCBI API key (get from https://www.ncbi.nlm.nih.gov/account/)
121    ///   Without a key: 3 requests/second
122    ///   With a key: 10 requests/second
123    pub fn new(api_key: Option<String>) -> Result<Self> {
124        let client = Client::builder()
125            .timeout(Duration::from_secs(30))
126            .user_agent("RuVector/1.0 (genomics discovery)")
127            .build()
128            .map_err(FrameworkError::Network)?;
129
130        let rate_limit_delay = if api_key.is_some() {
131            Duration::from_millis(NCBI_WITH_KEY_RATE_LIMIT_MS)
132        } else {
133            Duration::from_millis(NCBI_RATE_LIMIT_MS)
134        };
135
136        Ok(Self {
137            client,
138            base_url: "https://eutils.ncbi.nlm.nih.gov/entrez/eutils".to_string(),
139            api_key,
140            rate_limit_delay,
141            embedder: Arc::new(SimpleEmbedder::new(384)),
142        })
143    }
144
145    /// Search gene database
146    ///
147    /// # Arguments
148    /// * `query` - Search query (e.g., "BRCA1", "alzheimer's disease")
149    /// * `organism` - Optional organism filter (e.g., "human", "mouse")
150    pub async fn search_genes(
151        &self,
152        query: &str,
153        organism: Option<&str>,
154    ) -> Result<Vec<SemanticVector>> {
155        let mut search_query = query.to_string();
156        if let Some(org) = organism {
157            search_query.push_str(&format!(" AND {}[Organism]", org));
158        }
159
160        let gene_ids = self.search_database("gene", &search_query, 100).await?;
161        if gene_ids.is_empty() {
162            return Ok(Vec::new());
163        }
164
165        self.fetch_gene_summaries(&gene_ids).await
166    }
167
168    /// Get gene details by gene ID
169    pub async fn get_gene(&self, gene_id: &str) -> Result<Option<SemanticVector>> {
170        let vectors = self.fetch_gene_summaries(&[gene_id.to_string()]).await?;
171        Ok(vectors.into_iter().next())
172    }
173
174    /// Search protein database
175    pub async fn search_proteins(&self, query: &str) -> Result<Vec<SemanticVector>> {
176        let protein_ids = self.search_database("protein", query, 100).await?;
177        if protein_ids.is_empty() {
178            return Ok(Vec::new());
179        }
180
181        self.fetch_protein_summaries(&protein_ids).await
182    }
183
184    /// Search nucleotide sequences
185    pub async fn search_nucleotide(&self, query: &str) -> Result<Vec<SemanticVector>> {
186        let seq_ids = self.search_database("nucleotide", query, 100).await?;
187        if seq_ids.is_empty() {
188            return Ok(Vec::new());
189        }
190
191        self.fetch_nucleotide_summaries(&seq_ids).await
192    }
193
194    /// Get SNP/variant information by rsID
195    ///
196    /// # Arguments
197    /// * `rsid` - SNP reference ID (e.g., "rs429358" for APOE4)
198    pub async fn get_snp(&self, rsid: &str) -> Result<Option<SemanticVector>> {
199        let clean_rsid = rsid.trim_start_matches("rs");
200        let snp_ids = self.search_database("snp", clean_rsid, 1).await?;
201
202        if snp_ids.is_empty() {
203            return Ok(None);
204        }
205
206        let vectors = self.fetch_snp_summaries(&snp_ids).await?;
207        Ok(vectors.into_iter().next())
208    }
209
210    /// Search any NCBI database
211    async fn search_database(
212        &self,
213        db: &str,
214        query: &str,
215        max_results: usize,
216    ) -> Result<Vec<String>> {
217        let mut url = format!(
218            "{}/esearch.fcgi?db={}&term={}&retmode=json&retmax={}",
219            self.base_url,
220            db,
221            urlencoding::encode(query),
222            max_results
223        );
224
225        if let Some(key) = &self.api_key {
226            url.push_str(&format!("&api_key={}", key));
227        }
228
229        sleep(self.rate_limit_delay).await;
230        let response = self.fetch_with_retry(&url).await?;
231        let search_response: NcbiSearchResponse = response.json().await?;
232
233        Ok(search_response.esearchresult.idlist)
234    }
235
236    /// Fetch gene summaries
237    async fn fetch_gene_summaries(&self, gene_ids: &[String]) -> Result<Vec<SemanticVector>> {
238        if gene_ids.is_empty() {
239            return Ok(Vec::new());
240        }
241
242        let mut all_vectors = Vec::new();
243
244        for chunk in gene_ids.chunks(200) {
245            let id_list = chunk.join(",");
246            let mut url = format!(
247                "{}/esummary.fcgi?db=gene&id={}&retmode=json",
248                self.base_url, id_list
249            );
250
251            if let Some(key) = &self.api_key {
252                url.push_str(&format!("&api_key={}", key));
253            }
254
255            sleep(self.rate_limit_delay).await;
256            let response = self.fetch_with_retry(&url).await?;
257            let summary_response: NcbiGeneSummaryResponse = response.json().await?;
258
259            for (id, summary) in summary_response.result {
260                if id == "uids" {
261                    continue; // Skip metadata entry
262                }
263
264                let description = if !summary.summary.is_empty() {
265                    summary.summary.clone()
266                } else {
267                    summary.description.clone()
268                };
269
270                let text = format!(
271                    "{} {} {}",
272                    summary.name, description, summary.organism.scientificname
273                );
274                let embedding = self.embedder.embed_text(&text);
275
276                let mut metadata = HashMap::new();
277                metadata.insert("gene_id".to_string(), summary.uid.clone());
278                metadata.insert("symbol".to_string(), summary.name.clone());
279                metadata.insert("description".to_string(), description);
280                metadata.insert("organism".to_string(), summary.organism.scientificname);
281                metadata.insert("common_name".to_string(), summary.organism.commonname);
282                metadata.insert("chromosome".to_string(), summary.chromosome);
283                metadata.insert("location".to_string(), summary.maplocation);
284                metadata.insert("source".to_string(), "ncbi_gene".to_string());
285
286                all_vectors.push(SemanticVector {
287                    id: format!("GENE:{}", summary.uid),
288                    embedding,
289                    domain: Domain::Genomics,
290                    timestamp: Utc::now(),
291                    metadata,
292                });
293            }
294        }
295
296        Ok(all_vectors)
297    }
298
299    /// Fetch protein summaries (simplified)
300    async fn fetch_protein_summaries(&self, protein_ids: &[String]) -> Result<Vec<SemanticVector>> {
301        // For proteins, we use a simplified approach with just IDs
302        // In production, you'd parse full protein records
303        let mut vectors = Vec::new();
304
305        for id in protein_ids {
306            let text = format!("Protein {}", id);
307            let embedding = self.embedder.embed_text(&text);
308
309            let mut metadata = HashMap::new();
310            metadata.insert("protein_id".to_string(), id.clone());
311            metadata.insert("source".to_string(), "ncbi_protein".to_string());
312
313            vectors.push(SemanticVector {
314                id: format!("PROTEIN:{}", id),
315                embedding,
316                domain: Domain::Genomics,
317                timestamp: Utc::now(),
318                metadata,
319            });
320        }
321
322        Ok(vectors)
323    }
324
325    /// Fetch nucleotide summaries (simplified)
326    async fn fetch_nucleotide_summaries(&self, seq_ids: &[String]) -> Result<Vec<SemanticVector>> {
327        let mut vectors = Vec::new();
328
329        for id in seq_ids {
330            let text = format!("Nucleotide sequence {}", id);
331            let embedding = self.embedder.embed_text(&text);
332
333            let mut metadata = HashMap::new();
334            metadata.insert("sequence_id".to_string(), id.clone());
335            metadata.insert("source".to_string(), "ncbi_nucleotide".to_string());
336
337            vectors.push(SemanticVector {
338                id: format!("NUCLEOTIDE:{}", id),
339                embedding,
340                domain: Domain::Genomics,
341                timestamp: Utc::now(),
342                metadata,
343            });
344        }
345
346        Ok(vectors)
347    }
348
349    /// Fetch SNP summaries
350    async fn fetch_snp_summaries(&self, snp_ids: &[String]) -> Result<Vec<SemanticVector>> {
351        if snp_ids.is_empty() {
352            return Ok(Vec::new());
353        }
354
355        let mut all_vectors = Vec::new();
356
357        for chunk in snp_ids.chunks(200) {
358            let id_list = chunk.join(",");
359            let mut url = format!(
360                "{}/esummary.fcgi?db=snp&id={}&retmode=json",
361                self.base_url, id_list
362            );
363
364            if let Some(key) = &self.api_key {
365                url.push_str(&format!("&api_key={}", key));
366            }
367
368            sleep(self.rate_limit_delay).await;
369            let response = self.fetch_with_retry(&url).await?;
370            let snp_response: NcbiSnpResponse = response.json().await?;
371
372            for (id, summary) in snp_response.result {
373                if id == "uids" {
374                    continue;
375                }
376
377                let gene_names: Vec<String> = summary.genes.iter()
378                    .map(|g| g.name.clone())
379                    .collect();
380
381                let text = format!(
382                    "SNP rs{} chromosome {} position {} function {} genes {}",
383                    summary.snp_id,
384                    summary.chr,
385                    summary.chrpos,
386                    summary.fxn_class,
387                    gene_names.join(",")
388                );
389                let embedding = self.embedder.embed_text(&text);
390
391                let mut metadata = HashMap::new();
392                metadata.insert("rsid".to_string(), format!("rs{}", summary.snp_id));
393                metadata.insert("chromosome".to_string(), summary.chr);
394                metadata.insert("position".to_string(), summary.chrpos);
395                metadata.insert("function".to_string(), summary.fxn_class);
396                metadata.insert("genes".to_string(), gene_names.join(", "));
397                metadata.insert("source".to_string(), "ncbi_snp".to_string());
398
399                all_vectors.push(SemanticVector {
400                    id: format!("SNP:rs{}", summary.snp_id),
401                    embedding,
402                    domain: Domain::Genomics,
403                    timestamp: Utc::now(),
404                    metadata,
405                });
406            }
407        }
408
409        Ok(all_vectors)
410    }
411
412    /// Fetch with retry logic
413    async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
414        let mut retries = 0;
415        loop {
416            match self.client.get(url).send().await {
417                Ok(response) => {
418                    if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
419                        retries += 1;
420                        sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
421                        continue;
422                    }
423                    return Ok(response);
424                }
425                Err(_) if retries < MAX_RETRIES => {
426                    retries += 1;
427                    sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
428                }
429                Err(e) => return Err(FrameworkError::Network(e)),
430            }
431        }
432    }
433}
434
435// ============================================================================
436// UniProt Client (Protein Database)
437// ============================================================================
438
439/// UniProt search response
440#[derive(Debug, Deserialize)]
441struct UniProtSearchResponse {
442    results: Vec<UniProtEntry>,
443}
444
445#[derive(Debug, Deserialize)]
446struct UniProtEntry {
447    #[serde(rename = "primaryAccession")]
448    primary_accession: String,
449    #[serde(default)]
450    organism: Option<UniProtOrganism>,
451    #[serde(rename = "proteinDescription", default)]
452    protein_description: Option<UniProtDescription>,
453    #[serde(default)]
454    genes: Vec<UniProtGene>,
455    #[serde(default)]
456    comments: Vec<UniProtComment>,
457}
458
459#[derive(Debug, Deserialize)]
460struct UniProtOrganism {
461    #[serde(rename = "scientificName", default)]
462    scientific_name: String,
463}
464
465#[derive(Debug, Deserialize)]
466struct UniProtDescription {
467    #[serde(rename = "recommendedName", default)]
468    recommended_name: Option<UniProtName>,
469}
470
471#[derive(Debug, Deserialize)]
472struct UniProtName {
473    #[serde(rename = "fullName", default)]
474    full_name: Option<UniProtValue>,
475}
476
477#[derive(Debug, Deserialize)]
478struct UniProtValue {
479    #[serde(default)]
480    value: String,
481}
482
483#[derive(Debug, Deserialize)]
484struct UniProtGene {
485    #[serde(rename = "geneName", default)]
486    gene_name: Option<UniProtValue>,
487}
488
489#[derive(Debug, Deserialize)]
490struct UniProtComment {
491    #[serde(rename = "commentType", default)]
492    comment_type: String,
493    #[serde(default)]
494    texts: Vec<UniProtValue>,
495}
496
497/// Client for UniProt protein database
498pub struct UniProtClient {
499    client: Client,
500    base_url: String,
501    rate_limit_delay: Duration,
502    embedder: Arc<SimpleEmbedder>,
503}
504
505impl UniProtClient {
506    /// Create a new UniProt client
507    pub fn new() -> Result<Self> {
508        let client = Client::builder()
509            .timeout(Duration::from_secs(30))
510            .user_agent("RuVector/1.0 (genomics discovery)")
511            .build()
512            .map_err(FrameworkError::Network)?;
513
514        Ok(Self {
515            client,
516            base_url: "https://rest.uniprot.org/uniprotkb".to_string(),
517            rate_limit_delay: Duration::from_millis(UNIPROT_RATE_LIMIT_MS),
518            embedder: Arc::new(SimpleEmbedder::new(384)),
519        })
520    }
521
522    /// Search proteins
523    ///
524    /// # Arguments
525    /// * `query` - Search query (e.g., "kinase", "p53")
526    /// * `limit` - Maximum results (default 100)
527    pub async fn search_proteins(&self, query: &str, limit: usize) -> Result<Vec<SemanticVector>> {
528        let url = format!(
529            "{}/search?query={}&format=json&size={}",
530            self.base_url,
531            urlencoding::encode(query),
532            limit.min(500)
533        );
534
535        sleep(self.rate_limit_delay).await;
536        let response = self.fetch_with_retry(&url).await?;
537        let search_response: UniProtSearchResponse = response.json().await?;
538
539        let mut vectors = Vec::new();
540        for entry in search_response.results {
541            vectors.push(self.entry_to_vector(entry)?);
542        }
543
544        Ok(vectors)
545    }
546
547    /// Get protein by accession ID
548    pub async fn get_protein(&self, accession: &str) -> Result<Option<SemanticVector>> {
549        let url = format!("{}/{}.json", self.base_url, accession);
550
551        sleep(self.rate_limit_delay).await;
552        let response = self.fetch_with_retry(&url).await?;
553
554        if response.status() == StatusCode::NOT_FOUND {
555            return Ok(None);
556        }
557
558        let entry: UniProtEntry = response.json().await?;
559        Ok(Some(self.entry_to_vector(entry)?))
560    }
561
562    /// Search proteins by organism
563    pub async fn search_by_organism(&self, organism: &str) -> Result<Vec<SemanticVector>> {
564        let query = format!("organism:{}", organism);
565        self.search_proteins(&query, 100).await
566    }
567
568    /// Search proteins by GO term/function
569    pub async fn search_by_function(&self, function: &str) -> Result<Vec<SemanticVector>> {
570        let query = format!("cc:{}", function); // cc = cellular component, also use mf/bp
571        self.search_proteins(&query, 100).await
572    }
573
574    /// Convert UniProt entry to SemanticVector
575    fn entry_to_vector(&self, entry: UniProtEntry) -> Result<SemanticVector> {
576        let protein_name = entry
577            .protein_description
578            .as_ref()
579            .and_then(|pd| pd.recommended_name.as_ref())
580            .and_then(|rn| rn.full_name.as_ref())
581            .map(|fn_| fn_.value.clone())
582            .unwrap_or_else(|| "Unnamed protein".to_string());
583
584        let organism = entry
585            .organism
586            .as_ref()
587            .map(|o| o.scientific_name.clone())
588            .unwrap_or_default();
589
590        let gene_names: Vec<String> = entry
591            .genes
592            .iter()
593            .filter_map(|g| g.gene_name.as_ref().map(|gn| gn.value.clone()))
594            .collect();
595
596        // Extract function comments
597        let function_text = entry
598            .comments
599            .iter()
600            .filter(|c| c.comment_type == "FUNCTION")
601            .flat_map(|c| c.texts.iter().map(|t| t.value.clone()))
602            .collect::<Vec<_>>()
603            .join(" ");
604
605        let text = format!(
606            "{} {} {} {}",
607            protein_name, organism, gene_names.join(","), function_text
608        );
609        let embedding = self.embedder.embed_text(&text);
610
611        let mut metadata = HashMap::new();
612        metadata.insert("accession".to_string(), entry.primary_accession.clone());
613        metadata.insert("protein_name".to_string(), protein_name);
614        metadata.insert("organism".to_string(), organism);
615        metadata.insert("genes".to_string(), gene_names.join(", "));
616        metadata.insert("function".to_string(), function_text);
617        metadata.insert("source".to_string(), "uniprot".to_string());
618
619        Ok(SemanticVector {
620            id: format!("UNIPROT:{}", entry.primary_accession),
621            embedding,
622            domain: Domain::Genomics,
623            timestamp: Utc::now(),
624            metadata,
625        })
626    }
627
628    /// Fetch with retry logic
629    async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
630        let mut retries = 0;
631        loop {
632            match self.client.get(url).send().await {
633                Ok(response) => {
634                    if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
635                        retries += 1;
636                        sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
637                        continue;
638                    }
639                    return Ok(response);
640                }
641                Err(_) if retries < MAX_RETRIES => {
642                    retries += 1;
643                    sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
644                }
645                Err(e) => return Err(FrameworkError::Network(e)),
646            }
647        }
648    }
649}
650
651impl Default for UniProtClient {
652    fn default() -> Self {
653        Self::new().expect("Failed to create UniProt client")
654    }
655}
656
657// ============================================================================
658// Ensembl REST Client (Gene Info, Variants, Homologs)
659// ============================================================================
660
661/// Ensembl gene response
662#[derive(Debug, Deserialize)]
663struct EnsemblGene {
664    id: String,
665    #[serde(default)]
666    display_name: String,
667    #[serde(default)]
668    description: String,
669    #[serde(default)]
670    species: String,
671    #[serde(default)]
672    biotype: String,
673    #[serde(default)]
674    seq_region_name: String,
675    #[serde(default)]
676    start: i64,
677    #[serde(default)]
678    end: i64,
679}
680
681/// Ensembl variant response
682#[derive(Debug, Deserialize)]
683struct EnsemblVariant {
684    #[serde(default)]
685    id: String,
686    #[serde(default)]
687    seq_region_name: String,
688    #[serde(default)]
689    start: i64,
690    #[serde(default)]
691    most_severe_consequence: String,
692}
693
694/// Ensembl homology response
695#[derive(Debug, Deserialize)]
696struct EnsemblHomologyResponse {
697    #[serde(default)]
698    data: Vec<EnsemblHomology>,
699}
700
701#[derive(Debug, Deserialize)]
702struct EnsemblHomology {
703    #[serde(default)]
704    homologies: Vec<EnsemblHomologyEntry>,
705}
706
707#[derive(Debug, Deserialize)]
708struct EnsemblHomologyEntry {
709    #[serde(default)]
710    target: EnsemblTarget,
711    #[serde(rename = "type", default)]
712    homology_type: String,
713}
714
715#[derive(Debug, Deserialize, Default)]
716struct EnsemblTarget {
717    #[serde(default)]
718    id: String,
719    #[serde(default)]
720    species: String,
721    #[serde(default)]
722    protein_id: String,
723}
724
725/// Client for Ensembl REST API
726pub struct EnsemblClient {
727    client: Client,
728    base_url: String,
729    rate_limit_delay: Duration,
730    embedder: Arc<SimpleEmbedder>,
731}
732
733impl EnsemblClient {
734    /// Create a new Ensembl client
735    pub fn new() -> Result<Self> {
736        let client = Client::builder()
737            .timeout(Duration::from_secs(30))
738            .user_agent("RuVector/1.0 (genomics discovery)")
739            .build()
740            .map_err(FrameworkError::Network)?;
741
742        Ok(Self {
743            client,
744            base_url: "https://rest.ensembl.org".to_string(),
745            rate_limit_delay: Duration::from_millis(ENSEMBL_RATE_LIMIT_MS),
746            embedder: Arc::new(SimpleEmbedder::new(384)),
747        })
748    }
749
750    /// Get gene information
751    ///
752    /// # Arguments
753    /// * `gene_id` - Ensembl gene ID (e.g., "ENSG00000157764" for BRAF)
754    pub async fn get_gene_info(&self, gene_id: &str) -> Result<Option<SemanticVector>> {
755        let url = format!("{}/lookup/id/{}?content-type=application/json", self.base_url, gene_id);
756
757        sleep(self.rate_limit_delay).await;
758        let response = self.fetch_with_retry(&url).await?;
759
760        if response.status() == StatusCode::NOT_FOUND {
761            return Ok(None);
762        }
763
764        let gene: EnsemblGene = response.json().await?;
765
766        let text = format!(
767            "{} {} {} {}",
768            gene.display_name, gene.description, gene.species, gene.biotype
769        );
770        let embedding = self.embedder.embed_text(&text);
771
772        let mut metadata = HashMap::new();
773        metadata.insert("ensembl_id".to_string(), gene.id.clone());
774        metadata.insert("symbol".to_string(), gene.display_name);
775        metadata.insert("description".to_string(), gene.description);
776        metadata.insert("species".to_string(), gene.species);
777        metadata.insert("biotype".to_string(), gene.biotype);
778        metadata.insert("chromosome".to_string(), gene.seq_region_name);
779        metadata.insert("start".to_string(), gene.start.to_string());
780        metadata.insert("end".to_string(), gene.end.to_string());
781        metadata.insert("source".to_string(), "ensembl".to_string());
782
783        Ok(Some(SemanticVector {
784            id: format!("ENSEMBL:{}", gene.id),
785            embedding,
786            domain: Domain::Genomics,
787            timestamp: Utc::now(),
788            metadata,
789        }))
790    }
791
792    /// Get genetic variants for a gene
793    pub async fn get_variants(&self, gene_id: &str) -> Result<Vec<SemanticVector>> {
794        let url = format!(
795            "{}/overlap/id/{}?feature=variation;content-type=application/json",
796            self.base_url, gene_id
797        );
798
799        sleep(self.rate_limit_delay).await;
800        let response = self.fetch_with_retry(&url).await?;
801
802        if response.status() == StatusCode::NOT_FOUND {
803            return Ok(Vec::new());
804        }
805
806        let variants: Vec<EnsemblVariant> = response.json().await?;
807
808        let mut vectors = Vec::new();
809        for variant in variants.into_iter().take(100) {
810            let text = format!(
811                "Variant {} chromosome {} position {} consequence {}",
812                variant.id, variant.seq_region_name, variant.start, variant.most_severe_consequence
813            );
814            let embedding = self.embedder.embed_text(&text);
815
816            let mut metadata = HashMap::new();
817            metadata.insert("variant_id".to_string(), variant.id.clone());
818            metadata.insert("chromosome".to_string(), variant.seq_region_name);
819            metadata.insert("position".to_string(), variant.start.to_string());
820            metadata.insert("consequence".to_string(), variant.most_severe_consequence);
821            metadata.insert("gene_id".to_string(), gene_id.to_string());
822            metadata.insert("source".to_string(), "ensembl_variant".to_string());
823
824            vectors.push(SemanticVector {
825                id: format!("VARIANT:{}", variant.id),
826                embedding,
827                domain: Domain::Genomics,
828                timestamp: Utc::now(),
829                metadata,
830            });
831        }
832
833        Ok(vectors)
834    }
835
836    /// Get homologous genes across species
837    pub async fn get_homologs(&self, gene_id: &str) -> Result<Vec<SemanticVector>> {
838        let url = format!(
839            "{}/homology/id/{}?content-type=application/json;format=condensed",
840            self.base_url, gene_id
841        );
842
843        sleep(self.rate_limit_delay).await;
844        let response = self.fetch_with_retry(&url).await?;
845
846        if response.status() == StatusCode::NOT_FOUND {
847            return Ok(Vec::new());
848        }
849
850        let homology_response: EnsemblHomologyResponse = response.json().await?;
851
852        let mut vectors = Vec::new();
853        for data in homology_response.data {
854            for homology in data.homologies {
855                let text = format!(
856                    "Homolog {} in {} type {}",
857                    homology.target.id, homology.target.species, homology.homology_type
858                );
859                let embedding = self.embedder.embed_text(&text);
860
861                let mut metadata = HashMap::new();
862                metadata.insert("homolog_id".to_string(), homology.target.id.clone());
863                metadata.insert("species".to_string(), homology.target.species);
864                metadata.insert("protein_id".to_string(), homology.target.protein_id);
865                metadata.insert("homology_type".to_string(), homology.homology_type);
866                metadata.insert("source_gene".to_string(), gene_id.to_string());
867                metadata.insert("source".to_string(), "ensembl_homology".to_string());
868
869                vectors.push(SemanticVector {
870                    id: format!("HOMOLOG:{}:{}", gene_id, homology.target.id),
871                    embedding,
872                    domain: Domain::Genomics,
873                    timestamp: Utc::now(),
874                    metadata,
875                });
876            }
877        }
878
879        Ok(vectors)
880    }
881
882    /// Fetch with retry logic
883    async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
884        let mut retries = 0;
885        loop {
886            match self.client.get(url).send().await {
887                Ok(response) => {
888                    if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
889                        retries += 1;
890                        sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
891                        continue;
892                    }
893                    return Ok(response);
894                }
895                Err(_) if retries < MAX_RETRIES => {
896                    retries += 1;
897                    sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
898                }
899                Err(e) => return Err(FrameworkError::Network(e)),
900            }
901        }
902    }
903}
904
905impl Default for EnsemblClient {
906    fn default() -> Self {
907        Self::new().expect("Failed to create Ensembl client")
908    }
909}
910
911// ============================================================================
912// GWAS Catalog Client (EBI)
913// ============================================================================
914
915/// GWAS association response
916#[derive(Debug, Deserialize)]
917struct GwasAssociationResponse {
918    #[serde(rename = "_embedded")]
919    embedded: Option<GwasEmbedded>,
920}
921
922#[derive(Debug, Deserialize)]
923struct GwasEmbedded {
924    #[serde(default)]
925    associations: Vec<GwasAssociation>,
926}
927
928#[derive(Debug, Deserialize)]
929struct GwasAssociation {
930    #[serde(default)]
931    riskAllele: String,
932    #[serde(default)]
933    pvalue: f64,
934    #[serde(default, rename = "trait")]
935    trait_name: String,
936    #[serde(default)]
937    chromosomeName: String,
938    #[serde(default)]
939    chromosomePosition: i64,
940    #[serde(default)]
941    loci: Vec<GwasLocus>,
942}
943
944#[derive(Debug, Deserialize)]
945struct GwasLocus {
946    #[serde(default)]
947    authorReportedGene: String,
948}
949
950/// GWAS study response
951#[derive(Debug, Deserialize)]
952struct GwasStudyResponse {
953    #[serde(rename = "_embedded")]
954    embedded: Option<GwasStudyEmbedded>,
955}
956
957#[derive(Debug, Deserialize)]
958struct GwasStudyEmbedded {
959    #[serde(default)]
960    studies: Vec<GwasStudy>,
961}
962
963#[derive(Debug, Deserialize)]
964struct GwasStudy {
965    #[serde(default)]
966    accessionId: String,
967    #[serde(default)]
968    publicationDate: Option<String>,
969    #[serde(default)]
970    diseaseTrait: String,
971    #[serde(default)]
972    initialSampleSize: String,
973}
974
975/// Client for GWAS Catalog (EBI)
976pub struct GwasClient {
977    client: Client,
978    base_url: String,
979    rate_limit_delay: Duration,
980    embedder: Arc<SimpleEmbedder>,
981}
982
983impl GwasClient {
984    /// Create a new GWAS Catalog client
985    pub fn new() -> Result<Self> {
986        let client = Client::builder()
987            .timeout(Duration::from_secs(30))
988            .user_agent("RuVector/1.0 (genomics discovery)")
989            .build()
990            .map_err(FrameworkError::Network)?;
991
992        Ok(Self {
993            client,
994            base_url: "https://www.ebi.ac.uk/gwas/rest/api".to_string(),
995            rate_limit_delay: Duration::from_millis(GWAS_RATE_LIMIT_MS),
996            embedder: Arc::new(SimpleEmbedder::new(384)),
997        })
998    }
999
1000    /// Search trait-gene associations
1001    ///
1002    /// # Arguments
1003    /// * `trait_name` - Disease or trait name (e.g., "diabetes", "height")
1004    pub async fn search_associations(&self, trait_name: &str) -> Result<Vec<SemanticVector>> {
1005        let url = format!(
1006            "{}/associations/search/findByEfoTrait?efoTrait={}&size=100",
1007            self.base_url,
1008            urlencoding::encode(trait_name)
1009        );
1010
1011        sleep(self.rate_limit_delay).await;
1012        let response = self.fetch_with_retry(&url).await?;
1013
1014        if response.status() == StatusCode::NOT_FOUND {
1015            return Ok(Vec::new());
1016        }
1017
1018        let assoc_response: GwasAssociationResponse = response.json().await?;
1019
1020        let mut vectors = Vec::new();
1021        if let Some(embedded) = assoc_response.embedded {
1022            for assoc in embedded.associations {
1023                let genes: Vec<String> = assoc.loci.iter()
1024                    .map(|l| l.authorReportedGene.clone())
1025                    .collect();
1026
1027                let text = format!(
1028                    "GWAS association trait {} genes {} chromosome {} position {} p-value {}",
1029                    assoc.trait_name,
1030                    genes.join(","),
1031                    assoc.chromosomeName,
1032                    assoc.chromosomePosition,
1033                    assoc.pvalue
1034                );
1035                let embedding = self.embedder.embed_text(&text);
1036
1037                let mut metadata = HashMap::new();
1038                metadata.insert("trait".to_string(), assoc.trait_name.clone());
1039                metadata.insert("genes".to_string(), genes.join(", "));
1040                metadata.insert("risk_allele".to_string(), assoc.riskAllele.clone());
1041                metadata.insert("pvalue".to_string(), assoc.pvalue.to_string());
1042                metadata.insert("chromosome".to_string(), assoc.chromosomeName.clone());
1043                metadata.insert("position".to_string(), assoc.chromosomePosition.to_string());
1044                metadata.insert("source".to_string(), "gwas_catalog".to_string());
1045
1046                vectors.push(SemanticVector {
1047                    id: format!("GWAS:{}_{}_{}", assoc.chromosomeName, assoc.chromosomePosition, assoc.pvalue),
1048                    embedding,
1049                    domain: Domain::Genomics,
1050                    timestamp: Utc::now(),
1051                    metadata,
1052                });
1053            }
1054        }
1055
1056        Ok(vectors)
1057    }
1058
1059    /// Get study details
1060    pub async fn get_study(&self, study_id: &str) -> Result<Option<SemanticVector>> {
1061        let url = format!("{}/studies/{}", self.base_url, study_id);
1062
1063        sleep(self.rate_limit_delay).await;
1064        let response = self.fetch_with_retry(&url).await?;
1065
1066        if response.status() == StatusCode::NOT_FOUND {
1067            return Ok(None);
1068        }
1069
1070        let study: GwasStudy = response.json().await?;
1071
1072        let text = format!(
1073            "GWAS study {} trait {} sample size {}",
1074            study.accessionId, study.diseaseTrait, study.initialSampleSize
1075        );
1076        let embedding = self.embedder.embed_text(&text);
1077
1078        let timestamp = study
1079            .publicationDate
1080            .as_ref()
1081            .and_then(|d| NaiveDate::parse_from_str(d, "%Y-%m-%d").ok())
1082            .and_then(|d| d.and_hms_opt(0, 0, 0))
1083            .map(|dt| dt.and_utc())
1084            .unwrap_or_else(Utc::now);
1085
1086        let mut metadata = HashMap::new();
1087        metadata.insert("study_id".to_string(), study.accessionId.clone());
1088        metadata.insert("trait".to_string(), study.diseaseTrait);
1089        metadata.insert("sample_size".to_string(), study.initialSampleSize);
1090        metadata.insert("source".to_string(), "gwas_study".to_string());
1091
1092        Ok(Some(SemanticVector {
1093            id: format!("GWAS_STUDY:{}", study.accessionId),
1094            embedding,
1095            domain: Domain::Genomics,
1096            timestamp,
1097            metadata,
1098        }))
1099    }
1100
1101    /// Search associations by gene
1102    pub async fn search_by_gene(&self, gene: &str) -> Result<Vec<SemanticVector>> {
1103        let url = format!(
1104            "{}/associations/search/findByGene?geneName={}&size=100",
1105            self.base_url,
1106            urlencoding::encode(gene)
1107        );
1108
1109        sleep(self.rate_limit_delay).await;
1110        let response = self.fetch_with_retry(&url).await?;
1111
1112        if response.status() == StatusCode::NOT_FOUND {
1113            return Ok(Vec::new());
1114        }
1115
1116        let assoc_response: GwasAssociationResponse = response.json().await?;
1117
1118        let mut vectors = Vec::new();
1119        if let Some(embedded) = assoc_response.embedded {
1120            for assoc in embedded.associations {
1121                let text = format!(
1122                    "Gene {} associated with trait {} p-value {}",
1123                    gene, assoc.trait_name, assoc.pvalue
1124                );
1125                let embedding = self.embedder.embed_text(&text);
1126
1127                let mut metadata = HashMap::new();
1128                metadata.insert("gene".to_string(), gene.to_string());
1129                metadata.insert("trait".to_string(), assoc.trait_name.clone());
1130                metadata.insert("pvalue".to_string(), assoc.pvalue.to_string());
1131                metadata.insert("chromosome".to_string(), assoc.chromosomeName.clone());
1132                metadata.insert("source".to_string(), "gwas_gene_association".to_string());
1133
1134                vectors.push(SemanticVector {
1135                    id: format!("GWAS_GENE:{}:{}", gene, assoc.trait_name),
1136                    embedding,
1137                    domain: Domain::Genomics,
1138                    timestamp: Utc::now(),
1139                    metadata,
1140                });
1141            }
1142        }
1143
1144        Ok(vectors)
1145    }
1146
1147    /// Fetch with retry logic
1148    async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
1149        let mut retries = 0;
1150        loop {
1151            match self.client.get(url).send().await {
1152                Ok(response) => {
1153                    if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
1154                        retries += 1;
1155                        sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
1156                        continue;
1157                    }
1158                    return Ok(response);
1159                }
1160                Err(_) if retries < MAX_RETRIES => {
1161                    retries += 1;
1162                    sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
1163                }
1164                Err(e) => return Err(FrameworkError::Network(e)),
1165            }
1166        }
1167    }
1168}
1169
1170impl Default for GwasClient {
1171    fn default() -> Self {
1172        Self::new().expect("Failed to create GWAS client")
1173    }
1174}
1175
1176// ============================================================================
1177// Tests
1178// ============================================================================
1179
1180#[cfg(test)]
1181mod tests {
1182    use super::*;
1183
1184    #[tokio::test]
1185    async fn test_ncbi_client_creation() {
1186        let client = NcbiClient::new(None);
1187        assert!(client.is_ok());
1188    }
1189
1190    #[tokio::test]
1191    async fn test_ncbi_rate_limiting() {
1192        let without_key = NcbiClient::new(None).unwrap();
1193        assert_eq!(
1194            without_key.rate_limit_delay,
1195            Duration::from_millis(NCBI_RATE_LIMIT_MS)
1196        );
1197
1198        let with_key = NcbiClient::new(Some("test_key".to_string())).unwrap();
1199        assert_eq!(
1200            with_key.rate_limit_delay,
1201            Duration::from_millis(NCBI_WITH_KEY_RATE_LIMIT_MS)
1202        );
1203    }
1204
1205    #[tokio::test]
1206    async fn test_uniprot_client_creation() {
1207        let client = UniProtClient::new();
1208        assert!(client.is_ok());
1209    }
1210
1211    #[tokio::test]
1212    async fn test_ensembl_client_creation() {
1213        let client = EnsemblClient::new();
1214        assert!(client.is_ok());
1215    }
1216
1217    #[tokio::test]
1218    async fn test_gwas_client_creation() {
1219        let client = GwasClient::new();
1220        assert!(client.is_ok());
1221    }
1222
1223    #[test]
1224    fn test_genomics_domain() {
1225        // Ensure Domain::Genomics is available
1226        let _domain = Domain::Genomics;
1227    }
1228}