1use std::collections::HashMap;
8use std::sync::Arc;
9use std::time::Duration;
10
11use chrono::{NaiveDate, Utc};
12use reqwest::{Client, StatusCode};
13use serde::Deserialize;
14use tokio::time::sleep;
15
16use crate::api_clients::SimpleEmbedder;
17use crate::ruvector_native::{Domain, SemanticVector};
18use crate::{FrameworkError, Result};
19
20const NCBI_RATE_LIMIT_MS: u64 = 334; const NCBI_WITH_KEY_RATE_LIMIT_MS: u64 = 100; const UNIPROT_RATE_LIMIT_MS: u64 = 100; const ENSEMBL_RATE_LIMIT_MS: u64 = 67; const GWAS_RATE_LIMIT_MS: u64 = 100; const MAX_RETRIES: u32 = 3;
27const RETRY_DELAY_MS: u64 = 1000;
28
29#[derive(Debug, Deserialize)]
35struct NcbiSearchResponse {
36 esearchresult: NcbiSearchResult,
37}
38
39#[derive(Debug, Deserialize)]
40struct NcbiSearchResult {
41 #[serde(default)]
42 idlist: Vec<String>,
43 #[serde(default)]
44 count: String,
45}
46
47#[derive(Debug, Deserialize)]
49struct NcbiGeneSummaryResponse {
50 result: HashMap<String, NcbiGeneSummary>,
51}
52
53#[derive(Debug, Deserialize)]
54struct NcbiGeneSummary {
55 #[serde(default)]
56 uid: String,
57 #[serde(default)]
58 name: String,
59 #[serde(default)]
60 description: String,
61 #[serde(default)]
62 summary: String,
63 #[serde(default)]
64 organism: NcbiOrganism,
65 #[serde(default)]
66 chromosome: String,
67 #[serde(default)]
68 maplocation: String,
69}
70
71#[derive(Debug, Deserialize, Default)]
72struct NcbiOrganism {
73 #[serde(default)]
74 scientificname: String,
75 #[serde(default)]
76 commonname: String,
77}
78
79#[derive(Debug, Deserialize)]
81struct NcbiSnpResponse {
82 result: HashMap<String, NcbiSnpSummary>,
83}
84
85#[derive(Debug, Deserialize)]
86struct NcbiSnpSummary {
87 #[serde(default)]
88 uid: String,
89 #[serde(default)]
90 snp_id: String,
91 #[serde(default)]
92 genes: Vec<NcbiGene>,
93 #[serde(default)]
94 chr: String,
95 #[serde(default)]
96 chrpos: String,
97 #[serde(default)]
98 fxn_class: String,
99}
100
101#[derive(Debug, Deserialize)]
102struct NcbiGene {
103 #[serde(default)]
104 name: String,
105}
106
107pub struct NcbiClient {
109 client: Client,
110 base_url: String,
111 api_key: Option<String>,
112 rate_limit_delay: Duration,
113 embedder: Arc<SimpleEmbedder>,
114}
115
116impl NcbiClient {
117 pub fn new(api_key: Option<String>) -> Result<Self> {
124 let client = Client::builder()
125 .timeout(Duration::from_secs(30))
126 .user_agent("RuVector/1.0 (genomics discovery)")
127 .build()
128 .map_err(FrameworkError::Network)?;
129
130 let rate_limit_delay = if api_key.is_some() {
131 Duration::from_millis(NCBI_WITH_KEY_RATE_LIMIT_MS)
132 } else {
133 Duration::from_millis(NCBI_RATE_LIMIT_MS)
134 };
135
136 Ok(Self {
137 client,
138 base_url: "https://eutils.ncbi.nlm.nih.gov/entrez/eutils".to_string(),
139 api_key,
140 rate_limit_delay,
141 embedder: Arc::new(SimpleEmbedder::new(384)),
142 })
143 }
144
145 pub async fn search_genes(
151 &self,
152 query: &str,
153 organism: Option<&str>,
154 ) -> Result<Vec<SemanticVector>> {
155 let mut search_query = query.to_string();
156 if let Some(org) = organism {
157 search_query.push_str(&format!(" AND {}[Organism]", org));
158 }
159
160 let gene_ids = self.search_database("gene", &search_query, 100).await?;
161 if gene_ids.is_empty() {
162 return Ok(Vec::new());
163 }
164
165 self.fetch_gene_summaries(&gene_ids).await
166 }
167
168 pub async fn get_gene(&self, gene_id: &str) -> Result<Option<SemanticVector>> {
170 let vectors = self.fetch_gene_summaries(&[gene_id.to_string()]).await?;
171 Ok(vectors.into_iter().next())
172 }
173
174 pub async fn search_proteins(&self, query: &str) -> Result<Vec<SemanticVector>> {
176 let protein_ids = self.search_database("protein", query, 100).await?;
177 if protein_ids.is_empty() {
178 return Ok(Vec::new());
179 }
180
181 self.fetch_protein_summaries(&protein_ids).await
182 }
183
184 pub async fn search_nucleotide(&self, query: &str) -> Result<Vec<SemanticVector>> {
186 let seq_ids = self.search_database("nucleotide", query, 100).await?;
187 if seq_ids.is_empty() {
188 return Ok(Vec::new());
189 }
190
191 self.fetch_nucleotide_summaries(&seq_ids).await
192 }
193
194 pub async fn get_snp(&self, rsid: &str) -> Result<Option<SemanticVector>> {
199 let clean_rsid = rsid.trim_start_matches("rs");
200 let snp_ids = self.search_database("snp", clean_rsid, 1).await?;
201
202 if snp_ids.is_empty() {
203 return Ok(None);
204 }
205
206 let vectors = self.fetch_snp_summaries(&snp_ids).await?;
207 Ok(vectors.into_iter().next())
208 }
209
210 async fn search_database(
212 &self,
213 db: &str,
214 query: &str,
215 max_results: usize,
216 ) -> Result<Vec<String>> {
217 let mut url = format!(
218 "{}/esearch.fcgi?db={}&term={}&retmode=json&retmax={}",
219 self.base_url,
220 db,
221 urlencoding::encode(query),
222 max_results
223 );
224
225 if let Some(key) = &self.api_key {
226 url.push_str(&format!("&api_key={}", key));
227 }
228
229 sleep(self.rate_limit_delay).await;
230 let response = self.fetch_with_retry(&url).await?;
231 let search_response: NcbiSearchResponse = response.json().await?;
232
233 Ok(search_response.esearchresult.idlist)
234 }
235
236 async fn fetch_gene_summaries(&self, gene_ids: &[String]) -> Result<Vec<SemanticVector>> {
238 if gene_ids.is_empty() {
239 return Ok(Vec::new());
240 }
241
242 let mut all_vectors = Vec::new();
243
244 for chunk in gene_ids.chunks(200) {
245 let id_list = chunk.join(",");
246 let mut url = format!(
247 "{}/esummary.fcgi?db=gene&id={}&retmode=json",
248 self.base_url, id_list
249 );
250
251 if let Some(key) = &self.api_key {
252 url.push_str(&format!("&api_key={}", key));
253 }
254
255 sleep(self.rate_limit_delay).await;
256 let response = self.fetch_with_retry(&url).await?;
257 let summary_response: NcbiGeneSummaryResponse = response.json().await?;
258
259 for (id, summary) in summary_response.result {
260 if id == "uids" {
261 continue; }
263
264 let description = if !summary.summary.is_empty() {
265 summary.summary.clone()
266 } else {
267 summary.description.clone()
268 };
269
270 let text = format!(
271 "{} {} {}",
272 summary.name, description, summary.organism.scientificname
273 );
274 let embedding = self.embedder.embed_text(&text);
275
276 let mut metadata = HashMap::new();
277 metadata.insert("gene_id".to_string(), summary.uid.clone());
278 metadata.insert("symbol".to_string(), summary.name.clone());
279 metadata.insert("description".to_string(), description);
280 metadata.insert("organism".to_string(), summary.organism.scientificname);
281 metadata.insert("common_name".to_string(), summary.organism.commonname);
282 metadata.insert("chromosome".to_string(), summary.chromosome);
283 metadata.insert("location".to_string(), summary.maplocation);
284 metadata.insert("source".to_string(), "ncbi_gene".to_string());
285
286 all_vectors.push(SemanticVector {
287 id: format!("GENE:{}", summary.uid),
288 embedding,
289 domain: Domain::Genomics,
290 timestamp: Utc::now(),
291 metadata,
292 });
293 }
294 }
295
296 Ok(all_vectors)
297 }
298
299 async fn fetch_protein_summaries(&self, protein_ids: &[String]) -> Result<Vec<SemanticVector>> {
301 let mut vectors = Vec::new();
304
305 for id in protein_ids {
306 let text = format!("Protein {}", id);
307 let embedding = self.embedder.embed_text(&text);
308
309 let mut metadata = HashMap::new();
310 metadata.insert("protein_id".to_string(), id.clone());
311 metadata.insert("source".to_string(), "ncbi_protein".to_string());
312
313 vectors.push(SemanticVector {
314 id: format!("PROTEIN:{}", id),
315 embedding,
316 domain: Domain::Genomics,
317 timestamp: Utc::now(),
318 metadata,
319 });
320 }
321
322 Ok(vectors)
323 }
324
325 async fn fetch_nucleotide_summaries(&self, seq_ids: &[String]) -> Result<Vec<SemanticVector>> {
327 let mut vectors = Vec::new();
328
329 for id in seq_ids {
330 let text = format!("Nucleotide sequence {}", id);
331 let embedding = self.embedder.embed_text(&text);
332
333 let mut metadata = HashMap::new();
334 metadata.insert("sequence_id".to_string(), id.clone());
335 metadata.insert("source".to_string(), "ncbi_nucleotide".to_string());
336
337 vectors.push(SemanticVector {
338 id: format!("NUCLEOTIDE:{}", id),
339 embedding,
340 domain: Domain::Genomics,
341 timestamp: Utc::now(),
342 metadata,
343 });
344 }
345
346 Ok(vectors)
347 }
348
349 async fn fetch_snp_summaries(&self, snp_ids: &[String]) -> Result<Vec<SemanticVector>> {
351 if snp_ids.is_empty() {
352 return Ok(Vec::new());
353 }
354
355 let mut all_vectors = Vec::new();
356
357 for chunk in snp_ids.chunks(200) {
358 let id_list = chunk.join(",");
359 let mut url = format!(
360 "{}/esummary.fcgi?db=snp&id={}&retmode=json",
361 self.base_url, id_list
362 );
363
364 if let Some(key) = &self.api_key {
365 url.push_str(&format!("&api_key={}", key));
366 }
367
368 sleep(self.rate_limit_delay).await;
369 let response = self.fetch_with_retry(&url).await?;
370 let snp_response: NcbiSnpResponse = response.json().await?;
371
372 for (id, summary) in snp_response.result {
373 if id == "uids" {
374 continue;
375 }
376
377 let gene_names: Vec<String> = summary.genes.iter()
378 .map(|g| g.name.clone())
379 .collect();
380
381 let text = format!(
382 "SNP rs{} chromosome {} position {} function {} genes {}",
383 summary.snp_id,
384 summary.chr,
385 summary.chrpos,
386 summary.fxn_class,
387 gene_names.join(",")
388 );
389 let embedding = self.embedder.embed_text(&text);
390
391 let mut metadata = HashMap::new();
392 metadata.insert("rsid".to_string(), format!("rs{}", summary.snp_id));
393 metadata.insert("chromosome".to_string(), summary.chr);
394 metadata.insert("position".to_string(), summary.chrpos);
395 metadata.insert("function".to_string(), summary.fxn_class);
396 metadata.insert("genes".to_string(), gene_names.join(", "));
397 metadata.insert("source".to_string(), "ncbi_snp".to_string());
398
399 all_vectors.push(SemanticVector {
400 id: format!("SNP:rs{}", summary.snp_id),
401 embedding,
402 domain: Domain::Genomics,
403 timestamp: Utc::now(),
404 metadata,
405 });
406 }
407 }
408
409 Ok(all_vectors)
410 }
411
412 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
414 let mut retries = 0;
415 loop {
416 match self.client.get(url).send().await {
417 Ok(response) => {
418 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
419 retries += 1;
420 sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
421 continue;
422 }
423 return Ok(response);
424 }
425 Err(_) if retries < MAX_RETRIES => {
426 retries += 1;
427 sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
428 }
429 Err(e) => return Err(FrameworkError::Network(e)),
430 }
431 }
432 }
433}
434
435#[derive(Debug, Deserialize)]
441struct UniProtSearchResponse {
442 results: Vec<UniProtEntry>,
443}
444
445#[derive(Debug, Deserialize)]
446struct UniProtEntry {
447 #[serde(rename = "primaryAccession")]
448 primary_accession: String,
449 #[serde(default)]
450 organism: Option<UniProtOrganism>,
451 #[serde(rename = "proteinDescription", default)]
452 protein_description: Option<UniProtDescription>,
453 #[serde(default)]
454 genes: Vec<UniProtGene>,
455 #[serde(default)]
456 comments: Vec<UniProtComment>,
457}
458
459#[derive(Debug, Deserialize)]
460struct UniProtOrganism {
461 #[serde(rename = "scientificName", default)]
462 scientific_name: String,
463}
464
465#[derive(Debug, Deserialize)]
466struct UniProtDescription {
467 #[serde(rename = "recommendedName", default)]
468 recommended_name: Option<UniProtName>,
469}
470
471#[derive(Debug, Deserialize)]
472struct UniProtName {
473 #[serde(rename = "fullName", default)]
474 full_name: Option<UniProtValue>,
475}
476
477#[derive(Debug, Deserialize)]
478struct UniProtValue {
479 #[serde(default)]
480 value: String,
481}
482
483#[derive(Debug, Deserialize)]
484struct UniProtGene {
485 #[serde(rename = "geneName", default)]
486 gene_name: Option<UniProtValue>,
487}
488
489#[derive(Debug, Deserialize)]
490struct UniProtComment {
491 #[serde(rename = "commentType", default)]
492 comment_type: String,
493 #[serde(default)]
494 texts: Vec<UniProtValue>,
495}
496
497pub struct UniProtClient {
499 client: Client,
500 base_url: String,
501 rate_limit_delay: Duration,
502 embedder: Arc<SimpleEmbedder>,
503}
504
505impl UniProtClient {
506 pub fn new() -> Result<Self> {
508 let client = Client::builder()
509 .timeout(Duration::from_secs(30))
510 .user_agent("RuVector/1.0 (genomics discovery)")
511 .build()
512 .map_err(FrameworkError::Network)?;
513
514 Ok(Self {
515 client,
516 base_url: "https://rest.uniprot.org/uniprotkb".to_string(),
517 rate_limit_delay: Duration::from_millis(UNIPROT_RATE_LIMIT_MS),
518 embedder: Arc::new(SimpleEmbedder::new(384)),
519 })
520 }
521
522 pub async fn search_proteins(&self, query: &str, limit: usize) -> Result<Vec<SemanticVector>> {
528 let url = format!(
529 "{}/search?query={}&format=json&size={}",
530 self.base_url,
531 urlencoding::encode(query),
532 limit.min(500)
533 );
534
535 sleep(self.rate_limit_delay).await;
536 let response = self.fetch_with_retry(&url).await?;
537 let search_response: UniProtSearchResponse = response.json().await?;
538
539 let mut vectors = Vec::new();
540 for entry in search_response.results {
541 vectors.push(self.entry_to_vector(entry)?);
542 }
543
544 Ok(vectors)
545 }
546
547 pub async fn get_protein(&self, accession: &str) -> Result<Option<SemanticVector>> {
549 let url = format!("{}/{}.json", self.base_url, accession);
550
551 sleep(self.rate_limit_delay).await;
552 let response = self.fetch_with_retry(&url).await?;
553
554 if response.status() == StatusCode::NOT_FOUND {
555 return Ok(None);
556 }
557
558 let entry: UniProtEntry = response.json().await?;
559 Ok(Some(self.entry_to_vector(entry)?))
560 }
561
562 pub async fn search_by_organism(&self, organism: &str) -> Result<Vec<SemanticVector>> {
564 let query = format!("organism:{}", organism);
565 self.search_proteins(&query, 100).await
566 }
567
568 pub async fn search_by_function(&self, function: &str) -> Result<Vec<SemanticVector>> {
570 let query = format!("cc:{}", function); self.search_proteins(&query, 100).await
572 }
573
574 fn entry_to_vector(&self, entry: UniProtEntry) -> Result<SemanticVector> {
576 let protein_name = entry
577 .protein_description
578 .as_ref()
579 .and_then(|pd| pd.recommended_name.as_ref())
580 .and_then(|rn| rn.full_name.as_ref())
581 .map(|fn_| fn_.value.clone())
582 .unwrap_or_else(|| "Unnamed protein".to_string());
583
584 let organism = entry
585 .organism
586 .as_ref()
587 .map(|o| o.scientific_name.clone())
588 .unwrap_or_default();
589
590 let gene_names: Vec<String> = entry
591 .genes
592 .iter()
593 .filter_map(|g| g.gene_name.as_ref().map(|gn| gn.value.clone()))
594 .collect();
595
596 let function_text = entry
598 .comments
599 .iter()
600 .filter(|c| c.comment_type == "FUNCTION")
601 .flat_map(|c| c.texts.iter().map(|t| t.value.clone()))
602 .collect::<Vec<_>>()
603 .join(" ");
604
605 let text = format!(
606 "{} {} {} {}",
607 protein_name, organism, gene_names.join(","), function_text
608 );
609 let embedding = self.embedder.embed_text(&text);
610
611 let mut metadata = HashMap::new();
612 metadata.insert("accession".to_string(), entry.primary_accession.clone());
613 metadata.insert("protein_name".to_string(), protein_name);
614 metadata.insert("organism".to_string(), organism);
615 metadata.insert("genes".to_string(), gene_names.join(", "));
616 metadata.insert("function".to_string(), function_text);
617 metadata.insert("source".to_string(), "uniprot".to_string());
618
619 Ok(SemanticVector {
620 id: format!("UNIPROT:{}", entry.primary_accession),
621 embedding,
622 domain: Domain::Genomics,
623 timestamp: Utc::now(),
624 metadata,
625 })
626 }
627
628 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
630 let mut retries = 0;
631 loop {
632 match self.client.get(url).send().await {
633 Ok(response) => {
634 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
635 retries += 1;
636 sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
637 continue;
638 }
639 return Ok(response);
640 }
641 Err(_) if retries < MAX_RETRIES => {
642 retries += 1;
643 sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
644 }
645 Err(e) => return Err(FrameworkError::Network(e)),
646 }
647 }
648 }
649}
650
651impl Default for UniProtClient {
652 fn default() -> Self {
653 Self::new().expect("Failed to create UniProt client")
654 }
655}
656
657#[derive(Debug, Deserialize)]
663struct EnsemblGene {
664 id: String,
665 #[serde(default)]
666 display_name: String,
667 #[serde(default)]
668 description: String,
669 #[serde(default)]
670 species: String,
671 #[serde(default)]
672 biotype: String,
673 #[serde(default)]
674 seq_region_name: String,
675 #[serde(default)]
676 start: i64,
677 #[serde(default)]
678 end: i64,
679}
680
681#[derive(Debug, Deserialize)]
683struct EnsemblVariant {
684 #[serde(default)]
685 id: String,
686 #[serde(default)]
687 seq_region_name: String,
688 #[serde(default)]
689 start: i64,
690 #[serde(default)]
691 most_severe_consequence: String,
692}
693
694#[derive(Debug, Deserialize)]
696struct EnsemblHomologyResponse {
697 #[serde(default)]
698 data: Vec<EnsemblHomology>,
699}
700
701#[derive(Debug, Deserialize)]
702struct EnsemblHomology {
703 #[serde(default)]
704 homologies: Vec<EnsemblHomologyEntry>,
705}
706
707#[derive(Debug, Deserialize)]
708struct EnsemblHomologyEntry {
709 #[serde(default)]
710 target: EnsemblTarget,
711 #[serde(rename = "type", default)]
712 homology_type: String,
713}
714
715#[derive(Debug, Deserialize, Default)]
716struct EnsemblTarget {
717 #[serde(default)]
718 id: String,
719 #[serde(default)]
720 species: String,
721 #[serde(default)]
722 protein_id: String,
723}
724
725pub struct EnsemblClient {
727 client: Client,
728 base_url: String,
729 rate_limit_delay: Duration,
730 embedder: Arc<SimpleEmbedder>,
731}
732
733impl EnsemblClient {
734 pub fn new() -> Result<Self> {
736 let client = Client::builder()
737 .timeout(Duration::from_secs(30))
738 .user_agent("RuVector/1.0 (genomics discovery)")
739 .build()
740 .map_err(FrameworkError::Network)?;
741
742 Ok(Self {
743 client,
744 base_url: "https://rest.ensembl.org".to_string(),
745 rate_limit_delay: Duration::from_millis(ENSEMBL_RATE_LIMIT_MS),
746 embedder: Arc::new(SimpleEmbedder::new(384)),
747 })
748 }
749
750 pub async fn get_gene_info(&self, gene_id: &str) -> Result<Option<SemanticVector>> {
755 let url = format!("{}/lookup/id/{}?content-type=application/json", self.base_url, gene_id);
756
757 sleep(self.rate_limit_delay).await;
758 let response = self.fetch_with_retry(&url).await?;
759
760 if response.status() == StatusCode::NOT_FOUND {
761 return Ok(None);
762 }
763
764 let gene: EnsemblGene = response.json().await?;
765
766 let text = format!(
767 "{} {} {} {}",
768 gene.display_name, gene.description, gene.species, gene.biotype
769 );
770 let embedding = self.embedder.embed_text(&text);
771
772 let mut metadata = HashMap::new();
773 metadata.insert("ensembl_id".to_string(), gene.id.clone());
774 metadata.insert("symbol".to_string(), gene.display_name);
775 metadata.insert("description".to_string(), gene.description);
776 metadata.insert("species".to_string(), gene.species);
777 metadata.insert("biotype".to_string(), gene.biotype);
778 metadata.insert("chromosome".to_string(), gene.seq_region_name);
779 metadata.insert("start".to_string(), gene.start.to_string());
780 metadata.insert("end".to_string(), gene.end.to_string());
781 metadata.insert("source".to_string(), "ensembl".to_string());
782
783 Ok(Some(SemanticVector {
784 id: format!("ENSEMBL:{}", gene.id),
785 embedding,
786 domain: Domain::Genomics,
787 timestamp: Utc::now(),
788 metadata,
789 }))
790 }
791
792 pub async fn get_variants(&self, gene_id: &str) -> Result<Vec<SemanticVector>> {
794 let url = format!(
795 "{}/overlap/id/{}?feature=variation;content-type=application/json",
796 self.base_url, gene_id
797 );
798
799 sleep(self.rate_limit_delay).await;
800 let response = self.fetch_with_retry(&url).await?;
801
802 if response.status() == StatusCode::NOT_FOUND {
803 return Ok(Vec::new());
804 }
805
806 let variants: Vec<EnsemblVariant> = response.json().await?;
807
808 let mut vectors = Vec::new();
809 for variant in variants.into_iter().take(100) {
810 let text = format!(
811 "Variant {} chromosome {} position {} consequence {}",
812 variant.id, variant.seq_region_name, variant.start, variant.most_severe_consequence
813 );
814 let embedding = self.embedder.embed_text(&text);
815
816 let mut metadata = HashMap::new();
817 metadata.insert("variant_id".to_string(), variant.id.clone());
818 metadata.insert("chromosome".to_string(), variant.seq_region_name);
819 metadata.insert("position".to_string(), variant.start.to_string());
820 metadata.insert("consequence".to_string(), variant.most_severe_consequence);
821 metadata.insert("gene_id".to_string(), gene_id.to_string());
822 metadata.insert("source".to_string(), "ensembl_variant".to_string());
823
824 vectors.push(SemanticVector {
825 id: format!("VARIANT:{}", variant.id),
826 embedding,
827 domain: Domain::Genomics,
828 timestamp: Utc::now(),
829 metadata,
830 });
831 }
832
833 Ok(vectors)
834 }
835
836 pub async fn get_homologs(&self, gene_id: &str) -> Result<Vec<SemanticVector>> {
838 let url = format!(
839 "{}/homology/id/{}?content-type=application/json;format=condensed",
840 self.base_url, gene_id
841 );
842
843 sleep(self.rate_limit_delay).await;
844 let response = self.fetch_with_retry(&url).await?;
845
846 if response.status() == StatusCode::NOT_FOUND {
847 return Ok(Vec::new());
848 }
849
850 let homology_response: EnsemblHomologyResponse = response.json().await?;
851
852 let mut vectors = Vec::new();
853 for data in homology_response.data {
854 for homology in data.homologies {
855 let text = format!(
856 "Homolog {} in {} type {}",
857 homology.target.id, homology.target.species, homology.homology_type
858 );
859 let embedding = self.embedder.embed_text(&text);
860
861 let mut metadata = HashMap::new();
862 metadata.insert("homolog_id".to_string(), homology.target.id.clone());
863 metadata.insert("species".to_string(), homology.target.species);
864 metadata.insert("protein_id".to_string(), homology.target.protein_id);
865 metadata.insert("homology_type".to_string(), homology.homology_type);
866 metadata.insert("source_gene".to_string(), gene_id.to_string());
867 metadata.insert("source".to_string(), "ensembl_homology".to_string());
868
869 vectors.push(SemanticVector {
870 id: format!("HOMOLOG:{}:{}", gene_id, homology.target.id),
871 embedding,
872 domain: Domain::Genomics,
873 timestamp: Utc::now(),
874 metadata,
875 });
876 }
877 }
878
879 Ok(vectors)
880 }
881
882 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
884 let mut retries = 0;
885 loop {
886 match self.client.get(url).send().await {
887 Ok(response) => {
888 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
889 retries += 1;
890 sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
891 continue;
892 }
893 return Ok(response);
894 }
895 Err(_) if retries < MAX_RETRIES => {
896 retries += 1;
897 sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
898 }
899 Err(e) => return Err(FrameworkError::Network(e)),
900 }
901 }
902 }
903}
904
905impl Default for EnsemblClient {
906 fn default() -> Self {
907 Self::new().expect("Failed to create Ensembl client")
908 }
909}
910
911#[derive(Debug, Deserialize)]
917struct GwasAssociationResponse {
918 #[serde(rename = "_embedded")]
919 embedded: Option<GwasEmbedded>,
920}
921
922#[derive(Debug, Deserialize)]
923struct GwasEmbedded {
924 #[serde(default)]
925 associations: Vec<GwasAssociation>,
926}
927
928#[derive(Debug, Deserialize)]
929struct GwasAssociation {
930 #[serde(default)]
931 riskAllele: String,
932 #[serde(default)]
933 pvalue: f64,
934 #[serde(default, rename = "trait")]
935 trait_name: String,
936 #[serde(default)]
937 chromosomeName: String,
938 #[serde(default)]
939 chromosomePosition: i64,
940 #[serde(default)]
941 loci: Vec<GwasLocus>,
942}
943
944#[derive(Debug, Deserialize)]
945struct GwasLocus {
946 #[serde(default)]
947 authorReportedGene: String,
948}
949
950#[derive(Debug, Deserialize)]
952struct GwasStudyResponse {
953 #[serde(rename = "_embedded")]
954 embedded: Option<GwasStudyEmbedded>,
955}
956
957#[derive(Debug, Deserialize)]
958struct GwasStudyEmbedded {
959 #[serde(default)]
960 studies: Vec<GwasStudy>,
961}
962
963#[derive(Debug, Deserialize)]
964struct GwasStudy {
965 #[serde(default)]
966 accessionId: String,
967 #[serde(default)]
968 publicationDate: Option<String>,
969 #[serde(default)]
970 diseaseTrait: String,
971 #[serde(default)]
972 initialSampleSize: String,
973}
974
975pub struct GwasClient {
977 client: Client,
978 base_url: String,
979 rate_limit_delay: Duration,
980 embedder: Arc<SimpleEmbedder>,
981}
982
983impl GwasClient {
984 pub fn new() -> Result<Self> {
986 let client = Client::builder()
987 .timeout(Duration::from_secs(30))
988 .user_agent("RuVector/1.0 (genomics discovery)")
989 .build()
990 .map_err(FrameworkError::Network)?;
991
992 Ok(Self {
993 client,
994 base_url: "https://www.ebi.ac.uk/gwas/rest/api".to_string(),
995 rate_limit_delay: Duration::from_millis(GWAS_RATE_LIMIT_MS),
996 embedder: Arc::new(SimpleEmbedder::new(384)),
997 })
998 }
999
1000 pub async fn search_associations(&self, trait_name: &str) -> Result<Vec<SemanticVector>> {
1005 let url = format!(
1006 "{}/associations/search/findByEfoTrait?efoTrait={}&size=100",
1007 self.base_url,
1008 urlencoding::encode(trait_name)
1009 );
1010
1011 sleep(self.rate_limit_delay).await;
1012 let response = self.fetch_with_retry(&url).await?;
1013
1014 if response.status() == StatusCode::NOT_FOUND {
1015 return Ok(Vec::new());
1016 }
1017
1018 let assoc_response: GwasAssociationResponse = response.json().await?;
1019
1020 let mut vectors = Vec::new();
1021 if let Some(embedded) = assoc_response.embedded {
1022 for assoc in embedded.associations {
1023 let genes: Vec<String> = assoc.loci.iter()
1024 .map(|l| l.authorReportedGene.clone())
1025 .collect();
1026
1027 let text = format!(
1028 "GWAS association trait {} genes {} chromosome {} position {} p-value {}",
1029 assoc.trait_name,
1030 genes.join(","),
1031 assoc.chromosomeName,
1032 assoc.chromosomePosition,
1033 assoc.pvalue
1034 );
1035 let embedding = self.embedder.embed_text(&text);
1036
1037 let mut metadata = HashMap::new();
1038 metadata.insert("trait".to_string(), assoc.trait_name.clone());
1039 metadata.insert("genes".to_string(), genes.join(", "));
1040 metadata.insert("risk_allele".to_string(), assoc.riskAllele.clone());
1041 metadata.insert("pvalue".to_string(), assoc.pvalue.to_string());
1042 metadata.insert("chromosome".to_string(), assoc.chromosomeName.clone());
1043 metadata.insert("position".to_string(), assoc.chromosomePosition.to_string());
1044 metadata.insert("source".to_string(), "gwas_catalog".to_string());
1045
1046 vectors.push(SemanticVector {
1047 id: format!("GWAS:{}_{}_{}", assoc.chromosomeName, assoc.chromosomePosition, assoc.pvalue),
1048 embedding,
1049 domain: Domain::Genomics,
1050 timestamp: Utc::now(),
1051 metadata,
1052 });
1053 }
1054 }
1055
1056 Ok(vectors)
1057 }
1058
1059 pub async fn get_study(&self, study_id: &str) -> Result<Option<SemanticVector>> {
1061 let url = format!("{}/studies/{}", self.base_url, study_id);
1062
1063 sleep(self.rate_limit_delay).await;
1064 let response = self.fetch_with_retry(&url).await?;
1065
1066 if response.status() == StatusCode::NOT_FOUND {
1067 return Ok(None);
1068 }
1069
1070 let study: GwasStudy = response.json().await?;
1071
1072 let text = format!(
1073 "GWAS study {} trait {} sample size {}",
1074 study.accessionId, study.diseaseTrait, study.initialSampleSize
1075 );
1076 let embedding = self.embedder.embed_text(&text);
1077
1078 let timestamp = study
1079 .publicationDate
1080 .as_ref()
1081 .and_then(|d| NaiveDate::parse_from_str(d, "%Y-%m-%d").ok())
1082 .and_then(|d| d.and_hms_opt(0, 0, 0))
1083 .map(|dt| dt.and_utc())
1084 .unwrap_or_else(Utc::now);
1085
1086 let mut metadata = HashMap::new();
1087 metadata.insert("study_id".to_string(), study.accessionId.clone());
1088 metadata.insert("trait".to_string(), study.diseaseTrait);
1089 metadata.insert("sample_size".to_string(), study.initialSampleSize);
1090 metadata.insert("source".to_string(), "gwas_study".to_string());
1091
1092 Ok(Some(SemanticVector {
1093 id: format!("GWAS_STUDY:{}", study.accessionId),
1094 embedding,
1095 domain: Domain::Genomics,
1096 timestamp,
1097 metadata,
1098 }))
1099 }
1100
1101 pub async fn search_by_gene(&self, gene: &str) -> Result<Vec<SemanticVector>> {
1103 let url = format!(
1104 "{}/associations/search/findByGene?geneName={}&size=100",
1105 self.base_url,
1106 urlencoding::encode(gene)
1107 );
1108
1109 sleep(self.rate_limit_delay).await;
1110 let response = self.fetch_with_retry(&url).await?;
1111
1112 if response.status() == StatusCode::NOT_FOUND {
1113 return Ok(Vec::new());
1114 }
1115
1116 let assoc_response: GwasAssociationResponse = response.json().await?;
1117
1118 let mut vectors = Vec::new();
1119 if let Some(embedded) = assoc_response.embedded {
1120 for assoc in embedded.associations {
1121 let text = format!(
1122 "Gene {} associated with trait {} p-value {}",
1123 gene, assoc.trait_name, assoc.pvalue
1124 );
1125 let embedding = self.embedder.embed_text(&text);
1126
1127 let mut metadata = HashMap::new();
1128 metadata.insert("gene".to_string(), gene.to_string());
1129 metadata.insert("trait".to_string(), assoc.trait_name.clone());
1130 metadata.insert("pvalue".to_string(), assoc.pvalue.to_string());
1131 metadata.insert("chromosome".to_string(), assoc.chromosomeName.clone());
1132 metadata.insert("source".to_string(), "gwas_gene_association".to_string());
1133
1134 vectors.push(SemanticVector {
1135 id: format!("GWAS_GENE:{}:{}", gene, assoc.trait_name),
1136 embedding,
1137 domain: Domain::Genomics,
1138 timestamp: Utc::now(),
1139 metadata,
1140 });
1141 }
1142 }
1143
1144 Ok(vectors)
1145 }
1146
1147 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
1149 let mut retries = 0;
1150 loop {
1151 match self.client.get(url).send().await {
1152 Ok(response) => {
1153 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
1154 retries += 1;
1155 sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
1156 continue;
1157 }
1158 return Ok(response);
1159 }
1160 Err(_) if retries < MAX_RETRIES => {
1161 retries += 1;
1162 sleep(Duration::from_millis(RETRY_DELAY_MS * 2_u64.pow(retries))).await;
1163 }
1164 Err(e) => return Err(FrameworkError::Network(e)),
1165 }
1166 }
1167 }
1168}
1169
1170impl Default for GwasClient {
1171 fn default() -> Self {
1172 Self::new().expect("Failed to create GWAS client")
1173 }
1174}
1175
1176#[cfg(test)]
1181mod tests {
1182 use super::*;
1183
1184 #[tokio::test]
1185 async fn test_ncbi_client_creation() {
1186 let client = NcbiClient::new(None);
1187 assert!(client.is_ok());
1188 }
1189
1190 #[tokio::test]
1191 async fn test_ncbi_rate_limiting() {
1192 let without_key = NcbiClient::new(None).unwrap();
1193 assert_eq!(
1194 without_key.rate_limit_delay,
1195 Duration::from_millis(NCBI_RATE_LIMIT_MS)
1196 );
1197
1198 let with_key = NcbiClient::new(Some("test_key".to_string())).unwrap();
1199 assert_eq!(
1200 with_key.rate_limit_delay,
1201 Duration::from_millis(NCBI_WITH_KEY_RATE_LIMIT_MS)
1202 );
1203 }
1204
1205 #[tokio::test]
1206 async fn test_uniprot_client_creation() {
1207 let client = UniProtClient::new();
1208 assert!(client.is_ok());
1209 }
1210
1211 #[tokio::test]
1212 async fn test_ensembl_client_creation() {
1213 let client = EnsemblClient::new();
1214 assert!(client.is_ok());
1215 }
1216
1217 #[tokio::test]
1218 async fn test_gwas_client_creation() {
1219 let client = GwasClient::new();
1220 assert!(client.is_ok());
1221 }
1222
1223 #[test]
1224 fn test_genomics_domain() {
1225 let _domain = Domain::Genomics;
1227 }
1228}