1use std::collections::HashMap;
17use std::sync::Arc;
18use std::time::Duration;
19
20use chrono::{Datelike, NaiveDate, Utc};
21use reqwest::{Client, StatusCode};
22use serde::{Deserialize, Serialize};
23use tokio::time::sleep;
24
25use crate::api_clients::SimpleEmbedder;
26use crate::ruvector_native::{Domain, SemanticVector};
27use crate::{FrameworkError, Result};
28
29const CENSUS_RATE_LIMIT_MS: u64 = 1200; const DATAGOV_RATE_LIMIT_MS: u64 = 1000; const EU_OPENDATA_RATE_LIMIT_MS: u64 = 500; const UK_GOV_RATE_LIMIT_MS: u64 = 500; const WORLDBANK_RATE_LIMIT_MS: u64 = 100; const UNDATA_RATE_LIMIT_MS: u64 = 1000; const MAX_RETRIES: u32 = 3;
37const RETRY_DELAY_MS: u64 = 1000;
38const DEFAULT_EMBEDDING_DIM: usize = 256;
39
40#[derive(Debug, Deserialize)]
46struct CensusDataResponse {
47 #[serde(default)]
48 dataset: Vec<Vec<serde_json::Value>>,
49}
50
51#[derive(Debug, Deserialize, Serialize, Clone)]
53struct CensusDataset {
54 #[serde(rename = "c_vintage", default)]
55 vintage: String,
56 #[serde(rename = "c_dataset", default)]
57 dataset: Vec<String>,
58 #[serde(default)]
59 title: String,
60 #[serde(default)]
61 description: String,
62}
63
64#[derive(Debug, Deserialize, Clone)]
66struct CensusVariable {
67 #[serde(default)]
68 name: String,
69 #[serde(default)]
70 label: String,
71 #[serde(default)]
72 concept: String,
73 #[serde(default)]
74 predicateType: String,
75}
76
77pub struct CensusClient {
95 client: Client,
96 base_url: String,
97 api_key: Option<String>,
98 rate_limit_delay: Duration,
99 embedder: Arc<SimpleEmbedder>,
100 use_mock: bool,
101}
102
103impl CensusClient {
104 pub fn new(api_key: Option<String>) -> Self {
110 Self::with_config(api_key, DEFAULT_EMBEDDING_DIM, false)
111 }
112
113 pub fn with_config(api_key: Option<String>, embedding_dim: usize, use_mock: bool) -> Self {
120 Self {
121 client: Client::builder()
122 .user_agent("RuVector-Discovery/1.0")
123 .timeout(Duration::from_secs(30))
124 .build()
125 .expect("Failed to create HTTP client"),
126 base_url: "https://api.census.gov/data".to_string(),
127 api_key,
128 rate_limit_delay: Duration::from_millis(CENSUS_RATE_LIMIT_MS),
129 embedder: Arc::new(SimpleEmbedder::new(embedding_dim)),
130 use_mock,
131 }
132 }
133
134 pub async fn get_population(
149 &self,
150 year: u32,
151 geography: &str,
152 ) -> Result<Vec<SemanticVector>> {
153 let url = if year >= 2020 {
154 format!(
155 "{}/{}/dec/pl?get=NAME,P1_001N&for={}",
156 self.base_url, year, geography
157 )
158 } else {
159 format!(
160 "{}/{}/dec/sf1?get=NAME,P001001&for={}",
161 self.base_url, year, geography
162 )
163 };
164
165 self.fetch_census_data(&url, &format!("population_{}", year))
166 .await
167 }
168
169 pub async fn get_acs5(
186 &self,
187 year: u32,
188 variables: Vec<&str>,
189 geography: &str,
190 ) -> Result<Vec<SemanticVector>> {
191 let vars = variables.join(",");
192 let url = format!(
193 "{}/{}/acs/acs5?get=NAME,{}&for={}",
194 self.base_url, year, vars, geography
195 );
196
197 self.fetch_census_data(&url, &format!("acs5_{}", year))
198 .await
199 }
200
201 pub async fn get_available_datasets(&self) -> Result<Vec<SemanticVector>> {
211 if self.use_mock {
212 return Ok(self.get_mock_datasets());
213 }
214
215 let url = format!("{}.json", self.base_url);
216
217 sleep(self.rate_limit_delay).await;
218 let response = self.fetch_with_retry(&url).await?;
219 let text = response.text().await?;
220
221 let json: serde_json::Value = serde_json::from_str(&text)?;
223 let mut vectors = Vec::new();
224
225 if let Some(dataset_obj) = json.get("dataset") {
226 if let Some(datasets) = dataset_obj.as_array() {
227 for (idx, ds) in datasets.iter().enumerate() {
228 if let Some(title) = ds.get("title").and_then(|t| t.as_str()) {
229 let description = ds
230 .get("description")
231 .and_then(|d| d.as_str())
232 .unwrap_or("");
233 let vintage = ds.get("c_vintage").and_then(|v| v.as_str()).unwrap_or("");
234
235 let text = format!("{} {} {}", title, description, vintage);
236 let embedding = self.embedder.embed_text(&text);
237
238 let mut metadata = HashMap::new();
239 metadata.insert("title".to_string(), title.to_string());
240 metadata.insert("description".to_string(), description.to_string());
241 metadata.insert("vintage".to_string(), vintage.to_string());
242 metadata.insert("source".to_string(), "census_catalog".to_string());
243
244 vectors.push(SemanticVector {
245 id: format!("CENSUS_DS:{}", idx),
246 embedding,
247 domain: Domain::Government,
248 timestamp: Utc::now(),
249 metadata,
250 });
251 }
252 }
253 }
254 }
255
256 Ok(vectors)
257 }
258
259 pub async fn search_variables(
270 &self,
271 dataset: &str,
272 query: &str,
273 ) -> Result<Vec<SemanticVector>> {
274 if self.use_mock {
275 return Ok(self.get_mock_variables(query));
276 }
277
278 let url = format!("{}/2021/{}/variables.json", self.base_url, dataset);
279
280 sleep(self.rate_limit_delay).await;
281 let response = self.fetch_with_retry(&url).await?;
282 let text = response.text().await?;
283
284 let json: serde_json::Value = serde_json::from_str(&text)?;
285 let mut vectors = Vec::new();
286
287 if let Some(variables) = json.get("variables").and_then(|v| v.as_object()) {
288 for (var_name, var_data) in variables.iter() {
289 if let Some(label) = var_data.get("label").and_then(|l| l.as_str()) {
290 if !label.to_lowercase().contains(&query.to_lowercase())
292 && !var_name.to_lowercase().contains(&query.to_lowercase())
293 {
294 continue;
295 }
296
297 let concept = var_data
298 .get("concept")
299 .and_then(|c| c.as_str())
300 .unwrap_or("");
301
302 let text = format!("{} {} {}", var_name, label, concept);
303 let embedding = self.embedder.embed_text(&text);
304
305 let mut metadata = HashMap::new();
306 metadata.insert("variable_name".to_string(), var_name.clone());
307 metadata.insert("label".to_string(), label.to_string());
308 metadata.insert("concept".to_string(), concept.to_string());
309 metadata.insert("dataset".to_string(), dataset.to_string());
310 metadata.insert("source".to_string(), "census_variables".to_string());
311
312 vectors.push(SemanticVector {
313 id: format!("CENSUS_VAR:{}:{}", dataset, var_name),
314 embedding,
315 domain: Domain::Government,
316 timestamp: Utc::now(),
317 metadata,
318 });
319
320 if vectors.len() >= 50 {
321 break; }
323 }
324 }
325 }
326
327 Ok(vectors)
328 }
329
330 async fn fetch_census_data(&self, url: &str, dataset_name: &str) -> Result<Vec<SemanticVector>> {
332 if self.use_mock {
333 return Ok(self.get_mock_census_data(dataset_name));
334 }
335
336 let mut full_url = url.to_string();
337 if let Some(key) = &self.api_key {
338 full_url.push_str(&format!("&key={}", key));
339 }
340
341 sleep(self.rate_limit_delay).await;
342 let response = self.fetch_with_retry(&full_url).await?;
343 let text = response.text().await?;
344
345 let data: Vec<Vec<serde_json::Value>> = serde_json::from_str(&text)?;
347
348 if data.is_empty() {
349 return Ok(Vec::new());
350 }
351
352 let headers = &data[0];
353 let mut vectors = Vec::new();
354
355 for (idx, row) in data.iter().skip(1).enumerate() {
356 let mut record = HashMap::new();
357 for (i, value) in row.iter().enumerate() {
358 if let Some(header) = headers.get(i).and_then(|h| h.as_str()) {
359 record.insert(header.to_string(), value.to_string());
360 }
361 }
362
363 let name = record.get("NAME").map(|s| s.as_str()).unwrap_or("Unknown");
364 let text = format!("{} {}", dataset_name, name);
365 let embedding = self.embedder.embed_text(&text);
366
367 let mut metadata = record.clone();
368 metadata.insert("source".to_string(), "census".to_string());
369 metadata.insert("dataset".to_string(), dataset_name.to_string());
370
371 vectors.push(SemanticVector {
372 id: format!("CENSUS:{}:{}", dataset_name, idx),
373 embedding,
374 domain: Domain::Government,
375 timestamp: Utc::now(),
376 metadata,
377 });
378 }
379
380 Ok(vectors)
381 }
382
383 fn get_mock_census_data(&self, dataset_name: &str) -> Vec<SemanticVector> {
385 let mut vectors = Vec::new();
386 let items = vec![
387 ("California", "39538223"),
388 ("Texas", "29145505"),
389 ("Florida", "21538187"),
390 ];
391
392 for (idx, (name, population)) in items.iter().enumerate() {
393 let text = format!("{} {} population {}", dataset_name, name, population);
394 let embedding = self.embedder.embed_text(&text);
395
396 let mut metadata = HashMap::new();
397 metadata.insert("NAME".to_string(), name.to_string());
398 metadata.insert("population".to_string(), population.to_string());
399 metadata.insert("source".to_string(), "census_mock".to_string());
400
401 vectors.push(SemanticVector {
402 id: format!("CENSUS_MOCK:{}:{}", dataset_name, idx),
403 embedding,
404 domain: Domain::Government,
405 timestamp: Utc::now(),
406 metadata,
407 });
408 }
409
410 vectors
411 }
412
413 fn get_mock_datasets(&self) -> Vec<SemanticVector> {
415 vec![self.create_mock_dataset(
416 "Decennial Census",
417 "Population and housing counts",
418 "2020",
419 )]
420 }
421
422 fn get_mock_variables(&self, query: &str) -> Vec<SemanticVector> {
424 vec![self.create_mock_variable(
425 "B19013_001E",
426 "Median Household Income",
427 "Income",
428 )]
429 }
430
431 fn create_mock_dataset(&self, title: &str, description: &str, vintage: &str) -> SemanticVector {
432 let text = format!("{} {} {}", title, description, vintage);
433 let embedding = self.embedder.embed_text(&text);
434
435 let mut metadata = HashMap::new();
436 metadata.insert("title".to_string(), title.to_string());
437 metadata.insert("description".to_string(), description.to_string());
438 metadata.insert("vintage".to_string(), vintage.to_string());
439 metadata.insert("source".to_string(), "census_mock".to_string());
440
441 SemanticVector {
442 id: "CENSUS_MOCK_DS:1".to_string(),
443 embedding,
444 domain: Domain::Government,
445 timestamp: Utc::now(),
446 metadata,
447 }
448 }
449
450 fn create_mock_variable(&self, name: &str, label: &str, concept: &str) -> SemanticVector {
451 let text = format!("{} {} {}", name, label, concept);
452 let embedding = self.embedder.embed_text(&text);
453
454 let mut metadata = HashMap::new();
455 metadata.insert("variable_name".to_string(), name.to_string());
456 metadata.insert("label".to_string(), label.to_string());
457 metadata.insert("concept".to_string(), concept.to_string());
458 metadata.insert("source".to_string(), "census_mock".to_string());
459
460 SemanticVector {
461 id: format!("CENSUS_MOCK_VAR:{}", name),
462 embedding,
463 domain: Domain::Government,
464 timestamp: Utc::now(),
465 metadata,
466 }
467 }
468
469 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
471 let mut retries = 0;
472 loop {
473 match self.client.get(url).send().await {
474 Ok(response) => {
475 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
476 retries += 1;
477 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
478 continue;
479 }
480 if !response.status().is_success() && self.use_mock {
481 return Err(FrameworkError::Network(
482 reqwest::Error::from(response.error_for_status().unwrap_err()),
483 ));
484 }
485 return Ok(response);
486 }
487 Err(e) if retries < MAX_RETRIES && self.use_mock => {
488 retries += 1;
489 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
490 }
491 Err(e) => return Err(FrameworkError::Network(e)),
492 }
493 }
494 }
495}
496
497#[derive(Debug, Deserialize)]
503struct DataGovSearchResponse {
504 #[serde(default)]
505 success: bool,
506 result: DataGovResult,
507}
508
509#[derive(Debug, Deserialize)]
510struct DataGovResult {
511 #[serde(default)]
512 count: u64,
513 #[serde(default)]
514 results: Vec<DataGovDataset>,
515}
516
517#[derive(Debug, Deserialize)]
518struct DataGovDataset {
519 id: String,
520 #[serde(default)]
521 title: String,
522 #[serde(default)]
523 notes: String,
524 #[serde(default)]
525 organization: Option<DataGovOrganization>,
526 #[serde(default)]
527 tags: Vec<DataGovTag>,
528 #[serde(default)]
529 metadata_created: String,
530 #[serde(default)]
531 metadata_modified: String,
532}
533
534#[derive(Debug, Deserialize)]
535struct DataGovOrganization {
536 #[serde(default)]
537 name: String,
538 #[serde(default)]
539 title: String,
540}
541
542#[derive(Debug, Deserialize)]
543struct DataGovTag {
544 #[serde(default)]
545 name: String,
546}
547
548#[derive(Debug, Deserialize)]
549struct DataGovOrganizationList {
550 #[serde(default)]
551 success: bool,
552 #[serde(default)]
553 result: Vec<DataGovOrganizationInfo>,
554}
555
556#[derive(Debug, Deserialize)]
557struct DataGovOrganizationInfo {
558 id: String,
559 name: String,
560 title: String,
561 #[serde(default)]
562 description: String,
563 #[serde(default)]
564 package_count: u64,
565}
566
567pub struct DataGovClient {
584 client: Client,
585 base_url: String,
586 rate_limit_delay: Duration,
587 embedder: Arc<SimpleEmbedder>,
588 use_mock: bool,
589}
590
591impl DataGovClient {
592 pub fn new() -> Self {
594 Self::with_config(DEFAULT_EMBEDDING_DIM, false)
595 }
596
597 pub fn with_config(embedding_dim: usize, use_mock: bool) -> Self {
599 Self {
600 client: Client::builder()
601 .user_agent("RuVector-Discovery/1.0")
602 .timeout(Duration::from_secs(30))
603 .build()
604 .expect("Failed to create HTTP client"),
605 base_url: "https://catalog.data.gov/api/3".to_string(),
606 rate_limit_delay: Duration::from_millis(DATAGOV_RATE_LIMIT_MS),
607 embedder: Arc::new(SimpleEmbedder::new(embedding_dim)),
608 use_mock,
609 }
610 }
611
612 pub async fn search_datasets(&self, query: &str) -> Result<Vec<SemanticVector>> {
622 if self.use_mock {
623 return Ok(self.get_mock_datagov_datasets(query));
624 }
625
626 let url = format!(
627 "{}/action/package_search?q={}&rows=50",
628 self.base_url,
629 urlencoding::encode(query)
630 );
631
632 sleep(self.rate_limit_delay).await;
633 let response = self.fetch_with_retry(&url).await?;
634 let search_response: DataGovSearchResponse = response.json().await?;
635
636 let mut vectors = Vec::new();
637 for dataset in search_response.result.results {
638 let org_name = dataset
639 .organization
640 .as_ref()
641 .map(|o| o.title.as_str())
642 .unwrap_or("Unknown");
643
644 let tags = dataset
645 .tags
646 .iter()
647 .map(|t| t.name.clone())
648 .collect::<Vec<_>>()
649 .join(", ");
650
651 let text = format!("{} {} {} {}", dataset.title, dataset.notes, org_name, tags);
652 let embedding = self.embedder.embed_text(&text);
653
654 let mut metadata = HashMap::new();
655 metadata.insert("title".to_string(), dataset.title);
656 metadata.insert("description".to_string(), dataset.notes);
657 metadata.insert("organization".to_string(), org_name.to_string());
658 metadata.insert("tags".to_string(), tags);
659 metadata.insert("created".to_string(), dataset.metadata_created);
660 metadata.insert("modified".to_string(), dataset.metadata_modified);
661 metadata.insert("source".to_string(), "datagov".to_string());
662
663 vectors.push(SemanticVector {
664 id: format!("DATAGOV:{}", dataset.id),
665 embedding,
666 domain: Domain::Government,
667 timestamp: Utc::now(),
668 metadata,
669 });
670 }
671
672 Ok(vectors)
673 }
674
675 pub async fn get_dataset(&self, id: &str) -> Result<Option<SemanticVector>> {
685 if self.use_mock {
686 return Ok(Some(self.get_mock_datagov_datasets("mock").into_iter().next().unwrap()));
687 }
688
689 let url = format!("{}/action/package_show?id={}", self.base_url, id);
690
691 sleep(self.rate_limit_delay).await;
692 let response = self.fetch_with_retry(&url).await?;
693 let json: serde_json::Value = response.json().await?;
694
695 if let Some(result) = json.get("result") {
696 let dataset: DataGovDataset = serde_json::from_value(result.clone())?;
697
698 let org_name = dataset
699 .organization
700 .as_ref()
701 .map(|o| o.title.as_str())
702 .unwrap_or("Unknown");
703
704 let text = format!("{} {}", dataset.title, dataset.notes);
705 let embedding = self.embedder.embed_text(&text);
706
707 let mut metadata = HashMap::new();
708 metadata.insert("title".to_string(), dataset.title);
709 metadata.insert("description".to_string(), dataset.notes);
710 metadata.insert("organization".to_string(), org_name.to_string());
711 metadata.insert("source".to_string(), "datagov".to_string());
712
713 return Ok(Some(SemanticVector {
714 id: format!("DATAGOV:{}", dataset.id),
715 embedding,
716 domain: Domain::Government,
717 timestamp: Utc::now(),
718 metadata,
719 }));
720 }
721
722 Ok(None)
723 }
724
725 pub async fn list_organizations(&self) -> Result<Vec<SemanticVector>> {
732 if self.use_mock {
733 return Ok(self.get_mock_organizations());
734 }
735
736 let url = format!("{}/action/organization_list?all_fields=true", self.base_url);
737
738 sleep(self.rate_limit_delay).await;
739 let response = self.fetch_with_retry(&url).await?;
740 let org_list: DataGovOrganizationList = response.json().await?;
741
742 let mut vectors = Vec::new();
743 for org in org_list.result.iter().take(100) {
744 let text = format!("{} {}", org.title, org.description);
745 let embedding = self.embedder.embed_text(&text);
746
747 let mut metadata = HashMap::new();
748 metadata.insert("name".to_string(), org.name.clone());
749 metadata.insert("title".to_string(), org.title.clone());
750 metadata.insert("description".to_string(), org.description.clone());
751 metadata.insert("package_count".to_string(), org.package_count.to_string());
752 metadata.insert("source".to_string(), "datagov_org".to_string());
753
754 vectors.push(SemanticVector {
755 id: format!("DATAGOV_ORG:{}", org.id),
756 embedding,
757 domain: Domain::Government,
758 timestamp: Utc::now(),
759 metadata,
760 });
761 }
762
763 Ok(vectors)
764 }
765
766 pub async fn get_organization(&self, id: &str) -> Result<Option<SemanticVector>> {
776 if self.use_mock {
777 return Ok(Some(self.get_mock_organizations().into_iter().next().unwrap()));
778 }
779
780 let url = format!("{}/action/organization_show?id={}", self.base_url, id);
781
782 sleep(self.rate_limit_delay).await;
783 let response = self.fetch_with_retry(&url).await?;
784 let json: serde_json::Value = response.json().await?;
785
786 if let Some(result) = json.get("result") {
787 let org: DataGovOrganizationInfo = serde_json::from_value(result.clone())?;
788
789 let text = format!("{} {}", org.title, org.description);
790 let embedding = self.embedder.embed_text(&text);
791
792 let mut metadata = HashMap::new();
793 metadata.insert("name".to_string(), org.name);
794 metadata.insert("title".to_string(), org.title);
795 metadata.insert("description".to_string(), org.description);
796 metadata.insert("source".to_string(), "datagov_org".to_string());
797
798 return Ok(Some(SemanticVector {
799 id: format!("DATAGOV_ORG:{}", org.id),
800 embedding,
801 domain: Domain::Government,
802 timestamp: Utc::now(),
803 metadata,
804 }));
805 }
806
807 Ok(None)
808 }
809
810 pub async fn list_tags(&self) -> Result<Vec<SemanticVector>> {
817 if self.use_mock {
818 return Ok(self.get_mock_tags());
819 }
820
821 let url = format!("{}/action/tag_list", self.base_url);
822
823 sleep(self.rate_limit_delay).await;
824 let response = self.fetch_with_retry(&url).await?;
825 let json: serde_json::Value = response.json().await?;
826
827 let mut vectors = Vec::new();
828 if let Some(result) = json.get("result").and_then(|r| r.as_array()) {
829 for (idx, tag) in result.iter().take(100).enumerate() {
830 if let Some(tag_name) = tag.as_str() {
831 let embedding = self.embedder.embed_text(tag_name);
832
833 let mut metadata = HashMap::new();
834 metadata.insert("tag".to_string(), tag_name.to_string());
835 metadata.insert("source".to_string(), "datagov_tag".to_string());
836
837 vectors.push(SemanticVector {
838 id: format!("DATAGOV_TAG:{}", idx),
839 embedding,
840 domain: Domain::Government,
841 timestamp: Utc::now(),
842 metadata,
843 });
844 }
845 }
846 }
847
848 Ok(vectors)
849 }
850
851 fn get_mock_datagov_datasets(&self, query: &str) -> Vec<SemanticVector> {
852 let text = format!("Mock dataset for {}", query);
853 let embedding = self.embedder.embed_text(&text);
854
855 let mut metadata = HashMap::new();
856 metadata.insert("title".to_string(), "Mock Dataset".to_string());
857 metadata.insert("description".to_string(), "Mock data for testing".to_string());
858 metadata.insert("source".to_string(), "datagov_mock".to_string());
859
860 vec![SemanticVector {
861 id: "DATAGOV_MOCK:1".to_string(),
862 embedding,
863 domain: Domain::Government,
864 timestamp: Utc::now(),
865 metadata,
866 }]
867 }
868
869 fn get_mock_organizations(&self) -> Vec<SemanticVector> {
870 let text = "NASA National Aeronautics and Space Administration";
871 let embedding = self.embedder.embed_text(text);
872
873 let mut metadata = HashMap::new();
874 metadata.insert("name".to_string(), "nasa-gov".to_string());
875 metadata.insert("title".to_string(), "NASA".to_string());
876 metadata.insert("source".to_string(), "datagov_mock".to_string());
877
878 vec![SemanticVector {
879 id: "DATAGOV_ORG_MOCK:1".to_string(),
880 embedding,
881 domain: Domain::Government,
882 timestamp: Utc::now(),
883 metadata,
884 }]
885 }
886
887 fn get_mock_tags(&self) -> Vec<SemanticVector> {
888 let text = "climate";
889 let embedding = self.embedder.embed_text(text);
890
891 let mut metadata = HashMap::new();
892 metadata.insert("tag".to_string(), "climate".to_string());
893 metadata.insert("source".to_string(), "datagov_mock".to_string());
894
895 vec![SemanticVector {
896 id: "DATAGOV_TAG_MOCK:1".to_string(),
897 embedding,
898 domain: Domain::Government,
899 timestamp: Utc::now(),
900 metadata,
901 }]
902 }
903
904 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
905 let mut retries = 0;
906 loop {
907 match self.client.get(url).send().await {
908 Ok(response) => {
909 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
910 retries += 1;
911 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
912 continue;
913 }
914 return Ok(response);
915 }
916 Err(e) if retries < MAX_RETRIES => {
917 retries += 1;
918 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
919 }
920 Err(e) => return Err(FrameworkError::Network(e)),
921 }
922 }
923 }
924}
925
926impl Default for DataGovClient {
927 fn default() -> Self {
928 Self::new()
929 }
930}
931
932#[derive(Debug, Deserialize)]
938struct EuOpenDataResponse {
939 #[serde(default)]
940 result: EuOpenDataResult,
941}
942
943#[derive(Debug, Default, Deserialize)]
944struct EuOpenDataResult {
945 #[serde(default)]
946 count: u64,
947 #[serde(default)]
948 results: Vec<EuDataset>,
949}
950
951#[derive(Debug, Deserialize)]
952struct EuDataset {
953 #[serde(default)]
954 id: String,
955 #[serde(default)]
956 title: HashMap<String, String>,
957 #[serde(default)]
958 description: HashMap<String, String>,
959 #[serde(default)]
960 keywords: Vec<String>,
961 #[serde(default)]
962 catalog: Option<String>,
963}
964
965pub struct EuOpenDataClient {
981 client: Client,
982 base_url: String,
983 rate_limit_delay: Duration,
984 embedder: Arc<SimpleEmbedder>,
985 use_mock: bool,
986}
987
988impl EuOpenDataClient {
989 pub fn new() -> Self {
991 Self::with_config(DEFAULT_EMBEDDING_DIM, false)
992 }
993
994 pub fn with_config(embedding_dim: usize, use_mock: bool) -> Self {
996 Self {
997 client: Client::builder()
998 .user_agent("RuVector-Discovery/1.0")
999 .timeout(Duration::from_secs(30))
1000 .build()
1001 .expect("Failed to create HTTP client"),
1002 base_url: "https://data.europa.eu/api/hub/search".to_string(),
1003 rate_limit_delay: Duration::from_millis(EU_OPENDATA_RATE_LIMIT_MS),
1004 embedder: Arc::new(SimpleEmbedder::new(embedding_dim)),
1005 use_mock,
1006 }
1007 }
1008
1009 pub async fn search_datasets(&self, query: &str) -> Result<Vec<SemanticVector>> {
1019 if self.use_mock {
1020 return Ok(self.get_mock_eu_datasets(query));
1021 }
1022
1023 let url = format!(
1024 "{}/datasets?q={}&limit=50",
1025 self.base_url,
1026 urlencoding::encode(query)
1027 );
1028
1029 sleep(self.rate_limit_delay).await;
1030 let response = self.fetch_with_retry(&url).await?;
1031 let json: serde_json::Value = response.json().await?;
1032
1033 let mut vectors = Vec::new();
1034
1035 if let Some(results) = json.get("result").and_then(|r| r.get("results")).and_then(|r| r.as_array()) {
1036 for dataset in results {
1037 let id = dataset.get("id").and_then(|i| i.as_str()).unwrap_or("");
1038 let title = dataset
1039 .get("title")
1040 .and_then(|t| t.get("en"))
1041 .and_then(|e| e.as_str())
1042 .or_else(|| dataset.get("title").and_then(|t| t.as_str()))
1043 .unwrap_or("");
1044
1045 let description = dataset
1046 .get("description")
1047 .and_then(|d| d.get("en"))
1048 .and_then(|e| e.as_str())
1049 .or_else(|| dataset.get("description").and_then(|d| d.as_str()))
1050 .unwrap_or("");
1051
1052 let text = format!("{} {}", title, description);
1053 let embedding = self.embedder.embed_text(&text);
1054
1055 let mut metadata = HashMap::new();
1056 metadata.insert("title".to_string(), title.to_string());
1057 metadata.insert("description".to_string(), description.to_string());
1058 metadata.insert("source".to_string(), "eu_opendata".to_string());
1059
1060 vectors.push(SemanticVector {
1061 id: format!("EU:{}", id),
1062 embedding,
1063 domain: Domain::Government,
1064 timestamp: Utc::now(),
1065 metadata,
1066 });
1067 }
1068 }
1069
1070 Ok(vectors)
1071 }
1072
1073 pub async fn get_dataset(&self, id: &str) -> Result<Option<SemanticVector>> {
1078 if self.use_mock {
1079 return Ok(Some(self.get_mock_eu_datasets("mock").into_iter().next().unwrap()));
1080 }
1081
1082 let url = format!("{}/datasets/{}", self.base_url, id);
1083
1084 sleep(self.rate_limit_delay).await;
1085 let response = self.fetch_with_retry(&url).await?;
1086 let dataset: serde_json::Value = response.json().await?;
1087
1088 let title = dataset
1089 .get("title")
1090 .and_then(|t| t.get("en"))
1091 .and_then(|e| e.as_str())
1092 .unwrap_or("");
1093
1094 let text = title.to_string();
1095 let embedding = self.embedder.embed_text(&text);
1096
1097 let mut metadata = HashMap::new();
1098 metadata.insert("title".to_string(), title.to_string());
1099 metadata.insert("source".to_string(), "eu_opendata".to_string());
1100
1101 Ok(Some(SemanticVector {
1102 id: format!("EU:{}", id),
1103 embedding,
1104 domain: Domain::Government,
1105 timestamp: Utc::now(),
1106 metadata,
1107 }))
1108 }
1109
1110 pub async fn list_catalogs(&self) -> Result<Vec<SemanticVector>> {
1112 if self.use_mock {
1113 return Ok(self.get_mock_catalogs());
1114 }
1115
1116 let url = format!("{}/catalogues", self.base_url);
1117
1118 sleep(self.rate_limit_delay).await;
1119 let response = self.fetch_with_retry(&url).await?;
1120 let json: serde_json::Value = response.json().await?;
1121
1122 let mut vectors = Vec::new();
1123
1124 if let Some(catalogs) = json.get("result").and_then(|r| r.as_array()) {
1125 for (idx, catalog) in catalogs.iter().take(50).enumerate() {
1126 let id = catalog.get("id").and_then(|i| i.as_str()).unwrap_or("");
1127 let title = catalog.get("title").and_then(|t| t.as_str()).unwrap_or("");
1128
1129 let embedding = self.embedder.embed_text(title);
1130
1131 let mut metadata = HashMap::new();
1132 metadata.insert("title".to_string(), title.to_string());
1133 metadata.insert("source".to_string(), "eu_catalog".to_string());
1134
1135 vectors.push(SemanticVector {
1136 id: format!("EU_CAT:{}", id),
1137 embedding,
1138 domain: Domain::Government,
1139 timestamp: Utc::now(),
1140 metadata,
1141 });
1142 }
1143 }
1144
1145 Ok(vectors)
1146 }
1147
1148 pub async fn get_catalog(&self, id: &str) -> Result<Option<SemanticVector>> {
1150 if self.use_mock {
1151 return Ok(Some(self.get_mock_catalogs().into_iter().next().unwrap()));
1152 }
1153
1154 let url = format!("{}/catalogues/{}", self.base_url, id);
1155
1156 sleep(self.rate_limit_delay).await;
1157 let response = self.fetch_with_retry(&url).await?;
1158 let catalog: serde_json::Value = response.json().await?;
1159
1160 let title = catalog.get("title").and_then(|t| t.as_str()).unwrap_or("");
1161 let embedding = self.embedder.embed_text(title);
1162
1163 let mut metadata = HashMap::new();
1164 metadata.insert("title".to_string(), title.to_string());
1165 metadata.insert("source".to_string(), "eu_catalog".to_string());
1166
1167 Ok(Some(SemanticVector {
1168 id: format!("EU_CAT:{}", id),
1169 embedding,
1170 domain: Domain::Government,
1171 timestamp: Utc::now(),
1172 metadata,
1173 }))
1174 }
1175
1176 pub async fn search_by_theme(&self, theme: &str) -> Result<Vec<SemanticVector>> {
1178 if self.use_mock {
1179 return Ok(self.get_mock_eu_datasets(theme));
1180 }
1181
1182 let url = format!(
1183 "{}/datasets?facets[theme][]={}&limit=50",
1184 self.base_url,
1185 urlencoding::encode(theme)
1186 );
1187
1188 sleep(self.rate_limit_delay).await;
1189 let response = self.fetch_with_retry(&url).await?;
1190 let json: serde_json::Value = response.json().await?;
1191
1192 self.parse_eu_datasets(&json)
1194 }
1195
1196 fn parse_eu_datasets(&self, json: &serde_json::Value) -> Result<Vec<SemanticVector>> {
1197 let mut vectors = Vec::new();
1198
1199 if let Some(results) = json.get("result").and_then(|r| r.get("results")).and_then(|r| r.as_array()) {
1200 for dataset in results {
1201 let id = dataset.get("id").and_then(|i| i.as_str()).unwrap_or("");
1202 let title = dataset.get("title").and_then(|t| t.as_str()).unwrap_or("");
1203
1204 let embedding = self.embedder.embed_text(title);
1205
1206 let mut metadata = HashMap::new();
1207 metadata.insert("title".to_string(), title.to_string());
1208 metadata.insert("source".to_string(), "eu_opendata".to_string());
1209
1210 vectors.push(SemanticVector {
1211 id: format!("EU:{}", id),
1212 embedding,
1213 domain: Domain::Government,
1214 timestamp: Utc::now(),
1215 metadata,
1216 });
1217 }
1218 }
1219
1220 Ok(vectors)
1221 }
1222
1223 fn get_mock_eu_datasets(&self, query: &str) -> Vec<SemanticVector> {
1224 let text = format!("EU dataset about {}", query);
1225 let embedding = self.embedder.embed_text(&text);
1226
1227 let mut metadata = HashMap::new();
1228 metadata.insert("title".to_string(), "Mock EU Dataset".to_string());
1229 metadata.insert("source".to_string(), "eu_mock".to_string());
1230
1231 vec![SemanticVector {
1232 id: "EU_MOCK:1".to_string(),
1233 embedding,
1234 domain: Domain::Government,
1235 timestamp: Utc::now(),
1236 metadata,
1237 }]
1238 }
1239
1240 fn get_mock_catalogs(&self) -> Vec<SemanticVector> {
1241 let text = "European Commission Data Catalog";
1242 let embedding = self.embedder.embed_text(text);
1243
1244 let mut metadata = HashMap::new();
1245 metadata.insert("title".to_string(), "EC Catalog".to_string());
1246 metadata.insert("source".to_string(), "eu_mock".to_string());
1247
1248 vec![SemanticVector {
1249 id: "EU_CAT_MOCK:1".to_string(),
1250 embedding,
1251 domain: Domain::Government,
1252 timestamp: Utc::now(),
1253 metadata,
1254 }]
1255 }
1256
1257 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
1258 let mut retries = 0;
1259 loop {
1260 match self.client.get(url).send().await {
1261 Ok(response) => {
1262 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
1263 retries += 1;
1264 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
1265 continue;
1266 }
1267 return Ok(response);
1268 }
1269 Err(e) if retries < MAX_RETRIES => {
1270 retries += 1;
1271 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
1272 }
1273 Err(e) => return Err(FrameworkError::Network(e)),
1274 }
1275 }
1276 }
1277}
1278
1279impl Default for EuOpenDataClient {
1280 fn default() -> Self {
1281 Self::new()
1282 }
1283}
1284
1285#[derive(Debug, Deserialize)]
1291struct UkGovSearchResponse {
1292 #[serde(default)]
1293 success: bool,
1294 result: UkGovResult,
1295}
1296
1297#[derive(Debug, Deserialize)]
1298struct UkGovResult {
1299 #[serde(default)]
1300 count: u64,
1301 #[serde(default)]
1302 results: Vec<UkGovDataset>,
1303}
1304
1305#[derive(Debug, Deserialize)]
1306struct UkGovDataset {
1307 id: String,
1308 #[serde(default)]
1309 title: String,
1310 #[serde(default)]
1311 notes: String,
1312 #[serde(default)]
1313 organization: Option<UkGovOrganization>,
1314}
1315
1316#[derive(Debug, Deserialize)]
1317struct UkGovOrganization {
1318 #[serde(default)]
1319 title: String,
1320}
1321
1322pub struct UkGovClient {
1338 client: Client,
1339 base_url: String,
1340 rate_limit_delay: Duration,
1341 embedder: Arc<SimpleEmbedder>,
1342 use_mock: bool,
1343}
1344
1345impl UkGovClient {
1346 pub fn new() -> Self {
1348 Self::with_config(DEFAULT_EMBEDDING_DIM, false)
1349 }
1350
1351 pub fn with_config(embedding_dim: usize, use_mock: bool) -> Self {
1353 Self {
1354 client: Client::builder()
1355 .user_agent("RuVector-Discovery/1.0")
1356 .timeout(Duration::from_secs(30))
1357 .build()
1358 .expect("Failed to create HTTP client"),
1359 base_url: "https://data.gov.uk/api/action".to_string(),
1360 rate_limit_delay: Duration::from_millis(UK_GOV_RATE_LIMIT_MS),
1361 embedder: Arc::new(SimpleEmbedder::new(embedding_dim)),
1362 use_mock,
1363 }
1364 }
1365
1366 pub async fn search_datasets(&self, query: &str) -> Result<Vec<SemanticVector>> {
1368 if self.use_mock {
1369 return Ok(self.get_mock_uk_datasets(query));
1370 }
1371
1372 let url = format!(
1373 "{}/package_search?q={}&rows=50",
1374 self.base_url,
1375 urlencoding::encode(query)
1376 );
1377
1378 sleep(self.rate_limit_delay).await;
1379 let response = self.fetch_with_retry(&url).await?;
1380 let search_response: UkGovSearchResponse = response.json().await?;
1381
1382 let mut vectors = Vec::new();
1383 for dataset in search_response.result.results {
1384 let org_name = dataset
1385 .organization
1386 .as_ref()
1387 .map(|o| o.title.as_str())
1388 .unwrap_or("Unknown");
1389
1390 let text = format!("{} {} {}", dataset.title, dataset.notes, org_name);
1391 let embedding = self.embedder.embed_text(&text);
1392
1393 let mut metadata = HashMap::new();
1394 metadata.insert("title".to_string(), dataset.title);
1395 metadata.insert("description".to_string(), dataset.notes);
1396 metadata.insert("publisher".to_string(), org_name.to_string());
1397 metadata.insert("source".to_string(), "ukgov".to_string());
1398
1399 vectors.push(SemanticVector {
1400 id: format!("UKGOV:{}", dataset.id),
1401 embedding,
1402 domain: Domain::Government,
1403 timestamp: Utc::now(),
1404 metadata,
1405 });
1406 }
1407
1408 Ok(vectors)
1409 }
1410
1411 pub async fn get_dataset(&self, id: &str) -> Result<Option<SemanticVector>> {
1413 if self.use_mock {
1414 return Ok(Some(self.get_mock_uk_datasets("mock").into_iter().next().unwrap()));
1415 }
1416
1417 let url = format!("{}/package_show?id={}", self.base_url, id);
1418
1419 sleep(self.rate_limit_delay).await;
1420 let response = self.fetch_with_retry(&url).await?;
1421 let json: serde_json::Value = response.json().await?;
1422
1423 if let Some(result) = json.get("result") {
1424 let dataset: UkGovDataset = serde_json::from_value(result.clone())?;
1425
1426 let text = format!("{} {}", dataset.title, dataset.notes);
1427 let embedding = self.embedder.embed_text(&text);
1428
1429 let mut metadata = HashMap::new();
1430 metadata.insert("title".to_string(), dataset.title);
1431 metadata.insert("description".to_string(), dataset.notes);
1432 metadata.insert("source".to_string(), "ukgov".to_string());
1433
1434 return Ok(Some(SemanticVector {
1435 id: format!("UKGOV:{}", dataset.id),
1436 embedding,
1437 domain: Domain::Government,
1438 timestamp: Utc::now(),
1439 metadata,
1440 }));
1441 }
1442
1443 Ok(None)
1444 }
1445
1446 pub async fn list_publishers(&self) -> Result<Vec<SemanticVector>> {
1448 if self.use_mock {
1449 return Ok(self.get_mock_publishers());
1450 }
1451
1452 let url = format!("{}/organization_list?all_fields=true", self.base_url);
1453
1454 sleep(self.rate_limit_delay).await;
1455 let response = self.fetch_with_retry(&url).await?;
1456 let json: serde_json::Value = response.json().await?;
1457
1458 let mut vectors = Vec::new();
1459
1460 if let Some(result) = json.get("result").and_then(|r| r.as_array()) {
1461 for (idx, pub_data) in result.iter().take(50).enumerate() {
1462 let title = pub_data.get("title").and_then(|t| t.as_str()).unwrap_or("");
1463 let name = pub_data.get("name").and_then(|n| n.as_str()).unwrap_or("");
1464
1465 let embedding = self.embedder.embed_text(title);
1466
1467 let mut metadata = HashMap::new();
1468 metadata.insert("title".to_string(), title.to_string());
1469 metadata.insert("name".to_string(), name.to_string());
1470 metadata.insert("source".to_string(), "ukgov_publisher".to_string());
1471
1472 vectors.push(SemanticVector {
1473 id: format!("UKGOV_PUB:{}", idx),
1474 embedding,
1475 domain: Domain::Government,
1476 timestamp: Utc::now(),
1477 metadata,
1478 });
1479 }
1480 }
1481
1482 Ok(vectors)
1483 }
1484
1485 pub async fn get_publisher(&self, id: &str) -> Result<Option<SemanticVector>> {
1487 if self.use_mock {
1488 return Ok(Some(self.get_mock_publishers().into_iter().next().unwrap()));
1489 }
1490
1491 let url = format!("{}/organization_show?id={}", self.base_url, id);
1492
1493 sleep(self.rate_limit_delay).await;
1494 let response = self.fetch_with_retry(&url).await?;
1495 let json: serde_json::Value = response.json().await?;
1496
1497 if let Some(result) = json.get("result") {
1498 let title = result.get("title").and_then(|t| t.as_str()).unwrap_or("");
1499 let embedding = self.embedder.embed_text(title);
1500
1501 let mut metadata = HashMap::new();
1502 metadata.insert("title".to_string(), title.to_string());
1503 metadata.insert("source".to_string(), "ukgov_publisher".to_string());
1504
1505 return Ok(Some(SemanticVector {
1506 id: format!("UKGOV_PUB:{}", id),
1507 embedding,
1508 domain: Domain::Government,
1509 timestamp: Utc::now(),
1510 metadata,
1511 }));
1512 }
1513
1514 Ok(None)
1515 }
1516
1517 fn get_mock_uk_datasets(&self, query: &str) -> Vec<SemanticVector> {
1518 let text = format!("UK dataset about {}", query);
1519 let embedding = self.embedder.embed_text(&text);
1520
1521 let mut metadata = HashMap::new();
1522 metadata.insert("title".to_string(), "Mock UK Dataset".to_string());
1523 metadata.insert("source".to_string(), "ukgov_mock".to_string());
1524
1525 vec![SemanticVector {
1526 id: "UKGOV_MOCK:1".to_string(),
1527 embedding,
1528 domain: Domain::Government,
1529 timestamp: Utc::now(),
1530 metadata,
1531 }]
1532 }
1533
1534 fn get_mock_publishers(&self) -> Vec<SemanticVector> {
1535 let text = "Department of Health and Social Care";
1536 let embedding = self.embedder.embed_text(text);
1537
1538 let mut metadata = HashMap::new();
1539 metadata.insert("title".to_string(), "DHSC".to_string());
1540 metadata.insert("source".to_string(), "ukgov_mock".to_string());
1541
1542 vec![SemanticVector {
1543 id: "UKGOV_PUB_MOCK:1".to_string(),
1544 embedding,
1545 domain: Domain::Government,
1546 timestamp: Utc::now(),
1547 metadata,
1548 }]
1549 }
1550
1551 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
1552 let mut retries = 0;
1553 loop {
1554 match self.client.get(url).send().await {
1555 Ok(response) => {
1556 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
1557 retries += 1;
1558 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
1559 continue;
1560 }
1561 return Ok(response);
1562 }
1563 Err(e) if retries < MAX_RETRIES => {
1564 retries += 1;
1565 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
1566 }
1567 Err(e) => return Err(FrameworkError::Network(e)),
1568 }
1569 }
1570 }
1571}
1572
1573impl Default for UkGovClient {
1574 fn default() -> Self {
1575 Self::new()
1576 }
1577}
1578
1579#[derive(Debug, Deserialize)]
1585struct WbCountry {
1586 id: String,
1587 #[serde(default)]
1588 name: String,
1589 #[serde(default)]
1590 capitalCity: String,
1591 #[serde(default)]
1592 longitude: String,
1593 #[serde(default)]
1594 latitude: String,
1595}
1596
1597#[derive(Debug, Deserialize)]
1599struct WbIndicator {
1600 id: String,
1601 #[serde(default)]
1602 name: String,
1603 #[serde(default)]
1604 sourceNote: String,
1605}
1606
1607#[derive(Debug, Deserialize)]
1609struct WbIndicatorData {
1610 #[serde(default)]
1611 indicator: WbIndicatorInfo,
1612 #[serde(default)]
1613 country: WbCountryInfo,
1614 #[serde(default)]
1615 countryiso3code: String,
1616 #[serde(default)]
1617 date: String,
1618 #[serde(default)]
1619 value: Option<f64>,
1620}
1621
1622#[derive(Debug, Default, Deserialize)]
1623struct WbIndicatorInfo {
1624 id: String,
1625 value: String,
1626}
1627
1628#[derive(Debug, Default, Deserialize)]
1629struct WbCountryInfo {
1630 id: String,
1631 value: String,
1632}
1633
1634pub struct WorldBankClient {
1652 client: Client,
1653 base_url: String,
1654 rate_limit_delay: Duration,
1655 embedder: Arc<SimpleEmbedder>,
1656 use_mock: bool,
1657}
1658
1659impl WorldBankClient {
1660 pub fn new() -> Result<Self> {
1662 Ok(Self::with_config(DEFAULT_EMBEDDING_DIM, false)?)
1663 }
1664
1665 pub fn with_config(embedding_dim: usize, use_mock: bool) -> Result<Self> {
1667 let client = Client::builder()
1668 .user_agent("RuVector-Discovery/1.0")
1669 .timeout(Duration::from_secs(30))
1670 .build()
1671 .map_err(FrameworkError::Network)?;
1672
1673 Ok(Self {
1674 client,
1675 base_url: "https://api.worldbank.org/v2".to_string(),
1676 rate_limit_delay: Duration::from_millis(WORLDBANK_RATE_LIMIT_MS),
1677 embedder: Arc::new(SimpleEmbedder::new(embedding_dim)),
1678 use_mock,
1679 })
1680 }
1681
1682 pub async fn get_countries(&self) -> Result<Vec<SemanticVector>> {
1684 if self.use_mock {
1685 return Ok(self.get_mock_countries());
1686 }
1687
1688 let url = format!("{}/country?format=json&per_page=300", self.base_url);
1689
1690 sleep(self.rate_limit_delay).await;
1691 let response = self.fetch_with_retry(&url).await?;
1692 let text = response.text().await?;
1693
1694 let json_values: Vec<serde_json::Value> = serde_json::from_str(&text)?;
1695 if json_values.len() < 2 {
1696 return Ok(Vec::new());
1697 }
1698
1699 let countries: Vec<WbCountry> = serde_json::from_value(json_values[1].clone())?;
1700
1701 let mut vectors = Vec::new();
1702 for country in countries {
1703 let text = format!("{} {}", country.name, country.capitalCity);
1704 let embedding = self.embedder.embed_text(&text);
1705
1706 let mut metadata = HashMap::new();
1707 metadata.insert("country_code".to_string(), country.id.clone());
1708 metadata.insert("name".to_string(), country.name);
1709 metadata.insert("capital".to_string(), country.capitalCity);
1710 metadata.insert("source".to_string(), "worldbank_country".to_string());
1711
1712 vectors.push(SemanticVector {
1713 id: format!("WB_COUNTRY:{}", country.id),
1714 embedding,
1715 domain: Domain::Government,
1716 timestamp: Utc::now(),
1717 metadata,
1718 });
1719 }
1720
1721 Ok(vectors)
1722 }
1723
1724 pub async fn get_indicators(&self, topic: &str) -> Result<Vec<SemanticVector>> {
1726 if self.use_mock {
1727 return Ok(self.get_mock_indicators(topic));
1728 }
1729
1730 let url = format!(
1731 "{}/indicator?format=json&per_page=100&source=2",
1732 self.base_url
1733 );
1734
1735 sleep(self.rate_limit_delay).await;
1736 let response = self.fetch_with_retry(&url).await?;
1737 let text = response.text().await?;
1738
1739 let json_values: Vec<serde_json::Value> = serde_json::from_str(&text)?;
1740 if json_values.len() < 2 {
1741 return Ok(Vec::new());
1742 }
1743
1744 let indicators: Vec<WbIndicator> = serde_json::from_value(json_values[1].clone())?;
1745
1746 let mut vectors = Vec::new();
1747 for indicator in indicators.into_iter().take(50) {
1748 if !topic.is_empty()
1750 && !indicator.name.to_lowercase().contains(&topic.to_lowercase())
1751 && !indicator.sourceNote.to_lowercase().contains(&topic.to_lowercase())
1752 {
1753 continue;
1754 }
1755
1756 let text = format!("{} {}", indicator.name, indicator.sourceNote);
1757 let embedding = self.embedder.embed_text(&text);
1758
1759 let mut metadata = HashMap::new();
1760 metadata.insert("indicator_id".to_string(), indicator.id.clone());
1761 metadata.insert("name".to_string(), indicator.name);
1762 metadata.insert("description".to_string(), indicator.sourceNote);
1763 metadata.insert("source".to_string(), "worldbank_indicator".to_string());
1764
1765 vectors.push(SemanticVector {
1766 id: format!("WB_IND:{}", indicator.id),
1767 embedding,
1768 domain: Domain::Government,
1769 timestamp: Utc::now(),
1770 metadata,
1771 });
1772 }
1773
1774 Ok(vectors)
1775 }
1776
1777 pub async fn get_indicator(
1785 &self,
1786 country: &str,
1787 indicator: &str,
1788 ) -> Result<Vec<SemanticVector>> {
1789 let current_year = chrono::Utc::now().year();
1790 let start_year = current_year - 10;
1791 let date_range = format!("{}:{}", start_year, current_year);
1792 self.get_indicator_data(indicator, country, &date_range).await
1793 }
1794
1795 pub async fn get_indicator_data(
1802 &self,
1803 indicator: &str,
1804 country: &str,
1805 date_range: &str,
1806 ) -> Result<Vec<SemanticVector>> {
1807 if self.use_mock {
1808 return Ok(self.get_mock_indicator_data(indicator, country));
1809 }
1810
1811 let url = format!(
1812 "{}/country/{}/indicator/{}?format=json&date={}&per_page=100",
1813 self.base_url, country, indicator, date_range
1814 );
1815
1816 sleep(self.rate_limit_delay).await;
1817 let response = self.fetch_with_retry(&url).await?;
1818 let text = response.text().await?;
1819
1820 let json_values: Vec<serde_json::Value> = serde_json::from_str(&text)?;
1821 if json_values.len() < 2 {
1822 return Ok(Vec::new());
1823 }
1824
1825 let data_points: Vec<WbIndicatorData> = serde_json::from_value(json_values[1].clone())?;
1826
1827 let mut vectors = Vec::new();
1828 for data in data_points {
1829 let value = match data.value {
1830 Some(v) => v,
1831 None => continue,
1832 };
1833
1834 let year = data.date.parse::<i32>().unwrap_or(2020);
1835 let date = NaiveDate::from_ymd_opt(year, 1, 1)
1836 .and_then(|d| d.and_hms_opt(0, 0, 0))
1837 .map(|dt| dt.and_utc())
1838 .unwrap_or_else(Utc::now);
1839
1840 let text = format!(
1841 "{} {} in {}: {}",
1842 data.country.value, data.indicator.value, data.date, value
1843 );
1844 let embedding = self.embedder.embed_text(&text);
1845
1846 let mut metadata = HashMap::new();
1847 metadata.insert("country".to_string(), data.country.value);
1848 metadata.insert("country_code".to_string(), data.countryiso3code.clone());
1849 metadata.insert("indicator".to_string(), data.indicator.value);
1850 metadata.insert("indicator_id".to_string(), data.indicator.id);
1851 metadata.insert("year".to_string(), data.date.clone());
1852 metadata.insert("value".to_string(), value.to_string());
1853 metadata.insert("source".to_string(), "worldbank".to_string());
1854
1855 vectors.push(SemanticVector {
1856 id: format!("WB:{}:{}:{}", country, indicator, data.date),
1857 embedding,
1858 domain: Domain::Government,
1859 timestamp: date,
1860 metadata,
1861 });
1862 }
1863
1864 Ok(vectors)
1865 }
1866
1867 pub async fn search_indicators(&self, query: &str) -> Result<Vec<SemanticVector>> {
1869 self.get_indicators(query).await
1871 }
1872
1873 fn get_mock_countries(&self) -> Vec<SemanticVector> {
1874 let text = "United States Washington D.C.";
1875 let embedding = self.embedder.embed_text(text);
1876
1877 let mut metadata = HashMap::new();
1878 metadata.insert("country_code".to_string(), "USA".to_string());
1879 metadata.insert("name".to_string(), "United States".to_string());
1880 metadata.insert("source".to_string(), "worldbank_mock".to_string());
1881
1882 vec![SemanticVector {
1883 id: "WB_COUNTRY_MOCK:USA".to_string(),
1884 embedding,
1885 domain: Domain::Government,
1886 timestamp: Utc::now(),
1887 metadata,
1888 }]
1889 }
1890
1891 fn get_mock_indicators(&self, topic: &str) -> Vec<SemanticVector> {
1892 let text = format!("GDP indicator {}", topic);
1893 let embedding = self.embedder.embed_text(&text);
1894
1895 let mut metadata = HashMap::new();
1896 metadata.insert("indicator_id".to_string(), "NY.GDP.MKTP.CD".to_string());
1897 metadata.insert("name".to_string(), "GDP".to_string());
1898 metadata.insert("source".to_string(), "worldbank_mock".to_string());
1899
1900 vec![SemanticVector {
1901 id: "WB_IND_MOCK:1".to_string(),
1902 embedding,
1903 domain: Domain::Government,
1904 timestamp: Utc::now(),
1905 metadata,
1906 }]
1907 }
1908
1909 fn get_mock_indicator_data(&self, indicator: &str, country: &str) -> Vec<SemanticVector> {
1910 let text = format!("{} {} GDP: 20000000000000", country, indicator);
1911 let embedding = self.embedder.embed_text(&text);
1912
1913 let mut metadata = HashMap::new();
1914 metadata.insert("country_code".to_string(), country.to_string());
1915 metadata.insert("indicator_id".to_string(), indicator.to_string());
1916 metadata.insert("value".to_string(), "20000000000000".to_string());
1917 metadata.insert("source".to_string(), "worldbank_mock".to_string());
1918
1919 vec![SemanticVector {
1920 id: format!("WB_MOCK:{}:{}:2020", country, indicator),
1921 embedding,
1922 domain: Domain::Government,
1923 timestamp: Utc::now(),
1924 metadata,
1925 }]
1926 }
1927
1928 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
1929 let mut retries = 0;
1930 loop {
1931 match self.client.get(url).send().await {
1932 Ok(response) => {
1933 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
1934 retries += 1;
1935 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
1936 continue;
1937 }
1938 return Ok(response);
1939 }
1940 Err(e) if retries < MAX_RETRIES => {
1941 retries += 1;
1942 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
1943 }
1944 Err(e) => return Err(FrameworkError::Network(e)),
1945 }
1946 }
1947 }
1948}
1949
1950impl Default for WorldBankClient {
1951 fn default() -> Self {
1952 Self::new().expect("Failed to create WorldBank client")
1953 }
1954}
1955
1956#[derive(Debug, Deserialize)]
1962struct UnDataResponse {
1963 #[serde(default)]
1964 data: Vec<UnDataRecord>,
1965}
1966
1967#[derive(Debug, Deserialize)]
1968struct UnDataRecord {
1969 #[serde(default)]
1970 indicator: String,
1971 #[serde(default)]
1972 country: String,
1973 #[serde(default)]
1974 year: String,
1975 #[serde(default)]
1976 value: String,
1977}
1978
1979pub struct UNDataClient {
1996 client: Client,
1997 base_url: String,
1998 rate_limit_delay: Duration,
1999 embedder: Arc<SimpleEmbedder>,
2000 use_mock: bool,
2001}
2002
2003impl UNDataClient {
2004 pub fn new() -> Self {
2006 Self::with_config(DEFAULT_EMBEDDING_DIM, false)
2007 }
2008
2009 pub fn with_config(embedding_dim: usize, use_mock: bool) -> Self {
2011 Self {
2012 client: Client::builder()
2013 .user_agent("RuVector-Discovery/1.0")
2014 .timeout(Duration::from_secs(30))
2015 .build()
2016 .expect("Failed to create HTTP client"),
2017 base_url: "https://data.un.org/ws/rest".to_string(),
2018 rate_limit_delay: Duration::from_millis(UNDATA_RATE_LIMIT_MS),
2019 embedder: Arc::new(SimpleEmbedder::new(embedding_dim)),
2020 use_mock,
2021 }
2022 }
2023
2024 pub async fn get_indicators(&self) -> Result<Vec<SemanticVector>> {
2026 if self.use_mock {
2027 return Ok(self.get_mock_un_indicators());
2028 }
2029
2030 Ok(self.get_mock_un_indicators())
2032 }
2033
2034 pub async fn get_data(&self, indicator: &str, country: &str) -> Result<Vec<SemanticVector>> {
2036 if self.use_mock {
2037 return Ok(self.get_mock_un_data(indicator, country));
2038 }
2039
2040 Ok(self.get_mock_un_data(indicator, country))
2042 }
2043
2044 pub async fn search_datasets(&self, query: &str) -> Result<Vec<SemanticVector>> {
2046 if self.use_mock {
2047 return Ok(self.get_mock_un_datasets(query));
2048 }
2049
2050 Ok(self.get_mock_un_datasets(query))
2051 }
2052
2053 fn get_mock_un_indicators(&self) -> Vec<SemanticVector> {
2054 let indicators = vec![
2055 ("Population", "Total population"),
2056 ("GDP", "Gross Domestic Product"),
2057 ("CO2 Emissions", "Carbon dioxide emissions"),
2058 ];
2059
2060 let mut vectors = Vec::new();
2061 for (idx, (name, description)) in indicators.iter().enumerate() {
2062 let text = format!("{} {}", name, description);
2063 let embedding = self.embedder.embed_text(&text);
2064
2065 let mut metadata = HashMap::new();
2066 metadata.insert("indicator".to_string(), name.to_string());
2067 metadata.insert("description".to_string(), description.to_string());
2068 metadata.insert("source".to_string(), "undata_mock".to_string());
2069
2070 vectors.push(SemanticVector {
2071 id: format!("UN_IND_MOCK:{}", idx),
2072 embedding,
2073 domain: Domain::Government,
2074 timestamp: Utc::now(),
2075 metadata,
2076 });
2077 }
2078
2079 vectors
2080 }
2081
2082 fn get_mock_un_data(&self, indicator: &str, country: &str) -> Vec<SemanticVector> {
2083 let text = format!("{} {} 2020: 100000", country, indicator);
2084 let embedding = self.embedder.embed_text(&text);
2085
2086 let mut metadata = HashMap::new();
2087 metadata.insert("indicator".to_string(), indicator.to_string());
2088 metadata.insert("country".to_string(), country.to_string());
2089 metadata.insert("year".to_string(), "2020".to_string());
2090 metadata.insert("value".to_string(), "100000".to_string());
2091 metadata.insert("source".to_string(), "undata_mock".to_string());
2092
2093 vec![SemanticVector {
2094 id: format!("UN_MOCK:{}:{}:2020", country, indicator),
2095 embedding,
2096 domain: Domain::Government,
2097 timestamp: Utc::now(),
2098 metadata,
2099 }]
2100 }
2101
2102 fn get_mock_un_datasets(&self, query: &str) -> Vec<SemanticVector> {
2103 let text = format!("UN dataset about {}", query);
2104 let embedding = self.embedder.embed_text(&text);
2105
2106 let mut metadata = HashMap::new();
2107 metadata.insert("title".to_string(), format!("UN {} Data", query));
2108 metadata.insert("description".to_string(), "Mock UN dataset".to_string());
2109 metadata.insert("source".to_string(), "undata_mock".to_string());
2110
2111 vec![SemanticVector {
2112 id: "UN_DS_MOCK:1".to_string(),
2113 embedding,
2114 domain: Domain::Government,
2115 timestamp: Utc::now(),
2116 metadata,
2117 }]
2118 }
2119
2120 async fn fetch_with_retry(&self, url: &str) -> Result<reqwest::Response> {
2121 let mut retries = 0;
2122 loop {
2123 match self.client.get(url).send().await {
2124 Ok(response) => {
2125 if response.status() == StatusCode::TOO_MANY_REQUESTS && retries < MAX_RETRIES {
2126 retries += 1;
2127 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
2128 continue;
2129 }
2130 return Ok(response);
2131 }
2132 Err(e) if retries < MAX_RETRIES => {
2133 retries += 1;
2134 sleep(Duration::from_millis(RETRY_DELAY_MS * retries as u64)).await;
2135 }
2136 Err(e) => return Err(FrameworkError::Network(e)),
2137 }
2138 }
2139 }
2140}
2141
2142impl Default for UNDataClient {
2143 fn default() -> Self {
2144 Self::new()
2145 }
2146}
2147
2148#[cfg(test)]
2153mod tests {
2154 use super::*;
2155
2156 #[test]
2158 fn test_census_client_creation() {
2159 let client = CensusClient::new(None);
2160 assert_eq!(client.base_url, "https://api.census.gov/data");
2161 }
2162
2163 #[test]
2164 fn test_census_client_with_key() {
2165 let client = CensusClient::new(Some("test_key".to_string()));
2166 assert!(client.api_key.is_some());
2167 }
2168
2169 #[tokio::test]
2170 async fn test_census_mock_population() {
2171 let client = CensusClient::with_config(None, 256, true);
2172 let result = client.get_population(2020, "state:*").await;
2173 assert!(result.is_ok());
2174 let vectors = result.unwrap();
2175 assert!(!vectors.is_empty());
2176 assert_eq!(vectors[0].domain, Domain::Government);
2177 }
2178
2179 #[tokio::test]
2180 async fn test_census_mock_datasets() {
2181 let client = CensusClient::with_config(None, 256, true);
2182 let result = client.get_available_datasets().await;
2183 assert!(result.is_ok());
2184 let vectors = result.unwrap();
2185 assert!(!vectors.is_empty());
2186 }
2187
2188 #[tokio::test]
2189 async fn test_census_mock_variables() {
2190 let client = CensusClient::with_config(None, 256, true);
2191 let result = client.search_variables("acs/acs5", "income").await;
2192 assert!(result.is_ok());
2193 }
2194
2195 #[test]
2197 fn test_datagov_client_creation() {
2198 let client = DataGovClient::new();
2199 assert_eq!(client.base_url, "https://catalog.data.gov/api/3");
2200 }
2201
2202 #[tokio::test]
2203 async fn test_datagov_mock_search() {
2204 let client = DataGovClient::with_config(256, true);
2205 let result = client.search_datasets("climate").await;
2206 assert!(result.is_ok());
2207 let vectors = result.unwrap();
2208 assert!(!vectors.is_empty());
2209 assert_eq!(vectors[0].domain, Domain::Government);
2210 }
2211
2212 #[tokio::test]
2213 async fn test_datagov_mock_organizations() {
2214 let client = DataGovClient::with_config(256, true);
2215 let result = client.list_organizations().await;
2216 assert!(result.is_ok());
2217 }
2218
2219 #[tokio::test]
2220 async fn test_datagov_mock_tags() {
2221 let client = DataGovClient::with_config(256, true);
2222 let result = client.list_tags().await;
2223 assert!(result.is_ok());
2224 }
2225
2226 #[test]
2228 fn test_eu_client_creation() {
2229 let client = EuOpenDataClient::new();
2230 assert_eq!(client.base_url, "https://data.europa.eu/api/hub/search");
2231 }
2232
2233 #[tokio::test]
2234 async fn test_eu_mock_search() {
2235 let client = EuOpenDataClient::with_config(256, true);
2236 let result = client.search_datasets("environment").await;
2237 assert!(result.is_ok());
2238 let vectors = result.unwrap();
2239 assert!(!vectors.is_empty());
2240 }
2241
2242 #[tokio::test]
2243 async fn test_eu_mock_catalogs() {
2244 let client = EuOpenDataClient::with_config(256, true);
2245 let result = client.list_catalogs().await;
2246 assert!(result.is_ok());
2247 }
2248
2249 #[test]
2251 fn test_ukgov_client_creation() {
2252 let client = UkGovClient::new();
2253 assert_eq!(client.base_url, "https://data.gov.uk/api/action");
2254 }
2255
2256 #[tokio::test]
2257 async fn test_ukgov_mock_search() {
2258 let client = UkGovClient::with_config(256, true);
2259 let result = client.search_datasets("health").await;
2260 assert!(result.is_ok());
2261 let vectors = result.unwrap();
2262 assert!(!vectors.is_empty());
2263 }
2264
2265 #[tokio::test]
2266 async fn test_ukgov_mock_publishers() {
2267 let client = UkGovClient::with_config(256, true);
2268 let result = client.list_publishers().await;
2269 assert!(result.is_ok());
2270 }
2271
2272 #[test]
2274 fn test_worldbank_client_creation() {
2275 let client = WorldBankClient::new();
2276 assert!(client.is_ok());
2277 assert_eq!(client.unwrap().base_url, "https://api.worldbank.org/v2");
2278 }
2279
2280 #[tokio::test]
2281 async fn test_worldbank_mock_countries() {
2282 let client = WorldBankClient::with_config(256, true).unwrap();
2283 let result = client.get_countries().await;
2284 assert!(result.is_ok());
2285 let vectors = result.unwrap();
2286 assert!(!vectors.is_empty());
2287 }
2288
2289 #[tokio::test]
2290 async fn test_worldbank_mock_indicators() {
2291 let client = WorldBankClient::with_config(256, true).unwrap();
2292 let result = client.get_indicators("gdp").await;
2293 assert!(result.is_ok());
2294 }
2295
2296 #[tokio::test]
2297 async fn test_worldbank_mock_data() {
2298 let client = WorldBankClient::with_config(256, true).unwrap();
2299 let result = client.get_indicator_data("NY.GDP.MKTP.CD", "USA", "2020").await;
2300 assert!(result.is_ok());
2301 }
2302
2303 #[test]
2305 fn test_undata_client_creation() {
2306 let client = UNDataClient::new();
2307 assert_eq!(client.base_url, "https://data.un.org/ws/rest");
2308 }
2309
2310 #[tokio::test]
2311 async fn test_undata_mock_indicators() {
2312 let client = UNDataClient::with_config(256, true);
2313 let result = client.get_indicators().await;
2314 assert!(result.is_ok());
2315 let vectors = result.unwrap();
2316 assert!(!vectors.is_empty());
2317 }
2318
2319 #[tokio::test]
2320 async fn test_undata_mock_data() {
2321 let client = UNDataClient::with_config(256, true);
2322 let result = client.get_data("population", "USA").await;
2323 assert!(result.is_ok());
2324 }
2325
2326 #[tokio::test]
2327 async fn test_undata_mock_search() {
2328 let client = UNDataClient::with_config(256, true);
2329 let result = client.search_datasets("climate").await;
2330 assert!(result.is_ok());
2331 }
2332
2333 #[test]
2335 fn test_rate_limits() {
2336 let census = CensusClient::new(None);
2337 assert_eq!(census.rate_limit_delay, Duration::from_millis(CENSUS_RATE_LIMIT_MS));
2338
2339 let datagov = DataGovClient::new();
2340 assert_eq!(datagov.rate_limit_delay, Duration::from_millis(DATAGOV_RATE_LIMIT_MS));
2341
2342 let eu = EuOpenDataClient::new();
2343 assert_eq!(eu.rate_limit_delay, Duration::from_millis(EU_OPENDATA_RATE_LIMIT_MS));
2344
2345 let uk = UkGovClient::new();
2346 assert_eq!(uk.rate_limit_delay, Duration::from_millis(UK_GOV_RATE_LIMIT_MS));
2347
2348 let wb = WorldBankClient::new().unwrap();
2349 assert_eq!(wb.rate_limit_delay, Duration::from_millis(WORLDBANK_RATE_LIMIT_MS));
2350
2351 let un = UNDataClient::new();
2352 assert_eq!(un.rate_limit_delay, Duration::from_millis(UNDATA_RATE_LIMIT_MS));
2353 }
2354}