1use serde::{Deserialize, Serialize};
16use std::fmt;
17use std::path::{Path, PathBuf};
18use std::str::FromStr;
19use std::time::Duration;
20
21use crate::circuit_breaker::CircuitBreakerConfig;
22use crate::error::AppError;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
32#[serde(rename_all = "lowercase")]
33pub enum EmbeddingProviderType {
34 #[default]
36 Gemini,
37 OpenAI,
39}
40
41impl fmt::Display for EmbeddingProviderType {
42 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
43 match self {
44 Self::Gemini => write!(f, "gemini"),
45 Self::OpenAI => write!(f, "openai"),
46 }
47 }
48}
49
50impl FromStr for EmbeddingProviderType {
51 type Err = AppError;
52
53 fn from_str(s: &str) -> Result<Self, Self::Err> {
54 match s.to_lowercase().as_str() {
55 "gemini" => Ok(Self::Gemini),
56 "openai" => Ok(Self::OpenAI),
57 _ => Err(AppError::ConfigError(format!(
58 "Unknown embedding provider: '{}'. Valid options: gemini, openai",
59 s
60 ))),
61 }
62 }
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct GeminiEmbeddingConfig {
68 #[serde(default = "default_gemini_model")]
70 pub model: String,
71}
72
73fn default_gemini_model() -> String {
74 "gemini-embedding-001".to_string()
75}
76
77impl Default for GeminiEmbeddingConfig {
78 fn default() -> Self {
79 Self {
80 model: default_gemini_model(),
81 }
82 }
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct OpenAIEmbeddingConfig {
88 #[serde(default = "default_openai_model")]
90 pub model: String,
91 pub endpoint: Option<String>,
93}
94
95fn default_openai_model() -> String {
96 "text-embedding-3-small".to_string()
97}
98
99impl Default for OpenAIEmbeddingConfig {
100 fn default() -> Self {
101 Self {
102 model: default_openai_model(),
103 endpoint: None,
104 }
105 }
106}
107
108pub fn embedding_dimension(provider: EmbeddingProviderType, model: Option<&str>) -> usize {
115 match provider {
116 EmbeddingProviderType::Gemini => 768, EmbeddingProviderType::OpenAI => match model.unwrap_or("text-embedding-3-small") {
118 "text-embedding-3-large" => 3072,
119 _ => 1536, },
121 }
122}
123
124pub struct DbConfig {
126 pub max_connections: u32,
127}
128
129impl Default for DbConfig {
130 fn default() -> Self {
131 Self { max_connections: 5 }
132 }
133}
134
135pub struct HttpConfig {
137 pub timeout: Duration,
138 pub max_retries: u32,
139 pub retry_base_delay: Duration,
140}
141
142impl Default for HttpConfig {
143 fn default() -> Self {
144 Self {
145 timeout: Duration::from_secs(30),
146 max_retries: 3,
147 retry_base_delay: Duration::from_millis(500),
148 }
149 }
150}
151
152#[derive(Clone)]
157pub struct HarvestConfig {
158 pub concurrency: usize,
160 pub upsert_batch_size: usize,
162 pub force_full_sync: bool,
164 pub dry_run: bool,
166}
167
168impl Default for HarvestConfig {
169 fn default() -> Self {
170 Self {
171 concurrency: 10,
172 upsert_batch_size: 500,
173 force_full_sync: false,
174 dry_run: false,
175 }
176 }
177}
178
179impl HarvestConfig {
180 pub fn with_full_sync(mut self) -> Self {
182 self.force_full_sync = true;
183 self
184 }
185
186 pub fn with_dry_run(mut self) -> Self {
188 self.dry_run = true;
189 self
190 }
191}
192
193#[derive(Clone)]
198pub struct EmbeddingServiceConfig {
199 pub batch_size: usize,
202 pub circuit_breaker: CircuitBreakerConfig,
204}
205
206impl Default for EmbeddingServiceConfig {
207 fn default() -> Self {
208 Self {
209 batch_size: 64,
210 circuit_breaker: CircuitBreakerConfig::default(),
211 }
212 }
213}
214
215#[derive(Clone)]
225pub struct SyncConfig {
226 pub concurrency: usize,
228 pub embedding_batch_size: usize,
231 pub upsert_batch_size: usize,
233 pub force_full_sync: bool,
235 pub dry_run: bool,
237 pub circuit_breaker: CircuitBreakerConfig,
239}
240
241impl Default for SyncConfig {
242 fn default() -> Self {
243 Self {
245 concurrency: 10,
246 embedding_batch_size: 64,
247 upsert_batch_size: 500,
248 force_full_sync: false,
249 dry_run: false,
250 circuit_breaker: CircuitBreakerConfig::default(),
251 }
252 }
253}
254
255impl SyncConfig {
256 pub fn with_full_sync(mut self) -> Self {
258 self.force_full_sync = true;
259 self
260 }
261
262 pub fn with_dry_run(mut self) -> Self {
264 self.dry_run = true;
265 self
266 }
267
268 pub fn with_embedding_batch_size(mut self, size: usize) -> Self {
270 self.embedding_batch_size = size.max(1);
271 self
272 }
273
274 pub fn with_circuit_breaker(mut self, config: CircuitBreakerConfig) -> Self {
276 self.circuit_breaker = config;
277 self
278 }
279
280 pub fn harvest_config(&self) -> HarvestConfig {
282 HarvestConfig {
283 concurrency: self.concurrency,
284 upsert_batch_size: self.upsert_batch_size,
285 force_full_sync: self.force_full_sync,
286 dry_run: self.dry_run,
287 }
288 }
289
290 pub fn embedding_service_config(&self) -> EmbeddingServiceConfig {
292 EmbeddingServiceConfig {
293 batch_size: self.embedding_batch_size,
294 circuit_breaker: self.circuit_breaker.clone(),
295 }
296 }
297}
298
299#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
307#[serde(rename_all = "lowercase")]
308pub enum PortalType {
309 #[default]
311 Ckan,
312 Socrata,
314 Dcat,
316}
317
318impl fmt::Display for PortalType {
319 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
320 match self {
321 Self::Ckan => write!(f, "ckan"),
322 Self::Socrata => write!(f, "socrata"),
323 Self::Dcat => write!(f, "dcat"),
324 }
325 }
326}
327
328impl FromStr for PortalType {
329 type Err = AppError;
330
331 fn from_str(s: &str) -> Result<Self, Self::Err> {
332 match s.to_lowercase().as_str() {
333 "ckan" => Ok(Self::Ckan),
334 "socrata" => Ok(Self::Socrata),
335 "dcat" => Ok(Self::Dcat),
336 _ => Err(AppError::ConfigError(format!(
337 "Unknown portal type: '{}'. Valid options: ckan, socrata, dcat",
338 s
339 ))),
340 }
341 }
342}
343
344fn default_enabled() -> bool {
346 true
347}
348
349#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct PortalsConfig {
370 pub portals: Vec<PortalEntry>,
372}
373
374impl PortalsConfig {
375 pub fn enabled_portals(&self) -> Vec<&PortalEntry> {
379 self.portals.iter().filter(|p| p.enabled).collect()
380 }
381
382 pub fn find_by_name(&self, name: &str) -> Option<&PortalEntry> {
390 self.portals
391 .iter()
392 .find(|p| p.name.eq_ignore_ascii_case(name))
393 }
394}
395
396#[derive(Debug, Clone, Serialize, Deserialize)]
401pub struct PortalEntry {
402 pub name: String,
406
407 pub url: String,
411
412 #[serde(rename = "type", default)]
416 pub portal_type: PortalType,
417
418 #[serde(default = "default_enabled")]
422 pub enabled: bool,
423
424 pub description: Option<String>,
426
427 pub url_template: Option<String>,
435
436 pub language: Option<String>,
442}
443
444impl PortalEntry {
445 pub fn language(&self) -> &str {
447 self.language.as_deref().unwrap_or("en")
448 }
449}
450
451pub const CONFIG_FILE_NAME: &str = "portals.toml";
453
454pub fn default_config_dir() -> Option<PathBuf> {
458 dirs::config_dir().map(|p| p.join("ceres"))
459}
460
461pub fn default_config_path() -> Option<PathBuf> {
465 default_config_dir().map(|p| p.join(CONFIG_FILE_NAME))
466}
467
468const DEFAULT_CONFIG_TEMPLATE: &str = r#"# Ceres Portal Configuration
473#
474# Usage:
475# ceres harvest # Harvest all enabled portals
476# ceres harvest --portal milano # Harvest specific portal by name
477# ceres harvest https://... # Harvest single URL (ignores this file)
478#
479# Set enabled = false to skip a portal during batch harvest.
480# Use url_template for portals with non-standard frontends:
481# url_template = "https://example.com/dataset?id={id}"
482# Placeholders: {id} = dataset UUID, {name} = dataset slug
483
484# City of Milan open data
485[[portals]]
486name = "milano"
487url = "https://dati.comune.milano.it"
488type = "ckan"
489description = "Open data del Comune di Milano"
490
491# Sicily Region open data
492[[portals]]
493name = "sicilia"
494url = "https://dati.regione.sicilia.it"
495type = "ckan"
496description = "Open data della Regione Siciliana"
497"#;
498
499pub fn load_portals_config(path: Option<PathBuf>) -> Result<Option<PortalsConfig>, AppError> {
513 let using_default_path = path.is_none();
514 let config_path = match path {
515 Some(p) => p,
516 None => match default_config_path() {
517 Some(p) => p,
518 None => return Ok(None),
519 },
520 };
521
522 if !config_path.exists() {
523 if using_default_path {
525 match create_default_config(&config_path) {
526 Ok(()) => {
527 tracing::info!(
530 "Config file created at {}. Starting harvest with default portals...",
531 config_path.display()
532 );
533 }
535 Err(e) => {
536 tracing::warn!("Could not create default config template: {}", e);
538 return Ok(None);
539 }
540 }
541 } else {
542 return Err(AppError::ConfigError(format!(
544 "Config file not found: {}",
545 config_path.display()
546 )));
547 }
548 }
549
550 let content = std::fs::read_to_string(&config_path).map_err(|e| {
551 AppError::ConfigError(format!(
552 "Failed to read config file '{}': {}",
553 config_path.display(),
554 e
555 ))
556 })?;
557
558 let config: PortalsConfig = toml::from_str(&content).map_err(|e| {
559 AppError::ConfigError(format!(
560 "Invalid TOML in '{}': {}",
561 config_path.display(),
562 e
563 ))
564 })?;
565
566 Ok(Some(config))
567}
568
569fn create_default_config(path: &Path) -> std::io::Result<()> {
576 if let Some(parent) = path.parent() {
578 std::fs::create_dir_all(parent)?;
579 }
580
581 std::fs::write(path, DEFAULT_CONFIG_TEMPLATE)?;
582 tracing::info!("Created default config template at: {}", path.display());
583
584 Ok(())
585}
586
587#[cfg(test)]
588mod tests {
589 use super::*;
590
591 #[test]
592 fn test_db_config_defaults() {
593 let config = DbConfig::default();
594 assert_eq!(config.max_connections, 5);
595 }
596
597 #[test]
598 fn test_http_config_defaults() {
599 let config = HttpConfig::default();
600 assert_eq!(config.timeout, Duration::from_secs(30));
601 assert_eq!(config.max_retries, 3);
602 assert_eq!(config.retry_base_delay, Duration::from_millis(500));
603 }
604
605 #[test]
606 fn test_sync_config_defaults() {
607 let config = SyncConfig::default();
608 assert_eq!(config.concurrency, 10);
609 assert_eq!(config.upsert_batch_size, 500);
610 }
611
612 #[test]
613 fn test_sync_config_harvest_config_upsert_batch_size() {
614 let config = SyncConfig::default();
615 let harvest = config.harvest_config();
616 assert_eq!(harvest.upsert_batch_size, 500);
617 assert_ne!(harvest.upsert_batch_size, config.embedding_batch_size);
618 }
619
620 #[test]
625 fn test_portals_config_deserialize() {
626 let toml = r#"
627[[portals]]
628name = "test-portal"
629url = "https://example.com"
630type = "ckan"
631"#;
632 let config: PortalsConfig = toml::from_str(toml).unwrap();
633 assert_eq!(config.portals.len(), 1);
634 assert_eq!(config.portals[0].name, "test-portal");
635 assert_eq!(config.portals[0].url, "https://example.com");
636 assert_eq!(config.portals[0].portal_type, PortalType::Ckan);
637 assert!(config.portals[0].enabled); assert!(config.portals[0].description.is_none());
639 }
640
641 #[test]
642 fn test_portals_config_defaults() {
643 let toml = r#"
644[[portals]]
645name = "minimal"
646url = "https://example.com"
647"#;
648 let config: PortalsConfig = toml::from_str(toml).unwrap();
649 assert_eq!(config.portals[0].portal_type, PortalType::Ckan); assert!(config.portals[0].enabled); }
652
653 #[test]
654 fn test_portals_config_enabled_filter() {
655 let toml = r#"
656[[portals]]
657name = "enabled-portal"
658url = "https://a.com"
659
660[[portals]]
661name = "disabled-portal"
662url = "https://b.com"
663enabled = false
664"#;
665 let config: PortalsConfig = toml::from_str(toml).unwrap();
666 let enabled = config.enabled_portals();
667 assert_eq!(enabled.len(), 1);
668 assert_eq!(enabled[0].name, "enabled-portal");
669 }
670
671 #[test]
672 fn test_portals_config_find_by_name() {
673 let toml = r#"
674[[portals]]
675name = "Milano"
676url = "https://dati.comune.milano.it"
677"#;
678 let config: PortalsConfig = toml::from_str(toml).unwrap();
679
680 assert!(config.find_by_name("milano").is_some());
682 assert!(config.find_by_name("MILANO").is_some());
683 assert!(config.find_by_name("Milano").is_some());
684
685 assert!(config.find_by_name("roma").is_none());
687 }
688
689 #[test]
690 fn test_portals_config_with_description() {
691 let toml = r#"
692[[portals]]
693name = "test"
694url = "https://example.com"
695description = "A test portal"
696"#;
697 let config: PortalsConfig = toml::from_str(toml).unwrap();
698 assert_eq!(
699 config.portals[0].description,
700 Some("A test portal".to_string())
701 );
702 }
703
704 #[test]
705 fn test_portals_config_multiple_portals() {
706 let toml = r#"
707[[portals]]
708name = "portal-1"
709url = "https://a.com"
710
711[[portals]]
712name = "portal-2"
713url = "https://b.com"
714
715[[portals]]
716name = "portal-3"
717url = "https://c.com"
718enabled = false
719"#;
720 let config: PortalsConfig = toml::from_str(toml).unwrap();
721 assert_eq!(config.portals.len(), 3);
722 assert_eq!(config.enabled_portals().len(), 2);
723 }
724
725 #[test]
726 fn test_default_config_path() {
727 let path = default_config_path();
730 if let Some(p) = path {
731 assert!(p.ends_with("portals.toml"));
732 }
733 }
734
735 use std::io::Write;
740 use tempfile::NamedTempFile;
741
742 #[test]
743 fn test_load_portals_config_valid_file() {
744 let mut file = NamedTempFile::new().unwrap();
745 writeln!(
746 file,
747 r#"
748[[portals]]
749name = "test"
750url = "https://test.com"
751"#
752 )
753 .unwrap();
754
755 let config = load_portals_config(Some(file.path().to_path_buf()))
756 .unwrap()
757 .unwrap();
758
759 assert_eq!(config.portals.len(), 1);
760 assert_eq!(config.portals[0].name, "test");
761 assert_eq!(config.portals[0].url, "https://test.com");
762 }
763
764 #[test]
765 fn test_load_portals_config_custom_path_not_found() {
766 let result = load_portals_config(Some("/nonexistent/path/to/config.toml".into()));
767 assert!(result.is_err());
768 let err = result.unwrap_err();
769 assert!(matches!(err, AppError::ConfigError(_)));
770 }
771
772 #[test]
773 fn test_load_portals_config_invalid_toml() {
774 let mut file = NamedTempFile::new().unwrap();
775 writeln!(file, "this is not valid toml {{{{").unwrap();
776
777 let result = load_portals_config(Some(file.path().to_path_buf()));
778 assert!(result.is_err());
779 let err = result.unwrap_err();
780 assert!(matches!(err, AppError::ConfigError(_)));
781 }
782
783 #[test]
784 fn test_load_portals_config_multiple_portals_with_enabled_filter() {
785 let mut file = NamedTempFile::new().unwrap();
786 writeln!(
787 file,
788 r#"
789[[portals]]
790name = "enabled-portal"
791url = "https://a.com"
792
793[[portals]]
794name = "disabled-portal"
795url = "https://b.com"
796enabled = false
797
798[[portals]]
799name = "another-enabled"
800url = "https://c.com"
801enabled = true
802"#
803 )
804 .unwrap();
805
806 let config = load_portals_config(Some(file.path().to_path_buf()))
807 .unwrap()
808 .unwrap();
809
810 assert_eq!(config.portals.len(), 3);
811 assert_eq!(config.enabled_portals().len(), 2);
812 }
813
814 #[test]
815 fn test_load_portals_config_with_all_fields() {
816 let mut file = NamedTempFile::new().unwrap();
817 writeln!(
818 file,
819 r#"
820[[portals]]
821name = "full-config"
822url = "https://example.com"
823type = "ckan"
824enabled = true
825description = "A fully configured portal"
826"#
827 )
828 .unwrap();
829
830 let config = load_portals_config(Some(file.path().to_path_buf()))
831 .unwrap()
832 .unwrap();
833
834 let portal = &config.portals[0];
835 assert_eq!(portal.name, "full-config");
836 assert_eq!(portal.url, "https://example.com");
837 assert_eq!(portal.portal_type, PortalType::Ckan);
838 assert!(portal.enabled);
839 assert_eq!(
840 portal.description,
841 Some("A fully configured portal".to_string())
842 );
843 }
844
845 #[test]
846 fn test_load_portals_config_empty_portals_array() {
847 let mut file = NamedTempFile::new().unwrap();
848 writeln!(file, "portals = []").unwrap();
849
850 let config = load_portals_config(Some(file.path().to_path_buf()))
851 .unwrap()
852 .unwrap();
853
854 assert!(config.portals.is_empty());
855 assert!(config.enabled_portals().is_empty());
856 }
857
858 #[test]
863 fn test_embedding_provider_type_from_str() {
864 assert_eq!(
865 "gemini".parse::<EmbeddingProviderType>().unwrap(),
866 EmbeddingProviderType::Gemini
867 );
868 assert_eq!(
869 "openai".parse::<EmbeddingProviderType>().unwrap(),
870 EmbeddingProviderType::OpenAI
871 );
872 assert_eq!(
873 "GEMINI".parse::<EmbeddingProviderType>().unwrap(),
874 EmbeddingProviderType::Gemini
875 );
876 assert_eq!(
877 "OpenAI".parse::<EmbeddingProviderType>().unwrap(),
878 EmbeddingProviderType::OpenAI
879 );
880 }
881
882 #[test]
883 fn test_embedding_provider_type_invalid() {
884 let result = "invalid".parse::<EmbeddingProviderType>();
885 assert!(result.is_err());
886 }
887
888 #[test]
889 fn test_embedding_provider_type_display() {
890 assert_eq!(EmbeddingProviderType::Gemini.to_string(), "gemini");
891 assert_eq!(EmbeddingProviderType::OpenAI.to_string(), "openai");
892 }
893
894 #[test]
895 fn test_embedding_dimension() {
896 assert_eq!(
898 embedding_dimension(EmbeddingProviderType::Gemini, None),
899 768
900 );
901 assert_eq!(
902 embedding_dimension(EmbeddingProviderType::Gemini, Some("gemini-embedding-001")),
903 768
904 );
905
906 assert_eq!(
908 embedding_dimension(EmbeddingProviderType::OpenAI, None),
909 1536
910 );
911 assert_eq!(
912 embedding_dimension(
913 EmbeddingProviderType::OpenAI,
914 Some("text-embedding-3-small")
915 ),
916 1536
917 );
918 assert_eq!(
919 embedding_dimension(
920 EmbeddingProviderType::OpenAI,
921 Some("text-embedding-3-large")
922 ),
923 3072
924 );
925 }
926
927 #[test]
928 fn test_gemini_embedding_config_default() {
929 let config = GeminiEmbeddingConfig::default();
930 assert_eq!(config.model, "gemini-embedding-001");
931 }
932
933 #[test]
934 fn test_openai_embedding_config_default() {
935 let config = OpenAIEmbeddingConfig::default();
936 assert_eq!(config.model, "text-embedding-3-small");
937 assert!(config.endpoint.is_none());
938 }
939}