Skip to main content

ceres_core/
config.rs

1//! Configuration types for Ceres components.
2//!
3//! # Configuration Improvements
4//!
5//! TODO(config): Make all configuration values environment-configurable
6//! Currently all defaults are hardcoded. Should support:
7//! - `DB_MAX_CONNECTIONS` for database pool size
8//! - `SYNC_CONCURRENCY` for parallel dataset processing
9//! - `HTTP_TIMEOUT` for API request timeout
10//! - `HTTP_MAX_RETRIES` for retry attempts
11//!
12//! Consider using the `config` crate for layered configuration:
13//! defaults -> config file -> environment variables -> CLI args
14
15use serde::{Deserialize, Serialize};
16use std::fmt;
17use std::path::{Path, PathBuf};
18use std::str::FromStr;
19use std::time::Duration;
20
21use crate::circuit_breaker::CircuitBreakerConfig;
22use crate::error::AppError;
23
24// =============================================================================
25// Embedding Provider Configuration
26// =============================================================================
27
28/// Embedding provider type.
29///
30/// Determines which embedding API to use for generating text embeddings.
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
32#[serde(rename_all = "lowercase")]
33pub enum EmbeddingProviderType {
34    /// Google Gemini gemini-embedding-001 (768 dimensions).
35    #[default]
36    Gemini,
37    /// OpenAI text-embedding-3-small (1536d) or text-embedding-3-large (3072d).
38    OpenAI,
39}
40
41impl fmt::Display for EmbeddingProviderType {
42    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
43        match self {
44            Self::Gemini => write!(f, "gemini"),
45            Self::OpenAI => write!(f, "openai"),
46        }
47    }
48}
49
50impl FromStr for EmbeddingProviderType {
51    type Err = AppError;
52
53    fn from_str(s: &str) -> Result<Self, Self::Err> {
54        match s.to_lowercase().as_str() {
55            "gemini" => Ok(Self::Gemini),
56            "openai" => Ok(Self::OpenAI),
57            _ => Err(AppError::ConfigError(format!(
58                "Unknown embedding provider: '{}'. Valid options: gemini, openai",
59                s
60            ))),
61        }
62    }
63}
64
65/// Gemini embedding provider configuration.
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct GeminiEmbeddingConfig {
68    /// Gemini model name.
69    #[serde(default = "default_gemini_model")]
70    pub model: String,
71}
72
73fn default_gemini_model() -> String {
74    "gemini-embedding-001".to_string()
75}
76
77impl Default for GeminiEmbeddingConfig {
78    fn default() -> Self {
79        Self {
80            model: default_gemini_model(),
81        }
82    }
83}
84
85/// OpenAI embedding provider configuration.
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct OpenAIEmbeddingConfig {
88    /// OpenAI model name.
89    #[serde(default = "default_openai_model")]
90    pub model: String,
91    /// Custom API endpoint (for Azure OpenAI or proxies).
92    pub endpoint: Option<String>,
93}
94
95fn default_openai_model() -> String {
96    "text-embedding-3-small".to_string()
97}
98
99impl Default for OpenAIEmbeddingConfig {
100    fn default() -> Self {
101        Self {
102            model: default_openai_model(),
103            endpoint: None,
104        }
105    }
106}
107
108/// Returns the embedding dimension for a given provider and model.
109///
110/// # Arguments
111///
112/// * `provider` - The embedding provider type
113/// * `model` - The model name (optional, uses default if None)
114pub fn embedding_dimension(provider: EmbeddingProviderType, model: Option<&str>) -> usize {
115    match provider {
116        EmbeddingProviderType::Gemini => 768, // gemini-embedding-001 with output_dimensionality=768
117        EmbeddingProviderType::OpenAI => match model.unwrap_or("text-embedding-3-small") {
118            "text-embedding-3-large" => 3072,
119            _ => 1536, // text-embedding-3-small and ada-002
120        },
121    }
122}
123
124/// Database connection pool configuration.
125pub struct DbConfig {
126    pub max_connections: u32,
127}
128
129impl Default for DbConfig {
130    fn default() -> Self {
131        Self { max_connections: 5 }
132    }
133}
134
135/// HTTP client configuration for external API calls.
136pub struct HttpConfig {
137    pub timeout: Duration,
138    pub max_retries: u32,
139    pub retry_base_delay: Duration,
140}
141
142impl Default for HttpConfig {
143    fn default() -> Self {
144        Self {
145            timeout: Duration::from_secs(30),
146            max_retries: 3,
147            retry_base_delay: Duration::from_millis(500),
148        }
149    }
150}
151
152/// Harvest-only configuration (metadata fetching + delta detection).
153///
154/// Used by [`crate::HarvestService`] when running without an embedding provider.
155/// Contains only the settings relevant to portal fetching and persistence.
156#[derive(Clone)]
157pub struct HarvestConfig {
158    /// Number of concurrent dataset processing tasks.
159    pub concurrency: usize,
160    /// Maximum number of datasets per DB upsert batch.
161    pub upsert_batch_size: usize,
162    /// Force full sync even if incremental sync is available.
163    pub force_full_sync: bool,
164    /// Preview mode: fetch and compare datasets without writing to DB.
165    pub dry_run: bool,
166}
167
168impl Default for HarvestConfig {
169    fn default() -> Self {
170        Self {
171            concurrency: 10,
172            upsert_batch_size: 500,
173            force_full_sync: false,
174            dry_run: false,
175        }
176    }
177}
178
179impl HarvestConfig {
180    /// Creates a new HarvestConfig with force_full_sync enabled.
181    pub fn with_full_sync(mut self) -> Self {
182        self.force_full_sync = true;
183        self
184    }
185
186    /// Creates a new HarvestConfig with dry_run enabled.
187    pub fn with_dry_run(mut self) -> Self {
188        self.dry_run = true;
189        self
190    }
191}
192
193/// Embedding service configuration.
194///
195/// Used by [`crate::EmbeddingService`] for standalone embedding passes.
196/// Contains only the settings relevant to embedding API calls.
197#[derive(Clone)]
198pub struct EmbeddingServiceConfig {
199    /// Maximum number of texts per embedding API batch call.
200    /// The actual batch size is `min(this, provider.max_batch_size())`.
201    pub batch_size: usize,
202    /// Circuit breaker configuration for embedding API resilience.
203    pub circuit_breaker: CircuitBreakerConfig,
204}
205
206impl Default for EmbeddingServiceConfig {
207    fn default() -> Self {
208        Self {
209            batch_size: 64,
210            circuit_breaker: CircuitBreakerConfig::default(),
211        }
212    }
213}
214
215/// Portal synchronization configuration (combined harvest + embed).
216///
217/// This is a convenience type that composes [`HarvestConfig`] and
218/// [`EmbeddingServiceConfig`] for the common harvest-then-embed workflow.
219/// Used by [`crate::HarvestPipeline`] and kept for backward compatibility.
220///
221/// TODO(config): Support CLI arg `--concurrency` and env var `SYNC_CONCURRENCY`
222/// Optimal value depends on portal rate limits and system resources.
223/// Consider auto-tuning based on API response times.
224#[derive(Clone)]
225pub struct SyncConfig {
226    /// Number of concurrent dataset processing tasks.
227    pub concurrency: usize,
228    /// Maximum number of texts per embedding API batch call.
229    /// The actual batch size is `min(this, provider.max_batch_size())`.
230    pub embedding_batch_size: usize,
231    /// Maximum number of datasets per DB upsert batch.
232    pub upsert_batch_size: usize,
233    /// Force full sync even if incremental sync is available.
234    pub force_full_sync: bool,
235    /// Preview mode: fetch and compare datasets without writing to DB or calling embedding API.
236    pub dry_run: bool,
237    /// Circuit breaker configuration for API resilience.
238    pub circuit_breaker: CircuitBreakerConfig,
239}
240
241impl Default for SyncConfig {
242    fn default() -> Self {
243        // TODO(config): Read from SYNC_CONCURRENCY env var
244        Self {
245            concurrency: 10,
246            embedding_batch_size: 64,
247            upsert_batch_size: 500,
248            force_full_sync: false,
249            dry_run: false,
250            circuit_breaker: CircuitBreakerConfig::default(),
251        }
252    }
253}
254
255impl SyncConfig {
256    /// Creates a new SyncConfig with force_full_sync enabled.
257    pub fn with_full_sync(mut self) -> Self {
258        self.force_full_sync = true;
259        self
260    }
261
262    /// Creates a new SyncConfig with dry_run enabled.
263    pub fn with_dry_run(mut self) -> Self {
264        self.dry_run = true;
265        self
266    }
267
268    /// Creates a new SyncConfig with a custom embedding batch size.
269    pub fn with_embedding_batch_size(mut self, size: usize) -> Self {
270        self.embedding_batch_size = size.max(1);
271        self
272    }
273
274    /// Creates a new SyncConfig with custom circuit breaker configuration.
275    pub fn with_circuit_breaker(mut self, config: CircuitBreakerConfig) -> Self {
276        self.circuit_breaker = config;
277        self
278    }
279
280    /// Extracts the harvest-only configuration.
281    pub fn harvest_config(&self) -> HarvestConfig {
282        HarvestConfig {
283            concurrency: self.concurrency,
284            upsert_batch_size: self.upsert_batch_size,
285            force_full_sync: self.force_full_sync,
286            dry_run: self.dry_run,
287        }
288    }
289
290    /// Extracts the embedding service configuration.
291    pub fn embedding_service_config(&self) -> EmbeddingServiceConfig {
292        EmbeddingServiceConfig {
293            batch_size: self.embedding_batch_size,
294            circuit_breaker: self.circuit_breaker.clone(),
295        }
296    }
297}
298
299// =============================================================================
300// Portal Configuration (portals.toml)
301// =============================================================================
302
303/// Portal type identifier.
304///
305/// Determines which portal API client to use for harvesting.
306#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
307#[serde(rename_all = "lowercase")]
308pub enum PortalType {
309    /// CKAN open data portal (default).
310    #[default]
311    Ckan,
312    /// Socrata open data portal (US cities: NYC, Chicago, SF).
313    Socrata,
314    /// DCAT-AP / SPARQL endpoint (EU portals, data.europa.eu).
315    Dcat,
316}
317
318impl fmt::Display for PortalType {
319    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
320        match self {
321            Self::Ckan => write!(f, "ckan"),
322            Self::Socrata => write!(f, "socrata"),
323            Self::Dcat => write!(f, "dcat"),
324        }
325    }
326}
327
328impl FromStr for PortalType {
329    type Err = AppError;
330
331    fn from_str(s: &str) -> Result<Self, Self::Err> {
332        match s.to_lowercase().as_str() {
333            "ckan" => Ok(Self::Ckan),
334            "socrata" => Ok(Self::Socrata),
335            "dcat" => Ok(Self::Dcat),
336            _ => Err(AppError::ConfigError(format!(
337                "Unknown portal type: '{}'. Valid options: ckan, socrata, dcat",
338                s
339            ))),
340        }
341    }
342}
343
344/// Default enabled status when not specified in configuration.
345fn default_enabled() -> bool {
346    true
347}
348
349/// Root configuration structure for portals.toml.
350///
351/// This structure represents the entire configuration file containing
352/// an array of portal definitions.
353///
354/// # Example
355///
356/// ```toml
357/// [[portals]]
358/// name = "dati-gov-it"
359/// url = "https://dati.gov.it"
360/// type = "ckan"
361/// description = "Italian national open data portal"
362///
363/// [[portals]]
364/// name = "milano"
365/// url = "https://dati.comune.milano.it"
366/// enabled = true
367/// ```
368#[derive(Debug, Clone, Serialize, Deserialize)]
369pub struct PortalsConfig {
370    /// Array of portal configurations.
371    pub portals: Vec<PortalEntry>,
372}
373
374impl PortalsConfig {
375    /// Returns only enabled portals.
376    ///
377    /// Portals with `enabled = false` are excluded from batch harvesting.
378    pub fn enabled_portals(&self) -> Vec<&PortalEntry> {
379        self.portals.iter().filter(|p| p.enabled).collect()
380    }
381
382    /// Find a portal by name (case-insensitive).
383    ///
384    /// # Arguments
385    /// * `name` - The portal name to search for.
386    ///
387    /// # Returns
388    /// The matching portal entry, or None if not found.
389    pub fn find_by_name(&self, name: &str) -> Option<&PortalEntry> {
390        self.portals
391            .iter()
392            .find(|p| p.name.eq_ignore_ascii_case(name))
393    }
394}
395
396/// A single portal entry in the configuration file.
397///
398/// Each portal entry defines a CKAN portal to harvest, including
399/// its URL, type, and whether it's enabled for batch harvesting.
400#[derive(Debug, Clone, Serialize, Deserialize)]
401pub struct PortalEntry {
402    /// Human-readable portal name.
403    ///
404    /// Used for `--portal <name>` lookup and logging.
405    pub name: String,
406
407    /// Base URL of the CKAN portal.
408    ///
409    /// Example: "<https://dati.comune.milano.it>"
410    pub url: String,
411
412    /// Portal type: ckan, socrata, or dcat.
413    ///
414    /// Defaults to `Ckan` if not specified.
415    #[serde(rename = "type", default)]
416    pub portal_type: PortalType,
417
418    /// Whether this portal is enabled for batch harvesting.
419    ///
420    /// Defaults to `true` if not specified.
421    #[serde(default = "default_enabled")]
422    pub enabled: bool,
423
424    /// Optional description of the portal.
425    pub description: Option<String>,
426
427    /// Optional URL template for dataset landing pages.
428    ///
429    /// Supports placeholders:
430    /// - `{id}` — dataset UUID from the CKAN API
431    /// - `{name}` — dataset slug/name
432    ///
433    /// If not set, defaults to `{portal_url}/dataset/{name}`.
434    pub url_template: Option<String>,
435
436    /// Preferred language for multilingual portals (e.g., `"en"`, `"de"`, `"fr"`).
437    ///
438    /// Some portals return title and description as language-keyed objects.
439    /// This field controls which language is selected when resolving those fields.
440    /// Defaults to `"en"` when not specified.
441    pub language: Option<String>,
442}
443
444impl PortalEntry {
445    /// Returns the preferred language, defaulting to `"en"`.
446    pub fn language(&self) -> &str {
447        self.language.as_deref().unwrap_or("en")
448    }
449}
450
451/// Default configuration file name.
452pub const CONFIG_FILE_NAME: &str = "portals.toml";
453
454/// Returns the default configuration directory path.
455///
456/// Uses XDG Base Directory specification: `~/.config/ceres/`
457pub fn default_config_dir() -> Option<PathBuf> {
458    dirs::config_dir().map(|p| p.join("ceres"))
459}
460
461/// Returns the default configuration file path.
462///
463/// Path: `~/.config/ceres/portals.toml`
464pub fn default_config_path() -> Option<PathBuf> {
465    default_config_dir().map(|p| p.join(CONFIG_FILE_NAME))
466}
467
468/// Default template content for a new portals.toml file.
469///
470/// Includes pre-configured Italian open data portals so users can
471/// immediately run `ceres harvest` without manual configuration.
472const DEFAULT_CONFIG_TEMPLATE: &str = r#"# Ceres Portal Configuration
473#
474# Usage:
475#   ceres harvest                 # Harvest all enabled portals
476#   ceres harvest --portal milano # Harvest specific portal by name
477#   ceres harvest https://...     # Harvest single URL (ignores this file)
478#
479# Set enabled = false to skip a portal during batch harvest.
480# Use url_template for portals with non-standard frontends:
481#   url_template = "https://example.com/dataset?id={id}"
482#   Placeholders: {id} = dataset UUID, {name} = dataset slug
483
484# City of Milan open data
485[[portals]]
486name = "milano"
487url = "https://dati.comune.milano.it"
488type = "ckan"
489description = "Open data del Comune di Milano"
490
491# Sicily Region open data
492[[portals]]
493name = "sicilia"
494url = "https://dati.regione.sicilia.it"
495type = "ckan"
496description = "Open data della Regione Siciliana"
497"#;
498
499/// Load portal configuration from a TOML file.
500///
501/// # Arguments
502/// * `path` - Optional custom path. If `None`, uses default XDG path.
503///
504/// # Returns
505/// * `Ok(Some(config))` - Configuration loaded successfully
506/// * `Ok(None)` - No configuration file found (not an error for backward compatibility)
507/// * `Err(e)` - Configuration file exists but is invalid
508///
509/// # Behavior
510/// If no configuration file exists at the default path, a template file
511/// is automatically created to help users get started.
512pub fn load_portals_config(path: Option<PathBuf>) -> Result<Option<PortalsConfig>, AppError> {
513    let using_default_path = path.is_none();
514    let config_path = match path {
515        Some(p) => p,
516        None => match default_config_path() {
517            Some(p) => p,
518            None => return Ok(None),
519        },
520    };
521
522    if !config_path.exists() {
523        // Auto-create template if using default path
524        if using_default_path {
525            match create_default_config(&config_path) {
526                Ok(()) => {
527                    // Template created successfully - read it and return the config
528                    // This allows the user to immediately harvest without re-running
529                    tracing::info!(
530                        "Config file created at {}. Starting harvest with default portals...",
531                        config_path.display()
532                    );
533                    // Continue to read the newly created file below
534                }
535                Err(e) => {
536                    // Log warning but don't fail - user might not have write permissions
537                    tracing::warn!("Could not create default config template: {}", e);
538                    return Ok(None);
539                }
540            }
541        } else {
542            // Custom path specified but doesn't exist - that's an error
543            return Err(AppError::ConfigError(format!(
544                "Config file not found: {}",
545                config_path.display()
546            )));
547        }
548    }
549
550    let content = std::fs::read_to_string(&config_path).map_err(|e| {
551        AppError::ConfigError(format!(
552            "Failed to read config file '{}': {}",
553            config_path.display(),
554            e
555        ))
556    })?;
557
558    let config: PortalsConfig = toml::from_str(&content).map_err(|e| {
559        AppError::ConfigError(format!(
560            "Invalid TOML in '{}': {}",
561            config_path.display(),
562            e
563        ))
564    })?;
565
566    Ok(Some(config))
567}
568
569/// Create a default configuration file with a template.
570///
571/// Creates the parent directory if it doesn't exist.
572///
573/// # Arguments
574/// * `path` - The path where the config file should be created.
575fn create_default_config(path: &Path) -> std::io::Result<()> {
576    // Create parent directory if needed
577    if let Some(parent) = path.parent() {
578        std::fs::create_dir_all(parent)?;
579    }
580
581    std::fs::write(path, DEFAULT_CONFIG_TEMPLATE)?;
582    tracing::info!("Created default config template at: {}", path.display());
583
584    Ok(())
585}
586
587#[cfg(test)]
588mod tests {
589    use super::*;
590
591    #[test]
592    fn test_db_config_defaults() {
593        let config = DbConfig::default();
594        assert_eq!(config.max_connections, 5);
595    }
596
597    #[test]
598    fn test_http_config_defaults() {
599        let config = HttpConfig::default();
600        assert_eq!(config.timeout, Duration::from_secs(30));
601        assert_eq!(config.max_retries, 3);
602        assert_eq!(config.retry_base_delay, Duration::from_millis(500));
603    }
604
605    #[test]
606    fn test_sync_config_defaults() {
607        let config = SyncConfig::default();
608        assert_eq!(config.concurrency, 10);
609        assert_eq!(config.upsert_batch_size, 500);
610    }
611
612    #[test]
613    fn test_sync_config_harvest_config_upsert_batch_size() {
614        let config = SyncConfig::default();
615        let harvest = config.harvest_config();
616        assert_eq!(harvest.upsert_batch_size, 500);
617        assert_ne!(harvest.upsert_batch_size, config.embedding_batch_size);
618    }
619
620    // =========================================================================
621    // Portal Configuration Tests
622    // =========================================================================
623
624    #[test]
625    fn test_portals_config_deserialize() {
626        let toml = r#"
627[[portals]]
628name = "test-portal"
629url = "https://example.com"
630type = "ckan"
631"#;
632        let config: PortalsConfig = toml::from_str(toml).unwrap();
633        assert_eq!(config.portals.len(), 1);
634        assert_eq!(config.portals[0].name, "test-portal");
635        assert_eq!(config.portals[0].url, "https://example.com");
636        assert_eq!(config.portals[0].portal_type, PortalType::Ckan);
637        assert!(config.portals[0].enabled); // default
638        assert!(config.portals[0].description.is_none());
639    }
640
641    #[test]
642    fn test_portals_config_defaults() {
643        let toml = r#"
644[[portals]]
645name = "minimal"
646url = "https://example.com"
647"#;
648        let config: PortalsConfig = toml::from_str(toml).unwrap();
649        assert_eq!(config.portals[0].portal_type, PortalType::Ckan); // default type
650        assert!(config.portals[0].enabled); // default enabled
651    }
652
653    #[test]
654    fn test_portals_config_enabled_filter() {
655        let toml = r#"
656[[portals]]
657name = "enabled-portal"
658url = "https://a.com"
659
660[[portals]]
661name = "disabled-portal"
662url = "https://b.com"
663enabled = false
664"#;
665        let config: PortalsConfig = toml::from_str(toml).unwrap();
666        let enabled = config.enabled_portals();
667        assert_eq!(enabled.len(), 1);
668        assert_eq!(enabled[0].name, "enabled-portal");
669    }
670
671    #[test]
672    fn test_portals_config_find_by_name() {
673        let toml = r#"
674[[portals]]
675name = "Milano"
676url = "https://dati.comune.milano.it"
677"#;
678        let config: PortalsConfig = toml::from_str(toml).unwrap();
679
680        // Case-insensitive search
681        assert!(config.find_by_name("milano").is_some());
682        assert!(config.find_by_name("MILANO").is_some());
683        assert!(config.find_by_name("Milano").is_some());
684
685        // Not found
686        assert!(config.find_by_name("roma").is_none());
687    }
688
689    #[test]
690    fn test_portals_config_with_description() {
691        let toml = r#"
692[[portals]]
693name = "test"
694url = "https://example.com"
695description = "A test portal"
696"#;
697        let config: PortalsConfig = toml::from_str(toml).unwrap();
698        assert_eq!(
699            config.portals[0].description,
700            Some("A test portal".to_string())
701        );
702    }
703
704    #[test]
705    fn test_portals_config_multiple_portals() {
706        let toml = r#"
707[[portals]]
708name = "portal-1"
709url = "https://a.com"
710
711[[portals]]
712name = "portal-2"
713url = "https://b.com"
714
715[[portals]]
716name = "portal-3"
717url = "https://c.com"
718enabled = false
719"#;
720        let config: PortalsConfig = toml::from_str(toml).unwrap();
721        assert_eq!(config.portals.len(), 3);
722        assert_eq!(config.enabled_portals().len(), 2);
723    }
724
725    #[test]
726    fn test_default_config_path() {
727        // This test just verifies the function doesn't panic
728        // Actual path depends on the platform
729        let path = default_config_path();
730        if let Some(p) = path {
731            assert!(p.ends_with("portals.toml"));
732        }
733    }
734
735    // =========================================================================
736    // load_portals_config() tests with real files
737    // =========================================================================
738
739    use std::io::Write;
740    use tempfile::NamedTempFile;
741
742    #[test]
743    fn test_load_portals_config_valid_file() {
744        let mut file = NamedTempFile::new().unwrap();
745        writeln!(
746            file,
747            r#"
748[[portals]]
749name = "test"
750url = "https://test.com"
751"#
752        )
753        .unwrap();
754
755        let config = load_portals_config(Some(file.path().to_path_buf()))
756            .unwrap()
757            .unwrap();
758
759        assert_eq!(config.portals.len(), 1);
760        assert_eq!(config.portals[0].name, "test");
761        assert_eq!(config.portals[0].url, "https://test.com");
762    }
763
764    #[test]
765    fn test_load_portals_config_custom_path_not_found() {
766        let result = load_portals_config(Some("/nonexistent/path/to/config.toml".into()));
767        assert!(result.is_err());
768        let err = result.unwrap_err();
769        assert!(matches!(err, AppError::ConfigError(_)));
770    }
771
772    #[test]
773    fn test_load_portals_config_invalid_toml() {
774        let mut file = NamedTempFile::new().unwrap();
775        writeln!(file, "this is not valid toml {{{{").unwrap();
776
777        let result = load_portals_config(Some(file.path().to_path_buf()));
778        assert!(result.is_err());
779        let err = result.unwrap_err();
780        assert!(matches!(err, AppError::ConfigError(_)));
781    }
782
783    #[test]
784    fn test_load_portals_config_multiple_portals_with_enabled_filter() {
785        let mut file = NamedTempFile::new().unwrap();
786        writeln!(
787            file,
788            r#"
789[[portals]]
790name = "enabled-portal"
791url = "https://a.com"
792
793[[portals]]
794name = "disabled-portal"
795url = "https://b.com"
796enabled = false
797
798[[portals]]
799name = "another-enabled"
800url = "https://c.com"
801enabled = true
802"#
803        )
804        .unwrap();
805
806        let config = load_portals_config(Some(file.path().to_path_buf()))
807            .unwrap()
808            .unwrap();
809
810        assert_eq!(config.portals.len(), 3);
811        assert_eq!(config.enabled_portals().len(), 2);
812    }
813
814    #[test]
815    fn test_load_portals_config_with_all_fields() {
816        let mut file = NamedTempFile::new().unwrap();
817        writeln!(
818            file,
819            r#"
820[[portals]]
821name = "full-config"
822url = "https://example.com"
823type = "ckan"
824enabled = true
825description = "A fully configured portal"
826"#
827        )
828        .unwrap();
829
830        let config = load_portals_config(Some(file.path().to_path_buf()))
831            .unwrap()
832            .unwrap();
833
834        let portal = &config.portals[0];
835        assert_eq!(portal.name, "full-config");
836        assert_eq!(portal.url, "https://example.com");
837        assert_eq!(portal.portal_type, PortalType::Ckan);
838        assert!(portal.enabled);
839        assert_eq!(
840            portal.description,
841            Some("A fully configured portal".to_string())
842        );
843    }
844
845    #[test]
846    fn test_load_portals_config_empty_portals_array() {
847        let mut file = NamedTempFile::new().unwrap();
848        writeln!(file, "portals = []").unwrap();
849
850        let config = load_portals_config(Some(file.path().to_path_buf()))
851            .unwrap()
852            .unwrap();
853
854        assert!(config.portals.is_empty());
855        assert!(config.enabled_portals().is_empty());
856    }
857
858    // =========================================================================
859    // Embedding Provider Configuration Tests
860    // =========================================================================
861
862    #[test]
863    fn test_embedding_provider_type_from_str() {
864        assert_eq!(
865            "gemini".parse::<EmbeddingProviderType>().unwrap(),
866            EmbeddingProviderType::Gemini
867        );
868        assert_eq!(
869            "openai".parse::<EmbeddingProviderType>().unwrap(),
870            EmbeddingProviderType::OpenAI
871        );
872        assert_eq!(
873            "GEMINI".parse::<EmbeddingProviderType>().unwrap(),
874            EmbeddingProviderType::Gemini
875        );
876        assert_eq!(
877            "OpenAI".parse::<EmbeddingProviderType>().unwrap(),
878            EmbeddingProviderType::OpenAI
879        );
880    }
881
882    #[test]
883    fn test_embedding_provider_type_invalid() {
884        let result = "invalid".parse::<EmbeddingProviderType>();
885        assert!(result.is_err());
886    }
887
888    #[test]
889    fn test_embedding_provider_type_display() {
890        assert_eq!(EmbeddingProviderType::Gemini.to_string(), "gemini");
891        assert_eq!(EmbeddingProviderType::OpenAI.to_string(), "openai");
892    }
893
894    #[test]
895    fn test_embedding_dimension() {
896        // Gemini is always 768
897        assert_eq!(
898            embedding_dimension(EmbeddingProviderType::Gemini, None),
899            768
900        );
901        assert_eq!(
902            embedding_dimension(EmbeddingProviderType::Gemini, Some("gemini-embedding-001")),
903            768
904        );
905
906        // OpenAI defaults to 1536
907        assert_eq!(
908            embedding_dimension(EmbeddingProviderType::OpenAI, None),
909            1536
910        );
911        assert_eq!(
912            embedding_dimension(
913                EmbeddingProviderType::OpenAI,
914                Some("text-embedding-3-small")
915            ),
916            1536
917        );
918        assert_eq!(
919            embedding_dimension(
920                EmbeddingProviderType::OpenAI,
921                Some("text-embedding-3-large")
922            ),
923            3072
924        );
925    }
926
927    #[test]
928    fn test_gemini_embedding_config_default() {
929        let config = GeminiEmbeddingConfig::default();
930        assert_eq!(config.model, "gemini-embedding-001");
931    }
932
933    #[test]
934    fn test_openai_embedding_config_default() {
935        let config = OpenAIEmbeddingConfig::default();
936        assert_eq!(config.model, "text-embedding-3-small");
937        assert!(config.endpoint.is_none());
938    }
939}