Skip to main content

ceres_core/
config.rs

1//! Configuration types for Ceres components.
2//!
3//! # Configuration Improvements
4//!
5//! TODO(config): Make all configuration values environment-configurable
6//! Currently all defaults are hardcoded. Should support:
7//! - `DB_MAX_CONNECTIONS` for database pool size
8//! - `SYNC_CONCURRENCY` for parallel dataset processing
9//! - `HTTP_TIMEOUT` for API request timeout
10//! - `HTTP_MAX_RETRIES` for retry attempts
11//!
12//! Consider using the `config` crate for layered configuration:
13//! defaults -> config file -> environment variables -> CLI args
14
15use serde::{Deserialize, Serialize};
16use std::fmt;
17use std::path::{Path, PathBuf};
18use std::str::FromStr;
19use std::time::Duration;
20
21use crate::circuit_breaker::CircuitBreakerConfig;
22use crate::error::AppError;
23
24// =============================================================================
25// Embedding Provider Configuration
26// =============================================================================
27
28/// Embedding provider type.
29///
30/// Determines which embedding API to use for generating text embeddings.
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
32#[serde(rename_all = "lowercase")]
33pub enum EmbeddingProviderType {
34    /// Google Gemini gemini-embedding-001 (768 dimensions).
35    #[default]
36    Gemini,
37    /// OpenAI text-embedding-3-small (1536d) or text-embedding-3-large (3072d).
38    OpenAI,
39    /// Ollama local embedding (default: nomic-embed-text, 768 dimensions).
40    Ollama,
41}
42
43impl fmt::Display for EmbeddingProviderType {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        match self {
46            Self::Gemini => write!(f, "gemini"),
47            Self::OpenAI => write!(f, "openai"),
48            Self::Ollama => write!(f, "ollama"),
49        }
50    }
51}
52
53impl FromStr for EmbeddingProviderType {
54    type Err = AppError;
55
56    fn from_str(s: &str) -> Result<Self, Self::Err> {
57        match s.to_lowercase().as_str() {
58            "gemini" => Ok(Self::Gemini),
59            "openai" => Ok(Self::OpenAI),
60            "ollama" => Ok(Self::Ollama),
61            _ => Err(AppError::ConfigError(format!(
62                "Unknown embedding provider: '{}'. Valid options: gemini, openai, ollama",
63                s
64            ))),
65        }
66    }
67}
68
69/// Gemini embedding provider configuration.
70#[derive(Debug, Clone, Serialize, Deserialize)]
71pub struct GeminiEmbeddingConfig {
72    /// Gemini model name.
73    #[serde(default = "default_gemini_model")]
74    pub model: String,
75}
76
77fn default_gemini_model() -> String {
78    "gemini-embedding-001".to_string()
79}
80
81impl Default for GeminiEmbeddingConfig {
82    fn default() -> Self {
83        Self {
84            model: default_gemini_model(),
85        }
86    }
87}
88
89/// OpenAI embedding provider configuration.
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct OpenAIEmbeddingConfig {
92    /// OpenAI model name.
93    #[serde(default = "default_openai_model")]
94    pub model: String,
95    /// Custom API endpoint (for Azure OpenAI or proxies).
96    pub endpoint: Option<String>,
97}
98
99fn default_openai_model() -> String {
100    "text-embedding-3-small".to_string()
101}
102
103impl Default for OpenAIEmbeddingConfig {
104    fn default() -> Self {
105        Self {
106            model: default_openai_model(),
107            endpoint: None,
108        }
109    }
110}
111
112/// Ollama embedding provider configuration.
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct OllamaEmbeddingConfig {
115    /// Ollama model name.
116    #[serde(default = "default_ollama_model")]
117    pub model: String,
118    /// Ollama API endpoint.
119    #[serde(default = "default_ollama_endpoint")]
120    pub endpoint: String,
121}
122
123fn default_ollama_model() -> String {
124    "nomic-embed-text".to_string()
125}
126
127fn default_ollama_endpoint() -> String {
128    "http://localhost:11434".to_string()
129}
130
131impl Default for OllamaEmbeddingConfig {
132    fn default() -> Self {
133        Self {
134            model: default_ollama_model(),
135            endpoint: default_ollama_endpoint(),
136        }
137    }
138}
139
140/// Returns the embedding dimension for a given provider and model.
141///
142/// # Arguments
143///
144/// * `provider` - The embedding provider type
145/// * `model` - The model name (optional, uses default if None)
146pub fn embedding_dimension(provider: EmbeddingProviderType, model: Option<&str>) -> usize {
147    match provider {
148        EmbeddingProviderType::Gemini => 768, // gemini-embedding-001 with output_dimensionality=768
149        EmbeddingProviderType::OpenAI => match model.unwrap_or("text-embedding-3-small") {
150            "text-embedding-3-large" => 3072,
151            _ => 1536, // text-embedding-3-small and ada-002
152        },
153        EmbeddingProviderType::Ollama => {
154            // Normalize tagged model identifiers like "snowflake-arctic-embed:335m"
155            let normalized = model
156                .map(|m| m.split(':').next().unwrap_or(m))
157                .unwrap_or("nomic-embed-text");
158
159            match normalized {
160                "mxbai-embed-large" | "snowflake-arctic-embed" => 1024,
161                "all-minilm" => 384,
162                _ => 768, // nomic-embed-text and unknown models
163            }
164        }
165    }
166}
167
168/// Database connection pool configuration.
169pub struct DbConfig {
170    pub max_connections: u32,
171}
172
173impl Default for DbConfig {
174    fn default() -> Self {
175        Self { max_connections: 5 }
176    }
177}
178
179/// HTTP client configuration for external API calls.
180pub struct HttpConfig {
181    pub timeout: Duration,
182    pub max_retries: u32,
183    pub retry_base_delay: Duration,
184}
185
186impl Default for HttpConfig {
187    fn default() -> Self {
188        Self {
189            timeout: Duration::from_secs(30),
190            max_retries: 3,
191            retry_base_delay: Duration::from_millis(500),
192        }
193    }
194}
195
196/// Harvest-only configuration (metadata fetching + delta detection).
197///
198/// Used by [`crate::HarvestService`] when running without an embedding provider.
199/// Contains only the settings relevant to portal fetching and persistence.
200#[derive(Clone)]
201pub struct HarvestConfig {
202    /// Number of concurrent dataset processing tasks.
203    pub concurrency: usize,
204    /// Maximum number of datasets per DB upsert batch.
205    pub upsert_batch_size: usize,
206    /// Force full sync even if incremental sync is available.
207    pub force_full_sync: bool,
208    /// Preview mode: fetch and compare datasets without writing to DB.
209    pub dry_run: bool,
210}
211
212impl Default for HarvestConfig {
213    fn default() -> Self {
214        Self {
215            concurrency: 10,
216            upsert_batch_size: 500,
217            force_full_sync: false,
218            dry_run: false,
219        }
220    }
221}
222
223impl HarvestConfig {
224    /// Creates a new HarvestConfig with force_full_sync enabled.
225    pub fn with_full_sync(mut self) -> Self {
226        self.force_full_sync = true;
227        self
228    }
229
230    /// Creates a new HarvestConfig with dry_run enabled.
231    pub fn with_dry_run(mut self) -> Self {
232        self.dry_run = true;
233        self
234    }
235}
236
237/// Embedding service configuration.
238///
239/// Used by [`crate::EmbeddingService`] for standalone embedding passes.
240/// Contains only the settings relevant to embedding API calls.
241#[derive(Clone)]
242pub struct EmbeddingServiceConfig {
243    /// Maximum number of texts per embedding API batch call.
244    /// The actual batch size is `min(this, provider.max_batch_size())`.
245    pub batch_size: usize,
246    /// Circuit breaker configuration for embedding API resilience.
247    pub circuit_breaker: CircuitBreakerConfig,
248}
249
250impl Default for EmbeddingServiceConfig {
251    fn default() -> Self {
252        Self {
253            batch_size: 64,
254            circuit_breaker: CircuitBreakerConfig::default(),
255        }
256    }
257}
258
259/// Portal synchronization configuration (combined harvest + embed).
260///
261/// This is a convenience type that composes [`HarvestConfig`] and
262/// [`EmbeddingServiceConfig`] for the common harvest-then-embed workflow.
263/// Used by [`crate::HarvestPipeline`] and kept for backward compatibility.
264///
265/// TODO(config): Support CLI arg `--concurrency` and env var `SYNC_CONCURRENCY`
266/// Optimal value depends on portal rate limits and system resources.
267/// Consider auto-tuning based on API response times.
268#[derive(Clone)]
269pub struct SyncConfig {
270    /// Number of concurrent dataset processing tasks.
271    pub concurrency: usize,
272    /// Maximum number of texts per embedding API batch call.
273    /// The actual batch size is `min(this, provider.max_batch_size())`.
274    pub embedding_batch_size: usize,
275    /// Maximum number of datasets per DB upsert batch.
276    pub upsert_batch_size: usize,
277    /// Force full sync even if incremental sync is available.
278    pub force_full_sync: bool,
279    /// Preview mode: fetch and compare datasets without writing to DB or calling embedding API.
280    pub dry_run: bool,
281    /// Circuit breaker configuration for API resilience.
282    pub circuit_breaker: CircuitBreakerConfig,
283}
284
285impl Default for SyncConfig {
286    fn default() -> Self {
287        // TODO(config): Read from SYNC_CONCURRENCY env var
288        Self {
289            concurrency: 10,
290            embedding_batch_size: 64,
291            upsert_batch_size: 500,
292            force_full_sync: false,
293            dry_run: false,
294            circuit_breaker: CircuitBreakerConfig::default(),
295        }
296    }
297}
298
299impl SyncConfig {
300    /// Creates a new SyncConfig with force_full_sync enabled.
301    pub fn with_full_sync(mut self) -> Self {
302        self.force_full_sync = true;
303        self
304    }
305
306    /// Creates a new SyncConfig with dry_run enabled.
307    pub fn with_dry_run(mut self) -> Self {
308        self.dry_run = true;
309        self
310    }
311
312    /// Creates a new SyncConfig with a custom embedding batch size.
313    pub fn with_embedding_batch_size(mut self, size: usize) -> Self {
314        self.embedding_batch_size = size.max(1);
315        self
316    }
317
318    /// Creates a new SyncConfig with custom circuit breaker configuration.
319    pub fn with_circuit_breaker(mut self, config: CircuitBreakerConfig) -> Self {
320        self.circuit_breaker = config;
321        self
322    }
323
324    /// Extracts the harvest-only configuration.
325    pub fn harvest_config(&self) -> HarvestConfig {
326        HarvestConfig {
327            concurrency: self.concurrency,
328            upsert_batch_size: self.upsert_batch_size,
329            force_full_sync: self.force_full_sync,
330            dry_run: self.dry_run,
331        }
332    }
333
334    /// Extracts the embedding service configuration.
335    pub fn embedding_service_config(&self) -> EmbeddingServiceConfig {
336        EmbeddingServiceConfig {
337            batch_size: self.embedding_batch_size,
338            circuit_breaker: self.circuit_breaker.clone(),
339        }
340    }
341}
342
343// =============================================================================
344// Portal Configuration (portals.toml)
345// =============================================================================
346
347/// Portal type identifier.
348///
349/// Determines which portal API client to use for harvesting.
350#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
351#[serde(rename_all = "lowercase")]
352pub enum PortalType {
353    /// CKAN open data portal (default).
354    #[default]
355    Ckan,
356    /// Socrata open data portal (US cities: NYC, Chicago, SF).
357    Socrata,
358    /// DCAT-AP / SPARQL endpoint (EU portals, data.europa.eu).
359    Dcat,
360}
361
362impl fmt::Display for PortalType {
363    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
364        match self {
365            Self::Ckan => write!(f, "ckan"),
366            Self::Socrata => write!(f, "socrata"),
367            Self::Dcat => write!(f, "dcat"),
368        }
369    }
370}
371
372impl FromStr for PortalType {
373    type Err = AppError;
374
375    fn from_str(s: &str) -> Result<Self, Self::Err> {
376        match s.to_lowercase().as_str() {
377            "ckan" => Ok(Self::Ckan),
378            "socrata" => Ok(Self::Socrata),
379            "dcat" => Ok(Self::Dcat),
380            _ => Err(AppError::ConfigError(format!(
381                "Unknown portal type: '{}'. Valid options: ckan, socrata, dcat",
382                s
383            ))),
384        }
385    }
386}
387
388/// Default enabled status when not specified in configuration.
389fn default_enabled() -> bool {
390    true
391}
392
393/// Root configuration structure for portals.toml.
394///
395/// This structure represents the entire configuration file containing
396/// an array of portal definitions.
397///
398/// # Example
399///
400/// ```toml
401/// [[portals]]
402/// name = "dati-gov-it"
403/// url = "https://dati.gov.it"
404/// type = "ckan"
405/// description = "Italian national open data portal"
406///
407/// [[portals]]
408/// name = "milano"
409/// url = "https://dati.comune.milano.it"
410/// enabled = true
411/// ```
412#[derive(Debug, Clone, Serialize, Deserialize)]
413pub struct PortalsConfig {
414    /// Array of portal configurations.
415    pub portals: Vec<PortalEntry>,
416}
417
418impl PortalsConfig {
419    /// Returns only enabled portals.
420    ///
421    /// Portals with `enabled = false` are excluded from batch harvesting.
422    pub fn enabled_portals(&self) -> Vec<&PortalEntry> {
423        self.portals.iter().filter(|p| p.enabled).collect()
424    }
425
426    /// Find a portal by name (case-insensitive).
427    ///
428    /// # Arguments
429    /// * `name` - The portal name to search for.
430    ///
431    /// # Returns
432    /// The matching portal entry, or None if not found.
433    pub fn find_by_name(&self, name: &str) -> Option<&PortalEntry> {
434        self.portals
435            .iter()
436            .find(|p| p.name.eq_ignore_ascii_case(name))
437    }
438}
439
440/// A single portal entry in the configuration file.
441///
442/// Each portal entry defines a CKAN portal to harvest, including
443/// its URL, type, and whether it's enabled for batch harvesting.
444#[derive(Debug, Clone, Serialize, Deserialize)]
445pub struct PortalEntry {
446    /// Human-readable portal name.
447    ///
448    /// Used for `--portal <name>` lookup and logging.
449    pub name: String,
450
451    /// Base URL of the CKAN portal.
452    ///
453    /// Example: "<https://dati.comune.milano.it>"
454    pub url: String,
455
456    /// Portal type: ckan, socrata, or dcat.
457    ///
458    /// Defaults to `Ckan` if not specified.
459    #[serde(rename = "type", default)]
460    pub portal_type: PortalType,
461
462    /// Whether this portal is enabled for batch harvesting.
463    ///
464    /// Defaults to `true` if not specified.
465    #[serde(default = "default_enabled")]
466    pub enabled: bool,
467
468    /// Optional description of the portal.
469    pub description: Option<String>,
470
471    /// Optional URL template for dataset landing pages.
472    ///
473    /// Supports placeholders:
474    /// - `{id}` — dataset UUID from the CKAN API
475    /// - `{name}` — dataset slug/name
476    ///
477    /// If not set, defaults to `{portal_url}/dataset/{name}`.
478    pub url_template: Option<String>,
479
480    /// Preferred language for multilingual portals (e.g., `"en"`, `"de"`, `"fr"`).
481    ///
482    /// Some portals return title and description as language-keyed objects.
483    /// This field controls which language is selected when resolving those fields.
484    /// Defaults to `"en"` when not specified.
485    pub language: Option<String>,
486
487    /// Optional DCAT profile for sub-dispatch.
488    ///
489    /// Only meaningful when `type = "dcat"`. Values:
490    /// - `"udata_rest"` (default when omitted): udata REST JSON-LD catalog endpoint
491    /// - `"sparql"`: SPARQL endpoint returning DCAT-AP metadata
492    #[serde(default)]
493    pub profile: Option<String>,
494
495    /// Optional custom SPARQL endpoint URL.
496    ///
497    /// Only meaningful when `profile = "sparql"`. Overrides the default
498    /// convention of `{url}/sparql`. Use this when the SPARQL endpoint
499    /// lives on a different domain than the portal (e.g., Norway's
500    /// `data.norge.no` portal uses `sparql.fellesdatakatalog.digdir.no`).
501    #[serde(default)]
502    pub sparql_endpoint: Option<String>,
503}
504
505impl PortalEntry {
506    /// Returns the preferred language, defaulting to `"en"`.
507    pub fn language(&self) -> &str {
508        self.language.as_deref().unwrap_or("en")
509    }
510
511    /// Returns the DCAT profile, if set.
512    pub fn profile(&self) -> Option<&str> {
513        self.profile.as_deref()
514    }
515
516    /// Returns the custom SPARQL endpoint URL, if set.
517    pub fn sparql_endpoint(&self) -> Option<&str> {
518        self.sparql_endpoint.as_deref()
519    }
520}
521
522/// Default configuration file name.
523pub const CONFIG_FILE_NAME: &str = "portals.toml";
524
525/// Returns the default configuration directory path.
526///
527/// Uses XDG Base Directory specification: `~/.config/ceres/`
528pub fn default_config_dir() -> Option<PathBuf> {
529    dirs::config_dir().map(|p| p.join("ceres"))
530}
531
532/// Returns the default configuration file path.
533///
534/// Path: `~/.config/ceres/portals.toml`
535pub fn default_config_path() -> Option<PathBuf> {
536    default_config_dir().map(|p| p.join(CONFIG_FILE_NAME))
537}
538
539/// Default template content for a new portals.toml file.
540///
541/// Includes pre-configured Italian open data portals so users can
542/// immediately run `ceres harvest` without manual configuration.
543const DEFAULT_CONFIG_TEMPLATE: &str = r#"# Ceres Portal Configuration
544#
545# Usage:
546#   ceres harvest                 # Harvest all enabled portals
547#   ceres harvest --portal milano # Harvest specific portal by name
548#   ceres harvest https://...     # Harvest single URL (ignores this file)
549#
550# Set enabled = false to skip a portal during batch harvest.
551# Use url_template for portals with non-standard frontends:
552#   url_template = "https://example.com/dataset?id={id}"
553#   Placeholders: {id} = dataset UUID, {name} = dataset slug
554
555# City of Milan open data
556[[portals]]
557name = "milano"
558url = "https://dati.comune.milano.it"
559type = "ckan"
560description = "Open data del Comune di Milano"
561
562# Sicily Region open data
563[[portals]]
564name = "sicilia"
565url = "https://dati.regione.sicilia.it"
566type = "ckan"
567description = "Open data della Regione Siciliana"
568"#;
569
570/// Load portal configuration from a TOML file.
571///
572/// # Arguments
573/// * `path` - Optional custom path. If `None`, uses default XDG path.
574///
575/// # Returns
576/// * `Ok(Some(config))` - Configuration loaded successfully
577/// * `Ok(None)` - No configuration file found (not an error for backward compatibility)
578/// * `Err(e)` - Configuration file exists but is invalid
579///
580/// # Behavior
581/// If no configuration file exists at the default path, a template file
582/// is automatically created to help users get started.
583pub fn load_portals_config(path: Option<PathBuf>) -> Result<Option<PortalsConfig>, AppError> {
584    let using_default_path = path.is_none();
585    let config_path = match path {
586        Some(p) => p,
587        None => match default_config_path() {
588            Some(p) => p,
589            None => return Ok(None),
590        },
591    };
592
593    if !config_path.exists() {
594        // Auto-create template if using default path
595        if using_default_path {
596            match create_default_config(&config_path) {
597                Ok(()) => {
598                    // Template created successfully - read it and return the config
599                    // This allows the user to immediately harvest without re-running
600                    tracing::info!(
601                        "Config file created at {}. Starting harvest with default portals...",
602                        config_path.display()
603                    );
604                    // Continue to read the newly created file below
605                }
606                Err(e) => {
607                    // Log warning but don't fail - user might not have write permissions
608                    tracing::warn!("Could not create default config template: {}", e);
609                    return Ok(None);
610                }
611            }
612        } else {
613            // Custom path specified but doesn't exist - that's an error
614            return Err(AppError::ConfigError(format!(
615                "Config file not found: {}",
616                config_path.display()
617            )));
618        }
619    }
620
621    let content = std::fs::read_to_string(&config_path).map_err(|e| {
622        AppError::ConfigError(format!(
623            "Failed to read config file '{}': {}",
624            config_path.display(),
625            e
626        ))
627    })?;
628
629    let config: PortalsConfig = toml::from_str(&content).map_err(|e| {
630        AppError::ConfigError(format!(
631            "Invalid TOML in '{}': {}",
632            config_path.display(),
633            e
634        ))
635    })?;
636
637    Ok(Some(config))
638}
639
640/// Create a default configuration file with a template.
641///
642/// Creates the parent directory if it doesn't exist.
643///
644/// # Arguments
645/// * `path` - The path where the config file should be created.
646fn create_default_config(path: &Path) -> std::io::Result<()> {
647    // Create parent directory if needed
648    if let Some(parent) = path.parent() {
649        std::fs::create_dir_all(parent)?;
650    }
651
652    std::fs::write(path, DEFAULT_CONFIG_TEMPLATE)?;
653    tracing::info!("Created default config template at: {}", path.display());
654
655    Ok(())
656}
657
658#[cfg(test)]
659mod tests {
660    use super::*;
661
662    #[test]
663    fn test_db_config_defaults() {
664        let config = DbConfig::default();
665        assert_eq!(config.max_connections, 5);
666    }
667
668    #[test]
669    fn test_http_config_defaults() {
670        let config = HttpConfig::default();
671        assert_eq!(config.timeout, Duration::from_secs(30));
672        assert_eq!(config.max_retries, 3);
673        assert_eq!(config.retry_base_delay, Duration::from_millis(500));
674    }
675
676    #[test]
677    fn test_sync_config_defaults() {
678        let config = SyncConfig::default();
679        assert_eq!(config.concurrency, 10);
680        assert_eq!(config.upsert_batch_size, 500);
681    }
682
683    #[test]
684    fn test_sync_config_harvest_config_upsert_batch_size() {
685        let config = SyncConfig::default();
686        let harvest = config.harvest_config();
687        assert_eq!(harvest.upsert_batch_size, 500);
688        assert_ne!(harvest.upsert_batch_size, config.embedding_batch_size);
689    }
690
691    // =========================================================================
692    // Portal Configuration Tests
693    // =========================================================================
694
695    #[test]
696    fn test_portals_config_deserialize() {
697        let toml = r#"
698[[portals]]
699name = "test-portal"
700url = "https://example.com"
701type = "ckan"
702"#;
703        let config: PortalsConfig = toml::from_str(toml).unwrap();
704        assert_eq!(config.portals.len(), 1);
705        assert_eq!(config.portals[0].name, "test-portal");
706        assert_eq!(config.portals[0].url, "https://example.com");
707        assert_eq!(config.portals[0].portal_type, PortalType::Ckan);
708        assert!(config.portals[0].enabled); // default
709        assert!(config.portals[0].description.is_none());
710    }
711
712    #[test]
713    fn test_portals_config_defaults() {
714        let toml = r#"
715[[portals]]
716name = "minimal"
717url = "https://example.com"
718"#;
719        let config: PortalsConfig = toml::from_str(toml).unwrap();
720        assert_eq!(config.portals[0].portal_type, PortalType::Ckan); // default type
721        assert!(config.portals[0].enabled); // default enabled
722    }
723
724    #[test]
725    fn test_portals_config_enabled_filter() {
726        let toml = r#"
727[[portals]]
728name = "enabled-portal"
729url = "https://a.com"
730
731[[portals]]
732name = "disabled-portal"
733url = "https://b.com"
734enabled = false
735"#;
736        let config: PortalsConfig = toml::from_str(toml).unwrap();
737        let enabled = config.enabled_portals();
738        assert_eq!(enabled.len(), 1);
739        assert_eq!(enabled[0].name, "enabled-portal");
740    }
741
742    #[test]
743    fn test_portals_config_find_by_name() {
744        let toml = r#"
745[[portals]]
746name = "Milano"
747url = "https://dati.comune.milano.it"
748"#;
749        let config: PortalsConfig = toml::from_str(toml).unwrap();
750
751        // Case-insensitive search
752        assert!(config.find_by_name("milano").is_some());
753        assert!(config.find_by_name("MILANO").is_some());
754        assert!(config.find_by_name("Milano").is_some());
755
756        // Not found
757        assert!(config.find_by_name("roma").is_none());
758    }
759
760    #[test]
761    fn test_portals_config_with_description() {
762        let toml = r#"
763[[portals]]
764name = "test"
765url = "https://example.com"
766description = "A test portal"
767"#;
768        let config: PortalsConfig = toml::from_str(toml).unwrap();
769        assert_eq!(
770            config.portals[0].description,
771            Some("A test portal".to_string())
772        );
773    }
774
775    #[test]
776    fn test_portals_config_multiple_portals() {
777        let toml = r#"
778[[portals]]
779name = "portal-1"
780url = "https://a.com"
781
782[[portals]]
783name = "portal-2"
784url = "https://b.com"
785
786[[portals]]
787name = "portal-3"
788url = "https://c.com"
789enabled = false
790"#;
791        let config: PortalsConfig = toml::from_str(toml).unwrap();
792        assert_eq!(config.portals.len(), 3);
793        assert_eq!(config.enabled_portals().len(), 2);
794    }
795
796    #[test]
797    fn test_default_config_path() {
798        // This test just verifies the function doesn't panic
799        // Actual path depends on the platform
800        let path = default_config_path();
801        if let Some(p) = path {
802            assert!(p.ends_with("portals.toml"));
803        }
804    }
805
806    // =========================================================================
807    // load_portals_config() tests with real files
808    // =========================================================================
809
810    use std::io::Write;
811    use tempfile::NamedTempFile;
812
813    #[test]
814    fn test_load_portals_config_valid_file() {
815        let mut file = NamedTempFile::new().unwrap();
816        writeln!(
817            file,
818            r#"
819[[portals]]
820name = "test"
821url = "https://test.com"
822"#
823        )
824        .unwrap();
825
826        let config = load_portals_config(Some(file.path().to_path_buf()))
827            .unwrap()
828            .unwrap();
829
830        assert_eq!(config.portals.len(), 1);
831        assert_eq!(config.portals[0].name, "test");
832        assert_eq!(config.portals[0].url, "https://test.com");
833    }
834
835    #[test]
836    fn test_load_portals_config_custom_path_not_found() {
837        let result = load_portals_config(Some("/nonexistent/path/to/config.toml".into()));
838        assert!(result.is_err());
839        let err = result.unwrap_err();
840        assert!(matches!(err, AppError::ConfigError(_)));
841    }
842
843    #[test]
844    fn test_load_portals_config_invalid_toml() {
845        let mut file = NamedTempFile::new().unwrap();
846        writeln!(file, "this is not valid toml {{{{").unwrap();
847
848        let result = load_portals_config(Some(file.path().to_path_buf()));
849        assert!(result.is_err());
850        let err = result.unwrap_err();
851        assert!(matches!(err, AppError::ConfigError(_)));
852    }
853
854    #[test]
855    fn test_load_portals_config_multiple_portals_with_enabled_filter() {
856        let mut file = NamedTempFile::new().unwrap();
857        writeln!(
858            file,
859            r#"
860[[portals]]
861name = "enabled-portal"
862url = "https://a.com"
863
864[[portals]]
865name = "disabled-portal"
866url = "https://b.com"
867enabled = false
868
869[[portals]]
870name = "another-enabled"
871url = "https://c.com"
872enabled = true
873"#
874        )
875        .unwrap();
876
877        let config = load_portals_config(Some(file.path().to_path_buf()))
878            .unwrap()
879            .unwrap();
880
881        assert_eq!(config.portals.len(), 3);
882        assert_eq!(config.enabled_portals().len(), 2);
883    }
884
885    #[test]
886    fn test_load_portals_config_with_all_fields() {
887        let mut file = NamedTempFile::new().unwrap();
888        writeln!(
889            file,
890            r#"
891[[portals]]
892name = "full-config"
893url = "https://example.com"
894type = "ckan"
895enabled = true
896description = "A fully configured portal"
897"#
898        )
899        .unwrap();
900
901        let config = load_portals_config(Some(file.path().to_path_buf()))
902            .unwrap()
903            .unwrap();
904
905        let portal = &config.portals[0];
906        assert_eq!(portal.name, "full-config");
907        assert_eq!(portal.url, "https://example.com");
908        assert_eq!(portal.portal_type, PortalType::Ckan);
909        assert!(portal.enabled);
910        assert_eq!(
911            portal.description,
912            Some("A fully configured portal".to_string())
913        );
914    }
915
916    #[test]
917    fn test_portals_config_dcat_profile() {
918        let toml = r#"
919[[portals]]
920name = "eu-sparql"
921url = "https://data.europa.eu"
922type = "dcat"
923profile = "sparql"
924language = "en"
925"#;
926        let config: PortalsConfig = toml::from_str(toml).unwrap();
927        let portal = &config.portals[0];
928        assert_eq!(portal.portal_type, PortalType::Dcat);
929        assert_eq!(portal.profile(), Some("sparql"));
930        assert_eq!(portal.language(), "en");
931    }
932
933    #[test]
934    fn test_portals_config_profile_defaults_none() {
935        let toml = r#"
936[[portals]]
937name = "luxembourg"
938url = "https://data.public.lu"
939type = "dcat"
940"#;
941        let config: PortalsConfig = toml::from_str(toml).unwrap();
942        assert_eq!(config.portals[0].profile(), None);
943    }
944
945    #[test]
946    fn test_load_portals_config_empty_portals_array() {
947        let mut file = NamedTempFile::new().unwrap();
948        writeln!(file, "portals = []").unwrap();
949
950        let config = load_portals_config(Some(file.path().to_path_buf()))
951            .unwrap()
952            .unwrap();
953
954        assert!(config.portals.is_empty());
955        assert!(config.enabled_portals().is_empty());
956    }
957
958    // =========================================================================
959    // Embedding Provider Configuration Tests
960    // =========================================================================
961
962    #[test]
963    fn test_embedding_provider_type_from_str() {
964        assert_eq!(
965            "gemini".parse::<EmbeddingProviderType>().unwrap(),
966            EmbeddingProviderType::Gemini
967        );
968        assert_eq!(
969            "openai".parse::<EmbeddingProviderType>().unwrap(),
970            EmbeddingProviderType::OpenAI
971        );
972        assert_eq!(
973            "GEMINI".parse::<EmbeddingProviderType>().unwrap(),
974            EmbeddingProviderType::Gemini
975        );
976        assert_eq!(
977            "OpenAI".parse::<EmbeddingProviderType>().unwrap(),
978            EmbeddingProviderType::OpenAI
979        );
980        assert_eq!(
981            "ollama".parse::<EmbeddingProviderType>().unwrap(),
982            EmbeddingProviderType::Ollama
983        );
984        assert_eq!(
985            "OLLAMA".parse::<EmbeddingProviderType>().unwrap(),
986            EmbeddingProviderType::Ollama
987        );
988    }
989
990    #[test]
991    fn test_embedding_provider_type_invalid() {
992        let result = "invalid".parse::<EmbeddingProviderType>();
993        assert!(result.is_err());
994    }
995
996    #[test]
997    fn test_embedding_provider_type_display() {
998        assert_eq!(EmbeddingProviderType::Gemini.to_string(), "gemini");
999        assert_eq!(EmbeddingProviderType::OpenAI.to_string(), "openai");
1000        assert_eq!(EmbeddingProviderType::Ollama.to_string(), "ollama");
1001    }
1002
1003    #[test]
1004    fn test_embedding_dimension() {
1005        // Gemini is always 768
1006        assert_eq!(
1007            embedding_dimension(EmbeddingProviderType::Gemini, None),
1008            768
1009        );
1010        assert_eq!(
1011            embedding_dimension(EmbeddingProviderType::Gemini, Some("gemini-embedding-001")),
1012            768
1013        );
1014
1015        // OpenAI defaults to 1536
1016        assert_eq!(
1017            embedding_dimension(EmbeddingProviderType::OpenAI, None),
1018            1536
1019        );
1020        assert_eq!(
1021            embedding_dimension(
1022                EmbeddingProviderType::OpenAI,
1023                Some("text-embedding-3-small")
1024            ),
1025            1536
1026        );
1027        assert_eq!(
1028            embedding_dimension(
1029                EmbeddingProviderType::OpenAI,
1030                Some("text-embedding-3-large")
1031            ),
1032            3072
1033        );
1034    }
1035
1036    #[test]
1037    fn test_gemini_embedding_config_default() {
1038        let config = GeminiEmbeddingConfig::default();
1039        assert_eq!(config.model, "gemini-embedding-001");
1040    }
1041
1042    #[test]
1043    fn test_openai_embedding_config_default() {
1044        let config = OpenAIEmbeddingConfig::default();
1045        assert_eq!(config.model, "text-embedding-3-small");
1046        assert!(config.endpoint.is_none());
1047    }
1048
1049    #[test]
1050    fn test_ollama_embedding_config_default() {
1051        let config = OllamaEmbeddingConfig::default();
1052        assert_eq!(config.model, "nomic-embed-text");
1053        assert_eq!(config.endpoint, "http://localhost:11434");
1054    }
1055
1056    #[test]
1057    fn test_embedding_dimension_ollama() {
1058        assert_eq!(
1059            embedding_dimension(EmbeddingProviderType::Ollama, None),
1060            768
1061        );
1062        assert_eq!(
1063            embedding_dimension(EmbeddingProviderType::Ollama, Some("nomic-embed-text")),
1064            768
1065        );
1066        assert_eq!(
1067            embedding_dimension(EmbeddingProviderType::Ollama, Some("mxbai-embed-large")),
1068            1024
1069        );
1070        assert_eq!(
1071            embedding_dimension(EmbeddingProviderType::Ollama, Some("all-minilm")),
1072            384
1073        );
1074        assert_eq!(
1075            embedding_dimension(EmbeddingProviderType::Ollama, Some("unknown-model")),
1076            768
1077        );
1078        // Tagged model identifiers
1079        assert_eq!(
1080            embedding_dimension(
1081                EmbeddingProviderType::Ollama,
1082                Some("snowflake-arctic-embed:335m")
1083            ),
1084            1024
1085        );
1086        assert_eq!(
1087            embedding_dimension(
1088                EmbeddingProviderType::Ollama,
1089                Some("nomic-embed-text:latest")
1090            ),
1091            768
1092        );
1093    }
1094}