ceres_core/
config.rs

1//! Configuration types for Ceres components.
2//!
3//! # Configuration Improvements
4//!
5//! TODO(config): Make all configuration values environment-configurable
6//! Currently all defaults are hardcoded. Should support:
7//! - `DB_MAX_CONNECTIONS` for database pool size
8//! - `SYNC_CONCURRENCY` for parallel dataset processing
9//! - `HTTP_TIMEOUT` for API request timeout
10//! - `HTTP_MAX_RETRIES` for retry attempts
11//!
12//! Consider using the `config` crate for layered configuration:
13//! defaults -> config file -> environment variables -> CLI args
14
15use serde::{Deserialize, Serialize};
16use std::path::{Path, PathBuf};
17use std::time::Duration;
18
19use crate::error::AppError;
20
21/// Database connection pool configuration.
22///
23/// TODO(config): Support environment variable `DB_MAX_CONNECTIONS`
24/// Default of 5 may be insufficient for high-concurrency scenarios.
25pub struct DbConfig {
26    pub max_connections: u32,
27}
28
29impl Default for DbConfig {
30    fn default() -> Self {
31        // TODO(config): Read from DB_MAX_CONNECTIONS env var
32        Self { max_connections: 5 }
33    }
34}
35
36/// HTTP client configuration for external API calls.
37pub struct HttpConfig {
38    pub timeout: Duration,
39    pub max_retries: u32,
40    pub retry_base_delay: Duration,
41}
42
43impl Default for HttpConfig {
44    fn default() -> Self {
45        Self {
46            timeout: Duration::from_secs(30),
47            max_retries: 3,
48            retry_base_delay: Duration::from_millis(500),
49        }
50    }
51}
52
53/// Portal synchronization configuration.
54///
55/// TODO(config): Support CLI arg `--concurrency` and env var `SYNC_CONCURRENCY`
56/// Optimal value depends on portal rate limits and system resources.
57/// Consider auto-tuning based on API response times.
58pub struct SyncConfig {
59    pub concurrency: usize,
60}
61
62impl Default for SyncConfig {
63    fn default() -> Self {
64        // TODO(config): Read from SYNC_CONCURRENCY env var
65        Self { concurrency: 10 }
66    }
67}
68
69// =============================================================================
70// Portal Configuration (portals.toml)
71// =============================================================================
72
73/// Default portal type when not specified in configuration.
74fn default_portal_type() -> String {
75    "ckan".to_string()
76}
77
78/// Default enabled status when not specified in configuration.
79fn default_enabled() -> bool {
80    true
81}
82
83/// Root configuration structure for portals.toml.
84///
85/// This structure represents the entire configuration file containing
86/// an array of portal definitions.
87///
88/// # Example
89///
90/// ```toml
91/// [[portals]]
92/// name = "dati-gov-it"
93/// url = "https://dati.gov.it"
94/// type = "ckan"
95/// description = "Italian national open data portal"
96///
97/// [[portals]]
98/// name = "milano"
99/// url = "https://dati.comune.milano.it"
100/// enabled = true
101/// ```
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct PortalsConfig {
104    /// Array of portal configurations.
105    pub portals: Vec<PortalEntry>,
106}
107
108impl PortalsConfig {
109    /// Returns only enabled portals.
110    ///
111    /// Portals with `enabled = false` are excluded from batch harvesting.
112    pub fn enabled_portals(&self) -> Vec<&PortalEntry> {
113        self.portals.iter().filter(|p| p.enabled).collect()
114    }
115
116    /// Find a portal by name (case-insensitive).
117    ///
118    /// # Arguments
119    /// * `name` - The portal name to search for.
120    ///
121    /// # Returns
122    /// The matching portal entry, or None if not found.
123    pub fn find_by_name(&self, name: &str) -> Option<&PortalEntry> {
124        self.portals
125            .iter()
126            .find(|p| p.name.eq_ignore_ascii_case(name))
127    }
128}
129
130/// A single portal entry in the configuration file.
131///
132/// Each portal entry defines a CKAN portal to harvest, including
133/// its URL, type, and whether it's enabled for batch harvesting.
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct PortalEntry {
136    /// Human-readable portal name.
137    ///
138    /// Used for `--portal <name>` lookup and logging.
139    pub name: String,
140
141    /// Base URL of the CKAN portal.
142    ///
143    /// Example: "https://dati.comune.milano.it"
144    pub url: String,
145
146    /// Portal type: "ckan", "socrata", or "dcat".
147    ///
148    /// Defaults to "ckan" if not specified.
149    #[serde(rename = "type", default = "default_portal_type")]
150    pub portal_type: String,
151
152    /// Whether this portal is enabled for batch harvesting.
153    ///
154    /// Defaults to `true` if not specified.
155    #[serde(default = "default_enabled")]
156    pub enabled: bool,
157
158    /// Optional description of the portal.
159    pub description: Option<String>,
160}
161
162/// Default configuration file name.
163pub const CONFIG_FILE_NAME: &str = "portals.toml";
164
165/// Returns the default configuration directory path.
166///
167/// Uses XDG Base Directory specification: `~/.config/ceres/`
168pub fn default_config_dir() -> Option<PathBuf> {
169    dirs::config_dir().map(|p| p.join("ceres"))
170}
171
172/// Returns the default configuration file path.
173///
174/// Path: `~/.config/ceres/portals.toml`
175pub fn default_config_path() -> Option<PathBuf> {
176    default_config_dir().map(|p| p.join(CONFIG_FILE_NAME))
177}
178
179/// Default template content for a new portals.toml file.
180///
181/// Includes pre-configured Italian open data portals so users can
182/// immediately run `ceres harvest` without manual configuration.
183const DEFAULT_CONFIG_TEMPLATE: &str = r#"# Ceres Portal Configuration
184#
185# Usage:
186#   ceres harvest                 # Harvest all enabled portals
187#   ceres harvest --portal milano # Harvest specific portal by name
188#   ceres harvest https://...     # Harvest single URL (ignores this file)
189#
190# Set enabled = false to skip a portal during batch harvest.
191
192# City of Milan open data
193[[portals]]
194name = "milano"
195url = "https://dati.comune.milano.it"
196type = "ckan"
197description = "Open data del Comune di Milano"
198
199# Sicily Region open data
200[[portals]]
201name = "sicilia"
202url = "https://dati.regione.sicilia.it"
203type = "ckan"
204description = "Open data della Regione Siciliana"
205"#;
206
207/// Load portal configuration from a TOML file.
208///
209/// # Arguments
210/// * `path` - Optional custom path. If `None`, uses default XDG path.
211///
212/// # Returns
213/// * `Ok(Some(config))` - Configuration loaded successfully
214/// * `Ok(None)` - No configuration file found (not an error for backward compatibility)
215/// * `Err(e)` - Configuration file exists but is invalid
216///
217/// # Behavior
218/// If no configuration file exists at the default path, a template file
219/// is automatically created to help users get started.
220pub fn load_portals_config(path: Option<PathBuf>) -> Result<Option<PortalsConfig>, AppError> {
221    let using_default_path = path.is_none();
222    let config_path = match path {
223        Some(p) => p,
224        None => match default_config_path() {
225            Some(p) => p,
226            None => return Ok(None),
227        },
228    };
229
230    if !config_path.exists() {
231        // Auto-create template if using default path
232        if using_default_path {
233            match create_default_config(&config_path) {
234                Ok(()) => {
235                    // Template created successfully - read it and return the config
236                    // This allows the user to immediately harvest without re-running
237                    tracing::info!(
238                        "Config file created at {}. Starting harvest with default portals...",
239                        config_path.display()
240                    );
241                    // Continue to read the newly created file below
242                }
243                Err(e) => {
244                    // Log warning but don't fail - user might not have write permissions
245                    tracing::warn!("Could not create default config template: {}", e);
246                    return Ok(None);
247                }
248            }
249        } else {
250            // Custom path specified but doesn't exist - that's an error
251            return Err(AppError::ConfigError(format!(
252                "Config file not found: {}",
253                config_path.display()
254            )));
255        }
256    }
257
258    let content = std::fs::read_to_string(&config_path).map_err(|e| {
259        AppError::ConfigError(format!(
260            "Failed to read config file '{}': {}",
261            config_path.display(),
262            e
263        ))
264    })?;
265
266    let config: PortalsConfig = toml::from_str(&content).map_err(|e| {
267        AppError::ConfigError(format!(
268            "Invalid TOML in '{}': {}",
269            config_path.display(),
270            e
271        ))
272    })?;
273
274    Ok(Some(config))
275}
276
277/// Create a default configuration file with a template.
278///
279/// Creates the parent directory if it doesn't exist.
280///
281/// # Arguments
282/// * `path` - The path where the config file should be created.
283fn create_default_config(path: &Path) -> std::io::Result<()> {
284    // Create parent directory if needed
285    if let Some(parent) = path.parent() {
286        std::fs::create_dir_all(parent)?;
287    }
288
289    std::fs::write(path, DEFAULT_CONFIG_TEMPLATE)?;
290    tracing::info!("Created default config template at: {}", path.display());
291
292    Ok(())
293}
294
295#[cfg(test)]
296mod tests {
297    use super::*;
298
299    #[test]
300    fn test_db_config_defaults() {
301        let config = DbConfig::default();
302        assert_eq!(config.max_connections, 5);
303    }
304
305    #[test]
306    fn test_http_config_defaults() {
307        let config = HttpConfig::default();
308        assert_eq!(config.timeout, Duration::from_secs(30));
309        assert_eq!(config.max_retries, 3);
310        assert_eq!(config.retry_base_delay, Duration::from_millis(500));
311    }
312
313    #[test]
314    fn test_sync_config_defaults() {
315        let config = SyncConfig::default();
316        assert_eq!(config.concurrency, 10);
317    }
318
319    // =========================================================================
320    // Portal Configuration Tests
321    // =========================================================================
322
323    #[test]
324    fn test_portals_config_deserialize() {
325        let toml = r#"
326[[portals]]
327name = "test-portal"
328url = "https://example.com"
329type = "ckan"
330"#;
331        let config: PortalsConfig = toml::from_str(toml).unwrap();
332        assert_eq!(config.portals.len(), 1);
333        assert_eq!(config.portals[0].name, "test-portal");
334        assert_eq!(config.portals[0].url, "https://example.com");
335        assert_eq!(config.portals[0].portal_type, "ckan");
336        assert!(config.portals[0].enabled); // default
337        assert!(config.portals[0].description.is_none());
338    }
339
340    #[test]
341    fn test_portals_config_defaults() {
342        let toml = r#"
343[[portals]]
344name = "minimal"
345url = "https://example.com"
346"#;
347        let config: PortalsConfig = toml::from_str(toml).unwrap();
348        assert_eq!(config.portals[0].portal_type, "ckan"); // default type
349        assert!(config.portals[0].enabled); // default enabled
350    }
351
352    #[test]
353    fn test_portals_config_enabled_filter() {
354        let toml = r#"
355[[portals]]
356name = "enabled-portal"
357url = "https://a.com"
358
359[[portals]]
360name = "disabled-portal"
361url = "https://b.com"
362enabled = false
363"#;
364        let config: PortalsConfig = toml::from_str(toml).unwrap();
365        let enabled = config.enabled_portals();
366        assert_eq!(enabled.len(), 1);
367        assert_eq!(enabled[0].name, "enabled-portal");
368    }
369
370    #[test]
371    fn test_portals_config_find_by_name() {
372        let toml = r#"
373[[portals]]
374name = "Milano"
375url = "https://dati.comune.milano.it"
376"#;
377        let config: PortalsConfig = toml::from_str(toml).unwrap();
378
379        // Case-insensitive search
380        assert!(config.find_by_name("milano").is_some());
381        assert!(config.find_by_name("MILANO").is_some());
382        assert!(config.find_by_name("Milano").is_some());
383
384        // Not found
385        assert!(config.find_by_name("roma").is_none());
386    }
387
388    #[test]
389    fn test_portals_config_with_description() {
390        let toml = r#"
391[[portals]]
392name = "test"
393url = "https://example.com"
394description = "A test portal"
395"#;
396        let config: PortalsConfig = toml::from_str(toml).unwrap();
397        assert_eq!(
398            config.portals[0].description,
399            Some("A test portal".to_string())
400        );
401    }
402
403    #[test]
404    fn test_portals_config_multiple_portals() {
405        let toml = r#"
406[[portals]]
407name = "portal-1"
408url = "https://a.com"
409
410[[portals]]
411name = "portal-2"
412url = "https://b.com"
413
414[[portals]]
415name = "portal-3"
416url = "https://c.com"
417enabled = false
418"#;
419        let config: PortalsConfig = toml::from_str(toml).unwrap();
420        assert_eq!(config.portals.len(), 3);
421        assert_eq!(config.enabled_portals().len(), 2);
422    }
423
424    #[test]
425    fn test_default_config_path() {
426        // This test just verifies the function doesn't panic
427        // Actual path depends on the platform
428        let path = default_config_path();
429        if let Some(p) = path {
430            assert!(p.ends_with("portals.toml"));
431        }
432    }
433
434    // =========================================================================
435    // load_portals_config() tests with real files
436    // =========================================================================
437
438    use std::io::Write;
439    use tempfile::NamedTempFile;
440
441    #[test]
442    fn test_load_portals_config_valid_file() {
443        let mut file = NamedTempFile::new().unwrap();
444        writeln!(
445            file,
446            r#"
447[[portals]]
448name = "test"
449url = "https://test.com"
450"#
451        )
452        .unwrap();
453
454        let config = load_portals_config(Some(file.path().to_path_buf()))
455            .unwrap()
456            .unwrap();
457
458        assert_eq!(config.portals.len(), 1);
459        assert_eq!(config.portals[0].name, "test");
460        assert_eq!(config.portals[0].url, "https://test.com");
461    }
462
463    #[test]
464    fn test_load_portals_config_custom_path_not_found() {
465        let result = load_portals_config(Some("/nonexistent/path/to/config.toml".into()));
466        assert!(result.is_err());
467        let err = result.unwrap_err();
468        assert!(matches!(err, AppError::ConfigError(_)));
469    }
470
471    #[test]
472    fn test_load_portals_config_invalid_toml() {
473        let mut file = NamedTempFile::new().unwrap();
474        writeln!(file, "this is not valid toml {{{{").unwrap();
475
476        let result = load_portals_config(Some(file.path().to_path_buf()));
477        assert!(result.is_err());
478        let err = result.unwrap_err();
479        assert!(matches!(err, AppError::ConfigError(_)));
480    }
481
482    #[test]
483    fn test_load_portals_config_multiple_portals_with_enabled_filter() {
484        let mut file = NamedTempFile::new().unwrap();
485        writeln!(
486            file,
487            r#"
488[[portals]]
489name = "enabled-portal"
490url = "https://a.com"
491
492[[portals]]
493name = "disabled-portal"
494url = "https://b.com"
495enabled = false
496
497[[portals]]
498name = "another-enabled"
499url = "https://c.com"
500enabled = true
501"#
502        )
503        .unwrap();
504
505        let config = load_portals_config(Some(file.path().to_path_buf()))
506            .unwrap()
507            .unwrap();
508
509        assert_eq!(config.portals.len(), 3);
510        assert_eq!(config.enabled_portals().len(), 2);
511    }
512
513    #[test]
514    fn test_load_portals_config_with_all_fields() {
515        let mut file = NamedTempFile::new().unwrap();
516        writeln!(
517            file,
518            r#"
519[[portals]]
520name = "full-config"
521url = "https://example.com"
522type = "ckan"
523enabled = true
524description = "A fully configured portal"
525"#
526        )
527        .unwrap();
528
529        let config = load_portals_config(Some(file.path().to_path_buf()))
530            .unwrap()
531            .unwrap();
532
533        let portal = &config.portals[0];
534        assert_eq!(portal.name, "full-config");
535        assert_eq!(portal.url, "https://example.com");
536        assert_eq!(portal.portal_type, "ckan");
537        assert!(portal.enabled);
538        assert_eq!(
539            portal.description,
540            Some("A fully configured portal".to_string())
541        );
542    }
543
544    #[test]
545    fn test_load_portals_config_empty_portals_array() {
546        let mut file = NamedTempFile::new().unwrap();
547        writeln!(file, "portals = []").unwrap();
548
549        let config = load_portals_config(Some(file.path().to_path_buf()))
550            .unwrap()
551            .unwrap();
552
553        assert!(config.portals.is_empty());
554        assert!(config.enabled_portals().is_empty());
555    }
556}