Skip to main content

dataprof_core/
config.rs

1use serde::{Deserialize, Serialize};
2use std::fs;
3use std::path::{Path, PathBuf};
4
5use crate::errors::DataProfilerError;
6
7/// ISO 8000/25012 compliant quality thresholds.
8/// These thresholds are configurable for industry-specific requirements.
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct IsoQualityConfig {
11    /// Maximum acceptable null percentage for a column (default: 50%).
12    pub max_null_percentage: f64,
13    /// Threshold for reporting null value issues (default: 10%).
14    pub null_report_threshold: f64,
15    /// Minimum acceptable type consistency percentage (default: 95%).
16    pub min_type_consistency: f64,
17    /// Threshold for reporting duplicate rows (default: 5%).
18    pub duplicate_report_threshold: f64,
19    /// High cardinality warning threshold (uniqueness ratio, default: 95%).
20    pub high_cardinality_threshold: f64,
21    /// IQR multiplier for outlier detection (default: 1.5, ISO standard).
22    pub outlier_iqr_multiplier: f64,
23    /// Minimum samples required for outlier detection (default: 4).
24    pub outlier_min_samples: usize,
25    /// Maximum age in years for data to be considered fresh (default: 5 years).
26    pub max_data_age_years: f64,
27    /// Percentage threshold for reporting stale data (default: 20%).
28    pub stale_data_threshold: f64,
29}
30
31impl Default for IsoQualityConfig {
32    fn default() -> Self {
33        Self {
34            max_null_percentage: 50.0,
35            null_report_threshold: 10.0,
36            min_type_consistency: 95.0,
37            duplicate_report_threshold: 5.0,
38            high_cardinality_threshold: 95.0,
39            outlier_iqr_multiplier: 1.5,
40            outlier_min_samples: 4,
41            max_data_age_years: 5.0,
42            stale_data_threshold: 20.0,
43        }
44    }
45}
46
47impl IsoQualityConfig {
48    /// Create strict thresholds for high-compliance industries (finance, healthcare).
49    pub fn strict() -> Self {
50        Self {
51            max_null_percentage: 30.0,
52            null_report_threshold: 5.0,
53            min_type_consistency: 98.0,
54            duplicate_report_threshold: 1.0,
55            high_cardinality_threshold: 98.0,
56            outlier_iqr_multiplier: 1.5,
57            outlier_min_samples: 10,
58            max_data_age_years: 2.0,
59            stale_data_threshold: 10.0,
60        }
61    }
62
63    /// Create lenient thresholds for exploratory/marketing data.
64    pub fn lenient() -> Self {
65        Self {
66            max_null_percentage: 70.0,
67            null_report_threshold: 20.0,
68            min_type_consistency: 90.0,
69            duplicate_report_threshold: 10.0,
70            high_cardinality_threshold: 90.0,
71            outlier_iqr_multiplier: 2.0,
72            outlier_min_samples: 4,
73            max_data_age_years: 10.0,
74            stale_data_threshold: 30.0,
75        }
76    }
77}
78
79// ============================================================================
80// Configuration Constants
81// ============================================================================
82// These constants define the default values for various configuration options.
83// Each constant is documented with its rationale to avoid "magic numbers".
84
85// Output Configuration Defaults
86/// Default output format for report serialization.
87/// "text" provides human-readable output suitable for direct display.
88const DEFAULT_OUTPUT_FORMAT: &str = "text";
89
90/// Enable colored output by default.
91/// Colors improve readability in modern terminals that support ANSI codes.
92const DEFAULT_COLORED_OUTPUT: bool = true;
93
94/// Default verbosity level (1 = normal).
95/// 0=quiet, 1=normal, 2=verbose, 3=debug
96/// Level 1 strikes a balance between informativeness and noise.
97const DEFAULT_VERBOSITY: u8 = 1;
98
99/// Show progress bars by default.
100/// Progress feedback is important for long-running operations.
101const DEFAULT_SHOW_PROGRESS: bool = true;
102
103// Quality Configuration Defaults
104/// Enable quality checking by default.
105/// Quality analysis is a core feature of DataProf.
106/// When enabled, all ISO 8000/25012 quality metrics are calculated.
107const DEFAULT_QUALITY_ENABLED: bool = true;
108
109// Engine Configuration Defaults
110/// Default engine selection: "auto".
111/// Automatic engine selection adapts to file size and format.
112const DEFAULT_ENGINE: &str = "auto";
113
114/// Enable parallel processing by default.
115/// Modern systems benefit from parallel processing for large datasets.
116const DEFAULT_PARALLEL_PROCESSING: bool = true;
117
118// Memory Configuration Defaults
119/// Maximum memory usage in MB (0 = unlimited).
120/// 0 allows the OS to manage memory, suitable for most use cases.
121const DEFAULT_MAX_MEMORY_MB: usize = 0;
122
123/// Enable memory monitoring by default.
124/// Monitoring helps detect memory leaks and optimize performance.
125const DEFAULT_MEMORY_MONITORING: bool = true;
126
127/// Auto-switch to streaming for files larger than 100MB.
128/// 100MB is a reasonable threshold where streaming provides clear benefits
129/// over loading the entire file into memory.
130const DEFAULT_AUTO_STREAMING_THRESHOLD_MB: f64 = 100.0;
131
132// Database Configuration Defaults (when database feature is enabled)
133#[cfg(feature = "database")]
134/// Default connection timeout in seconds.
135/// 30 seconds allows for slow networks while avoiding indefinite hangs.
136const DEFAULT_DB_CONNECTION_TIMEOUT_SECS: u64 = 30;
137
138#[cfg(feature = "database")]
139/// Default batch size for database queries.
140/// 10,000 rows per batch balances memory usage and query performance.
141const DEFAULT_DB_BATCH_SIZE: usize = 10_000;
142
143#[cfg(feature = "database")]
144/// Maximum number of database connections in the pool.
145/// 10 connections supports moderate concurrency without overwhelming the database.
146const DEFAULT_DB_MAX_CONNECTIONS: usize = 10;
147
148#[cfg(feature = "database")]
149/// Enable SSL by default for security.
150/// SSL should be the default for production database connections.
151const DEFAULT_DB_SSL_ENABLED: bool = true;
152
153#[cfg(feature = "database")]
154/// Enable sampling for large database tables by default.
155/// Sampling provides faster analysis for exploratory data profiling.
156const DEFAULT_DB_SAMPLING_ENABLED: bool = true;
157
158#[cfg(feature = "database")]
159/// Default sample size for database tables.
160/// 100,000 rows provides statistical significance for most analyses
161/// while keeping query times reasonable.
162const DEFAULT_DB_SAMPLE_SIZE: usize = 100_000;
163
164#[cfg(feature = "database")]
165/// Threshold for automatic sampling (number of rows).
166/// Tables with over 1 million rows benefit significantly from sampling.
167const DEFAULT_DB_AUTO_SAMPLE_THRESHOLD: usize = 1_000_000;
168
169/// Main configuration structure for dataprof.
170#[derive(Debug, Clone, Serialize, Deserialize, Default)]
171pub struct DataprofConfig {
172    /// Output configuration
173    pub output: OutputConfig,
174
175    /// Quality checking configuration
176    pub quality: QualityConfig,
177
178    /// Engine selection and performance tuning
179    pub engine: EngineConfig,
180
181    /// Database configuration
182    #[cfg(feature = "database")]
183    pub database: Option<DatabaseSettings>,
184}
185
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct OutputConfig {
188    /// Default output format (text, json, csv, yaml, plain)
189    pub default_format: String,
190
191    /// Enable colored output by default
192    pub colored: bool,
193
194    /// Default verbosity level (0=quiet, 1=normal, 2=verbose, 3=debug)
195    pub verbosity: u8,
196
197    /// Show progress bars by default
198    pub show_progress: bool,
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct QualityConfig {
203    /// Enable quality checking by default
204    pub enabled: bool,
205
206    /// ISO 8000/25012 compliant thresholds
207    ///
208    /// All quality metrics (null detection, duplicate detection, type consistency,
209    /// date format checking, outlier detection, etc.) are controlled through
210    /// these ISO-compliant thresholds.
211    ///
212    /// Use `IsoQualityConfig::strict()` for high-compliance industries,
213    /// `IsoQualityConfig::lenient()` for exploratory data, or customize
214    /// individual thresholds as needed.
215    pub iso_thresholds: IsoQualityConfig,
216}
217
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct EngineConfig {
220    /// Default engine selection (auto, streaming, memory_efficient, etc.)
221    pub default_engine: String,
222
223    /// Default chunk size for streaming operations
224    pub default_chunk_size: Option<usize>,
225
226    /// Enable parallel processing by default
227    pub parallel: bool,
228
229    /// Maximum concurrent operations
230    pub max_concurrent: usize,
231
232    /// Memory usage limits
233    pub memory: MemoryConfig,
234}
235
236#[derive(Debug, Clone, Serialize, Deserialize)]
237pub struct MemoryConfig {
238    /// Maximum memory usage in MB (0 = unlimited)
239    pub max_usage_mb: usize,
240
241    /// Enable memory monitoring
242    pub monitor: bool,
243
244    /// Auto-switch to streaming for large files
245    pub auto_streaming_threshold_mb: f64,
246}
247
248#[cfg(feature = "database")]
249#[derive(Debug, Clone, Serialize, Deserialize)]
250pub struct DatabaseSettings {
251    /// Default connection timeout in seconds
252    pub connection_timeout: u64,
253
254    /// Default batch size for database queries
255    pub batch_size: usize,
256
257    /// Maximum number of database connections
258    pub max_connections: usize,
259
260    /// Enable SSL by default
261    pub ssl_enabled: bool,
262
263    /// Default sampling configuration
264    pub sampling: DatabaseSamplingConfig,
265}
266
267#[cfg(feature = "database")]
268#[derive(Debug, Clone, Serialize, Deserialize)]
269pub struct DatabaseSamplingConfig {
270    /// Enable sampling for large tables
271    pub enabled: bool,
272
273    /// Default sample size
274    pub default_sample_size: usize,
275
276    /// Threshold for automatic sampling (number of rows)
277    pub auto_sample_threshold: usize,
278}
279
280impl Default for OutputConfig {
281    fn default() -> Self {
282        Self {
283            default_format: DEFAULT_OUTPUT_FORMAT.to_string(),
284            colored: DEFAULT_COLORED_OUTPUT,
285            verbosity: DEFAULT_VERBOSITY,
286            show_progress: DEFAULT_SHOW_PROGRESS,
287        }
288    }
289}
290
291impl Default for QualityConfig {
292    fn default() -> Self {
293        Self {
294            enabled: DEFAULT_QUALITY_ENABLED,
295            iso_thresholds: IsoQualityConfig::default(),
296        }
297    }
298}
299
300impl Default for EngineConfig {
301    fn default() -> Self {
302        Self {
303            default_engine: DEFAULT_ENGINE.to_string(),
304            default_chunk_size: None,
305            parallel: DEFAULT_PARALLEL_PROCESSING,
306            max_concurrent: num_cpus::get(),
307            memory: MemoryConfig::default(),
308        }
309    }
310}
311
312impl Default for MemoryConfig {
313    fn default() -> Self {
314        Self {
315            max_usage_mb: DEFAULT_MAX_MEMORY_MB,
316            monitor: DEFAULT_MEMORY_MONITORING,
317            auto_streaming_threshold_mb: DEFAULT_AUTO_STREAMING_THRESHOLD_MB,
318        }
319    }
320}
321
322#[cfg(feature = "database")]
323impl Default for DatabaseSettings {
324    fn default() -> Self {
325        Self {
326            connection_timeout: DEFAULT_DB_CONNECTION_TIMEOUT_SECS,
327            batch_size: DEFAULT_DB_BATCH_SIZE,
328            max_connections: DEFAULT_DB_MAX_CONNECTIONS,
329            ssl_enabled: DEFAULT_DB_SSL_ENABLED,
330            sampling: DatabaseSamplingConfig::default(),
331        }
332    }
333}
334
335#[cfg(feature = "database")]
336impl Default for DatabaseSamplingConfig {
337    fn default() -> Self {
338        Self {
339            enabled: DEFAULT_DB_SAMPLING_ENABLED,
340            default_sample_size: DEFAULT_DB_SAMPLE_SIZE,
341            auto_sample_threshold: DEFAULT_DB_AUTO_SAMPLE_THRESHOLD,
342        }
343    }
344}
345
346impl DataprofConfig {
347    /// Load configuration from file, with fallback to default
348    pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self, DataProfilerError> {
349        let content = fs::read_to_string(path)?;
350        let config: DataprofConfig = toml::from_str(&content)?;
351        Ok(config)
352    }
353
354    /// Save configuration to file
355    pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<(), DataProfilerError> {
356        let content = toml::to_string_pretty(self)?;
357        fs::write(path, content)?;
358        Ok(())
359    }
360
361    /// Load configuration with automatic file discovery.
362    ///
363    /// Searches for configuration files in the following order:
364    /// 1. `.dataprof.toml` in current directory
365    /// 2. `$HOME/.config/dataprof/config.toml` (Linux/macOS)
366    /// 3. `%USERPROFILE%\.config\dataprof\config.toml` (Windows)
367    /// 4. `dataprof.toml` in current directory
368    ///
369    /// If no config file is found, returns default configuration with
370    /// environment variable overrides applied.
371    pub fn load_with_discovery() -> Self {
372        // Build list of config paths to try
373        let mut config_paths = vec![
374            // Current directory - highest priority
375            PathBuf::from(".dataprof.toml"),
376            PathBuf::from("dataprof.toml"),
377        ];
378
379        // User config directory - cross-platform
380        if let Some(home) = std::env::var_os("HOME").or_else(|| std::env::var_os("USERPROFILE")) {
381            let user_config = PathBuf::from(home)
382                .join(".config")
383                .join("dataprof")
384                .join("config.toml");
385            config_paths.push(user_config);
386        }
387
388        // Try loading from each path
389        for path in &config_paths {
390            if path.exists() {
391                match Self::load_from_file(path) {
392                    Ok(mut config) => {
393                        log::info!("Loaded configuration from: {}", path.display());
394                        // Apply environment overrides on top of file config
395                        config.apply_env_overrides();
396                        return config;
397                    }
398                    Err(e) => {
399                        log::warn!(
400                            "Found config file at {} but failed to load: {}",
401                            path.display(),
402                            e
403                        );
404                    }
405                }
406            }
407        }
408
409        // No config file found - use defaults
410        log::debug!("No configuration file found. Using defaults with environment overrides.");
411        let mut config = Self::default();
412        config.apply_env_overrides();
413        config
414    }
415
416    /// Apply environment variable overrides
417    pub fn apply_env_overrides(&mut self) {
418        // Output format override
419        if let Ok(format) = std::env::var("DATAPROF_FORMAT") {
420            self.output.default_format = format;
421        }
422
423        // Verbosity override
424        if let Ok(verbosity) = std::env::var("DATAPROF_VERBOSITY")
425            && let Ok(level) = verbosity.parse::<u8>()
426        {
427            self.output.verbosity = level;
428        }
429
430        // Engine override
431        if let Ok(engine) = std::env::var("DATAPROF_ENGINE") {
432            self.engine.default_engine = engine;
433        }
434
435        // Quality checking override
436        if let Ok(quality) = std::env::var("DATAPROF_QUALITY")
437            && let Ok(enabled) = quality.parse::<bool>()
438        {
439            self.quality.enabled = enabled;
440        }
441
442        // Disable colors if NO_COLOR is set
443        if std::env::var("NO_COLOR").is_ok() {
444            self.output.colored = false;
445        }
446
447        // Progress override
448        if let Ok(progress) = std::env::var("DATAPROF_PROGRESS")
449            && let Ok(enabled) = progress.parse::<bool>()
450        {
451            self.output.show_progress = enabled;
452        }
453    }
454
455    /// Apply explicit runtime overrides to configuration.
456    pub fn apply_overrides(
457        &mut self,
458        format: Option<&str>,
459        quality: Option<bool>,
460        progress: Option<bool>,
461    ) {
462        if let Some(format) = format {
463            self.output.default_format = format.to_string();
464        }
465
466        if let Some(quality) = quality {
467            self.quality.enabled = quality;
468        }
469
470        if let Some(progress) = progress {
471            self.output.show_progress = progress;
472        }
473    }
474
475    /// Deprecated compatibility wrapper for the old CLI-oriented name.
476    #[deprecated(note = "use DataprofConfig::apply_overrides instead")]
477    pub fn merge_with_cli_args(
478        &mut self,
479        format: Option<&str>,
480        quality: Option<bool>,
481        progress: Option<bool>,
482    ) {
483        self.apply_overrides(format, quality, progress);
484    }
485
486    /// Create a sample configuration file for users
487    pub fn create_sample_config<P: AsRef<Path>>(path: P) -> Result<(), DataProfilerError> {
488        let sample_config = Self::default();
489        sample_config.save_to_file(path)?;
490        Ok(())
491    }
492}
493
494/// Configuration validation with comprehensive error messages
495impl DataprofConfig {
496    /// Validate the configuration and return detailed error messages.
497    ///
498    /// This method checks:
499    /// - Output format validity
500    /// - Verbosity level range
501    /// - Quality threshold ranges
502    /// - ISO threshold consistency
503    /// - Engine validity
504    /// - Memory configuration sanity
505    /// - Chunk size validity
506    pub fn validate(&self) -> Result<(), DataProfilerError> {
507        // Validate output format
508        let valid_formats = ["text", "json", "csv", "plain"];
509        if !valid_formats.contains(&self.output.default_format.as_str()) {
510            return Err(DataProfilerError::ConfigValidationError {
511                message: format!(
512                    "Invalid output format '{}'. Valid formats: {}\n\
513                     → Fix: Set output.default_format to one of the valid formats in your config file.",
514                    self.output.default_format,
515                    valid_formats.join(", ")
516                ),
517            });
518        }
519
520        // Validate verbosity level
521        if self.output.verbosity > 3 {
522            return Err(DataProfilerError::ConfigValidationError {
523                message: format!(
524                    "Invalid verbosity level {}. Must be between 0 (quiet) and 3 (debug).\n\
525                     → Fix: Set output.verbosity to 0, 1, 2, or 3 in your config file.",
526                    self.output.verbosity
527                ),
528            });
529        }
530
531        // Validate ISO thresholds
532        let iso = &self.quality.iso_thresholds;
533
534        if iso.outlier_iqr_multiplier <= 0.0 {
535            return Err(DataProfilerError::ConfigValidationError {
536                message: format!(
537                    "IQR multiplier must be positive (standard value: 1.5), got {}.\n\
538                     → Fix: Set quality.iso_thresholds.outlier_iqr_multiplier to a positive value.\n\
539                     → Recommended: Use 1.5 (ISO standard) for normal cases, 3.0 for lenient detection.",
540                    iso.outlier_iqr_multiplier
541                ),
542            });
543        }
544
545        if iso.max_null_percentage < 0.0 || iso.max_null_percentage > 100.0 {
546            return Err(DataProfilerError::ConfigValidationError {
547                message: format!(
548                    "Max null percentage must be between 0 and 100, got {}.\n\
549                     → Fix: Set quality.iso_thresholds.max_null_percentage to a value between 0.0 and 100.0.",
550                    iso.max_null_percentage
551                ),
552            });
553        }
554
555        if iso.null_report_threshold < 0.0 || iso.null_report_threshold > 100.0 {
556            return Err(DataProfilerError::ConfigValidationError {
557                message: format!(
558                    "Null report threshold must be between 0 and 100, got {}.\n\
559                     → Fix: Set quality.iso_thresholds.null_report_threshold to a value between 0.0 and 100.0.",
560                    iso.null_report_threshold
561                ),
562            });
563        }
564
565        if iso.min_type_consistency < 0.0 || iso.min_type_consistency > 100.0 {
566            return Err(DataProfilerError::ConfigValidationError {
567                message: format!(
568                    "Min type consistency must be between 0 and 100, got {}.\n\
569                     → Fix: Set quality.iso_thresholds.min_type_consistency to a value between 0.0 and 100.0.",
570                    iso.min_type_consistency
571                ),
572            });
573        }
574
575        if iso.high_cardinality_threshold < 0.0 || iso.high_cardinality_threshold > 100.0 {
576            return Err(DataProfilerError::ConfigValidationError {
577                message: format!(
578                    "High cardinality threshold must be between 0 and 100, got {}.\n\
579                     → Fix: Set quality.iso_thresholds.high_cardinality_threshold to a value between 0.0 and 100.0.",
580                    iso.high_cardinality_threshold
581                ),
582            });
583        }
584
585        if iso.duplicate_report_threshold < 0.0 || iso.duplicate_report_threshold > 100.0 {
586            return Err(DataProfilerError::ConfigValidationError {
587                message: format!(
588                    "Duplicate report threshold must be between 0 and 100, got {}.\n\
589                     → Fix: Set quality.iso_thresholds.duplicate_report_threshold to a value between 0.0 and 100.0.",
590                    iso.duplicate_report_threshold
591                ),
592            });
593        }
594
595        if iso.max_data_age_years < 0.0 {
596            return Err(DataProfilerError::ConfigValidationError {
597                message: format!(
598                    "Max data age must be non-negative, got {} years.\n\
599                     → Fix: Set quality.iso_thresholds.max_data_age_years to a positive value.",
600                    iso.max_data_age_years
601                ),
602            });
603        }
604
605        if iso.stale_data_threshold < 0.0 || iso.stale_data_threshold > 100.0 {
606            return Err(DataProfilerError::ConfigValidationError {
607                message: format!(
608                    "Stale data threshold must be between 0 and 100, got {}.\n\
609                     → Fix: Set quality.iso_thresholds.stale_data_threshold to a value between 0.0 and 100.0.",
610                    iso.stale_data_threshold
611                ),
612            });
613        }
614
615        // Validate engine
616        let valid_engines = ["auto", "streaming", "memory_efficient", "true_streaming"];
617        if !valid_engines.contains(&self.engine.default_engine.as_str()) {
618            return Err(DataProfilerError::ConfigValidationError {
619                message: format!(
620                    "Invalid engine '{}'. Valid engines: {}\n\
621                     → Fix: Set engine.default_engine to one of the valid engines in your config file.\n\
622                     → Recommended: Use 'auto' for automatic selection based on file size.",
623                    self.engine.default_engine,
624                    valid_engines.join(", ")
625                ),
626            });
627        }
628
629        // Validate memory configuration
630        if self.engine.memory.auto_streaming_threshold_mb < 0.0 {
631            return Err(DataProfilerError::ConfigValidationError {
632                message: format!(
633                    "Auto-streaming threshold must be non-negative, got {} MB.\n\
634                     → Fix: Set engine.memory.auto_streaming_threshold_mb to a positive value.\n\
635                     → Recommended: 100.0 MB is a good default for most systems.",
636                    self.engine.memory.auto_streaming_threshold_mb
637                ),
638            });
639        }
640
641        // Validate chunk size if specified
642        if let Some(chunk_size) = self.engine.default_chunk_size {
643            if chunk_size == 0 {
644                return Err(DataProfilerError::ConfigValidationError {
645                    message: format!(
646                        "Chunk size must be greater than 0, got {}.\n\
647                         → Fix: Set engine.default_chunk_size to a positive value or null for adaptive sizing.\n\
648                         → Recommended: 8192-65536 rows for most CSV files.",
649                        chunk_size
650                    ),
651                });
652            }
653
654            if chunk_size > 1_000_000 {
655                return Err(DataProfilerError::ConfigValidationError {
656                    message: format!(
657                        "Chunk size {} is very large and may cause memory issues.\n\
658                         → Fix: Set engine.default_chunk_size to a smaller value.\n\
659                         → Recommended: 8192-65536 rows for most CSV files.",
660                        chunk_size
661                    ),
662                });
663            }
664        }
665
666        // Validate max concurrent operations
667        if self.engine.max_concurrent == 0 {
668            return Err(DataProfilerError::ConfigValidationError {
669                message: "Max concurrent operations must be greater than 0.\n\
670                     → Fix: Set engine.max_concurrent to a positive value.\n\
671                     → Recommended: Use num_cpus::get() or leave unspecified for automatic detection."
672                    .to_string(),
673            });
674        }
675
676        // Validate database settings if present
677        #[cfg(feature = "database")]
678        if let Some(ref db) = self.database {
679            if db.connection_timeout == 0 {
680                return Err(DataProfilerError::ConfigValidationError {
681                    message: "Database connection timeout must be greater than 0 seconds.\n\
682                         → Fix: Set database.connection_timeout to a positive value.\n\
683                         → Recommended: 30 seconds for most network conditions."
684                        .to_string(),
685                });
686            }
687
688            if db.batch_size == 0 {
689                return Err(DataProfilerError::ConfigValidationError {
690                    message: "Database batch size must be greater than 0.\n\
691                         → Fix: Set database.batch_size to a positive value.\n\
692                         → Recommended: 10000 rows for most databases."
693                        .to_string(),
694                });
695            }
696
697            if db.max_connections == 0 {
698                return Err(DataProfilerError::ConfigValidationError {
699                    message: "Database max connections must be greater than 0.\n\
700                         → Fix: Set database.max_connections to a positive value.\n\
701                         → Recommended: 10 connections for most use cases."
702                        .to_string(),
703                });
704            }
705
706            if db.sampling.default_sample_size == 0 {
707                return Err(DataProfilerError::ConfigValidationError {
708                    message: "Database sample size must be greater than 0.\n\
709                         → Fix: Set database.sampling.default_sample_size to a positive value.\n\
710                         → Recommended: 100000 rows for statistical significance."
711                        .to_string(),
712                });
713            }
714        }
715
716        Ok(())
717    }
718}
719
720// ============================================================================
721// Builder Pattern Implementation
722// ============================================================================
723
724/// Builder for constructing DataprofConfig with a fluent API.
725///
726/// The builder pattern provides:
727/// - Clear, self-documenting configuration
728/// - Type-safe construction
729/// - Validation at build time
730/// - Easy preset configurations
731///
732/// # Examples
733///
734/// ```
735/// use dataprof_core::config::{DataprofConfigBuilder, IsoQualityConfig};
736///
737/// // Simple configuration with defaults
738/// let config = DataprofConfigBuilder::new()
739///     .build()
740///     .expect("Failed to build config");
741///
742/// // Configuration with custom settings
743/// let config = DataprofConfigBuilder::new()
744///     .output_format("json")
745///     .verbosity(2)
746///     .engine("streaming")
747///     .build()
748///     .expect("Failed to build config");
749///
750/// // Strict quality profile for finance/healthcare
751/// let config = DataprofConfigBuilder::new()
752///     .iso_quality_profile_strict()
753///     .build()
754///     .expect("Failed to build config");
755/// ```
756#[derive(Debug, Clone)]
757pub struct DataprofConfigBuilder {
758    output: OutputConfig,
759    quality: QualityConfig,
760    engine: EngineConfig,
761    #[cfg(feature = "database")]
762    database: Option<DatabaseSettings>,
763}
764
765impl DataprofConfigBuilder {
766    /// Create a new builder with default values.
767    pub fn new() -> Self {
768        Self {
769            output: OutputConfig::default(),
770            quality: QualityConfig::default(),
771            engine: EngineConfig::default(),
772            #[cfg(feature = "database")]
773            database: Some(DatabaseSettings::default()),
774        }
775    }
776
777    // ========================================================================
778    // Output Configuration Methods
779    // ========================================================================
780
781    /// Set the default output format.
782    ///
783    /// Valid formats: "text", "json", "csv", "plain"
784    pub fn output_format(mut self, format: &str) -> Self {
785        self.output.default_format = format.to_string();
786        self
787    }
788
789    /// Enable or disable colored output.
790    pub fn colored(mut self, enabled: bool) -> Self {
791        self.output.colored = enabled;
792        self
793    }
794
795    /// Set verbosity level (0=quiet, 1=normal, 2=verbose, 3=debug).
796    pub fn verbosity(mut self, level: u8) -> Self {
797        self.output.verbosity = level;
798        self
799    }
800
801    /// Enable or disable progress bars.
802    pub fn show_progress(mut self, enabled: bool) -> Self {
803        self.output.show_progress = enabled;
804        self
805    }
806
807    // ========================================================================
808    // Quality Configuration Methods
809    // ========================================================================
810
811    /// Enable or disable quality checking.
812    ///
813    /// When enabled, all ISO 8000/25012 quality metrics are calculated:
814    /// - Completeness (null detection)
815    /// - Consistency (type consistency, mixed types)
816    /// - Uniqueness (duplicate detection, high cardinality)
817    /// - Accuracy (outlier detection)
818    /// - Timeliness (stale data detection)
819    pub fn quality_enabled(mut self, enabled: bool) -> Self {
820        self.quality.enabled = enabled;
821        self
822    }
823
824    /// Set custom ISO quality thresholds.
825    pub fn iso_quality_thresholds(mut self, thresholds: IsoQualityConfig) -> Self {
826        self.quality.iso_thresholds = thresholds;
827        self
828    }
829
830    /// Use strict ISO quality thresholds (finance, healthcare).
831    ///
832    /// Stricter thresholds for high-compliance industries:
833    /// - Lower null tolerance (30% vs 50%)
834    /// - Higher type consistency requirements (98% vs 95%)
835    /// - Stricter duplicate detection (1% vs 5%)
836    /// - More recent data requirements (2 years vs 5 years)
837    pub fn iso_quality_profile_strict(mut self) -> Self {
838        self.quality.iso_thresholds = IsoQualityConfig::strict();
839        self
840    }
841
842    /// Use lenient ISO quality thresholds (exploratory, marketing).
843    ///
844    /// More relaxed thresholds for exploratory data:
845    /// - Higher null tolerance (70% vs 50%)
846    /// - Lower type consistency requirements (90% vs 95%)
847    /// - More lenient duplicate detection (10% vs 5%)
848    /// - Older data accepted (10 years vs 5 years)
849    pub fn iso_quality_profile_lenient(mut self) -> Self {
850        self.quality.iso_thresholds = IsoQualityConfig::lenient();
851        self
852    }
853
854    // ========================================================================
855    // Engine Configuration Methods
856    // ========================================================================
857
858    /// Set the default engine selection.
859    ///
860    /// Valid engines: "auto", "streaming", "memory_efficient", "true_streaming"
861    pub fn engine(mut self, engine: &str) -> Self {
862        self.engine.default_engine = engine.to_string();
863        self
864    }
865
866    /// Set default chunk size for streaming operations.
867    pub fn chunk_size(mut self, size: usize) -> Self {
868        self.engine.default_chunk_size = Some(size);
869        self
870    }
871
872    /// Enable or disable parallel processing.
873    pub fn parallel(mut self, enabled: bool) -> Self {
874        self.engine.parallel = enabled;
875        self
876    }
877
878    /// Set maximum concurrent operations.
879    pub fn max_concurrent(mut self, max: usize) -> Self {
880        self.engine.max_concurrent = max;
881        self
882    }
883
884    /// Set maximum memory usage in MB (0 = unlimited).
885    pub fn max_memory_mb(mut self, mb: usize) -> Self {
886        self.engine.memory.max_usage_mb = mb;
887        self
888    }
889
890    /// Set auto-streaming threshold in MB.
891    pub fn auto_streaming_threshold_mb(mut self, mb: f64) -> Self {
892        self.engine.memory.auto_streaming_threshold_mb = mb;
893        self
894    }
895
896    // ========================================================================
897    // Database Configuration Methods (optional feature)
898    // ========================================================================
899
900    #[cfg(feature = "database")]
901    /// Set database connection timeout in seconds.
902    pub fn db_connection_timeout(mut self, seconds: u64) -> Self {
903        if let Some(ref mut db) = self.database {
904            db.connection_timeout = seconds;
905        }
906        self
907    }
908
909    #[cfg(feature = "database")]
910    /// Set database batch size for queries.
911    pub fn db_batch_size(mut self, size: usize) -> Self {
912        if let Some(ref mut db) = self.database {
913            db.batch_size = size;
914        }
915        self
916    }
917
918    #[cfg(feature = "database")]
919    /// Enable or disable database sampling.
920    pub fn db_sampling_enabled(mut self, enabled: bool) -> Self {
921        if let Some(ref mut db) = self.database {
922            db.sampling.enabled = enabled;
923        }
924        self
925    }
926
927    // ========================================================================
928    // Preset Configurations
929    // ========================================================================
930
931    /// Create a configuration optimized for CI/CD pipelines.
932    ///
933    /// - No colors (for log compatibility)
934    /// - No progress bars (cleaner output)
935    /// - JSON output format
936    /// - Verbose logging
937    pub fn ci_preset() -> Self {
938        Self::new()
939            .colored(false)
940            .show_progress(false)
941            .output_format("json")
942            .verbosity(2)
943    }
944
945    /// Create a configuration for interactive terminal use.
946    ///
947    /// - Colors enabled
948    /// - Progress bars enabled
949    /// - Text output format
950    /// - Normal verbosity
951    pub fn interactive_preset() -> Self {
952        Self::new()
953            .colored(true)
954            .show_progress(true)
955            .output_format("text")
956            .verbosity(1)
957    }
958
959    /// Create a configuration for production quality checks.
960    ///
961    /// - Strict ISO quality thresholds
962    /// - Quality checking enabled
963    /// - Memory monitoring enabled
964    /// - Conservative memory limits
965    pub fn production_quality_preset() -> Self {
966        Self::new()
967            .iso_quality_profile_strict()
968            .quality_enabled(true)
969            .max_memory_mb(512) // 512MB limit for predictability
970    }
971
972    // ========================================================================
973    // Build and Validation
974    // ========================================================================
975
976    /// Build the configuration and validate it.
977    ///
978    /// Returns an error if the configuration is invalid.
979    pub fn build(self) -> Result<DataprofConfig, DataProfilerError> {
980        let config = DataprofConfig {
981            output: self.output,
982            quality: self.quality,
983            engine: self.engine,
984            #[cfg(feature = "database")]
985            database: self.database,
986        };
987
988        // Validate the configuration before returning
989        config.validate()?;
990
991        Ok(config)
992    }
993
994    /// Build the configuration without validation.
995    ///
996    /// Use this only if you're certain the configuration is valid
997    /// and want to skip validation for performance reasons.
998    pub fn build_unchecked(self) -> DataprofConfig {
999        DataprofConfig {
1000            output: self.output,
1001            quality: self.quality,
1002            engine: self.engine,
1003            #[cfg(feature = "database")]
1004            database: self.database,
1005        }
1006    }
1007}
1008
1009impl Default for DataprofConfigBuilder {
1010    fn default() -> Self {
1011        Self::new()
1012    }
1013}