1use serde::{Deserialize, Serialize};
2use std::fs;
3use std::path::{Path, PathBuf};
4
5use crate::errors::DataProfilerError;
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct IsoQualityConfig {
11 pub max_null_percentage: f64,
13 pub null_report_threshold: f64,
15 pub min_type_consistency: f64,
17 pub duplicate_report_threshold: f64,
19 pub high_cardinality_threshold: f64,
21 pub outlier_iqr_multiplier: f64,
23 pub outlier_min_samples: usize,
25 pub max_data_age_years: f64,
27 pub stale_data_threshold: f64,
29}
30
31impl Default for IsoQualityConfig {
32 fn default() -> Self {
33 Self {
34 max_null_percentage: 50.0,
35 null_report_threshold: 10.0,
36 min_type_consistency: 95.0,
37 duplicate_report_threshold: 5.0,
38 high_cardinality_threshold: 95.0,
39 outlier_iqr_multiplier: 1.5,
40 outlier_min_samples: 4,
41 max_data_age_years: 5.0,
42 stale_data_threshold: 20.0,
43 }
44 }
45}
46
47impl IsoQualityConfig {
48 pub fn strict() -> Self {
50 Self {
51 max_null_percentage: 30.0,
52 null_report_threshold: 5.0,
53 min_type_consistency: 98.0,
54 duplicate_report_threshold: 1.0,
55 high_cardinality_threshold: 98.0,
56 outlier_iqr_multiplier: 1.5,
57 outlier_min_samples: 10,
58 max_data_age_years: 2.0,
59 stale_data_threshold: 10.0,
60 }
61 }
62
63 pub fn lenient() -> Self {
65 Self {
66 max_null_percentage: 70.0,
67 null_report_threshold: 20.0,
68 min_type_consistency: 90.0,
69 duplicate_report_threshold: 10.0,
70 high_cardinality_threshold: 90.0,
71 outlier_iqr_multiplier: 2.0,
72 outlier_min_samples: 4,
73 max_data_age_years: 10.0,
74 stale_data_threshold: 30.0,
75 }
76 }
77}
78
79const DEFAULT_OUTPUT_FORMAT: &str = "text";
89
90const DEFAULT_COLORED_OUTPUT: bool = true;
93
94const DEFAULT_VERBOSITY: u8 = 1;
98
99const DEFAULT_SHOW_PROGRESS: bool = true;
102
103const DEFAULT_QUALITY_ENABLED: bool = true;
108
109const DEFAULT_ENGINE: &str = "auto";
113
114const DEFAULT_PARALLEL_PROCESSING: bool = true;
117
118const DEFAULT_MAX_MEMORY_MB: usize = 0;
122
123const DEFAULT_MEMORY_MONITORING: bool = true;
126
127const DEFAULT_AUTO_STREAMING_THRESHOLD_MB: f64 = 100.0;
131
132#[cfg(feature = "database")]
134const DEFAULT_DB_CONNECTION_TIMEOUT_SECS: u64 = 30;
137
138#[cfg(feature = "database")]
139const DEFAULT_DB_BATCH_SIZE: usize = 10_000;
142
143#[cfg(feature = "database")]
144const DEFAULT_DB_MAX_CONNECTIONS: usize = 10;
147
148#[cfg(feature = "database")]
149const DEFAULT_DB_SSL_ENABLED: bool = true;
152
153#[cfg(feature = "database")]
154const DEFAULT_DB_SAMPLING_ENABLED: bool = true;
157
158#[cfg(feature = "database")]
159const DEFAULT_DB_SAMPLE_SIZE: usize = 100_000;
163
164#[cfg(feature = "database")]
165const DEFAULT_DB_AUTO_SAMPLE_THRESHOLD: usize = 1_000_000;
168
169#[derive(Debug, Clone, Serialize, Deserialize, Default)]
171pub struct DataprofConfig {
172 pub output: OutputConfig,
174
175 pub quality: QualityConfig,
177
178 pub engine: EngineConfig,
180
181 #[cfg(feature = "database")]
183 pub database: Option<DatabaseSettings>,
184}
185
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct OutputConfig {
188 pub default_format: String,
190
191 pub colored: bool,
193
194 pub verbosity: u8,
196
197 pub show_progress: bool,
199}
200
201#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct QualityConfig {
203 pub enabled: bool,
205
206 pub iso_thresholds: IsoQualityConfig,
216}
217
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct EngineConfig {
220 pub default_engine: String,
222
223 pub default_chunk_size: Option<usize>,
225
226 pub parallel: bool,
228
229 pub max_concurrent: usize,
231
232 pub memory: MemoryConfig,
234}
235
236#[derive(Debug, Clone, Serialize, Deserialize)]
237pub struct MemoryConfig {
238 pub max_usage_mb: usize,
240
241 pub monitor: bool,
243
244 pub auto_streaming_threshold_mb: f64,
246}
247
248#[cfg(feature = "database")]
249#[derive(Debug, Clone, Serialize, Deserialize)]
250pub struct DatabaseSettings {
251 pub connection_timeout: u64,
253
254 pub batch_size: usize,
256
257 pub max_connections: usize,
259
260 pub ssl_enabled: bool,
262
263 pub sampling: DatabaseSamplingConfig,
265}
266
267#[cfg(feature = "database")]
268#[derive(Debug, Clone, Serialize, Deserialize)]
269pub struct DatabaseSamplingConfig {
270 pub enabled: bool,
272
273 pub default_sample_size: usize,
275
276 pub auto_sample_threshold: usize,
278}
279
280impl Default for OutputConfig {
281 fn default() -> Self {
282 Self {
283 default_format: DEFAULT_OUTPUT_FORMAT.to_string(),
284 colored: DEFAULT_COLORED_OUTPUT,
285 verbosity: DEFAULT_VERBOSITY,
286 show_progress: DEFAULT_SHOW_PROGRESS,
287 }
288 }
289}
290
291impl Default for QualityConfig {
292 fn default() -> Self {
293 Self {
294 enabled: DEFAULT_QUALITY_ENABLED,
295 iso_thresholds: IsoQualityConfig::default(),
296 }
297 }
298}
299
300impl Default for EngineConfig {
301 fn default() -> Self {
302 Self {
303 default_engine: DEFAULT_ENGINE.to_string(),
304 default_chunk_size: None,
305 parallel: DEFAULT_PARALLEL_PROCESSING,
306 max_concurrent: num_cpus::get(),
307 memory: MemoryConfig::default(),
308 }
309 }
310}
311
312impl Default for MemoryConfig {
313 fn default() -> Self {
314 Self {
315 max_usage_mb: DEFAULT_MAX_MEMORY_MB,
316 monitor: DEFAULT_MEMORY_MONITORING,
317 auto_streaming_threshold_mb: DEFAULT_AUTO_STREAMING_THRESHOLD_MB,
318 }
319 }
320}
321
322#[cfg(feature = "database")]
323impl Default for DatabaseSettings {
324 fn default() -> Self {
325 Self {
326 connection_timeout: DEFAULT_DB_CONNECTION_TIMEOUT_SECS,
327 batch_size: DEFAULT_DB_BATCH_SIZE,
328 max_connections: DEFAULT_DB_MAX_CONNECTIONS,
329 ssl_enabled: DEFAULT_DB_SSL_ENABLED,
330 sampling: DatabaseSamplingConfig::default(),
331 }
332 }
333}
334
335#[cfg(feature = "database")]
336impl Default for DatabaseSamplingConfig {
337 fn default() -> Self {
338 Self {
339 enabled: DEFAULT_DB_SAMPLING_ENABLED,
340 default_sample_size: DEFAULT_DB_SAMPLE_SIZE,
341 auto_sample_threshold: DEFAULT_DB_AUTO_SAMPLE_THRESHOLD,
342 }
343 }
344}
345
346impl DataprofConfig {
347 pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self, DataProfilerError> {
349 let content = fs::read_to_string(path)?;
350 let config: DataprofConfig = toml::from_str(&content)?;
351 Ok(config)
352 }
353
354 pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<(), DataProfilerError> {
356 let content = toml::to_string_pretty(self)?;
357 fs::write(path, content)?;
358 Ok(())
359 }
360
361 pub fn load_with_discovery() -> Self {
372 let mut config_paths = vec![
374 PathBuf::from(".dataprof.toml"),
376 PathBuf::from("dataprof.toml"),
377 ];
378
379 if let Some(home) = std::env::var_os("HOME").or_else(|| std::env::var_os("USERPROFILE")) {
381 let user_config = PathBuf::from(home)
382 .join(".config")
383 .join("dataprof")
384 .join("config.toml");
385 config_paths.push(user_config);
386 }
387
388 for path in &config_paths {
390 if path.exists() {
391 match Self::load_from_file(path) {
392 Ok(mut config) => {
393 log::info!("Loaded configuration from: {}", path.display());
394 config.apply_env_overrides();
396 return config;
397 }
398 Err(e) => {
399 log::warn!(
400 "Found config file at {} but failed to load: {}",
401 path.display(),
402 e
403 );
404 }
405 }
406 }
407 }
408
409 log::debug!("No configuration file found. Using defaults with environment overrides.");
411 let mut config = Self::default();
412 config.apply_env_overrides();
413 config
414 }
415
416 pub fn apply_env_overrides(&mut self) {
418 if let Ok(format) = std::env::var("DATAPROF_FORMAT") {
420 self.output.default_format = format;
421 }
422
423 if let Ok(verbosity) = std::env::var("DATAPROF_VERBOSITY")
425 && let Ok(level) = verbosity.parse::<u8>()
426 {
427 self.output.verbosity = level;
428 }
429
430 if let Ok(engine) = std::env::var("DATAPROF_ENGINE") {
432 self.engine.default_engine = engine;
433 }
434
435 if let Ok(quality) = std::env::var("DATAPROF_QUALITY")
437 && let Ok(enabled) = quality.parse::<bool>()
438 {
439 self.quality.enabled = enabled;
440 }
441
442 if std::env::var("NO_COLOR").is_ok() {
444 self.output.colored = false;
445 }
446
447 if let Ok(progress) = std::env::var("DATAPROF_PROGRESS")
449 && let Ok(enabled) = progress.parse::<bool>()
450 {
451 self.output.show_progress = enabled;
452 }
453 }
454
455 pub fn apply_overrides(
457 &mut self,
458 format: Option<&str>,
459 quality: Option<bool>,
460 progress: Option<bool>,
461 ) {
462 if let Some(format) = format {
463 self.output.default_format = format.to_string();
464 }
465
466 if let Some(quality) = quality {
467 self.quality.enabled = quality;
468 }
469
470 if let Some(progress) = progress {
471 self.output.show_progress = progress;
472 }
473 }
474
475 #[deprecated(note = "use DataprofConfig::apply_overrides instead")]
477 pub fn merge_with_cli_args(
478 &mut self,
479 format: Option<&str>,
480 quality: Option<bool>,
481 progress: Option<bool>,
482 ) {
483 self.apply_overrides(format, quality, progress);
484 }
485
486 pub fn create_sample_config<P: AsRef<Path>>(path: P) -> Result<(), DataProfilerError> {
488 let sample_config = Self::default();
489 sample_config.save_to_file(path)?;
490 Ok(())
491 }
492}
493
494impl DataprofConfig {
496 pub fn validate(&self) -> Result<(), DataProfilerError> {
507 let valid_formats = ["text", "json", "csv", "plain"];
509 if !valid_formats.contains(&self.output.default_format.as_str()) {
510 return Err(DataProfilerError::ConfigValidationError {
511 message: format!(
512 "Invalid output format '{}'. Valid formats: {}\n\
513 → Fix: Set output.default_format to one of the valid formats in your config file.",
514 self.output.default_format,
515 valid_formats.join(", ")
516 ),
517 });
518 }
519
520 if self.output.verbosity > 3 {
522 return Err(DataProfilerError::ConfigValidationError {
523 message: format!(
524 "Invalid verbosity level {}. Must be between 0 (quiet) and 3 (debug).\n\
525 → Fix: Set output.verbosity to 0, 1, 2, or 3 in your config file.",
526 self.output.verbosity
527 ),
528 });
529 }
530
531 let iso = &self.quality.iso_thresholds;
533
534 if iso.outlier_iqr_multiplier <= 0.0 {
535 return Err(DataProfilerError::ConfigValidationError {
536 message: format!(
537 "IQR multiplier must be positive (standard value: 1.5), got {}.\n\
538 → Fix: Set quality.iso_thresholds.outlier_iqr_multiplier to a positive value.\n\
539 → Recommended: Use 1.5 (ISO standard) for normal cases, 3.0 for lenient detection.",
540 iso.outlier_iqr_multiplier
541 ),
542 });
543 }
544
545 if iso.max_null_percentage < 0.0 || iso.max_null_percentage > 100.0 {
546 return Err(DataProfilerError::ConfigValidationError {
547 message: format!(
548 "Max null percentage must be between 0 and 100, got {}.\n\
549 → Fix: Set quality.iso_thresholds.max_null_percentage to a value between 0.0 and 100.0.",
550 iso.max_null_percentage
551 ),
552 });
553 }
554
555 if iso.null_report_threshold < 0.0 || iso.null_report_threshold > 100.0 {
556 return Err(DataProfilerError::ConfigValidationError {
557 message: format!(
558 "Null report threshold must be between 0 and 100, got {}.\n\
559 → Fix: Set quality.iso_thresholds.null_report_threshold to a value between 0.0 and 100.0.",
560 iso.null_report_threshold
561 ),
562 });
563 }
564
565 if iso.min_type_consistency < 0.0 || iso.min_type_consistency > 100.0 {
566 return Err(DataProfilerError::ConfigValidationError {
567 message: format!(
568 "Min type consistency must be between 0 and 100, got {}.\n\
569 → Fix: Set quality.iso_thresholds.min_type_consistency to a value between 0.0 and 100.0.",
570 iso.min_type_consistency
571 ),
572 });
573 }
574
575 if iso.high_cardinality_threshold < 0.0 || iso.high_cardinality_threshold > 100.0 {
576 return Err(DataProfilerError::ConfigValidationError {
577 message: format!(
578 "High cardinality threshold must be between 0 and 100, got {}.\n\
579 → Fix: Set quality.iso_thresholds.high_cardinality_threshold to a value between 0.0 and 100.0.",
580 iso.high_cardinality_threshold
581 ),
582 });
583 }
584
585 if iso.duplicate_report_threshold < 0.0 || iso.duplicate_report_threshold > 100.0 {
586 return Err(DataProfilerError::ConfigValidationError {
587 message: format!(
588 "Duplicate report threshold must be between 0 and 100, got {}.\n\
589 → Fix: Set quality.iso_thresholds.duplicate_report_threshold to a value between 0.0 and 100.0.",
590 iso.duplicate_report_threshold
591 ),
592 });
593 }
594
595 if iso.max_data_age_years < 0.0 {
596 return Err(DataProfilerError::ConfigValidationError {
597 message: format!(
598 "Max data age must be non-negative, got {} years.\n\
599 → Fix: Set quality.iso_thresholds.max_data_age_years to a positive value.",
600 iso.max_data_age_years
601 ),
602 });
603 }
604
605 if iso.stale_data_threshold < 0.0 || iso.stale_data_threshold > 100.0 {
606 return Err(DataProfilerError::ConfigValidationError {
607 message: format!(
608 "Stale data threshold must be between 0 and 100, got {}.\n\
609 → Fix: Set quality.iso_thresholds.stale_data_threshold to a value between 0.0 and 100.0.",
610 iso.stale_data_threshold
611 ),
612 });
613 }
614
615 let valid_engines = ["auto", "streaming", "memory_efficient", "true_streaming"];
617 if !valid_engines.contains(&self.engine.default_engine.as_str()) {
618 return Err(DataProfilerError::ConfigValidationError {
619 message: format!(
620 "Invalid engine '{}'. Valid engines: {}\n\
621 → Fix: Set engine.default_engine to one of the valid engines in your config file.\n\
622 → Recommended: Use 'auto' for automatic selection based on file size.",
623 self.engine.default_engine,
624 valid_engines.join(", ")
625 ),
626 });
627 }
628
629 if self.engine.memory.auto_streaming_threshold_mb < 0.0 {
631 return Err(DataProfilerError::ConfigValidationError {
632 message: format!(
633 "Auto-streaming threshold must be non-negative, got {} MB.\n\
634 → Fix: Set engine.memory.auto_streaming_threshold_mb to a positive value.\n\
635 → Recommended: 100.0 MB is a good default for most systems.",
636 self.engine.memory.auto_streaming_threshold_mb
637 ),
638 });
639 }
640
641 if let Some(chunk_size) = self.engine.default_chunk_size {
643 if chunk_size == 0 {
644 return Err(DataProfilerError::ConfigValidationError {
645 message: format!(
646 "Chunk size must be greater than 0, got {}.\n\
647 → Fix: Set engine.default_chunk_size to a positive value or null for adaptive sizing.\n\
648 → Recommended: 8192-65536 rows for most CSV files.",
649 chunk_size
650 ),
651 });
652 }
653
654 if chunk_size > 1_000_000 {
655 return Err(DataProfilerError::ConfigValidationError {
656 message: format!(
657 "Chunk size {} is very large and may cause memory issues.\n\
658 → Fix: Set engine.default_chunk_size to a smaller value.\n\
659 → Recommended: 8192-65536 rows for most CSV files.",
660 chunk_size
661 ),
662 });
663 }
664 }
665
666 if self.engine.max_concurrent == 0 {
668 return Err(DataProfilerError::ConfigValidationError {
669 message: "Max concurrent operations must be greater than 0.\n\
670 → Fix: Set engine.max_concurrent to a positive value.\n\
671 → Recommended: Use num_cpus::get() or leave unspecified for automatic detection."
672 .to_string(),
673 });
674 }
675
676 #[cfg(feature = "database")]
678 if let Some(ref db) = self.database {
679 if db.connection_timeout == 0 {
680 return Err(DataProfilerError::ConfigValidationError {
681 message: "Database connection timeout must be greater than 0 seconds.\n\
682 → Fix: Set database.connection_timeout to a positive value.\n\
683 → Recommended: 30 seconds for most network conditions."
684 .to_string(),
685 });
686 }
687
688 if db.batch_size == 0 {
689 return Err(DataProfilerError::ConfigValidationError {
690 message: "Database batch size must be greater than 0.\n\
691 → Fix: Set database.batch_size to a positive value.\n\
692 → Recommended: 10000 rows for most databases."
693 .to_string(),
694 });
695 }
696
697 if db.max_connections == 0 {
698 return Err(DataProfilerError::ConfigValidationError {
699 message: "Database max connections must be greater than 0.\n\
700 → Fix: Set database.max_connections to a positive value.\n\
701 → Recommended: 10 connections for most use cases."
702 .to_string(),
703 });
704 }
705
706 if db.sampling.default_sample_size == 0 {
707 return Err(DataProfilerError::ConfigValidationError {
708 message: "Database sample size must be greater than 0.\n\
709 → Fix: Set database.sampling.default_sample_size to a positive value.\n\
710 → Recommended: 100000 rows for statistical significance."
711 .to_string(),
712 });
713 }
714 }
715
716 Ok(())
717 }
718}
719
720#[derive(Debug, Clone)]
757pub struct DataprofConfigBuilder {
758 output: OutputConfig,
759 quality: QualityConfig,
760 engine: EngineConfig,
761 #[cfg(feature = "database")]
762 database: Option<DatabaseSettings>,
763}
764
765impl DataprofConfigBuilder {
766 pub fn new() -> Self {
768 Self {
769 output: OutputConfig::default(),
770 quality: QualityConfig::default(),
771 engine: EngineConfig::default(),
772 #[cfg(feature = "database")]
773 database: Some(DatabaseSettings::default()),
774 }
775 }
776
777 pub fn output_format(mut self, format: &str) -> Self {
785 self.output.default_format = format.to_string();
786 self
787 }
788
789 pub fn colored(mut self, enabled: bool) -> Self {
791 self.output.colored = enabled;
792 self
793 }
794
795 pub fn verbosity(mut self, level: u8) -> Self {
797 self.output.verbosity = level;
798 self
799 }
800
801 pub fn show_progress(mut self, enabled: bool) -> Self {
803 self.output.show_progress = enabled;
804 self
805 }
806
807 pub fn quality_enabled(mut self, enabled: bool) -> Self {
820 self.quality.enabled = enabled;
821 self
822 }
823
824 pub fn iso_quality_thresholds(mut self, thresholds: IsoQualityConfig) -> Self {
826 self.quality.iso_thresholds = thresholds;
827 self
828 }
829
830 pub fn iso_quality_profile_strict(mut self) -> Self {
838 self.quality.iso_thresholds = IsoQualityConfig::strict();
839 self
840 }
841
842 pub fn iso_quality_profile_lenient(mut self) -> Self {
850 self.quality.iso_thresholds = IsoQualityConfig::lenient();
851 self
852 }
853
854 pub fn engine(mut self, engine: &str) -> Self {
862 self.engine.default_engine = engine.to_string();
863 self
864 }
865
866 pub fn chunk_size(mut self, size: usize) -> Self {
868 self.engine.default_chunk_size = Some(size);
869 self
870 }
871
872 pub fn parallel(mut self, enabled: bool) -> Self {
874 self.engine.parallel = enabled;
875 self
876 }
877
878 pub fn max_concurrent(mut self, max: usize) -> Self {
880 self.engine.max_concurrent = max;
881 self
882 }
883
884 pub fn max_memory_mb(mut self, mb: usize) -> Self {
886 self.engine.memory.max_usage_mb = mb;
887 self
888 }
889
890 pub fn auto_streaming_threshold_mb(mut self, mb: f64) -> Self {
892 self.engine.memory.auto_streaming_threshold_mb = mb;
893 self
894 }
895
896 #[cfg(feature = "database")]
901 pub fn db_connection_timeout(mut self, seconds: u64) -> Self {
903 if let Some(ref mut db) = self.database {
904 db.connection_timeout = seconds;
905 }
906 self
907 }
908
909 #[cfg(feature = "database")]
910 pub fn db_batch_size(mut self, size: usize) -> Self {
912 if let Some(ref mut db) = self.database {
913 db.batch_size = size;
914 }
915 self
916 }
917
918 #[cfg(feature = "database")]
919 pub fn db_sampling_enabled(mut self, enabled: bool) -> Self {
921 if let Some(ref mut db) = self.database {
922 db.sampling.enabled = enabled;
923 }
924 self
925 }
926
927 pub fn ci_preset() -> Self {
938 Self::new()
939 .colored(false)
940 .show_progress(false)
941 .output_format("json")
942 .verbosity(2)
943 }
944
945 pub fn interactive_preset() -> Self {
952 Self::new()
953 .colored(true)
954 .show_progress(true)
955 .output_format("text")
956 .verbosity(1)
957 }
958
959 pub fn production_quality_preset() -> Self {
966 Self::new()
967 .iso_quality_profile_strict()
968 .quality_enabled(true)
969 .max_memory_mb(512) }
971
972 pub fn build(self) -> Result<DataprofConfig, DataProfilerError> {
980 let config = DataprofConfig {
981 output: self.output,
982 quality: self.quality,
983 engine: self.engine,
984 #[cfg(feature = "database")]
985 database: self.database,
986 };
987
988 config.validate()?;
990
991 Ok(config)
992 }
993
994 pub fn build_unchecked(self) -> DataprofConfig {
999 DataprofConfig {
1000 output: self.output,
1001 quality: self.quality,
1002 engine: self.engine,
1003 #[cfg(feature = "database")]
1004 database: self.database,
1005 }
1006 }
1007}
1008
1009impl Default for DataprofConfigBuilder {
1010 fn default() -> Self {
1011 Self::new()
1012 }
1013}