use serde::{Deserialize, Serialize};
use std::fs;
use std::path::{Path, PathBuf};
use crate::core::errors::DataProfilerError;
const DEFAULT_OUTPUT_FORMAT: &str = "text";
const DEFAULT_COLORED_OUTPUT: bool = true;
const DEFAULT_VERBOSITY: u8 = 1;
const DEFAULT_SHOW_PROGRESS: bool = true;
const DEFAULT_QUALITY_ENABLED: bool = true;
const DEFAULT_ENGINE: &str = "auto";
const DEFAULT_PARALLEL_PROCESSING: bool = true;
const DEFAULT_MAX_MEMORY_MB: usize = 0;
const DEFAULT_MEMORY_MONITORING: bool = true;
const DEFAULT_AUTO_STREAMING_THRESHOLD_MB: f64 = 100.0;
#[cfg(feature = "database")]
const DEFAULT_DB_CONNECTION_TIMEOUT_SECS: u64 = 30;
#[cfg(feature = "database")]
const DEFAULT_DB_BATCH_SIZE: usize = 10_000;
#[cfg(feature = "database")]
const DEFAULT_DB_MAX_CONNECTIONS: usize = 10;
#[cfg(feature = "database")]
const DEFAULT_DB_SSL_ENABLED: bool = true;
#[cfg(feature = "database")]
const DEFAULT_DB_SAMPLING_ENABLED: bool = true;
#[cfg(feature = "database")]
const DEFAULT_DB_SAMPLE_SIZE: usize = 100_000;
#[cfg(feature = "database")]
const DEFAULT_DB_AUTO_SAMPLE_THRESHOLD: usize = 1_000_000;
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct DataprofConfig {
pub output: OutputConfig,
pub quality: QualityConfig,
pub engine: EngineConfig,
#[cfg(feature = "database")]
pub database: Option<DatabaseSettings>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OutputConfig {
pub default_format: String,
pub colored: bool,
pub verbosity: u8,
pub show_progress: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityConfig {
pub enabled: bool,
pub iso_thresholds: IsoQualityConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IsoQualityConfig {
pub max_null_percentage: f64,
pub null_report_threshold: f64,
pub min_type_consistency: f64,
pub duplicate_report_threshold: f64,
pub high_cardinality_threshold: f64,
pub outlier_iqr_multiplier: f64,
pub outlier_min_samples: usize,
pub max_data_age_years: f64,
pub stale_data_threshold: f64,
}
impl Default for IsoQualityConfig {
fn default() -> Self {
Self {
max_null_percentage: 50.0,
null_report_threshold: 10.0,
min_type_consistency: 95.0,
duplicate_report_threshold: 5.0,
high_cardinality_threshold: 95.0,
outlier_iqr_multiplier: 1.5, outlier_min_samples: 4,
max_data_age_years: 5.0,
stale_data_threshold: 20.0,
}
}
}
impl IsoQualityConfig {
pub fn strict() -> Self {
Self {
max_null_percentage: 30.0,
null_report_threshold: 5.0,
min_type_consistency: 98.0,
duplicate_report_threshold: 1.0,
high_cardinality_threshold: 98.0,
outlier_iqr_multiplier: 1.5,
outlier_min_samples: 10,
max_data_age_years: 2.0, stale_data_threshold: 10.0,
}
}
pub fn lenient() -> Self {
Self {
max_null_percentage: 70.0,
null_report_threshold: 20.0,
min_type_consistency: 90.0,
duplicate_report_threshold: 10.0,
high_cardinality_threshold: 90.0,
outlier_iqr_multiplier: 2.0,
outlier_min_samples: 4,
max_data_age_years: 10.0, stale_data_threshold: 30.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EngineConfig {
pub default_engine: String,
pub default_chunk_size: Option<usize>,
pub parallel: bool,
pub max_concurrent: usize,
pub memory: MemoryConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MemoryConfig {
pub max_usage_mb: usize,
pub monitor: bool,
pub auto_streaming_threshold_mb: f64,
}
#[cfg(feature = "database")]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatabaseSettings {
pub connection_timeout: u64,
pub batch_size: usize,
pub max_connections: usize,
pub ssl_enabled: bool,
pub sampling: DatabaseSamplingConfig,
}
#[cfg(feature = "database")]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatabaseSamplingConfig {
pub enabled: bool,
pub default_sample_size: usize,
pub auto_sample_threshold: usize,
}
impl Default for OutputConfig {
fn default() -> Self {
Self {
default_format: DEFAULT_OUTPUT_FORMAT.to_string(),
colored: DEFAULT_COLORED_OUTPUT,
verbosity: DEFAULT_VERBOSITY,
show_progress: DEFAULT_SHOW_PROGRESS,
}
}
}
impl Default for QualityConfig {
fn default() -> Self {
Self {
enabled: DEFAULT_QUALITY_ENABLED,
iso_thresholds: IsoQualityConfig::default(),
}
}
}
impl Default for EngineConfig {
fn default() -> Self {
Self {
default_engine: DEFAULT_ENGINE.to_string(),
default_chunk_size: None,
parallel: DEFAULT_PARALLEL_PROCESSING,
max_concurrent: num_cpus::get(),
memory: MemoryConfig::default(),
}
}
}
impl Default for MemoryConfig {
fn default() -> Self {
Self {
max_usage_mb: DEFAULT_MAX_MEMORY_MB,
monitor: DEFAULT_MEMORY_MONITORING,
auto_streaming_threshold_mb: DEFAULT_AUTO_STREAMING_THRESHOLD_MB,
}
}
}
#[cfg(feature = "database")]
impl Default for DatabaseSettings {
fn default() -> Self {
Self {
connection_timeout: DEFAULT_DB_CONNECTION_TIMEOUT_SECS,
batch_size: DEFAULT_DB_BATCH_SIZE,
max_connections: DEFAULT_DB_MAX_CONNECTIONS,
ssl_enabled: DEFAULT_DB_SSL_ENABLED,
sampling: DatabaseSamplingConfig::default(),
}
}
}
#[cfg(feature = "database")]
impl Default for DatabaseSamplingConfig {
fn default() -> Self {
Self {
enabled: DEFAULT_DB_SAMPLING_ENABLED,
default_sample_size: DEFAULT_DB_SAMPLE_SIZE,
auto_sample_threshold: DEFAULT_DB_AUTO_SAMPLE_THRESHOLD,
}
}
}
impl DataprofConfig {
pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self, DataProfilerError> {
let content = fs::read_to_string(path)?;
let config: DataprofConfig = toml::from_str(&content)?;
Ok(config)
}
pub fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<(), DataProfilerError> {
let content = toml::to_string_pretty(self)?;
fs::write(path, content)?;
Ok(())
}
pub fn load_with_discovery() -> Self {
let mut config_paths = vec![
PathBuf::from(".dataprof.toml"),
PathBuf::from("dataprof.toml"),
];
if let Some(home) = std::env::var_os("HOME").or_else(|| std::env::var_os("USERPROFILE")) {
let user_config = PathBuf::from(home)
.join(".config")
.join("dataprof")
.join("config.toml");
config_paths.push(user_config);
}
for path in &config_paths {
if path.exists() {
match Self::load_from_file(path) {
Ok(mut config) => {
log::info!("Loaded configuration from: {}", path.display());
config.apply_env_overrides();
return config;
}
Err(e) => {
log::warn!(
"Found config file at {} but failed to load: {}",
path.display(),
e
);
}
}
}
}
log::debug!("No configuration file found. Using defaults with environment overrides.");
let mut config = Self::default();
config.apply_env_overrides();
config
}
pub fn apply_env_overrides(&mut self) {
if let Ok(format) = std::env::var("DATAPROF_FORMAT") {
self.output.default_format = format;
}
if let Ok(verbosity) = std::env::var("DATAPROF_VERBOSITY")
&& let Ok(level) = verbosity.parse::<u8>()
{
self.output.verbosity = level;
}
if let Ok(engine) = std::env::var("DATAPROF_ENGINE") {
self.engine.default_engine = engine;
}
if let Ok(quality) = std::env::var("DATAPROF_QUALITY")
&& let Ok(enabled) = quality.parse::<bool>()
{
self.quality.enabled = enabled;
}
if std::env::var("NO_COLOR").is_ok() {
self.output.colored = false;
}
if let Ok(progress) = std::env::var("DATAPROF_PROGRESS")
&& let Ok(enabled) = progress.parse::<bool>()
{
self.output.show_progress = enabled;
}
}
pub fn merge_with_cli_args(
&mut self,
cli_format: Option<&str>,
cli_quality: Option<bool>,
cli_progress: Option<bool>,
) {
if let Some(format) = cli_format {
self.output.default_format = format.to_string();
}
if let Some(quality) = cli_quality {
self.quality.enabled = quality;
}
if let Some(progress) = cli_progress {
self.output.show_progress = progress;
}
}
pub fn create_sample_config<P: AsRef<Path>>(path: P) -> Result<(), DataProfilerError> {
let sample_config = Self::default();
sample_config.save_to_file(path)?;
Ok(())
}
}
impl DataprofConfig {
pub fn validate(&self) -> Result<(), DataProfilerError> {
let valid_formats = ["text", "json", "csv", "plain"];
if !valid_formats.contains(&self.output.default_format.as_str()) {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Invalid output format '{}'. Valid formats: {}\n\
→ Fix: Set output.default_format to one of the valid formats in your config file.",
self.output.default_format,
valid_formats.join(", ")
),
});
}
if self.output.verbosity > 3 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Invalid verbosity level {}. Must be between 0 (quiet) and 3 (debug).\n\
→ Fix: Set output.verbosity to 0, 1, 2, or 3 in your config file.",
self.output.verbosity
),
});
}
let iso = &self.quality.iso_thresholds;
if iso.outlier_iqr_multiplier <= 0.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"IQR multiplier must be positive (standard value: 1.5), got {}.\n\
→ Fix: Set quality.iso_thresholds.outlier_iqr_multiplier to a positive value.\n\
→ Recommended: Use 1.5 (ISO standard) for normal cases, 3.0 for lenient detection.",
iso.outlier_iqr_multiplier
),
});
}
if iso.max_null_percentage < 0.0 || iso.max_null_percentage > 100.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Max null percentage must be between 0 and 100, got {}.\n\
→ Fix: Set quality.iso_thresholds.max_null_percentage to a value between 0.0 and 100.0.",
iso.max_null_percentage
),
});
}
if iso.null_report_threshold < 0.0 || iso.null_report_threshold > 100.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Null report threshold must be between 0 and 100, got {}.\n\
→ Fix: Set quality.iso_thresholds.null_report_threshold to a value between 0.0 and 100.0.",
iso.null_report_threshold
),
});
}
if iso.min_type_consistency < 0.0 || iso.min_type_consistency > 100.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Min type consistency must be between 0 and 100, got {}.\n\
→ Fix: Set quality.iso_thresholds.min_type_consistency to a value between 0.0 and 100.0.",
iso.min_type_consistency
),
});
}
if iso.high_cardinality_threshold < 0.0 || iso.high_cardinality_threshold > 100.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"High cardinality threshold must be between 0 and 100, got {}.\n\
→ Fix: Set quality.iso_thresholds.high_cardinality_threshold to a value between 0.0 and 100.0.",
iso.high_cardinality_threshold
),
});
}
if iso.duplicate_report_threshold < 0.0 || iso.duplicate_report_threshold > 100.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Duplicate report threshold must be between 0 and 100, got {}.\n\
→ Fix: Set quality.iso_thresholds.duplicate_report_threshold to a value between 0.0 and 100.0.",
iso.duplicate_report_threshold
),
});
}
if iso.max_data_age_years < 0.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Max data age must be non-negative, got {} years.\n\
→ Fix: Set quality.iso_thresholds.max_data_age_years to a positive value.",
iso.max_data_age_years
),
});
}
if iso.stale_data_threshold < 0.0 || iso.stale_data_threshold > 100.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Stale data threshold must be between 0 and 100, got {}.\n\
→ Fix: Set quality.iso_thresholds.stale_data_threshold to a value between 0.0 and 100.0.",
iso.stale_data_threshold
),
});
}
let valid_engines = ["auto", "streaming", "memory_efficient", "true_streaming"];
if !valid_engines.contains(&self.engine.default_engine.as_str()) {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Invalid engine '{}'. Valid engines: {}\n\
→ Fix: Set engine.default_engine to one of the valid engines in your config file.\n\
→ Recommended: Use 'auto' for automatic selection based on file size.",
self.engine.default_engine,
valid_engines.join(", ")
),
});
}
if self.engine.memory.auto_streaming_threshold_mb < 0.0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Auto-streaming threshold must be non-negative, got {} MB.\n\
→ Fix: Set engine.memory.auto_streaming_threshold_mb to a positive value.\n\
→ Recommended: 100.0 MB is a good default for most systems.",
self.engine.memory.auto_streaming_threshold_mb
),
});
}
if let Some(chunk_size) = self.engine.default_chunk_size {
if chunk_size == 0 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Chunk size must be greater than 0, got {}.\n\
→ Fix: Set engine.default_chunk_size to a positive value or null for adaptive sizing.\n\
→ Recommended: 8192-65536 rows for most CSV files.",
chunk_size
),
});
}
if chunk_size > 1_000_000 {
return Err(DataProfilerError::ConfigValidationError {
message: format!(
"Chunk size {} is very large and may cause memory issues.\n\
→ Fix: Set engine.default_chunk_size to a smaller value.\n\
→ Recommended: 8192-65536 rows for most CSV files.",
chunk_size
),
});
}
}
if self.engine.max_concurrent == 0 {
return Err(DataProfilerError::ConfigValidationError {
message: "Max concurrent operations must be greater than 0.\n\
→ Fix: Set engine.max_concurrent to a positive value.\n\
→ Recommended: Use num_cpus::get() or leave unspecified for automatic detection."
.to_string(),
});
}
#[cfg(feature = "database")]
if let Some(ref db) = self.database {
if db.connection_timeout == 0 {
return Err(DataProfilerError::ConfigValidationError {
message: "Database connection timeout must be greater than 0 seconds.\n\
→ Fix: Set database.connection_timeout to a positive value.\n\
→ Recommended: 30 seconds for most network conditions."
.to_string(),
});
}
if db.batch_size == 0 {
return Err(DataProfilerError::ConfigValidationError {
message: "Database batch size must be greater than 0.\n\
→ Fix: Set database.batch_size to a positive value.\n\
→ Recommended: 10000 rows for most databases."
.to_string(),
});
}
if db.max_connections == 0 {
return Err(DataProfilerError::ConfigValidationError {
message: "Database max connections must be greater than 0.\n\
→ Fix: Set database.max_connections to a positive value.\n\
→ Recommended: 10 connections for most use cases."
.to_string(),
});
}
if db.sampling.default_sample_size == 0 {
return Err(DataProfilerError::ConfigValidationError {
message: "Database sample size must be greater than 0.\n\
→ Fix: Set database.sampling.default_sample_size to a positive value.\n\
→ Recommended: 100000 rows for statistical significance."
.to_string(),
});
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct DataprofConfigBuilder {
output: OutputConfig,
quality: QualityConfig,
engine: EngineConfig,
#[cfg(feature = "database")]
database: Option<DatabaseSettings>,
}
impl DataprofConfigBuilder {
pub fn new() -> Self {
Self {
output: OutputConfig::default(),
quality: QualityConfig::default(),
engine: EngineConfig::default(),
#[cfg(feature = "database")]
database: Some(DatabaseSettings::default()),
}
}
pub fn output_format(mut self, format: &str) -> Self {
self.output.default_format = format.to_string();
self
}
pub fn colored(mut self, enabled: bool) -> Self {
self.output.colored = enabled;
self
}
pub fn verbosity(mut self, level: u8) -> Self {
self.output.verbosity = level;
self
}
pub fn show_progress(mut self, enabled: bool) -> Self {
self.output.show_progress = enabled;
self
}
pub fn quality_enabled(mut self, enabled: bool) -> Self {
self.quality.enabled = enabled;
self
}
pub fn iso_quality_thresholds(mut self, thresholds: IsoQualityConfig) -> Self {
self.quality.iso_thresholds = thresholds;
self
}
pub fn iso_quality_profile_strict(mut self) -> Self {
self.quality.iso_thresholds = IsoQualityConfig::strict();
self
}
pub fn iso_quality_profile_lenient(mut self) -> Self {
self.quality.iso_thresholds = IsoQualityConfig::lenient();
self
}
pub fn engine(mut self, engine: &str) -> Self {
self.engine.default_engine = engine.to_string();
self
}
pub fn chunk_size(mut self, size: usize) -> Self {
self.engine.default_chunk_size = Some(size);
self
}
pub fn parallel(mut self, enabled: bool) -> Self {
self.engine.parallel = enabled;
self
}
pub fn max_concurrent(mut self, max: usize) -> Self {
self.engine.max_concurrent = max;
self
}
pub fn max_memory_mb(mut self, mb: usize) -> Self {
self.engine.memory.max_usage_mb = mb;
self
}
pub fn auto_streaming_threshold_mb(mut self, mb: f64) -> Self {
self.engine.memory.auto_streaming_threshold_mb = mb;
self
}
#[cfg(feature = "database")]
pub fn db_connection_timeout(mut self, seconds: u64) -> Self {
if let Some(ref mut db) = self.database {
db.connection_timeout = seconds;
}
self
}
#[cfg(feature = "database")]
pub fn db_batch_size(mut self, size: usize) -> Self {
if let Some(ref mut db) = self.database {
db.batch_size = size;
}
self
}
#[cfg(feature = "database")]
pub fn db_sampling_enabled(mut self, enabled: bool) -> Self {
if let Some(ref mut db) = self.database {
db.sampling.enabled = enabled;
}
self
}
pub fn ci_preset() -> Self {
Self::new()
.colored(false)
.show_progress(false)
.output_format("json")
.verbosity(2)
}
pub fn interactive_preset() -> Self {
Self::new()
.colored(true)
.show_progress(true)
.output_format("text")
.verbosity(1)
}
pub fn production_quality_preset() -> Self {
Self::new()
.iso_quality_profile_strict()
.quality_enabled(true)
.max_memory_mb(512) }
pub fn build(self) -> Result<DataprofConfig, DataProfilerError> {
let config = DataprofConfig {
output: self.output,
quality: self.quality,
engine: self.engine,
#[cfg(feature = "database")]
database: self.database,
};
config.validate()?;
Ok(config)
}
pub fn build_unchecked(self) -> DataprofConfig {
DataprofConfig {
output: self.output,
quality: self.quality,
engine: self.engine,
#[cfg(feature = "database")]
database: self.database,
}
}
}
impl Default for DataprofConfigBuilder {
fn default() -> Self {
Self::new()
}
}