mod accuracy;
mod completeness;
mod consistency;
mod timeliness;
mod uniqueness;
mod utils;
pub use utils::{StatisticalValidation, validate_sample_size};
use accuracy::AccuracyCalculator;
use completeness::CompletenessCalculator;
use consistency::ConsistencyCalculator;
use timeliness::TimelinessCalculator;
use uniqueness::UniquenessCalculator;
use crate::core::config::IsoQualityConfig;
use crate::core::errors::DataProfilerError;
use crate::types::{
AccuracyMetrics, ColumnProfile, CompletenessMetrics, ConsistencyMetrics, QualityDimension,
QualityMetrics, TimelinessMetrics, UniquenessMetrics,
};
use std::collections::HashMap;
pub struct MetricsCalculator {
pub thresholds: IsoQualityConfig,
}
impl Default for MetricsCalculator {
fn default() -> Self {
Self::new()
}
}
impl MetricsCalculator {
pub fn new() -> Self {
Self {
thresholds: IsoQualityConfig::default(),
}
}
pub fn with_thresholds(thresholds: IsoQualityConfig) -> Self {
Self { thresholds }
}
pub fn strict() -> Self {
Self {
thresholds: IsoQualityConfig::strict(),
}
}
pub fn lenient() -> Self {
Self {
thresholds: IsoQualityConfig::lenient(),
}
}
pub fn validate_sample_size(sample_size: usize, metric_type: &str) -> StatisticalValidation {
utils::validate_sample_size(sample_size, metric_type)
}
fn is_requested(requested: &Option<&[QualityDimension]>, dim: QualityDimension) -> bool {
match requested {
None => true,
Some(dims) => dims.contains(&dim),
}
}
pub fn calculate_comprehensive_metrics(
&self,
data: &HashMap<String, Vec<String>>,
column_profiles: &[ColumnProfile],
requested_dimensions: Option<&[QualityDimension]>,
) -> Result<QualityMetrics, DataProfilerError> {
if data.is_empty() {
return Ok(Self::default_metrics_for_empty_dataset(
&requested_dimensions,
));
}
let sample_size = Self::calculate_sample_size(data)?;
let requested = &requested_dimensions;
let validation = Self::validate_sample_size(sample_size, "general");
if !validation.sufficient_sample {
eprintln!(
"Warning: Sample size ({}) is below recommended minimum ({}) for reliable statistics",
validation.actual_sample_size, validation.min_sample_size
);
}
let completeness = if Self::is_requested(requested, QualityDimension::Completeness) {
let c = CompletenessCalculator::new(&self.thresholds).calculate(
data,
column_profiles,
sample_size,
)?;
Some(CompletenessMetrics {
missing_values_ratio: c.missing_values_ratio,
complete_records_ratio: c.complete_records_ratio,
null_columns: c.null_columns,
})
} else {
None
};
let consistency = if Self::is_requested(requested, QualityDimension::Consistency) {
let c = ConsistencyCalculator::calculate(data, column_profiles)?;
Some(ConsistencyMetrics {
data_type_consistency: c.data_type_consistency,
format_violations: c.format_violations,
encoding_issues: c.encoding_issues,
})
} else {
None
};
let uniqueness = if Self::is_requested(requested, QualityDimension::Uniqueness) {
let u = UniquenessCalculator::new(&self.thresholds).calculate(
data,
column_profiles,
sample_size,
)?;
Some(UniquenessMetrics {
duplicate_rows: u.duplicate_rows,
key_uniqueness: u.key_uniqueness,
high_cardinality_warning: u.high_cardinality_warning,
})
} else {
None
};
let accuracy = if Self::is_requested(requested, QualityDimension::Accuracy) {
let a = AccuracyCalculator::new(&self.thresholds).calculate(data, column_profiles)?;
Some(AccuracyMetrics {
outlier_ratio: a.outlier_ratio,
range_violations: a.range_violations,
negative_values_in_positive: a.negative_values_in_positive,
})
} else {
None
};
let timeliness = if Self::is_requested(requested, QualityDimension::Timeliness) {
let t = TimelinessCalculator::new(&self.thresholds).calculate(data, column_profiles)?;
Some(TimelinessMetrics {
future_dates_count: t.future_dates_count,
stale_data_ratio: t.stale_data_ratio,
temporal_violations: t.temporal_violations,
})
} else {
None
};
Ok(QualityMetrics {
completeness,
consistency,
uniqueness,
accuracy,
timeliness,
})
}
fn default_metrics_for_empty_dataset(
requested: &Option<&[QualityDimension]>,
) -> QualityMetrics {
let is_req = |d| match requested {
None => true,
Some(dims) => dims.contains(&d),
};
QualityMetrics {
completeness: if is_req(QualityDimension::Completeness) {
Some(CompletenessMetrics {
missing_values_ratio: 0.0,
complete_records_ratio: 100.0,
null_columns: vec![],
})
} else {
None
},
consistency: if is_req(QualityDimension::Consistency) {
Some(ConsistencyMetrics {
data_type_consistency: 100.0,
format_violations: 0,
encoding_issues: 0,
})
} else {
None
},
uniqueness: if is_req(QualityDimension::Uniqueness) {
Some(UniquenessMetrics {
duplicate_rows: 0,
key_uniqueness: 100.0,
high_cardinality_warning: false,
})
} else {
None
},
accuracy: if is_req(QualityDimension::Accuracy) {
Some(AccuracyMetrics {
outlier_ratio: 0.0,
range_violations: 0,
negative_values_in_positive: 0,
})
} else {
None
},
timeliness: if is_req(QualityDimension::Timeliness) {
Some(TimelinessMetrics {
future_dates_count: 0,
stale_data_ratio: 0.0,
temporal_violations: 0,
})
} else {
None
},
}
}
pub fn calculate_bifurcated_metrics(
&self,
data: &HashMap<String, Vec<String>>,
column_profiles: &[ColumnProfile],
requested_dimensions: Option<&[QualityDimension]>,
) -> Result<BifurcatedResult, DataProfilerError> {
if data.is_empty() && column_profiles.is_empty() {
return Ok(BifurcatedResult {
metrics: Self::default_metrics_for_empty_dataset(&requested_dimensions),
exact_dimensions: vec![],
sampled_dimensions: vec![],
sample_size: 0,
});
}
let total_rows = column_profiles.first().map(|p| p.total_count).unwrap_or(0);
let sample_rows = Self::calculate_sample_size(data).unwrap_or(0);
let requested = &requested_dimensions;
let mut exact_dimensions = Vec::new();
let mut sampled_dimensions = Vec::new();
let completeness = if Self::is_requested(requested, QualityDimension::Completeness) {
let c = CompletenessCalculator::new(&self.thresholds)
.calculate_from_profiles(column_profiles)?;
exact_dimensions.push("completeness".to_string());
Some(CompletenessMetrics {
missing_values_ratio: c.missing_values_ratio,
complete_records_ratio: c.complete_records_ratio,
null_columns: c.null_columns,
})
} else {
None
};
let consistency = if Self::is_requested(requested, QualityDimension::Consistency) {
let c = if !data.is_empty() {
ConsistencyCalculator::calculate(data, column_profiles)?
} else {
consistency::ConsistencyMetrics {
data_type_consistency: 100.0,
format_violations: 0,
encoding_issues: 0,
}
};
sampled_dimensions.push("consistency".to_string());
Some(ConsistencyMetrics {
data_type_consistency: c.data_type_consistency,
format_violations: c.format_violations,
encoding_issues: c.encoding_issues,
})
} else {
None
};
let uniqueness = if Self::is_requested(requested, QualityDimension::Uniqueness) {
let u = UniquenessCalculator::new(&self.thresholds).calculate(
data,
column_profiles,
total_rows,
)?;
exact_dimensions.push("key_uniqueness".to_string());
sampled_dimensions.push("duplicate_rows".to_string());
Some(UniquenessMetrics {
duplicate_rows: u.duplicate_rows,
key_uniqueness: u.key_uniqueness,
high_cardinality_warning: u.high_cardinality_warning,
})
} else {
None
};
let accuracy = if Self::is_requested(requested, QualityDimension::Accuracy) {
let a = if !data.is_empty() {
AccuracyCalculator::new(&self.thresholds).calculate(data, column_profiles)?
} else {
accuracy::AccuracyMetrics {
outlier_ratio: 0.0,
range_violations: 0,
negative_values_in_positive: 0,
}
};
sampled_dimensions.push("accuracy".to_string());
Some(AccuracyMetrics {
outlier_ratio: a.outlier_ratio,
range_violations: a.range_violations,
negative_values_in_positive: a.negative_values_in_positive,
})
} else {
None
};
let timeliness = if Self::is_requested(requested, QualityDimension::Timeliness) {
let t = if !data.is_empty() {
TimelinessCalculator::new(&self.thresholds).calculate(data, column_profiles)?
} else {
timeliness::TimelinessMetrics {
future_dates_count: 0,
stale_data_ratio: 0.0,
temporal_violations: 0,
}
};
sampled_dimensions.push("timeliness".to_string());
Some(TimelinessMetrics {
future_dates_count: t.future_dates_count,
stale_data_ratio: t.stale_data_ratio,
temporal_violations: t.temporal_violations,
})
} else {
None
};
let metrics = QualityMetrics {
completeness,
consistency,
uniqueness,
accuracy,
timeliness,
};
Ok(BifurcatedResult {
metrics,
exact_dimensions,
sampled_dimensions,
sample_size: sample_rows,
})
}
fn calculate_sample_size(
data: &HashMap<String, Vec<String>>,
) -> Result<usize, DataProfilerError> {
data.values().map(|v| v.len()).max().ok_or_else(|| {
DataProfilerError::MetricsCalculationError {
message: "No data columns found".to_string(),
}
})
}
}
pub struct BifurcatedResult {
pub metrics: QualityMetrics,
pub exact_dimensions: Vec<String>,
pub sampled_dimensions: Vec<String>,
pub sample_size: usize,
}