use super::utils::{DATE_FORMAT_REGEXES, is_likely_date_column, is_valid_date_format};
use crate::core::errors::DataProfilerError;
use crate::types::{ColumnProfile, DataType};
use std::collections::HashMap;
#[derive(Debug)]
pub(crate) struct ConsistencyMetrics {
pub data_type_consistency: f64,
pub format_violations: usize,
pub encoding_issues: usize,
}
pub(crate) struct ConsistencyCalculator;
impl ConsistencyCalculator {
pub fn calculate(
data: &HashMap<String, Vec<String>>,
column_profiles: &[ColumnProfile],
) -> Result<ConsistencyMetrics, DataProfilerError> {
let data_type_consistency = Self::calculate_type_consistency(data, column_profiles)?;
let format_violations = Self::count_format_violations(data)?;
let encoding_issues = Self::detect_encoding_issues(data)?;
Ok(ConsistencyMetrics {
data_type_consistency,
format_violations,
encoding_issues,
})
}
fn calculate_type_consistency(
data: &HashMap<String, Vec<String>>,
column_profiles: &[ColumnProfile],
) -> Result<f64, DataProfilerError> {
let mut total_values = 0;
let mut consistent_values = 0;
for profile in column_profiles {
if let Some(column_data) = data.get(&profile.name) {
for value in column_data {
if value.is_empty() {
continue; }
total_values += 1;
let is_consistent = match profile.data_type {
DataType::Integer => value.parse::<i64>().is_ok(),
DataType::Float => value.parse::<f64>().is_ok(),
DataType::Date => is_valid_date_format(value),
DataType::Boolean => {
matches!(value.as_str(), "True" | "False" | "true" | "false")
}
DataType::String => true, };
if is_consistent {
consistent_values += 1;
}
}
}
}
if total_values == 0 {
Ok(100.0)
} else {
Ok((consistent_values as f64 / total_values as f64) * 100.0)
}
}
fn count_format_violations(
data: &HashMap<String, Vec<String>>,
) -> Result<usize, DataProfilerError> {
let mut violations = 0;
for (column_name, values) in data {
violations += Self::count_mixed_date_formats(column_name, values);
violations += Self::count_other_format_violations(values);
}
Ok(violations)
}
fn count_mixed_date_formats(column_name: &str, values: &[String]) -> usize {
if !is_likely_date_column(column_name) {
return 0;
}
let mut format_counts = HashMap::new();
let non_empty: Vec<&String> = values.iter().filter(|s| !s.trim().is_empty()).collect();
let sample_size = 50.min(non_empty.len());
for value in non_empty.iter().take(sample_size) {
let trimmed = value.trim();
for (format_name, regex) in DATE_FORMAT_REGEXES.iter() {
if regex.is_match(trimmed) {
*format_counts.entry(*format_name).or_insert(0) += 1;
break;
}
}
}
if format_counts.len() > 1 {
format_counts.values().sum::<usize>() - format_counts.values().max().unwrap_or(&0)
} else {
0
}
}
fn count_other_format_violations(values: &[String]) -> usize {
let mut dot_decimal_count = 0;
let mut comma_decimal_count = 0;
let mut violations = 0;
for value in values {
if value.is_empty() {
continue;
}
if value.contains('.') && value.contains(',') {
violations += 1;
continue;
}
if value.contains('.') {
let dot_count = value.chars().filter(|&c| c == '.').count();
if dot_count == 1 {
dot_decimal_count += 1;
}
} else if value.contains(',') {
let comma_count = value.chars().filter(|&c| c == ',').count();
if comma_count == 1 {
comma_decimal_count += 1;
}
}
}
if dot_decimal_count > 0 && comma_decimal_count > 0 {
violations += dot_decimal_count.min(comma_decimal_count);
}
violations
}
fn detect_encoding_issues(
data: &HashMap<String, Vec<String>>,
) -> Result<usize, DataProfilerError> {
let mut issues = 0;
for values in data.values() {
for value in values {
if value.contains('\u{FFFD}') {
issues += 1;
}
if Self::has_encoding_artifacts(value) {
issues += 1;
}
}
}
Ok(issues)
}
fn has_encoding_artifacts(text: &str) -> bool {
let artifacts = ["á", "é", "Ã\u{AD}", "ó", "ú", "ñ", "ç"];
artifacts.iter().any(|artifact| text.contains(artifact))
}
}