use crate::analysis::patterns::looks_like_date;
use crate::core::streaming_stats::{StreamingColumnCollection, StreamingStatistics};
use crate::types::{ColumnProfile, ColumnStats, DataType, TextStats};
use std::collections::HashMap;
pub struct ColumnProfileInput<'a> {
pub name: String,
pub data_type: DataType,
pub total_count: usize,
pub null_count: usize,
pub unique_count: Option<usize>,
pub sample_values: &'a [String],
pub text_lengths: Option<TextLengths>,
pub boolean_counts: Option<(usize, usize)>,
pub skip_statistics: bool,
pub skip_patterns: bool,
pub locale: Option<&'a str>,
}
pub struct TextLengths {
pub min_length: usize,
pub max_length: usize,
pub avg_length: f64,
}
pub fn build_column_profile(input: ColumnProfileInput<'_>) -> ColumnProfile {
let stats = if input.skip_statistics {
ColumnStats::None
} else {
match input.data_type {
DataType::Integer | DataType::Float => {
crate::stats::numeric::calculate_numeric_stats(input.sample_values)
}
DataType::Date => {
if !input.sample_values.is_empty() {
crate::stats::datetime::calculate_datetime_stats(input.sample_values)
} else if let Some(tl) = &input.text_lengths {
ColumnStats::Text(TextStats::from_lengths(
tl.min_length,
tl.max_length,
tl.avg_length,
))
} else {
ColumnStats::DateTime(crate::types::DateTimeStats::empty())
}
}
DataType::Boolean => {
let (true_count, false_count) = input.boolean_counts.unwrap_or_else(|| {
let tc = input
.sample_values
.iter()
.filter(|v| {
let t = v.trim();
t.eq_ignore_ascii_case("true") || t.eq_ignore_ascii_case("yes")
})
.count();
let fc = input
.sample_values
.iter()
.filter(|v| {
let t = v.trim();
t.eq_ignore_ascii_case("false") || t.eq_ignore_ascii_case("no")
})
.count();
(tc, fc)
});
let total = true_count + false_count;
let true_ratio = if total > 0 {
true_count as f64 / total as f64
} else {
0.0
};
ColumnStats::Boolean(crate::types::BooleanStats {
true_count,
false_count,
true_ratio,
})
}
DataType::String => {
if let Some(tl) = &input.text_lengths {
ColumnStats::Text(TextStats::from_lengths(
tl.min_length,
tl.max_length,
tl.avg_length,
))
} else {
crate::stats::text::calculate_text_stats(input.sample_values)
}
}
}
};
let patterns = if input.skip_patterns {
Vec::new()
} else {
crate::detect_patterns(input.sample_values, input.locale)
};
ColumnProfile {
name: input.name,
data_type: input.data_type,
null_count: input.null_count,
total_count: input.total_count,
unique_count: input.unique_count,
stats,
patterns,
}
}
pub fn profiles_from_streaming(
column_stats: &StreamingColumnCollection,
skip_statistics: bool,
skip_patterns: bool,
locale: Option<&str>,
) -> Vec<ColumnProfile> {
let mut profiles = Vec::new();
for column_name in column_stats.column_names() {
if let Some(stats) = column_stats.get_column_stats(&column_name) {
let profile =
profile_from_stats(&column_name, stats, skip_statistics, skip_patterns, locale);
profiles.push(profile);
}
}
profiles
}
pub fn profile_from_stats(
name: &str,
stats: &StreamingStatistics,
skip_statistics: bool,
skip_patterns: bool,
locale: Option<&str>,
) -> ColumnProfile {
let data_type = infer_data_type_streaming(stats);
let text_stats = stats.text_length_stats();
build_column_profile(ColumnProfileInput {
name: name.to_string(),
data_type,
total_count: stats.count,
null_count: stats.null_count,
unique_count: Some(stats.unique_count()),
sample_values: stats.sample_values(),
text_lengths: Some(TextLengths {
min_length: text_stats.min_length,
max_length: text_stats.max_length,
avg_length: text_stats.avg_length,
}),
boolean_counts: None,
skip_statistics,
skip_patterns,
locale,
})
}
pub fn infer_data_type_streaming(stats: &StreamingStatistics) -> DataType {
if stats.min.is_finite() && stats.max.is_finite() {
let sample_values = stats.sample_values();
let non_empty: Vec<&String> = sample_values.iter().filter(|s| !s.is_empty()).collect();
if !non_empty.is_empty() {
let all_integers = non_empty.iter().all(|s| s.parse::<i64>().is_ok());
if all_integers {
return DataType::Integer;
}
let numeric_count = non_empty
.iter()
.filter(|s| s.parse::<f64>().is_ok())
.count();
if numeric_count as f64 / non_empty.len() as f64 > 0.8 {
return DataType::Float;
}
}
}
let sample_values = stats.sample_values();
let non_empty: Vec<&String> = sample_values
.iter()
.filter(|s| !s.trim().is_empty())
.collect();
if !non_empty.is_empty() {
let date_like_count = non_empty
.iter()
.take(100)
.filter(|s| looks_like_date(s))
.count();
if date_like_count as f64 / non_empty.len().min(100) as f64 > 0.7 {
return DataType::Date;
}
let bool_count = non_empty
.iter()
.filter(|s| {
matches!(
s.trim(),
"true"
| "false"
| "True"
| "False"
| "TRUE"
| "FALSE"
| "yes"
| "no"
| "Yes"
| "No"
| "YES"
| "NO"
)
})
.count();
if bool_count as f64 / non_empty.len() as f64 >= 0.9 {
return DataType::Boolean;
}
}
DataType::String
}
pub fn quality_check_samples(
column_stats: &StreamingColumnCollection,
) -> HashMap<String, Vec<String>> {
let mut samples = HashMap::new();
for column_name in column_stats.column_names() {
if let Some(stats) = column_stats.get_column_stats(&column_name) {
let sample_values: Vec<String> = stats.sample_values().to_vec();
samples.insert(column_name, sample_values);
}
}
samples
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::streaming_stats::StreamingColumnCollection;
#[test]
fn test_profiles_from_streaming() {
let mut collection = StreamingColumnCollection::new();
let headers = vec!["name".to_string(), "age".to_string()];
collection.process_record(&headers, vec!["Alice".to_string(), "30".to_string()]);
collection.process_record(&headers, vec!["Bob".to_string(), "25".to_string()]);
collection.process_record(&headers, vec!["Charlie".to_string(), "35".to_string()]);
let profiles = profiles_from_streaming(&collection, false, false, None);
assert_eq!(profiles.len(), 2);
let age = profiles.iter().find(|p| p.name == "age").unwrap();
assert_eq!(age.data_type, DataType::Integer);
assert_eq!(age.total_count, 3);
}
#[test]
fn test_quality_check_samples() {
let mut collection = StreamingColumnCollection::new();
let headers = vec!["col".to_string()];
collection.process_record(&headers, vec!["val1".to_string()]);
collection.process_record(&headers, vec!["val2".to_string()]);
let samples = quality_check_samples(&collection);
assert!(samples.contains_key("col"));
assert_eq!(samples["col"].len(), 2);
}
#[test]
fn test_boolean_stats_with_counts() {
let samples = vec!["True".to_string(), "False".to_string(), "True".to_string()];
let profile = build_column_profile(ColumnProfileInput {
name: "flag".to_string(),
data_type: DataType::Boolean,
total_count: 3,
null_count: 0,
unique_count: Some(2),
sample_values: &samples,
text_lengths: None,
boolean_counts: Some((2, 1)),
skip_statistics: false,
skip_patterns: false,
locale: None,
});
match &profile.stats {
crate::types::ColumnStats::Boolean(b) => {
assert_eq!(b.true_count, 2);
assert_eq!(b.false_count, 1);
assert!((b.true_ratio - 2.0 / 3.0).abs() < 0.001);
}
other => panic!("expected Boolean stats, got {:?}", other),
}
}
#[test]
fn test_boolean_stats_fallback_case_insensitive() {
let samples = vec![
"true".to_string(),
"FALSE".to_string(),
" True ".to_string(),
];
let profile = build_column_profile(ColumnProfileInput {
name: "flag".to_string(),
data_type: DataType::Boolean,
total_count: 3,
null_count: 0,
unique_count: Some(2),
sample_values: &samples,
text_lengths: None,
boolean_counts: None, skip_statistics: false,
skip_patterns: false,
locale: None,
});
match &profile.stats {
crate::types::ColumnStats::Boolean(b) => {
assert_eq!(b.true_count, 2);
assert_eq!(b.false_count, 1);
assert!((b.true_ratio - 2.0 / 3.0).abs() < 0.001);
}
other => panic!("expected Boolean stats, got {:?}", other),
}
}
#[test]
fn test_skip_statistics() {
let samples = vec!["10".to_string(), "20".to_string(), "30".to_string()];
let profile = build_column_profile(ColumnProfileInput {
name: "num".to_string(),
data_type: DataType::Integer,
total_count: 3,
null_count: 0,
unique_count: Some(3),
sample_values: &samples,
text_lengths: None,
boolean_counts: None,
skip_statistics: true,
skip_patterns: false,
locale: None,
});
assert!(matches!(profile.stats, crate::types::ColumnStats::None));
assert_eq!(profile.data_type, DataType::Integer);
}
#[test]
fn test_skip_patterns() {
let samples = vec!["hello".to_string(), "world".to_string()];
let profile = build_column_profile(ColumnProfileInput {
name: "text".to_string(),
data_type: DataType::String,
total_count: 2,
null_count: 0,
unique_count: Some(2),
sample_values: &samples,
text_lengths: None,
boolean_counts: None,
skip_statistics: false,
skip_patterns: true,
locale: None,
});
assert!(profile.patterns.is_empty());
assert!(matches!(profile.stats, crate::types::ColumnStats::Text(_)));
}
#[test]
fn test_all_packs_default() {
let samples = vec!["42".to_string(), "99".to_string()];
let profile = build_column_profile(ColumnProfileInput {
name: "val".to_string(),
data_type: DataType::Integer,
total_count: 2,
null_count: 0,
unique_count: Some(2),
sample_values: &samples,
text_lengths: None,
boolean_counts: None,
skip_statistics: false,
skip_patterns: false,
locale: None,
});
assert!(matches!(
profile.stats,
crate::types::ColumnStats::Numeric(_)
));
assert_eq!(profile.data_type, DataType::Integer);
}
}