use crate::types::{ColumnProfile, ColumnStats, DataType};
use crate::analysis::inference::infer_type;
use crate::analysis::patterns::detect_patterns;
use crate::stats::{calculate_datetime_stats, calculate_numeric_stats, calculate_text_stats};
pub fn analyze_column(name: &str, data: &[String]) -> ColumnProfile {
analyze_column_with_options(name, data, false)
}
pub fn analyze_column_fast(name: &str, data: &[String]) -> ColumnProfile {
analyze_column_with_options(name, data, true)
}
fn analyze_column_with_options(name: &str, data: &[String], fast_mode: bool) -> ColumnProfile {
let total_count = data.len();
let null_count = data.iter().filter(|s| s.trim().is_empty()).count();
let data_type = infer_type(data);
let stats = match data_type {
DataType::Integer | DataType::Float => calculate_numeric_stats(data),
DataType::Date => calculate_datetime_stats(data),
DataType::Boolean => {
let non_null: Vec<&String> = data.iter().filter(|s| !s.trim().is_empty()).collect();
let tc = non_null
.iter()
.filter(|v| {
let t = v.trim();
t.eq_ignore_ascii_case("true") || t.eq_ignore_ascii_case("yes")
})
.count();
let fc = non_null
.iter()
.filter(|v| {
let t = v.trim();
t.eq_ignore_ascii_case("false") || t.eq_ignore_ascii_case("no")
})
.count();
let total = tc + fc;
let true_ratio = if total > 0 {
tc as f64 / total as f64
} else {
0.0
};
ColumnStats::Boolean(crate::types::BooleanStats {
true_count: tc,
false_count: fc,
true_ratio,
})
}
DataType::String => calculate_text_stats(data),
};
let patterns = if fast_mode {
Vec::new()
} else {
detect_patterns(data, None)
};
let unique_count = if fast_mode {
None } else {
Some(
data.iter()
.filter(|s| !s.trim().is_empty())
.collect::<std::collections::HashSet<_>>()
.len(),
)
};
ColumnProfile {
name: name.to_string(),
data_type,
null_count,
total_count,
unique_count,
stats,
patterns,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_analyze_column_basic() {
let data = vec!["1".to_string(), "2".to_string(), "3".to_string()];
let profile = analyze_column("test_col", &data);
assert_eq!(profile.name, "test_col");
assert!(matches!(profile.data_type, DataType::Integer));
assert_eq!(profile.total_count, 3);
assert_eq!(profile.null_count, 0);
assert_eq!(profile.unique_count, Some(3));
}
#[test]
fn test_analyze_column_with_nulls() {
let data = vec![
"1".to_string(),
"".to_string(),
"3".to_string(),
"".to_string(),
];
let profile = analyze_column("test_col", &data);
assert_eq!(profile.total_count, 4);
assert_eq!(profile.null_count, 2);
assert_eq!(profile.unique_count, Some(2)); }
#[test]
fn test_analyze_column_whitespace_as_null() {
let data = vec![
"1".to_string(),
" ".to_string(), "3".to_string(),
"\t".to_string(), ];
let profile = analyze_column("test_col", &data);
assert_eq!(profile.total_count, 4);
assert_eq!(profile.null_count, 2); assert_eq!(profile.unique_count, Some(2)); assert!(matches!(profile.data_type, DataType::Integer));
}
#[test]
fn test_analyze_column_with_whitespace_values() {
let data = vec![" 1 ".to_string(), " 2".to_string(), "3 ".to_string()];
let profile = analyze_column("test_col", &data);
assert_eq!(profile.null_count, 0); assert!(matches!(profile.data_type, DataType::Integer));
}
#[test]
fn test_analyze_column_fast_mode() {
let data = vec![
"user@example.com".to_string(),
"admin@test.org".to_string(),
"contact@company.com".to_string(),
];
let profile = analyze_column_fast("test_col", &data);
assert_eq!(profile.patterns.len(), 0); assert_eq!(profile.unique_count, None); }
#[test]
fn test_analyze_column_normal_mode() {
let data = vec![
"user@example.com".to_string(),
"admin@test.org".to_string(),
"contact@company.com".to_string(),
];
let profile = analyze_column("test_col", &data);
assert!(!profile.patterns.is_empty()); assert_eq!(profile.unique_count, Some(3)); }
#[test]
fn test_analyze_column_empty_data() {
let data: Vec<String> = vec![];
let profile = analyze_column("test_col", &data);
assert_eq!(profile.total_count, 0);
assert_eq!(profile.null_count, 0);
assert_eq!(profile.unique_count, Some(0));
}
#[test]
fn test_analyze_column_all_null() {
let data = vec!["".to_string(), " ".to_string(), "\t".to_string()];
let profile = analyze_column("test_col", &data);
assert_eq!(profile.total_count, 3);
assert_eq!(profile.null_count, 3);
assert_eq!(profile.unique_count, Some(0)); assert!(matches!(profile.data_type, DataType::String)); }
#[test]
fn test_analyze_column_float_detection() {
let data = vec!["1.5".to_string(), "2.3".to_string(), "3.7".to_string()];
let profile = analyze_column("test_col", &data);
assert!(matches!(profile.data_type, DataType::Float));
assert_eq!(profile.null_count, 0);
}
#[test]
fn test_analyze_column_date_detection() {
let data = vec![
"2023-01-15".to_string(),
"2023-02-20".to_string(),
"2023-03-25".to_string(),
];
let profile = analyze_column("test_col", &data);
assert!(matches!(profile.data_type, DataType::Date));
}
#[test]
fn test_analyze_column_unique_count_consistency() {
let data = vec![
"value1".to_string(),
"value2".to_string(),
" ".to_string(),
"value1".to_string(), "\t".to_string(),
];
let profile = analyze_column("test_col", &data);
assert_eq!(profile.null_count, 2); assert_eq!(profile.unique_count, Some(2)); }
}