use crate::data::type_inference::{InferredType, TypeInference};
use serde_json::Value;
use std::collections::HashMap;
pub struct DataAnalyzer {
column_stats: HashMap<String, ColumnStatistics>,
column_widths: Vec<usize>,
}
#[derive(Debug, Clone)]
pub struct ColumnStatistics {
pub column_name: String,
pub data_type: ColumnType,
pub total_values: usize,
pub non_null_values: usize,
pub null_values: usize,
pub unique_values: usize,
pub min_value: Option<String>,
pub max_value: Option<String>,
pub avg_value: Option<f64>,
pub sum_value: Option<f64>,
pub median_value: Option<f64>,
pub min_length: Option<usize>,
pub max_length: Option<usize>,
pub frequency_map: Option<std::collections::BTreeMap<String, usize>>,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum ColumnType {
String,
Integer,
Float,
Boolean,
Date,
Mixed,
Unknown,
}
impl Default for DataAnalyzer {
fn default() -> Self {
Self::new()
}
}
impl DataAnalyzer {
#[must_use]
pub fn new() -> Self {
Self {
column_stats: HashMap::new(),
column_widths: Vec::new(),
}
}
pub fn calculate_column_statistics(
&mut self,
column_name: &str,
values: &[&str],
) -> ColumnStatistics {
let mut stats = ColumnStatistics {
column_name: column_name.to_string(),
data_type: ColumnType::Unknown,
total_values: values.len(),
non_null_values: 0,
null_values: 0,
unique_values: 0,
min_value: None,
max_value: None,
avg_value: None,
sum_value: None,
median_value: None,
min_length: None,
max_length: None,
frequency_map: None,
};
if values.is_empty() {
return stats;
}
let mut unique = std::collections::HashSet::new();
let mut numeric_values = Vec::new();
let mut min_str: Option<&str> = None;
let mut max_str: Option<&str> = None;
let mut lengths = Vec::new();
for value in values {
if value.is_empty() {
stats.null_values += 1;
} else {
stats.non_null_values += 1;
unique.insert(*value);
lengths.push(value.len());
match min_str {
None => min_str = Some(value),
Some(min) if value < &min => min_str = Some(value),
_ => {}
}
match max_str {
None => max_str = Some(value),
Some(max) if value > &max => max_str = Some(value),
_ => {}
}
if let Ok(num) = value.parse::<f64>() {
numeric_values.push(num);
}
}
}
stats.unique_values = unique.len();
stats.data_type = self.detect_column_type(values);
match stats.data_type {
ColumnType::Integer | ColumnType::Float => {
if !numeric_values.is_empty() {
let sum: f64 = numeric_values.iter().sum();
stats.sum_value = Some(sum);
stats.avg_value = Some(sum / numeric_values.len() as f64);
numeric_values
.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mid = numeric_values.len() / 2;
stats.median_value = if numeric_values.len() % 2 == 0 {
Some(f64::midpoint(numeric_values[mid - 1], numeric_values[mid]))
} else {
Some(numeric_values[mid])
};
let min = numeric_values.iter().copied().fold(f64::INFINITY, f64::min);
let max = numeric_values
.iter()
.copied()
.fold(f64::NEG_INFINITY, f64::max);
stats.min_value = Some(min.to_string());
stats.max_value = Some(max.to_string());
}
}
_ => {
stats.min_value = min_str.map(std::string::ToString::to_string);
stats.max_value = max_str.map(std::string::ToString::to_string);
}
}
const MAX_VALUES_TO_SHOW: usize = 40;
if stats.unique_values > 0 {
let mut freq_map = std::collections::HashMap::new();
for value in values {
if !value.is_empty() {
*freq_map.entry((*value).to_string()).or_insert(0) += 1;
}
}
let mut freq_vec: Vec<(String, usize)> = freq_map.into_iter().collect();
freq_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
let mut display_map = std::collections::BTreeMap::new();
for (value, count) in freq_vec.into_iter().take(MAX_VALUES_TO_SHOW) {
display_map.insert(value, count);
}
stats.frequency_map = Some(display_map);
}
if !lengths.is_empty() {
stats.min_length = lengths.iter().min().copied();
stats.max_length = lengths.iter().max().copied();
}
self.column_stats
.insert(column_name.to_string(), stats.clone());
stats
}
#[must_use]
pub fn detect_column_type(&self, values: &[&str]) -> ColumnType {
if values.is_empty() {
return ColumnType::Unknown;
}
let mut type_counts = HashMap::new();
let first_type = self.detect_single_value_type(values[0]);
let mut all_same = true;
for (i, value) in values.iter().filter(|v| !v.is_empty()).enumerate() {
let detected_type = self.detect_single_value_type(value);
if i < 10 && detected_type != first_type {
all_same = false;
}
*type_counts.entry(detected_type).or_insert(0) += 1;
if i > 100 && type_counts.len() > 1 && !all_same {
break;
}
}
if type_counts.len() > 1 {
let total: usize = type_counts.values().sum();
for (col_type, count) in &type_counts {
if *count as f64 / total as f64 > 0.9 {
return col_type.clone();
}
}
ColumnType::Mixed
} else if let Some((col_type, _)) = type_counts.into_iter().next() {
col_type
} else {
ColumnType::Unknown
}
}
fn detect_single_value_type(&self, value: &str) -> ColumnType {
match TypeInference::infer_from_string(value) {
InferredType::Null => ColumnType::Unknown,
InferredType::Boolean => ColumnType::Boolean,
InferredType::Integer => ColumnType::Integer,
InferredType::Float => ColumnType::Float,
InferredType::DateTime => ColumnType::Date,
InferredType::String => ColumnType::String,
}
}
fn looks_like_date_fast(&self, value: &str) -> bool {
TypeInference::looks_like_datetime(value)
}
fn looks_like_date(&self, value: &str) -> bool {
TypeInference::looks_like_datetime(value)
}
pub fn calculate_optimal_column_widths(
&mut self,
data: &[Value],
max_sample_rows: usize,
) -> Vec<usize> {
if data.is_empty() {
return Vec::new();
}
let headers: Vec<String> = if let Some(first_row) = data.first() {
if let Some(obj) = first_row.as_object() {
obj.keys().map(std::string::ToString::to_string).collect()
} else {
return Vec::new();
}
} else {
return Vec::new();
};
let mut widths = vec![0; headers.len()];
for (i, header) in headers.iter().enumerate() {
widths[i] = header.len();
}
let total_rows = data.len();
let rows_to_check: Vec<usize> = if total_rows <= max_sample_rows {
(0..total_rows).collect()
} else {
let step = total_rows / max_sample_rows;
(0..max_sample_rows)
.map(|i| (i * step).min(total_rows - 1))
.collect()
};
for &row_idx in &rows_to_check {
if let Some(row) = data.get(row_idx) {
if let Some(obj) = row.as_object() {
for (i, header) in headers.iter().enumerate() {
if let Some(value) = obj.get(header) {
let display_len = self.get_value_display_length(value);
widths[i] = widths[i].max(display_len);
}
}
}
}
}
for width in &mut widths {
*width = (*width).min(50).max(3); }
self.column_widths = widths.clone();
widths
}
fn get_value_display_length(&self, value: &Value) -> usize {
match value {
Value::String(s) => s.len(),
Value::Number(n) => n.to_string().len(),
Value::Bool(b) => b.to_string().len(),
Value::Null => 4, Value::Array(a) => format!("[{} items]", a.len()).len(),
Value::Object(o) => format!("{{{} fields}}", o.len()).len(),
}
}
#[must_use]
pub fn get_column_statistics(&self, column_name: &str) -> Option<&ColumnStatistics> {
self.column_stats.get(column_name)
}
#[must_use]
pub fn get_column_widths(&self) -> &[usize] {
&self.column_widths
}
pub fn clear_cache(&mut self) {
self.column_stats.clear();
self.column_widths.clear();
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn test_column_type_detection() {
let analyzer = DataAnalyzer::new();
let int_values = vec!["1", "2", "3", "4", "5"];
assert_eq!(
analyzer.detect_column_type(&int_values),
ColumnType::Integer
);
let float_values = vec!["1.5", "2.7", "3.14", "4.0", "5.5"];
assert_eq!(
analyzer.detect_column_type(&float_values),
ColumnType::Float
);
let string_values = vec!["alice", "bob", "charlie", "david"];
assert_eq!(
analyzer.detect_column_type(&string_values),
ColumnType::String
);
let bool_values = vec!["true", "false", "TRUE", "FALSE"];
assert_eq!(
analyzer.detect_column_type(&bool_values),
ColumnType::Boolean
);
}
#[test]
fn test_column_statistics() {
let mut analyzer = DataAnalyzer::new();
let values = vec!["10", "20", "30", "40", "50", ""];
let stats = analyzer.calculate_column_statistics("test_column", &values);
assert_eq!(stats.total_values, 6);
assert_eq!(stats.non_null_values, 5);
assert_eq!(stats.null_values, 1);
assert_eq!(stats.unique_values, 5);
assert_eq!(stats.data_type, ColumnType::Integer);
assert_eq!(stats.avg_value, Some(30.0));
assert_eq!(stats.sum_value, Some(150.0));
assert_eq!(stats.median_value, Some(30.0));
assert_eq!(stats.min_value, Some("10".to_string()));
assert_eq!(stats.max_value, Some("50".to_string()));
assert!(stats.frequency_map.is_some());
}
#[test]
fn test_optimal_column_widths() {
let mut analyzer = DataAnalyzer::new();
let data = vec![
json!({"name": "Alice", "age": 30, "city": "New York"}),
json!({"name": "Bob", "age": 25, "city": "Los Angeles"}),
json!({"name": "Charlie", "age": 35, "city": "SF"}),
];
let widths = analyzer.calculate_optimal_column_widths(&data, 100);
assert_eq!(widths.len(), 3);
assert!(widths[2] >= 7); assert!(widths[0] >= 3); assert!(widths[1] >= 11); }
}