1use crate::data::type_inference::{InferredType, TypeInference};
2use serde_json::Value;
3use std::collections::HashMap;
4
5pub struct DataAnalyzer {
8 column_stats: HashMap<String, ColumnStatistics>,
10
11 column_widths: Vec<usize>,
13}
14
15#[derive(Debug, Clone)]
17pub struct ColumnStatistics {
18 pub column_name: String,
19 pub data_type: ColumnType,
20 pub total_values: usize,
21 pub non_null_values: usize,
22 pub null_values: usize,
23 pub unique_values: usize,
24 pub min_value: Option<String>,
25 pub max_value: Option<String>,
26 pub avg_value: Option<f64>,
27 pub sum_value: Option<f64>,
28 pub median_value: Option<f64>,
29 pub min_length: Option<usize>,
30 pub max_length: Option<usize>,
31 pub frequency_map: Option<std::collections::BTreeMap<String, usize>>,
32}
33
34#[derive(Debug, Clone, PartialEq, Eq, Hash)]
36pub enum ColumnType {
37 String,
38 Integer,
39 Float,
40 Boolean,
41 Date,
42 Mixed,
43 Unknown,
44}
45
46impl DataAnalyzer {
47 pub fn new() -> Self {
48 Self {
49 column_stats: HashMap::new(),
50 column_widths: Vec::new(),
51 }
52 }
53
54 pub fn calculate_column_statistics(
56 &mut self,
57 column_name: &str,
58 values: &[&str],
59 ) -> ColumnStatistics {
60 let mut stats = ColumnStatistics {
61 column_name: column_name.to_string(),
62 data_type: ColumnType::Unknown,
63 total_values: values.len(),
64 non_null_values: 0,
65 null_values: 0,
66 unique_values: 0,
67 min_value: None,
68 max_value: None,
69 avg_value: None,
70 sum_value: None,
71 median_value: None,
72 min_length: None,
73 max_length: None,
74 frequency_map: None,
75 };
76
77 if values.is_empty() {
78 return stats;
79 }
80
81 let mut unique = std::collections::HashSet::new();
83 let mut numeric_values = Vec::new();
84 let mut min_str: Option<&str> = None;
85 let mut max_str: Option<&str> = None;
86 let mut lengths = Vec::new();
87
88 for value in values {
89 if value.is_empty() {
90 stats.null_values += 1;
91 } else {
92 stats.non_null_values += 1;
93
94 unique.insert(*value);
96 lengths.push(value.len());
97
98 match min_str {
100 None => min_str = Some(value),
101 Some(min) if value < &min => min_str = Some(value),
102 _ => {}
103 }
104 match max_str {
105 None => max_str = Some(value),
106 Some(max) if value > &max => max_str = Some(value),
107 _ => {}
108 }
109
110 if let Ok(num) = value.parse::<f64>() {
112 numeric_values.push(num);
113 }
114 }
115 }
116
117 stats.unique_values = unique.len();
118
119 stats.data_type = self.detect_column_type(values);
121
122 match stats.data_type {
124 ColumnType::Integer | ColumnType::Float => {
125 if !numeric_values.is_empty() {
126 let sum: f64 = numeric_values.iter().sum();
127 stats.sum_value = Some(sum);
128 stats.avg_value = Some(sum / numeric_values.len() as f64);
129
130 numeric_values
132 .sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
133 let mid = numeric_values.len() / 2;
134 stats.median_value = if numeric_values.len() % 2 == 0 {
135 Some((numeric_values[mid - 1] + numeric_values[mid]) / 2.0)
136 } else {
137 Some(numeric_values[mid])
138 };
139
140 let min = numeric_values.iter().cloned().fold(f64::INFINITY, f64::min);
141 let max = numeric_values
142 .iter()
143 .cloned()
144 .fold(f64::NEG_INFINITY, f64::max);
145 stats.min_value = Some(min.to_string());
146 stats.max_value = Some(max.to_string());
147 }
148 }
149 _ => {
150 stats.min_value = min_str.map(|s| s.to_string());
152 stats.max_value = max_str.map(|s| s.to_string());
153 }
154 }
155
156 const MAX_VALUES_TO_SHOW: usize = 40;
158 if stats.unique_values > 0 {
159 let mut freq_map = std::collections::HashMap::new();
161 for value in values {
162 if !value.is_empty() {
163 *freq_map.entry(value.to_string()).or_insert(0) += 1;
164 }
165 }
166
167 let mut freq_vec: Vec<(String, usize)> = freq_map.into_iter().collect();
169 freq_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
170
171 let mut display_map = std::collections::BTreeMap::new();
173 for (value, count) in freq_vec.into_iter().take(MAX_VALUES_TO_SHOW) {
174 display_map.insert(value, count);
175 }
176
177 stats.frequency_map = Some(display_map);
178 }
179
180 if !lengths.is_empty() {
182 stats.min_length = lengths.iter().min().copied();
183 stats.max_length = lengths.iter().max().copied();
184 }
185
186 self.column_stats
188 .insert(column_name.to_string(), stats.clone());
189
190 stats
191 }
192
193 pub fn detect_column_type(&self, values: &[&str]) -> ColumnType {
195 if values.is_empty() {
196 return ColumnType::Unknown;
197 }
198
199 let mut type_counts = HashMap::new();
200
201 let first_type = self.detect_single_value_type(values[0]);
204 let mut all_same = true;
205
206 for (i, value) in values.iter().filter(|v| !v.is_empty()).enumerate() {
207 let detected_type = self.detect_single_value_type(value);
208
209 if i < 10 && detected_type != first_type {
210 all_same = false;
211 }
212
213 *type_counts.entry(detected_type).or_insert(0) += 1;
214
215 if i > 100 && type_counts.len() > 1 && !all_same {
217 break;
218 }
219 }
220
221 if type_counts.len() > 1 {
223 let total: usize = type_counts.values().sum();
225 for (col_type, count) in type_counts.iter() {
226 if *count as f64 / total as f64 > 0.9 {
227 return col_type.clone();
228 }
229 }
230 ColumnType::Mixed
231 } else if let Some((col_type, _)) = type_counts.into_iter().next() {
232 col_type
233 } else {
234 ColumnType::Unknown
235 }
236 }
237
238 fn detect_single_value_type(&self, value: &str) -> ColumnType {
240 match TypeInference::infer_from_string(value) {
242 InferredType::Null => ColumnType::Unknown,
243 InferredType::Boolean => ColumnType::Boolean,
244 InferredType::Integer => ColumnType::Integer,
245 InferredType::Float => ColumnType::Float,
246 InferredType::DateTime => ColumnType::Date,
247 InferredType::String => ColumnType::String,
248 }
249 }
250
251 fn looks_like_date_fast(&self, value: &str) -> bool {
253 TypeInference::looks_like_datetime(value)
254 }
255
256 fn looks_like_date(&self, value: &str) -> bool {
258 TypeInference::looks_like_datetime(value)
259 }
260
261 pub fn calculate_optimal_column_widths(
263 &mut self,
264 data: &[Value],
265 max_sample_rows: usize,
266 ) -> Vec<usize> {
267 if data.is_empty() {
268 return Vec::new();
269 }
270
271 let headers: Vec<String> = if let Some(first_row) = data.first() {
273 if let Some(obj) = first_row.as_object() {
274 obj.keys().map(|k| k.to_string()).collect()
275 } else {
276 return Vec::new();
277 }
278 } else {
279 return Vec::new();
280 };
281
282 let mut widths = vec![0; headers.len()];
283
284 for (i, header) in headers.iter().enumerate() {
286 widths[i] = header.len();
287 }
288
289 let total_rows = data.len();
291 let rows_to_check: Vec<usize> = if total_rows <= max_sample_rows {
292 (0..total_rows).collect()
293 } else {
294 let step = total_rows / max_sample_rows;
296 (0..max_sample_rows)
297 .map(|i| (i * step).min(total_rows - 1))
298 .collect()
299 };
300
301 for &row_idx in &rows_to_check {
303 if let Some(row) = data.get(row_idx) {
304 if let Some(obj) = row.as_object() {
305 for (i, header) in headers.iter().enumerate() {
306 if let Some(value) = obj.get(header) {
307 let display_len = self.get_value_display_length(value);
308 widths[i] = widths[i].max(display_len);
309 }
310 }
311 }
312 }
313 }
314
315 for width in &mut widths {
317 *width = (*width).min(50).max(3); }
319
320 self.column_widths = widths.clone();
321 widths
322 }
323
324 fn get_value_display_length(&self, value: &Value) -> usize {
326 match value {
327 Value::String(s) => s.len(),
328 Value::Number(n) => n.to_string().len(),
329 Value::Bool(b) => b.to_string().len(),
330 Value::Null => 4, Value::Array(a) => format!("[{} items]", a.len()).len(),
332 Value::Object(o) => format!("{{{} fields}}", o.len()).len(),
333 }
334 }
335
336 pub fn get_column_statistics(&self, column_name: &str) -> Option<&ColumnStatistics> {
338 self.column_stats.get(column_name)
339 }
340
341 pub fn get_column_widths(&self) -> &[usize] {
343 &self.column_widths
344 }
345
346 pub fn clear_cache(&mut self) {
348 self.column_stats.clear();
349 self.column_widths.clear();
350 }
351}
352
353#[cfg(test)]
354mod tests {
355 use super::*;
356 use serde_json::json;
357
358 #[test]
359 fn test_column_type_detection() {
360 let analyzer = DataAnalyzer::new();
361
362 let int_values = vec!["1", "2", "3", "4", "5"];
364 assert_eq!(
365 analyzer.detect_column_type(&int_values),
366 ColumnType::Integer
367 );
368
369 let float_values = vec!["1.5", "2.7", "3.14", "4.0", "5.5"];
371 assert_eq!(
372 analyzer.detect_column_type(&float_values),
373 ColumnType::Float
374 );
375
376 let string_values = vec!["alice", "bob", "charlie", "david"];
378 assert_eq!(
379 analyzer.detect_column_type(&string_values),
380 ColumnType::String
381 );
382
383 let bool_values = vec!["true", "false", "TRUE", "FALSE"];
385 assert_eq!(
386 analyzer.detect_column_type(&bool_values),
387 ColumnType::Boolean
388 );
389 }
390
391 #[test]
392 fn test_column_statistics() {
393 let mut analyzer = DataAnalyzer::new();
394
395 let values = vec!["10", "20", "30", "40", "50", ""];
396
397 let stats = analyzer.calculate_column_statistics("test_column", &values);
398
399 assert_eq!(stats.total_values, 6);
400 assert_eq!(stats.non_null_values, 5);
401 assert_eq!(stats.null_values, 1);
402 assert_eq!(stats.unique_values, 5);
403 assert_eq!(stats.data_type, ColumnType::Integer);
404 assert_eq!(stats.avg_value, Some(30.0));
405 assert_eq!(stats.sum_value, Some(150.0));
406 assert_eq!(stats.median_value, Some(30.0));
407 assert_eq!(stats.min_value, Some("10".to_string()));
408 assert_eq!(stats.max_value, Some("50".to_string()));
409 assert!(stats.frequency_map.is_some());
410 }
411
412 #[test]
413 fn test_optimal_column_widths() {
414 let mut analyzer = DataAnalyzer::new();
415
416 let data = vec![
417 json!({"name": "Alice", "age": 30, "city": "New York"}),
418 json!({"name": "Bob", "age": 25, "city": "Los Angeles"}),
419 json!({"name": "Charlie", "age": 35, "city": "SF"}),
420 ];
421
422 let widths = analyzer.calculate_optimal_column_widths(&data, 100);
423
424 assert_eq!(widths.len(), 3);
425 assert!(widths[2] >= 7); assert!(widths[0] >= 3); assert!(widths[1] >= 11); }
431}