1use crate::data::type_inference::{InferredType, TypeInference};
2use serde_json::Value;
3use std::collections::HashMap;
4
5pub struct DataAnalyzer {
8 column_stats: HashMap<String, ColumnStatistics>,
10
11 column_widths: Vec<usize>,
13}
14
15#[derive(Debug, Clone)]
17pub struct ColumnStatistics {
18 pub column_name: String,
19 pub data_type: ColumnType,
20 pub total_values: usize,
21 pub non_null_values: usize,
22 pub null_values: usize,
23 pub unique_values: usize,
24 pub min_value: Option<String>,
25 pub max_value: Option<String>,
26 pub avg_value: Option<f64>,
27 pub sum_value: Option<f64>,
28 pub median_value: Option<f64>,
29 pub min_length: Option<usize>,
30 pub max_length: Option<usize>,
31 pub frequency_map: Option<std::collections::BTreeMap<String, usize>>,
32}
33
34#[derive(Debug, Clone, PartialEq, Eq, Hash)]
36pub enum ColumnType {
37 String,
38 Integer,
39 Float,
40 Boolean,
41 Date,
42 Mixed,
43 Unknown,
44}
45
46impl Default for DataAnalyzer {
47 fn default() -> Self {
48 Self::new()
49 }
50}
51
52impl DataAnalyzer {
53 #[must_use]
54 pub fn new() -> Self {
55 Self {
56 column_stats: HashMap::new(),
57 column_widths: Vec::new(),
58 }
59 }
60
61 pub fn calculate_column_statistics(
63 &mut self,
64 column_name: &str,
65 values: &[&str],
66 ) -> ColumnStatistics {
67 let mut stats = ColumnStatistics {
68 column_name: column_name.to_string(),
69 data_type: ColumnType::Unknown,
70 total_values: values.len(),
71 non_null_values: 0,
72 null_values: 0,
73 unique_values: 0,
74 min_value: None,
75 max_value: None,
76 avg_value: None,
77 sum_value: None,
78 median_value: None,
79 min_length: None,
80 max_length: None,
81 frequency_map: None,
82 };
83
84 if values.is_empty() {
85 return stats;
86 }
87
88 let mut unique = std::collections::HashSet::new();
90 let mut numeric_values = Vec::new();
91 let mut min_str: Option<&str> = None;
92 let mut max_str: Option<&str> = None;
93 let mut lengths = Vec::new();
94
95 for value in values {
96 if value.is_empty() {
97 stats.null_values += 1;
98 } else {
99 stats.non_null_values += 1;
100
101 unique.insert(*value);
103 lengths.push(value.len());
104
105 match min_str {
107 None => min_str = Some(value),
108 Some(min) if value < &min => min_str = Some(value),
109 _ => {}
110 }
111 match max_str {
112 None => max_str = Some(value),
113 Some(max) if value > &max => max_str = Some(value),
114 _ => {}
115 }
116
117 if let Ok(num) = value.parse::<f64>() {
119 numeric_values.push(num);
120 }
121 }
122 }
123
124 stats.unique_values = unique.len();
125
126 stats.data_type = self.detect_column_type(values);
128
129 match stats.data_type {
131 ColumnType::Integer | ColumnType::Float => {
132 if !numeric_values.is_empty() {
133 let sum: f64 = numeric_values.iter().sum();
134 stats.sum_value = Some(sum);
135 stats.avg_value = Some(sum / numeric_values.len() as f64);
136
137 numeric_values
139 .sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
140 let mid = numeric_values.len() / 2;
141 stats.median_value = if numeric_values.len() % 2 == 0 {
142 Some(f64::midpoint(numeric_values[mid - 1], numeric_values[mid]))
143 } else {
144 Some(numeric_values[mid])
145 };
146
147 let min = numeric_values.iter().copied().fold(f64::INFINITY, f64::min);
148 let max = numeric_values
149 .iter()
150 .copied()
151 .fold(f64::NEG_INFINITY, f64::max);
152 stats.min_value = Some(min.to_string());
153 stats.max_value = Some(max.to_string());
154 }
155 }
156 _ => {
157 stats.min_value = min_str.map(std::string::ToString::to_string);
159 stats.max_value = max_str.map(std::string::ToString::to_string);
160 }
161 }
162
163 const MAX_VALUES_TO_SHOW: usize = 40;
165 if stats.unique_values > 0 {
166 let mut freq_map = std::collections::HashMap::new();
168 for value in values {
169 if !value.is_empty() {
170 *freq_map.entry((*value).to_string()).or_insert(0) += 1;
171 }
172 }
173
174 let mut freq_vec: Vec<(String, usize)> = freq_map.into_iter().collect();
176 freq_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
177
178 let mut display_map = std::collections::BTreeMap::new();
180 for (value, count) in freq_vec.into_iter().take(MAX_VALUES_TO_SHOW) {
181 display_map.insert(value, count);
182 }
183
184 stats.frequency_map = Some(display_map);
185 }
186
187 if !lengths.is_empty() {
189 stats.min_length = lengths.iter().min().copied();
190 stats.max_length = lengths.iter().max().copied();
191 }
192
193 self.column_stats
195 .insert(column_name.to_string(), stats.clone());
196
197 stats
198 }
199
200 #[must_use]
202 pub fn detect_column_type(&self, values: &[&str]) -> ColumnType {
203 if values.is_empty() {
204 return ColumnType::Unknown;
205 }
206
207 let mut type_counts = HashMap::new();
208
209 let first_type = self.detect_single_value_type(values[0]);
212 let mut all_same = true;
213
214 for (i, value) in values.iter().filter(|v| !v.is_empty()).enumerate() {
215 let detected_type = self.detect_single_value_type(value);
216
217 if i < 10 && detected_type != first_type {
218 all_same = false;
219 }
220
221 *type_counts.entry(detected_type).or_insert(0) += 1;
222
223 if i > 100 && type_counts.len() > 1 && !all_same {
225 break;
226 }
227 }
228
229 if type_counts.len() > 1 {
231 let total: usize = type_counts.values().sum();
233 for (col_type, count) in &type_counts {
234 if *count as f64 / total as f64 > 0.9 {
235 return col_type.clone();
236 }
237 }
238 ColumnType::Mixed
239 } else if let Some((col_type, _)) = type_counts.into_iter().next() {
240 col_type
241 } else {
242 ColumnType::Unknown
243 }
244 }
245
246 fn detect_single_value_type(&self, value: &str) -> ColumnType {
248 match TypeInference::infer_from_string(value) {
250 InferredType::Null => ColumnType::Unknown,
251 InferredType::Boolean => ColumnType::Boolean,
252 InferredType::Integer => ColumnType::Integer,
253 InferredType::Float => ColumnType::Float,
254 InferredType::DateTime => ColumnType::Date,
255 InferredType::String => ColumnType::String,
256 }
257 }
258
259 fn looks_like_date_fast(&self, value: &str) -> bool {
261 TypeInference::looks_like_datetime(value)
262 }
263
264 fn looks_like_date(&self, value: &str) -> bool {
266 TypeInference::looks_like_datetime(value)
267 }
268
269 pub fn calculate_optimal_column_widths(
271 &mut self,
272 data: &[Value],
273 max_sample_rows: usize,
274 ) -> Vec<usize> {
275 if data.is_empty() {
276 return Vec::new();
277 }
278
279 let headers: Vec<String> = if let Some(first_row) = data.first() {
281 if let Some(obj) = first_row.as_object() {
282 obj.keys().map(std::string::ToString::to_string).collect()
283 } else {
284 return Vec::new();
285 }
286 } else {
287 return Vec::new();
288 };
289
290 let mut widths = vec![0; headers.len()];
291
292 for (i, header) in headers.iter().enumerate() {
294 widths[i] = header.len();
295 }
296
297 let total_rows = data.len();
299 let rows_to_check: Vec<usize> = if total_rows <= max_sample_rows {
300 (0..total_rows).collect()
301 } else {
302 let step = total_rows / max_sample_rows;
304 (0..max_sample_rows)
305 .map(|i| (i * step).min(total_rows - 1))
306 .collect()
307 };
308
309 for &row_idx in &rows_to_check {
311 if let Some(row) = data.get(row_idx) {
312 if let Some(obj) = row.as_object() {
313 for (i, header) in headers.iter().enumerate() {
314 if let Some(value) = obj.get(header) {
315 let display_len = self.get_value_display_length(value);
316 widths[i] = widths[i].max(display_len);
317 }
318 }
319 }
320 }
321 }
322
323 for width in &mut widths {
325 *width = (*width).min(50).max(3); }
327
328 self.column_widths = widths.clone();
329 widths
330 }
331
332 fn get_value_display_length(&self, value: &Value) -> usize {
334 match value {
335 Value::String(s) => s.len(),
336 Value::Number(n) => n.to_string().len(),
337 Value::Bool(b) => b.to_string().len(),
338 Value::Null => 4, Value::Array(a) => format!("[{} items]", a.len()).len(),
340 Value::Object(o) => format!("{{{} fields}}", o.len()).len(),
341 }
342 }
343
344 #[must_use]
346 pub fn get_column_statistics(&self, column_name: &str) -> Option<&ColumnStatistics> {
347 self.column_stats.get(column_name)
348 }
349
350 #[must_use]
352 pub fn get_column_widths(&self) -> &[usize] {
353 &self.column_widths
354 }
355
356 pub fn clear_cache(&mut self) {
358 self.column_stats.clear();
359 self.column_widths.clear();
360 }
361}
362
363#[cfg(test)]
364mod tests {
365 use super::*;
366 use serde_json::json;
367
368 #[test]
369 fn test_column_type_detection() {
370 let analyzer = DataAnalyzer::new();
371
372 let int_values = vec!["1", "2", "3", "4", "5"];
374 assert_eq!(
375 analyzer.detect_column_type(&int_values),
376 ColumnType::Integer
377 );
378
379 let float_values = vec!["1.5", "2.7", "3.14", "4.0", "5.5"];
381 assert_eq!(
382 analyzer.detect_column_type(&float_values),
383 ColumnType::Float
384 );
385
386 let string_values = vec!["alice", "bob", "charlie", "david"];
388 assert_eq!(
389 analyzer.detect_column_type(&string_values),
390 ColumnType::String
391 );
392
393 let bool_values = vec!["true", "false", "TRUE", "FALSE"];
395 assert_eq!(
396 analyzer.detect_column_type(&bool_values),
397 ColumnType::Boolean
398 );
399 }
400
401 #[test]
402 fn test_column_statistics() {
403 let mut analyzer = DataAnalyzer::new();
404
405 let values = vec!["10", "20", "30", "40", "50", ""];
406
407 let stats = analyzer.calculate_column_statistics("test_column", &values);
408
409 assert_eq!(stats.total_values, 6);
410 assert_eq!(stats.non_null_values, 5);
411 assert_eq!(stats.null_values, 1);
412 assert_eq!(stats.unique_values, 5);
413 assert_eq!(stats.data_type, ColumnType::Integer);
414 assert_eq!(stats.avg_value, Some(30.0));
415 assert_eq!(stats.sum_value, Some(150.0));
416 assert_eq!(stats.median_value, Some(30.0));
417 assert_eq!(stats.min_value, Some("10".to_string()));
418 assert_eq!(stats.max_value, Some("50".to_string()));
419 assert!(stats.frequency_map.is_some());
420 }
421
422 #[test]
423 fn test_optimal_column_widths() {
424 let mut analyzer = DataAnalyzer::new();
425
426 let data = vec![
427 json!({"name": "Alice", "age": 30, "city": "New York"}),
428 json!({"name": "Bob", "age": 25, "city": "Los Angeles"}),
429 json!({"name": "Charlie", "age": 35, "city": "SF"}),
430 ];
431
432 let widths = analyzer.calculate_optimal_column_widths(&data, 100);
433
434 assert_eq!(widths.len(), 3);
435 assert!(widths[2] >= 7); assert!(widths[0] >= 3); assert!(widths[1] >= 11); }
441}