sql_cli/data/
data_analyzer.rs

1use crate::data::type_inference::{InferredType, TypeInference};
2use serde_json::Value;
3use std::collections::HashMap;
4
5/// Analyzes data for statistics, column widths, and other metrics
6/// Extracted from the monolithic enhanced_tui.rs
7pub struct DataAnalyzer {
8    /// Cached column statistics
9    column_stats: HashMap<String, ColumnStatistics>,
10
11    /// Cached optimal column widths
12    column_widths: Vec<usize>,
13}
14
15/// Statistics for a single column
16#[derive(Debug, Clone)]
17pub struct ColumnStatistics {
18    pub column_name: String,
19    pub data_type: ColumnType,
20    pub total_values: usize,
21    pub non_null_values: usize,
22    pub null_values: usize,
23    pub unique_values: usize,
24    pub min_value: Option<String>,
25    pub max_value: Option<String>,
26    pub avg_value: Option<f64>,
27    pub sum_value: Option<f64>,
28    pub median_value: Option<f64>,
29    pub min_length: Option<usize>,
30    pub max_length: Option<usize>,
31    pub frequency_map: Option<std::collections::BTreeMap<String, usize>>,
32}
33
34/// Detected column type
35#[derive(Debug, Clone, PartialEq, Eq, Hash)]
36pub enum ColumnType {
37    String,
38    Integer,
39    Float,
40    Boolean,
41    Date,
42    Mixed,
43    Unknown,
44}
45
46impl DataAnalyzer {
47    pub fn new() -> Self {
48        Self {
49            column_stats: HashMap::new(),
50            column_widths: Vec::new(),
51        }
52    }
53
54    /// Calculate statistics for a specific column
55    pub fn calculate_column_statistics(
56        &mut self,
57        column_name: &str,
58        values: &[&str],
59    ) -> ColumnStatistics {
60        let mut stats = ColumnStatistics {
61            column_name: column_name.to_string(),
62            data_type: ColumnType::Unknown,
63            total_values: values.len(),
64            non_null_values: 0,
65            null_values: 0,
66            unique_values: 0,
67            min_value: None,
68            max_value: None,
69            avg_value: None,
70            sum_value: None,
71            median_value: None,
72            min_length: None,
73            max_length: None,
74            frequency_map: None,
75        };
76
77        if values.is_empty() {
78            return stats;
79        }
80
81        // Collect unique values more efficiently - use references when possible
82        let mut unique = std::collections::HashSet::new();
83        let mut numeric_values = Vec::new();
84        let mut min_str: Option<&str> = None;
85        let mut max_str: Option<&str> = None;
86        let mut lengths = Vec::new();
87
88        for value in values {
89            if value.is_empty() {
90                stats.null_values += 1;
91            } else {
92                stats.non_null_values += 1;
93
94                // Use string reference for unique count
95                unique.insert(*value);
96                lengths.push(value.len());
97
98                // Track min/max strings without cloning
99                match min_str {
100                    None => min_str = Some(value),
101                    Some(min) if value < &min => min_str = Some(value),
102                    _ => {}
103                }
104                match max_str {
105                    None => max_str = Some(value),
106                    Some(max) if value > &max => max_str = Some(value),
107                    _ => {}
108                }
109
110                // Try to parse as number
111                if let Ok(num) = value.parse::<f64>() {
112                    numeric_values.push(num);
113                }
114            }
115        }
116
117        stats.unique_values = unique.len();
118
119        // Determine data type
120        stats.data_type = self.detect_column_type(values);
121
122        // Calculate type-specific statistics
123        match stats.data_type {
124            ColumnType::Integer | ColumnType::Float => {
125                if !numeric_values.is_empty() {
126                    let sum: f64 = numeric_values.iter().sum();
127                    stats.sum_value = Some(sum);
128                    stats.avg_value = Some(sum / numeric_values.len() as f64);
129
130                    // Calculate median
131                    numeric_values
132                        .sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
133                    let mid = numeric_values.len() / 2;
134                    stats.median_value = if numeric_values.len() % 2 == 0 {
135                        Some((numeric_values[mid - 1] + numeric_values[mid]) / 2.0)
136                    } else {
137                        Some(numeric_values[mid])
138                    };
139
140                    let min = numeric_values.iter().cloned().fold(f64::INFINITY, f64::min);
141                    let max = numeric_values
142                        .iter()
143                        .cloned()
144                        .fold(f64::NEG_INFINITY, f64::max);
145                    stats.min_value = Some(min.to_string());
146                    stats.max_value = Some(max.to_string());
147                }
148            }
149            _ => {
150                // String statistics - use the min/max we already found without cloning
151                stats.min_value = min_str.map(|s| s.to_string());
152                stats.max_value = max_str.map(|s| s.to_string());
153            }
154        }
155
156        // Build frequency map - always show top 40 values even for high cardinality columns
157        const MAX_VALUES_TO_SHOW: usize = 40;
158        if stats.unique_values > 0 {
159            // Count all values first
160            let mut freq_map = std::collections::HashMap::new();
161            for value in values {
162                if !value.is_empty() {
163                    *freq_map.entry(value.to_string()).or_insert(0) += 1;
164                }
165            }
166
167            // Sort by frequency (descending) and take top N
168            let mut freq_vec: Vec<(String, usize)> = freq_map.into_iter().collect();
169            freq_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
170
171            // Take top N values and put them in a BTreeMap for sorted display
172            let mut display_map = std::collections::BTreeMap::new();
173            for (value, count) in freq_vec.into_iter().take(MAX_VALUES_TO_SHOW) {
174                display_map.insert(value, count);
175            }
176
177            stats.frequency_map = Some(display_map);
178        }
179
180        // Length statistics
181        if !lengths.is_empty() {
182            stats.min_length = lengths.iter().min().copied();
183            stats.max_length = lengths.iter().max().copied();
184        }
185
186        // Cache the statistics
187        self.column_stats
188            .insert(column_name.to_string(), stats.clone());
189
190        stats
191    }
192
193    /// Detect the type of a column based on its values
194    pub fn detect_column_type(&self, values: &[&str]) -> ColumnType {
195        if values.is_empty() {
196            return ColumnType::Unknown;
197        }
198
199        let mut type_counts = HashMap::new();
200
201        // Early exit optimization: check first few values
202        // If they're all the same type, check if the rest matches
203        let first_type = self.detect_single_value_type(values[0]);
204        let mut all_same = true;
205
206        for (i, value) in values.iter().filter(|v| !v.is_empty()).enumerate() {
207            let detected_type = self.detect_single_value_type(value);
208
209            if i < 10 && detected_type != first_type {
210                all_same = false;
211            }
212
213            *type_counts.entry(detected_type).or_insert(0) += 1;
214
215            // Early exit if we have enough samples and they're mixed
216            if i > 100 && type_counts.len() > 1 && !all_same {
217                break;
218            }
219        }
220
221        // If we have multiple types, it's mixed
222        if type_counts.len() > 1 {
223            // But if >90% are one type, use that type
224            let total: usize = type_counts.values().sum();
225            for (col_type, count) in type_counts.iter() {
226                if *count as f64 / total as f64 > 0.9 {
227                    return col_type.clone();
228                }
229            }
230            ColumnType::Mixed
231        } else if let Some((col_type, _)) = type_counts.into_iter().next() {
232            col_type
233        } else {
234            ColumnType::Unknown
235        }
236    }
237
238    /// Detect type of a single value
239    fn detect_single_value_type(&self, value: &str) -> ColumnType {
240        // Use the shared type inference logic
241        match TypeInference::infer_from_string(value) {
242            InferredType::Null => ColumnType::Unknown,
243            InferredType::Boolean => ColumnType::Boolean,
244            InferredType::Integer => ColumnType::Integer,
245            InferredType::Float => ColumnType::Float,
246            InferredType::DateTime => ColumnType::Date,
247            InferredType::String => ColumnType::String,
248        }
249    }
250
251    /// Check if a string looks like a date
252    fn looks_like_date_fast(&self, value: &str) -> bool {
253        TypeInference::looks_like_datetime(value)
254    }
255
256    /// Check if a string looks like a date
257    fn looks_like_date(&self, value: &str) -> bool {
258        TypeInference::looks_like_datetime(value)
259    }
260
261    /// Calculate optimal column widths for display
262    pub fn calculate_optimal_column_widths(
263        &mut self,
264        data: &[Value],
265        max_sample_rows: usize,
266    ) -> Vec<usize> {
267        if data.is_empty() {
268            return Vec::new();
269        }
270
271        // Get headers from first row
272        let headers: Vec<String> = if let Some(first_row) = data.first() {
273            if let Some(obj) = first_row.as_object() {
274                obj.keys().map(|k| k.to_string()).collect()
275            } else {
276                return Vec::new();
277            }
278        } else {
279            return Vec::new();
280        };
281
282        let mut widths = vec![0; headers.len()];
283
284        // Start with header widths
285        for (i, header) in headers.iter().enumerate() {
286            widths[i] = header.len();
287        }
288
289        // Sample rows for width calculation
290        let total_rows = data.len();
291        let rows_to_check: Vec<usize> = if total_rows <= max_sample_rows {
292            (0..total_rows).collect()
293        } else {
294            // Sample evenly distributed rows
295            let step = total_rows / max_sample_rows;
296            (0..max_sample_rows)
297                .map(|i| (i * step).min(total_rows - 1))
298                .collect()
299        };
300
301        // Check sampled rows
302        for &row_idx in &rows_to_check {
303            if let Some(row) = data.get(row_idx) {
304                if let Some(obj) = row.as_object() {
305                    for (i, header) in headers.iter().enumerate() {
306                        if let Some(value) = obj.get(header) {
307                            let display_len = self.get_value_display_length(value);
308                            widths[i] = widths[i].max(display_len);
309                        }
310                    }
311                }
312            }
313        }
314
315        // Apply constraints
316        for width in &mut widths {
317            *width = (*width).min(50).max(3); // Min 3, max 50 characters
318        }
319
320        self.column_widths = widths.clone();
321        widths
322    }
323
324    /// Get display length of a JSON value
325    fn get_value_display_length(&self, value: &Value) -> usize {
326        match value {
327            Value::String(s) => s.len(),
328            Value::Number(n) => n.to_string().len(),
329            Value::Bool(b) => b.to_string().len(),
330            Value::Null => 4, // "null"
331            Value::Array(a) => format!("[{} items]", a.len()).len(),
332            Value::Object(o) => format!("{{{} fields}}", o.len()).len(),
333        }
334    }
335
336    /// Get cached column statistics
337    pub fn get_column_statistics(&self, column_name: &str) -> Option<&ColumnStatistics> {
338        self.column_stats.get(column_name)
339    }
340
341    /// Get cached column widths
342    pub fn get_column_widths(&self) -> &[usize] {
343        &self.column_widths
344    }
345
346    /// Clear all cached data
347    pub fn clear_cache(&mut self) {
348        self.column_stats.clear();
349        self.column_widths.clear();
350    }
351}
352
353#[cfg(test)]
354mod tests {
355    use super::*;
356    use serde_json::json;
357
358    #[test]
359    fn test_column_type_detection() {
360        let analyzer = DataAnalyzer::new();
361
362        // Integer column
363        let int_values = vec!["1", "2", "3", "4", "5"];
364        assert_eq!(
365            analyzer.detect_column_type(&int_values),
366            ColumnType::Integer
367        );
368
369        // Float column
370        let float_values = vec!["1.5", "2.7", "3.14", "4.0", "5.5"];
371        assert_eq!(
372            analyzer.detect_column_type(&float_values),
373            ColumnType::Float
374        );
375
376        // String column
377        let string_values = vec!["alice", "bob", "charlie", "david"];
378        assert_eq!(
379            analyzer.detect_column_type(&string_values),
380            ColumnType::String
381        );
382
383        // Boolean column
384        let bool_values = vec!["true", "false", "TRUE", "FALSE"];
385        assert_eq!(
386            analyzer.detect_column_type(&bool_values),
387            ColumnType::Boolean
388        );
389    }
390
391    #[test]
392    fn test_column_statistics() {
393        let mut analyzer = DataAnalyzer::new();
394
395        let values = vec!["10", "20", "30", "40", "50", ""];
396
397        let stats = analyzer.calculate_column_statistics("test_column", &values);
398
399        assert_eq!(stats.total_values, 6);
400        assert_eq!(stats.non_null_values, 5);
401        assert_eq!(stats.null_values, 1);
402        assert_eq!(stats.unique_values, 5);
403        assert_eq!(stats.data_type, ColumnType::Integer);
404        assert_eq!(stats.avg_value, Some(30.0));
405        assert_eq!(stats.sum_value, Some(150.0));
406        assert_eq!(stats.median_value, Some(30.0));
407        assert_eq!(stats.min_value, Some("10".to_string()));
408        assert_eq!(stats.max_value, Some("50".to_string()));
409        assert!(stats.frequency_map.is_some());
410    }
411
412    #[test]
413    fn test_optimal_column_widths() {
414        let mut analyzer = DataAnalyzer::new();
415
416        let data = vec![
417            json!({"name": "Alice", "age": 30, "city": "New York"}),
418            json!({"name": "Bob", "age": 25, "city": "Los Angeles"}),
419            json!({"name": "Charlie", "age": 35, "city": "SF"}),
420        ];
421
422        let widths = analyzer.calculate_optimal_column_widths(&data, 100);
423
424        assert_eq!(widths.len(), 3);
425        // The keys are sorted alphabetically: age, city, name
426        // So widths[0] is for "age", widths[1] is for "city", widths[2] is for "name"
427        assert!(widths[2] >= 7); // "Charlie" is 7 chars (name column)
428        assert!(widths[0] >= 3); // "age" header is 3 chars
429        assert!(widths[1] >= 11); // "Los Angeles" is 11 chars (city column)
430    }
431}