sql_cli/data/
data_analyzer.rs

1use crate::data::type_inference::{InferredType, TypeInference};
2use serde_json::Value;
3use std::collections::HashMap;
4
5/// Analyzes data for statistics, column widths, and other metrics
6/// Extracted from the monolithic `enhanced_tui.rs`
7pub struct DataAnalyzer {
8    /// Cached column statistics
9    column_stats: HashMap<String, ColumnStatistics>,
10
11    /// Cached optimal column widths
12    column_widths: Vec<usize>,
13}
14
15/// Statistics for a single column
16#[derive(Debug, Clone)]
17pub struct ColumnStatistics {
18    pub column_name: String,
19    pub data_type: ColumnType,
20    pub total_values: usize,
21    pub non_null_values: usize,
22    pub null_values: usize,
23    pub unique_values: usize,
24    pub min_value: Option<String>,
25    pub max_value: Option<String>,
26    pub avg_value: Option<f64>,
27    pub sum_value: Option<f64>,
28    pub median_value: Option<f64>,
29    pub min_length: Option<usize>,
30    pub max_length: Option<usize>,
31    pub frequency_map: Option<std::collections::BTreeMap<String, usize>>,
32}
33
34/// Detected column type
35#[derive(Debug, Clone, PartialEq, Eq, Hash)]
36pub enum ColumnType {
37    String,
38    Integer,
39    Float,
40    Boolean,
41    Date,
42    Mixed,
43    Unknown,
44}
45
46impl Default for DataAnalyzer {
47    fn default() -> Self {
48        Self::new()
49    }
50}
51
52impl DataAnalyzer {
53    #[must_use]
54    pub fn new() -> Self {
55        Self {
56            column_stats: HashMap::new(),
57            column_widths: Vec::new(),
58        }
59    }
60
61    /// Calculate statistics for a specific column
62    pub fn calculate_column_statistics(
63        &mut self,
64        column_name: &str,
65        values: &[&str],
66    ) -> ColumnStatistics {
67        let mut stats = ColumnStatistics {
68            column_name: column_name.to_string(),
69            data_type: ColumnType::Unknown,
70            total_values: values.len(),
71            non_null_values: 0,
72            null_values: 0,
73            unique_values: 0,
74            min_value: None,
75            max_value: None,
76            avg_value: None,
77            sum_value: None,
78            median_value: None,
79            min_length: None,
80            max_length: None,
81            frequency_map: None,
82        };
83
84        if values.is_empty() {
85            return stats;
86        }
87
88        // Collect unique values more efficiently - use references when possible
89        let mut unique = std::collections::HashSet::new();
90        let mut numeric_values = Vec::new();
91        let mut min_str: Option<&str> = None;
92        let mut max_str: Option<&str> = None;
93        let mut lengths = Vec::new();
94
95        for value in values {
96            if value.is_empty() {
97                stats.null_values += 1;
98            } else {
99                stats.non_null_values += 1;
100
101                // Use string reference for unique count
102                unique.insert(*value);
103                lengths.push(value.len());
104
105                // Track min/max strings without cloning
106                match min_str {
107                    None => min_str = Some(value),
108                    Some(min) if value < &min => min_str = Some(value),
109                    _ => {}
110                }
111                match max_str {
112                    None => max_str = Some(value),
113                    Some(max) if value > &max => max_str = Some(value),
114                    _ => {}
115                }
116
117                // Try to parse as number
118                if let Ok(num) = value.parse::<f64>() {
119                    numeric_values.push(num);
120                }
121            }
122        }
123
124        stats.unique_values = unique.len();
125
126        // Determine data type
127        stats.data_type = self.detect_column_type(values);
128
129        // Calculate type-specific statistics
130        match stats.data_type {
131            ColumnType::Integer | ColumnType::Float => {
132                if !numeric_values.is_empty() {
133                    let sum: f64 = numeric_values.iter().sum();
134                    stats.sum_value = Some(sum);
135                    stats.avg_value = Some(sum / numeric_values.len() as f64);
136
137                    // Calculate median
138                    numeric_values
139                        .sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
140                    let mid = numeric_values.len() / 2;
141                    stats.median_value = if numeric_values.len() % 2 == 0 {
142                        Some(f64::midpoint(numeric_values[mid - 1], numeric_values[mid]))
143                    } else {
144                        Some(numeric_values[mid])
145                    };
146
147                    let min = numeric_values.iter().copied().fold(f64::INFINITY, f64::min);
148                    let max = numeric_values
149                        .iter()
150                        .copied()
151                        .fold(f64::NEG_INFINITY, f64::max);
152                    stats.min_value = Some(min.to_string());
153                    stats.max_value = Some(max.to_string());
154                }
155            }
156            _ => {
157                // String statistics - use the min/max we already found without cloning
158                stats.min_value = min_str.map(std::string::ToString::to_string);
159                stats.max_value = max_str.map(std::string::ToString::to_string);
160            }
161        }
162
163        // Build frequency map - always show top 40 values even for high cardinality columns
164        const MAX_VALUES_TO_SHOW: usize = 40;
165        if stats.unique_values > 0 {
166            // Count all values first
167            let mut freq_map = std::collections::HashMap::new();
168            for value in values {
169                if !value.is_empty() {
170                    *freq_map.entry((*value).to_string()).or_insert(0) += 1;
171                }
172            }
173
174            // Sort by frequency (descending) and take top N
175            let mut freq_vec: Vec<(String, usize)> = freq_map.into_iter().collect();
176            freq_vec.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
177
178            // Take top N values and put them in a BTreeMap for sorted display
179            let mut display_map = std::collections::BTreeMap::new();
180            for (value, count) in freq_vec.into_iter().take(MAX_VALUES_TO_SHOW) {
181                display_map.insert(value, count);
182            }
183
184            stats.frequency_map = Some(display_map);
185        }
186
187        // Length statistics
188        if !lengths.is_empty() {
189            stats.min_length = lengths.iter().min().copied();
190            stats.max_length = lengths.iter().max().copied();
191        }
192
193        // Cache the statistics
194        self.column_stats
195            .insert(column_name.to_string(), stats.clone());
196
197        stats
198    }
199
200    /// Detect the type of a column based on its values
201    #[must_use]
202    pub fn detect_column_type(&self, values: &[&str]) -> ColumnType {
203        if values.is_empty() {
204            return ColumnType::Unknown;
205        }
206
207        let mut type_counts = HashMap::new();
208
209        // Early exit optimization: check first few values
210        // If they're all the same type, check if the rest matches
211        let first_type = self.detect_single_value_type(values[0]);
212        let mut all_same = true;
213
214        for (i, value) in values.iter().filter(|v| !v.is_empty()).enumerate() {
215            let detected_type = self.detect_single_value_type(value);
216
217            if i < 10 && detected_type != first_type {
218                all_same = false;
219            }
220
221            *type_counts.entry(detected_type).or_insert(0) += 1;
222
223            // Early exit if we have enough samples and they're mixed
224            if i > 100 && type_counts.len() > 1 && !all_same {
225                break;
226            }
227        }
228
229        // If we have multiple types, it's mixed
230        if type_counts.len() > 1 {
231            // But if >90% are one type, use that type
232            let total: usize = type_counts.values().sum();
233            for (col_type, count) in &type_counts {
234                if *count as f64 / total as f64 > 0.9 {
235                    return col_type.clone();
236                }
237            }
238            ColumnType::Mixed
239        } else if let Some((col_type, _)) = type_counts.into_iter().next() {
240            col_type
241        } else {
242            ColumnType::Unknown
243        }
244    }
245
246    /// Detect type of a single value
247    fn detect_single_value_type(&self, value: &str) -> ColumnType {
248        // Use the shared type inference logic
249        match TypeInference::infer_from_string(value) {
250            InferredType::Null => ColumnType::Unknown,
251            InferredType::Boolean => ColumnType::Boolean,
252            InferredType::Integer => ColumnType::Integer,
253            InferredType::Float => ColumnType::Float,
254            InferredType::DateTime => ColumnType::Date,
255            InferredType::String => ColumnType::String,
256        }
257    }
258
259    /// Check if a string looks like a date
260    fn looks_like_date_fast(&self, value: &str) -> bool {
261        TypeInference::looks_like_datetime(value)
262    }
263
264    /// Check if a string looks like a date
265    fn looks_like_date(&self, value: &str) -> bool {
266        TypeInference::looks_like_datetime(value)
267    }
268
269    /// Calculate optimal column widths for display
270    pub fn calculate_optimal_column_widths(
271        &mut self,
272        data: &[Value],
273        max_sample_rows: usize,
274    ) -> Vec<usize> {
275        if data.is_empty() {
276            return Vec::new();
277        }
278
279        // Get headers from first row
280        let headers: Vec<String> = if let Some(first_row) = data.first() {
281            if let Some(obj) = first_row.as_object() {
282                obj.keys().map(std::string::ToString::to_string).collect()
283            } else {
284                return Vec::new();
285            }
286        } else {
287            return Vec::new();
288        };
289
290        let mut widths = vec![0; headers.len()];
291
292        // Start with header widths
293        for (i, header) in headers.iter().enumerate() {
294            widths[i] = header.len();
295        }
296
297        // Sample rows for width calculation
298        let total_rows = data.len();
299        let rows_to_check: Vec<usize> = if total_rows <= max_sample_rows {
300            (0..total_rows).collect()
301        } else {
302            // Sample evenly distributed rows
303            let step = total_rows / max_sample_rows;
304            (0..max_sample_rows)
305                .map(|i| (i * step).min(total_rows - 1))
306                .collect()
307        };
308
309        // Check sampled rows
310        for &row_idx in &rows_to_check {
311            if let Some(row) = data.get(row_idx) {
312                if let Some(obj) = row.as_object() {
313                    for (i, header) in headers.iter().enumerate() {
314                        if let Some(value) = obj.get(header) {
315                            let display_len = self.get_value_display_length(value);
316                            widths[i] = widths[i].max(display_len);
317                        }
318                    }
319                }
320            }
321        }
322
323        // Apply constraints
324        for width in &mut widths {
325            *width = (*width).min(50).max(3); // Min 3, max 50 characters
326        }
327
328        self.column_widths = widths.clone();
329        widths
330    }
331
332    /// Get display length of a JSON value
333    fn get_value_display_length(&self, value: &Value) -> usize {
334        match value {
335            Value::String(s) => s.len(),
336            Value::Number(n) => n.to_string().len(),
337            Value::Bool(b) => b.to_string().len(),
338            Value::Null => 4, // "null"
339            Value::Array(a) => format!("[{} items]", a.len()).len(),
340            Value::Object(o) => format!("{{{} fields}}", o.len()).len(),
341        }
342    }
343
344    /// Get cached column statistics
345    #[must_use]
346    pub fn get_column_statistics(&self, column_name: &str) -> Option<&ColumnStatistics> {
347        self.column_stats.get(column_name)
348    }
349
350    /// Get cached column widths
351    #[must_use]
352    pub fn get_column_widths(&self) -> &[usize] {
353        &self.column_widths
354    }
355
356    /// Clear all cached data
357    pub fn clear_cache(&mut self) {
358        self.column_stats.clear();
359        self.column_widths.clear();
360    }
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366    use serde_json::json;
367
368    #[test]
369    fn test_column_type_detection() {
370        let analyzer = DataAnalyzer::new();
371
372        // Integer column
373        let int_values = vec!["1", "2", "3", "4", "5"];
374        assert_eq!(
375            analyzer.detect_column_type(&int_values),
376            ColumnType::Integer
377        );
378
379        // Float column
380        let float_values = vec!["1.5", "2.7", "3.14", "4.0", "5.5"];
381        assert_eq!(
382            analyzer.detect_column_type(&float_values),
383            ColumnType::Float
384        );
385
386        // String column
387        let string_values = vec!["alice", "bob", "charlie", "david"];
388        assert_eq!(
389            analyzer.detect_column_type(&string_values),
390            ColumnType::String
391        );
392
393        // Boolean column
394        let bool_values = vec!["true", "false", "TRUE", "FALSE"];
395        assert_eq!(
396            analyzer.detect_column_type(&bool_values),
397            ColumnType::Boolean
398        );
399    }
400
401    #[test]
402    fn test_column_statistics() {
403        let mut analyzer = DataAnalyzer::new();
404
405        let values = vec!["10", "20", "30", "40", "50", ""];
406
407        let stats = analyzer.calculate_column_statistics("test_column", &values);
408
409        assert_eq!(stats.total_values, 6);
410        assert_eq!(stats.non_null_values, 5);
411        assert_eq!(stats.null_values, 1);
412        assert_eq!(stats.unique_values, 5);
413        assert_eq!(stats.data_type, ColumnType::Integer);
414        assert_eq!(stats.avg_value, Some(30.0));
415        assert_eq!(stats.sum_value, Some(150.0));
416        assert_eq!(stats.median_value, Some(30.0));
417        assert_eq!(stats.min_value, Some("10".to_string()));
418        assert_eq!(stats.max_value, Some("50".to_string()));
419        assert!(stats.frequency_map.is_some());
420    }
421
422    #[test]
423    fn test_optimal_column_widths() {
424        let mut analyzer = DataAnalyzer::new();
425
426        let data = vec![
427            json!({"name": "Alice", "age": 30, "city": "New York"}),
428            json!({"name": "Bob", "age": 25, "city": "Los Angeles"}),
429            json!({"name": "Charlie", "age": 35, "city": "SF"}),
430        ];
431
432        let widths = analyzer.calculate_optimal_column_widths(&data, 100);
433
434        assert_eq!(widths.len(), 3);
435        // The keys are sorted alphabetically: age, city, name
436        // So widths[0] is for "age", widths[1] is for "city", widths[2] is for "name"
437        assert!(widths[2] >= 7); // "Charlie" is 7 chars (name column)
438        assert!(widths[0] >= 3); // "age" header is 3 chars
439        assert!(widths[1] >= 11); // "Los Angeles" is 11 chars (city column)
440    }
441}