Skip to main content

xls_rs/profiling/
analysis.rs

1//! Statistical analysis methods for data profiling
2
3use crate::common::string;
4use chrono::{Datelike, NaiveDate};
5use std::collections::HashMap;
6
7use super::types::*;
8
9impl super::profiler::DataProfiler {
10    /// Infer data type from sample values
11    pub fn infer_data_type(&self, data: &[String]) -> DataType {
12        let non_null_values: Vec<&str> = data
13            .iter()
14            .filter(|v| !string::is_empty_or_whitespace(v))
15            .map(|v| v.as_str())
16            .collect();
17
18        if non_null_values.is_empty() {
19            return DataType::Unknown;
20        }
21
22        let sample_size = non_null_values.len().min(100);
23        let sample = &non_null_values[..sample_size];
24
25        // Check for boolean
26        let boolean_count = sample
27            .iter()
28            .filter(|v| {
29                matches!(
30                    v.to_lowercase().as_str(),
31                    "true" | "false" | "1" | "0" | "yes" | "no"
32                )
33            })
34            .count();
35
36        if boolean_count as f64 / sample_size as f64 > 0.8 {
37            return DataType::Boolean;
38        }
39
40        // Check for email
41        let email_regex =
42            regex::Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").unwrap();
43        let email_count = sample.iter().filter(|v| email_regex.is_match(v)).count();
44
45        if email_count as f64 / sample_size as f64 > 0.8 {
46            return DataType::Email;
47        }
48
49        // Check for URL
50        let url_regex = regex::Regex::new(r"^https?://[^\s/$.?#].[^\s]*$").unwrap();
51        let url_count = sample.iter().filter(|v| url_regex.is_match(v)).count();
52
53        if url_count as f64 / sample_size as f64 > 0.8 {
54            return DataType::Url;
55        }
56
57        // Check for phone
58        let phone_regex = regex::Regex::new(r"^\+?[\d\s\-\(\)]{10,}$").unwrap();
59        let phone_count = sample.iter().filter(|v| phone_regex.is_match(v)).count();
60
61        if phone_count as f64 / sample_size as f64 > 0.8 {
62            return DataType::Phone;
63        }
64
65        // Check for date/datetime
66        let date_formats = vec![
67            "%Y-%m-%d",
68            "%d/%m/%Y",
69            "%m/%d/%Y",
70            "%Y-%m-%d %H:%M:%S",
71            "%d/%m/%Y %H:%M:%S",
72        ];
73
74        for format in &date_formats {
75            let date_count = sample
76                .iter()
77                .filter(|v| {
78                    chrono::NaiveDate::parse_from_str(v, format).is_ok()
79                        || chrono::NaiveDateTime::parse_from_str(v, format).is_ok()
80                })
81                .count();
82
83            if date_count as f64 / sample_size as f64 > 0.8 {
84                return if format.contains("%H") {
85                    DataType::DateTime
86                } else {
87                    DataType::Date
88                };
89            }
90        }
91
92        // Check for numeric
93        let numeric_count = sample.iter().filter(|v| string::is_numeric(v)).count();
94
95        if numeric_count as f64 / sample_size as f64 > 0.8 {
96            // Check if all are integers
97            let int_count = sample.iter().filter(|v| v.parse::<i64>().is_ok()).count();
98
99            return if int_count as f64 / numeric_count as f64 > 0.8 {
100                DataType::Integer
101            } else {
102                DataType::Float
103            };
104        }
105
106        DataType::String
107    }
108
109    /// Get value frequencies
110    pub fn get_value_frequencies(&self, data: &[String]) -> Vec<ValueFrequency> {
111        let mut frequency_map: HashMap<String, usize> = HashMap::new();
112        let total = data.len();
113
114        for value in data {
115            if !string::is_empty_or_whitespace(value) {
116                *frequency_map.entry(value.clone()).or_insert(0) += 1;
117            }
118        }
119
120        let mut frequencies: Vec<ValueFrequency> = frequency_map
121            .into_iter()
122            .map(|(value, count)| ValueFrequency {
123                value,
124                count,
125                percentage: (count as f64 / total as f64) * 100.0,
126            })
127            .collect();
128
129        // Sort by count (descending), then by value (ascending) for deterministic ordering
130        frequencies.sort_by(|a, b| match b.count.cmp(&a.count) {
131            std::cmp::Ordering::Equal => a.value.cmp(&b.value),
132            other => other,
133        });
134        frequencies.truncate(10); // Top 10 values
135        frequencies
136    }
137
138    /// Calculate length statistics
139    pub fn calculate_length_stats(&self, data: &[String]) -> LengthStats {
140        let lengths: Vec<usize> = data
141            .iter()
142            .filter(|v| !string::is_empty_or_whitespace(v))
143            .map(|v| v.len())
144            .collect();
145
146        if lengths.is_empty() {
147            return LengthStats {
148                min_length: 0,
149                max_length: 0,
150                avg_length: 0.0,
151                median_length: 0,
152                std_dev_length: 0.0,
153            };
154        }
155
156        let min_length = *lengths.iter().min().unwrap();
157        let max_length = *lengths.iter().max().unwrap();
158        let avg_length = lengths.iter().sum::<usize>() as f64 / lengths.len() as f64;
159
160        let mut sorted_lengths = lengths.clone();
161        sorted_lengths.sort_unstable();
162        let median_length = if sorted_lengths.len() % 2 == 0 {
163            let mid = sorted_lengths.len() / 2;
164            (sorted_lengths[mid - 1] + sorted_lengths[mid]) / 2
165        } else {
166            sorted_lengths[sorted_lengths.len() / 2]
167        };
168
169        let variance = lengths
170            .iter()
171            .map(|&len| (len as f64 - avg_length).powi(2))
172            .sum::<f64>()
173            / lengths.len() as f64;
174        let std_dev_length = variance.sqrt();
175
176        LengthStats {
177            min_length,
178            max_length,
179            avg_length,
180            median_length,
181            std_dev_length,
182        }
183    }
184
185    /// Calculate numeric statistics
186    pub fn calculate_numeric_stats(&self, data: &[String]) -> Option<NumericStats> {
187        let numbers: Vec<f64> = data
188            .iter()
189            .filter(|v| !string::is_empty_or_whitespace(v))
190            .filter_map(|v| string::to_number(v))
191            .collect();
192
193        if numbers.is_empty() {
194            return None;
195        }
196
197        let min = numbers.iter().fold(f64::INFINITY, |a, &b| a.min(b));
198        let max = numbers.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
199        let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
200
201        let mut sorted_numbers = numbers.clone();
202        sorted_numbers.sort_by(|a, b| a.partial_cmp(b).unwrap());
203        let median = if sorted_numbers.len() % 2 == 0 {
204            let mid = sorted_numbers.len() / 2;
205            (sorted_numbers[mid - 1] + sorted_numbers[mid]) / 2.0
206        } else {
207            sorted_numbers[sorted_numbers.len() / 2]
208        };
209
210        // Calculate mode
211        let mut frequency_map: HashMap<i64, usize> = HashMap::new();
212        for &num in &numbers {
213            let rounded = num.round() as i64;
214            *frequency_map.entry(rounded).or_insert(0) += 1;
215        }
216
217        let max_freq = frequency_map.values().max().unwrap();
218        let mode: Vec<String> = frequency_map
219            .iter()
220            .filter(|&(_, &freq)| freq == *max_freq)
221            .map(|(val, _)| val.to_string())
222            .collect();
223
224        let variance =
225            numbers.iter().map(|&num| (num - mean).powi(2)).sum::<f64>() / numbers.len() as f64;
226        let std_dev = variance.sqrt();
227
228        // Calculate quartiles
229        let q1_idx = (sorted_numbers.len() as f64 * 0.25) as usize;
230        let q3_idx = (sorted_numbers.len() as f64 * 0.75) as usize;
231        let q1 = sorted_numbers[q1_idx];
232        let q3 = sorted_numbers[q3_idx];
233        let iqr = q3 - q1;
234
235        // Calculate skewness and kurtosis
236        let skewness = if std_dev > 0.0 {
237            numbers
238                .iter()
239                .map(|&num| ((num - mean) / std_dev).powi(3))
240                .sum::<f64>()
241                / numbers.len() as f64
242        } else {
243            0.0
244        };
245
246        let kurtosis = if std_dev > 0.0 {
247            numbers
248                .iter()
249                .map(|&num| ((num - mean) / std_dev).powi(4))
250                .sum::<f64>()
251                / numbers.len() as f64
252                - 3.0 // Excess kurtosis
253        } else {
254            0.0
255        };
256
257        Some(NumericStats {
258            min,
259            max,
260            mean,
261            median,
262            mode,
263            std_dev,
264            variance,
265            q1,
266            q3,
267            iqr,
268            skewness,
269            kurtosis,
270        })
271    }
272
273    /// Calculate date statistics
274    pub fn calculate_date_stats(&self, data: &[String]) -> Option<DateStats> {
275        let dates: Vec<NaiveDate> = data
276            .iter()
277            .filter(|v| !string::is_empty_or_whitespace(v))
278            .filter_map(|v| {
279                // Try different date formats
280                if let Ok(date) = NaiveDate::parse_from_str(v, "%Y-%m-%d") {
281                    Some(date)
282                } else if let Ok(date) = NaiveDate::parse_from_str(v, "%d/%m/%Y") {
283                    Some(date)
284                } else if let Ok(date) = NaiveDate::parse_from_str(v, "%m/%d/%Y") {
285                    Some(date)
286                } else {
287                    None
288                }
289            })
290            .collect();
291
292        if dates.is_empty() {
293            return None;
294        }
295
296        let min_date = dates.iter().min().unwrap();
297        let max_date = dates.iter().max().unwrap();
298        let date_range_days = (max_date.signed_duration_since(*min_date)).num_days();
299
300        // Most common year, month, and day of week
301        let mut year_count: HashMap<u32, usize> = HashMap::new();
302        let mut month_count: HashMap<u32, usize> = HashMap::new();
303        let mut dow_count: HashMap<String, usize> = HashMap::new();
304
305        for date in &dates {
306            *year_count.entry(date.year() as u32).or_insert(0) += 1;
307            *month_count.entry(date.month()).or_insert(0) += 1;
308
309            let dow = match date.weekday() {
310                chrono::Weekday::Mon => "Monday",
311                chrono::Weekday::Tue => "Tuesday",
312                chrono::Weekday::Wed => "Wednesday",
313                chrono::Weekday::Thu => "Thursday",
314                chrono::Weekday::Fri => "Friday",
315                chrono::Weekday::Sat => "Saturday",
316                chrono::Weekday::Sun => "Sunday",
317            };
318            *dow_count.entry(dow.to_string()).or_insert(0) += 1;
319        }
320
321        let most_common_year = year_count
322            .iter()
323            .max_by_key(|&(_, &count)| count)
324            .map(|(&year, _)| year)
325            .unwrap_or(0);
326
327        let most_common_month = month_count
328            .iter()
329            .max_by_key(|&(_, &count)| count)
330            .map(|(&month, _)| month)
331            .unwrap_or(0);
332
333        let most_common_day_of_week = dow_count
334            .iter()
335            .max_by_key(|&(_, &count)| count)
336            .map(|(dow, _)| dow.clone())
337            .unwrap_or_else(|| "Unknown".to_string());
338
339        Some(DateStats {
340            min_date: min_date.format("%Y-%m-%d").to_string(),
341            max_date: max_date.format("%Y-%m-%d").to_string(),
342            date_range_days,
343            most_common_year,
344            most_common_month,
345            most_common_day_of_week,
346        })
347    }
348
349    /// Calculate text statistics
350    pub fn calculate_text_stats(&self, data: &[String]) -> TextStats {
351        let non_empty: Vec<&str> = data
352            .iter()
353            .filter(|v| !string::is_empty_or_whitespace(v))
354            .map(|v| v.as_str())
355            .collect();
356
357        if non_empty.is_empty() {
358            return TextStats {
359                avg_word_count: 0.0,
360                max_word_count: 0,
361                min_word_count: 0,
362                contains_numbers: false,
363                contains_special_chars: false,
364                all_uppercase: 0,
365                all_lowercase: 0,
366                title_case: 0,
367                mixed_case: 0,
368            };
369        }
370
371        let word_counts: Vec<usize> = non_empty
372            .iter()
373            .map(|text| text.split_whitespace().count())
374            .collect();
375
376        let max_word_count = *word_counts.iter().max().unwrap();
377        let min_word_count = *word_counts.iter().min().unwrap();
378        let avg_word_count = word_counts.iter().sum::<usize>() as f64 / word_counts.len() as f64;
379
380        let contains_numbers = non_empty
381            .iter()
382            .any(|text| text.chars().any(|c| c.is_numeric()));
383        let contains_special_chars = non_empty.iter().any(|text| {
384            text.chars()
385                .any(|c| !c.is_alphanumeric() && !c.is_whitespace())
386        });
387
388        let mut all_uppercase = 0;
389        let mut all_lowercase = 0;
390        let mut title_case = 0;
391        let mut mixed_case = 0;
392
393        for text in &non_empty {
394            if text.chars().all(|c| !c.is_alphabetic() || c.is_uppercase())
395                && text.chars().any(|c| c.is_alphabetic())
396            {
397                all_uppercase += 1;
398            } else if text.chars().all(|c| !c.is_alphabetic() || c.is_lowercase())
399                && text.chars().any(|c| c.is_alphabetic())
400            {
401                all_lowercase += 1;
402            } else if text.chars().next().map_or(false, |c| c.is_uppercase())
403                && text
404                    .chars()
405                    .skip(1)
406                    .all(|c| !c.is_alphabetic() || c.is_lowercase())
407            {
408                title_case += 1;
409            } else if text.chars().any(|c| c.is_alphabetic()) {
410                mixed_case += 1;
411            }
412        }
413
414        TextStats {
415            avg_word_count,
416            max_word_count,
417            min_word_count,
418            contains_numbers,
419            contains_special_chars,
420            all_uppercase,
421            all_lowercase,
422            title_case,
423            mixed_case,
424        }
425    }
426}