1use crate::common::string;
4use chrono::{Datelike, NaiveDate};
5use std::collections::HashMap;
6
7use super::types::*;
8
9impl super::profiler::DataProfiler {
10 pub fn infer_data_type(&self, data: &[String]) -> DataType {
12 let non_null_values: Vec<&str> = data
13 .iter()
14 .filter(|v| !string::is_empty_or_whitespace(v))
15 .map(|v| v.as_str())
16 .collect();
17
18 if non_null_values.is_empty() {
19 return DataType::Unknown;
20 }
21
22 let sample_size = non_null_values.len().min(100);
23 let sample = &non_null_values[..sample_size];
24
25 let boolean_count = sample
27 .iter()
28 .filter(|v| {
29 matches!(
30 v.to_lowercase().as_str(),
31 "true" | "false" | "1" | "0" | "yes" | "no"
32 )
33 })
34 .count();
35
36 if boolean_count as f64 / sample_size as f64 > 0.8 {
37 return DataType::Boolean;
38 }
39
40 let email_regex =
42 regex::Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").unwrap();
43 let email_count = sample.iter().filter(|v| email_regex.is_match(v)).count();
44
45 if email_count as f64 / sample_size as f64 > 0.8 {
46 return DataType::Email;
47 }
48
49 let url_regex = regex::Regex::new(r"^https?://[^\s/$.?#].[^\s]*$").unwrap();
51 let url_count = sample.iter().filter(|v| url_regex.is_match(v)).count();
52
53 if url_count as f64 / sample_size as f64 > 0.8 {
54 return DataType::Url;
55 }
56
57 let phone_regex = regex::Regex::new(r"^\+?[\d\s\-\(\)]{10,}$").unwrap();
59 let phone_count = sample.iter().filter(|v| phone_regex.is_match(v)).count();
60
61 if phone_count as f64 / sample_size as f64 > 0.8 {
62 return DataType::Phone;
63 }
64
65 let date_formats = vec![
67 "%Y-%m-%d",
68 "%d/%m/%Y",
69 "%m/%d/%Y",
70 "%Y-%m-%d %H:%M:%S",
71 "%d/%m/%Y %H:%M:%S",
72 ];
73
74 for format in &date_formats {
75 let date_count = sample
76 .iter()
77 .filter(|v| {
78 chrono::NaiveDate::parse_from_str(v, format).is_ok()
79 || chrono::NaiveDateTime::parse_from_str(v, format).is_ok()
80 })
81 .count();
82
83 if date_count as f64 / sample_size as f64 > 0.8 {
84 return if format.contains("%H") {
85 DataType::DateTime
86 } else {
87 DataType::Date
88 };
89 }
90 }
91
92 let numeric_count = sample.iter().filter(|v| string::is_numeric(v)).count();
94
95 if numeric_count as f64 / sample_size as f64 > 0.8 {
96 let int_count = sample.iter().filter(|v| v.parse::<i64>().is_ok()).count();
98
99 return if int_count as f64 / numeric_count as f64 > 0.8 {
100 DataType::Integer
101 } else {
102 DataType::Float
103 };
104 }
105
106 DataType::String
107 }
108
109 pub fn get_value_frequencies(&self, data: &[String]) -> Vec<ValueFrequency> {
111 let mut frequency_map: HashMap<String, usize> = HashMap::new();
112 let total = data.len();
113
114 for value in data {
115 if !string::is_empty_or_whitespace(value) {
116 *frequency_map.entry(value.clone()).or_insert(0) += 1;
117 }
118 }
119
120 let mut frequencies: Vec<ValueFrequency> = frequency_map
121 .into_iter()
122 .map(|(value, count)| ValueFrequency {
123 value,
124 count,
125 percentage: (count as f64 / total as f64) * 100.0,
126 })
127 .collect();
128
129 frequencies.sort_by(|a, b| match b.count.cmp(&a.count) {
131 std::cmp::Ordering::Equal => a.value.cmp(&b.value),
132 other => other,
133 });
134 frequencies.truncate(10); frequencies
136 }
137
138 pub fn calculate_length_stats(&self, data: &[String]) -> LengthStats {
140 let lengths: Vec<usize> = data
141 .iter()
142 .filter(|v| !string::is_empty_or_whitespace(v))
143 .map(|v| v.len())
144 .collect();
145
146 if lengths.is_empty() {
147 return LengthStats {
148 min_length: 0,
149 max_length: 0,
150 avg_length: 0.0,
151 median_length: 0,
152 std_dev_length: 0.0,
153 };
154 }
155
156 let min_length = *lengths.iter().min().unwrap();
157 let max_length = *lengths.iter().max().unwrap();
158 let avg_length = lengths.iter().sum::<usize>() as f64 / lengths.len() as f64;
159
160 let mut sorted_lengths = lengths.clone();
161 sorted_lengths.sort_unstable();
162 let median_length = if sorted_lengths.len() % 2 == 0 {
163 let mid = sorted_lengths.len() / 2;
164 (sorted_lengths[mid - 1] + sorted_lengths[mid]) / 2
165 } else {
166 sorted_lengths[sorted_lengths.len() / 2]
167 };
168
169 let variance = lengths
170 .iter()
171 .map(|&len| (len as f64 - avg_length).powi(2))
172 .sum::<f64>()
173 / lengths.len() as f64;
174 let std_dev_length = variance.sqrt();
175
176 LengthStats {
177 min_length,
178 max_length,
179 avg_length,
180 median_length,
181 std_dev_length,
182 }
183 }
184
185 pub fn calculate_numeric_stats(&self, data: &[String]) -> Option<NumericStats> {
187 let numbers: Vec<f64> = data
188 .iter()
189 .filter(|v| !string::is_empty_or_whitespace(v))
190 .filter_map(|v| string::to_number(v))
191 .collect();
192
193 if numbers.is_empty() {
194 return None;
195 }
196
197 let min = numbers.iter().fold(f64::INFINITY, |a, &b| a.min(b));
198 let max = numbers.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
199 let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
200
201 let mut sorted_numbers = numbers.clone();
202 sorted_numbers.sort_by(|a, b| a.partial_cmp(b).unwrap());
203 let median = if sorted_numbers.len() % 2 == 0 {
204 let mid = sorted_numbers.len() / 2;
205 (sorted_numbers[mid - 1] + sorted_numbers[mid]) / 2.0
206 } else {
207 sorted_numbers[sorted_numbers.len() / 2]
208 };
209
210 let mut frequency_map: HashMap<i64, usize> = HashMap::new();
212 for &num in &numbers {
213 let rounded = num.round() as i64;
214 *frequency_map.entry(rounded).or_insert(0) += 1;
215 }
216
217 let max_freq = frequency_map.values().max().unwrap();
218 let mode: Vec<String> = frequency_map
219 .iter()
220 .filter(|&(_, &freq)| freq == *max_freq)
221 .map(|(val, _)| val.to_string())
222 .collect();
223
224 let variance =
225 numbers.iter().map(|&num| (num - mean).powi(2)).sum::<f64>() / numbers.len() as f64;
226 let std_dev = variance.sqrt();
227
228 let q1_idx = (sorted_numbers.len() as f64 * 0.25) as usize;
230 let q3_idx = (sorted_numbers.len() as f64 * 0.75) as usize;
231 let q1 = sorted_numbers[q1_idx];
232 let q3 = sorted_numbers[q3_idx];
233 let iqr = q3 - q1;
234
235 let skewness = if std_dev > 0.0 {
237 numbers
238 .iter()
239 .map(|&num| ((num - mean) / std_dev).powi(3))
240 .sum::<f64>()
241 / numbers.len() as f64
242 } else {
243 0.0
244 };
245
246 let kurtosis = if std_dev > 0.0 {
247 numbers
248 .iter()
249 .map(|&num| ((num - mean) / std_dev).powi(4))
250 .sum::<f64>()
251 / numbers.len() as f64
252 - 3.0 } else {
254 0.0
255 };
256
257 Some(NumericStats {
258 min,
259 max,
260 mean,
261 median,
262 mode,
263 std_dev,
264 variance,
265 q1,
266 q3,
267 iqr,
268 skewness,
269 kurtosis,
270 })
271 }
272
273 pub fn calculate_date_stats(&self, data: &[String]) -> Option<DateStats> {
275 let dates: Vec<NaiveDate> = data
276 .iter()
277 .filter(|v| !string::is_empty_or_whitespace(v))
278 .filter_map(|v| {
279 if let Ok(date) = NaiveDate::parse_from_str(v, "%Y-%m-%d") {
281 Some(date)
282 } else if let Ok(date) = NaiveDate::parse_from_str(v, "%d/%m/%Y") {
283 Some(date)
284 } else if let Ok(date) = NaiveDate::parse_from_str(v, "%m/%d/%Y") {
285 Some(date)
286 } else {
287 None
288 }
289 })
290 .collect();
291
292 if dates.is_empty() {
293 return None;
294 }
295
296 let min_date = dates.iter().min().unwrap();
297 let max_date = dates.iter().max().unwrap();
298 let date_range_days = (max_date.signed_duration_since(*min_date)).num_days();
299
300 let mut year_count: HashMap<u32, usize> = HashMap::new();
302 let mut month_count: HashMap<u32, usize> = HashMap::new();
303 let mut dow_count: HashMap<String, usize> = HashMap::new();
304
305 for date in &dates {
306 *year_count.entry(date.year() as u32).or_insert(0) += 1;
307 *month_count.entry(date.month()).or_insert(0) += 1;
308
309 let dow = match date.weekday() {
310 chrono::Weekday::Mon => "Monday",
311 chrono::Weekday::Tue => "Tuesday",
312 chrono::Weekday::Wed => "Wednesday",
313 chrono::Weekday::Thu => "Thursday",
314 chrono::Weekday::Fri => "Friday",
315 chrono::Weekday::Sat => "Saturday",
316 chrono::Weekday::Sun => "Sunday",
317 };
318 *dow_count.entry(dow.to_string()).or_insert(0) += 1;
319 }
320
321 let most_common_year = year_count
322 .iter()
323 .max_by_key(|&(_, &count)| count)
324 .map(|(&year, _)| year)
325 .unwrap_or(0);
326
327 let most_common_month = month_count
328 .iter()
329 .max_by_key(|&(_, &count)| count)
330 .map(|(&month, _)| month)
331 .unwrap_or(0);
332
333 let most_common_day_of_week = dow_count
334 .iter()
335 .max_by_key(|&(_, &count)| count)
336 .map(|(dow, _)| dow.clone())
337 .unwrap_or_else(|| "Unknown".to_string());
338
339 Some(DateStats {
340 min_date: min_date.format("%Y-%m-%d").to_string(),
341 max_date: max_date.format("%Y-%m-%d").to_string(),
342 date_range_days,
343 most_common_year,
344 most_common_month,
345 most_common_day_of_week,
346 })
347 }
348
349 pub fn calculate_text_stats(&self, data: &[String]) -> TextStats {
351 let non_empty: Vec<&str> = data
352 .iter()
353 .filter(|v| !string::is_empty_or_whitespace(v))
354 .map(|v| v.as_str())
355 .collect();
356
357 if non_empty.is_empty() {
358 return TextStats {
359 avg_word_count: 0.0,
360 max_word_count: 0,
361 min_word_count: 0,
362 contains_numbers: false,
363 contains_special_chars: false,
364 all_uppercase: 0,
365 all_lowercase: 0,
366 title_case: 0,
367 mixed_case: 0,
368 };
369 }
370
371 let word_counts: Vec<usize> = non_empty
372 .iter()
373 .map(|text| text.split_whitespace().count())
374 .collect();
375
376 let max_word_count = *word_counts.iter().max().unwrap();
377 let min_word_count = *word_counts.iter().min().unwrap();
378 let avg_word_count = word_counts.iter().sum::<usize>() as f64 / word_counts.len() as f64;
379
380 let contains_numbers = non_empty
381 .iter()
382 .any(|text| text.chars().any(|c| c.is_numeric()));
383 let contains_special_chars = non_empty.iter().any(|text| {
384 text.chars()
385 .any(|c| !c.is_alphanumeric() && !c.is_whitespace())
386 });
387
388 let mut all_uppercase = 0;
389 let mut all_lowercase = 0;
390 let mut title_case = 0;
391 let mut mixed_case = 0;
392
393 for text in &non_empty {
394 if text.chars().all(|c| !c.is_alphabetic() || c.is_uppercase())
395 && text.chars().any(|c| c.is_alphabetic())
396 {
397 all_uppercase += 1;
398 } else if text.chars().all(|c| !c.is_alphabetic() || c.is_lowercase())
399 && text.chars().any(|c| c.is_alphabetic())
400 {
401 all_lowercase += 1;
402 } else if text.chars().next().map_or(false, |c| c.is_uppercase())
403 && text
404 .chars()
405 .skip(1)
406 .all(|c| !c.is_alphabetic() || c.is_lowercase())
407 {
408 title_case += 1;
409 } else if text.chars().any(|c| c.is_alphabetic()) {
410 mixed_case += 1;
411 }
412 }
413
414 TextStats {
415 avg_word_count,
416 max_word_count,
417 min_word_count,
418 contains_numbers,
419 contains_special_chars,
420 all_uppercase,
421 all_lowercase,
422 title_case,
423 mixed_case,
424 }
425 }
426}