Skip to main content

xls_rs/profiling/
profiler.rs

1//! Core data profiler implementation
2
3use crate::common::{collection, string};
4use anyhow::Result;
5use std::collections::HashSet;
6
7use super::types::*;
8
9/// Data profiler
10pub struct DataProfiler {
11    max_distinct_values: usize,
12    sample_size: Option<usize>,
13}
14
15impl DataProfiler {
16    /// Create a new profiler with options
17    pub fn new() -> Self {
18        Self {
19            max_distinct_values: 100,
20            sample_size: None,
21        }
22    }
23
24    /// Set maximum distinct values to track
25    pub fn with_max_distinct_values(mut self, max: usize) -> Self {
26        self.max_distinct_values = max;
27        self
28    }
29
30    /// Set sample size for large datasets
31    pub fn with_sample_size(mut self, size: usize) -> Self {
32        self.sample_size = Some(size);
33        self
34    }
35
36    /// Profile data from rows
37    pub fn profile(&self, data: &[Vec<String>], file_path: &str) -> Result<DataProfile> {
38        if data.is_empty() {
39            return Ok(DataProfile {
40                file_path: file_path.to_string(),
41                total_rows: 0,
42                total_columns: 0,
43                total_cells: 0,
44                null_cells: 0,
45                null_percentage: 0.0,
46                duplicate_rows: 0,
47                duplicate_percentage: 0.0,
48                columns: Vec::new(),
49                data_quality_score: 0.0,
50                recommendations: Vec::new(),
51                profiling_timestamp: chrono::Utc::now().to_rfc3339(),
52            });
53        }
54
55        let header = &data[0];
56        let total_rows = data.len() - 1;
57        let total_columns = header.len();
58        let total_cells = total_rows * total_columns;
59
60        // Sample data if needed
61        let data_to_profile = if let Some(sample_size) = self.sample_size {
62            if total_rows > sample_size {
63                let mut sampled = vec![header.clone()];
64                let step = total_rows / sample_size;
65                for i in (1..=total_rows).step_by(step.max(1)) {
66                    if i < data.len() {
67                        sampled.push(data[i].clone());
68                    }
69                }
70                sampled
71            } else {
72                data.to_vec()
73            }
74        } else {
75            data.to_vec()
76        };
77
78        // Profile each column
79        let mut columns = Vec::new();
80        let mut null_cells = 0;
81
82        for (col_idx, col_name) in header.iter().enumerate() {
83            let column_data: Vec<String> = data_to_profile
84                .iter()
85                .skip(1)
86                .filter_map(|row| row.get(col_idx).cloned())
87                .collect();
88
89            let column_profile = self.profile_column(col_name, &column_data, total_rows)?;
90            null_cells += column_profile.null_count;
91            columns.push(column_profile);
92        }
93
94        // Calculate duplicates
95        let duplicate_rows = self.count_duplicate_rows(&data_to_profile[1..]);
96        let duplicate_percentage = (duplicate_rows as f64 / total_rows as f64) * 100.0;
97        let null_percentage = (null_cells as f64 / total_cells as f64) * 100.0;
98
99        // Calculate overall quality score
100        let data_quality_score =
101            self.calculate_overall_quality_score(&columns, null_percentage, duplicate_percentage);
102
103        // Generate recommendations
104        let recommendations =
105            self.generate_recommendations(&columns, null_percentage, duplicate_percentage);
106
107        Ok(DataProfile {
108            file_path: file_path.to_string(),
109            total_rows,
110            total_columns,
111            total_cells,
112            null_cells,
113            null_percentage,
114            duplicate_rows,
115            duplicate_percentage,
116            columns,
117            data_quality_score,
118            recommendations,
119            profiling_timestamp: chrono::Utc::now().to_rfc3339(),
120        })
121    }
122
123    /// Profile a single column
124    fn profile_column(
125        &self,
126        name: &str,
127        data: &[String],
128        total_rows: usize,
129    ) -> Result<ColumnProfile> {
130        let null_count = data
131            .iter()
132            .filter(|v| string::is_empty_or_whitespace(v))
133            .count();
134        let null_percentage = (null_count as f64 / total_rows as f64) * 100.0;
135
136        // Get distinct values
137        let distinct_values: Vec<String> = collection::unique_preserve_order(
138            &data
139                .iter()
140                .filter(|v| !string::is_empty_or_whitespace(v))
141                .cloned()
142                .collect::<Vec<_>>(),
143        );
144
145        let unique_count = distinct_values.len();
146        let unique_percentage = (unique_count as f64 / total_rows as f64) * 100.0;
147
148        // Get top values
149        let top_values = self.get_value_frequencies(data);
150
151        // Determine data type
152        let data_type = self.infer_data_type(data);
153
154        // Calculate type-specific statistics
155        let length_stats = if matches!(
156            data_type,
157            DataType::String | DataType::Email | DataType::Url | DataType::Phone
158        ) {
159            Some(self.calculate_length_stats(data))
160        } else {
161            None
162        };
163
164        let numeric_stats = if matches!(data_type, DataType::Integer | DataType::Float) {
165            self.calculate_numeric_stats(data)
166        } else {
167            None
168        };
169
170        let date_stats = if matches!(data_type, DataType::Date | DataType::DateTime) {
171            self.calculate_date_stats(data)
172        } else {
173            None
174        };
175
176        let text_stats = if matches!(data_type, DataType::String) {
177            Some(self.calculate_text_stats(data))
178        } else {
179            None
180        };
181
182        // Calculate quality score for this column
183        let quality_score = self.calculate_column_quality_score(
184            null_percentage,
185            unique_percentage,
186            &data_type,
187            length_stats.as_ref(),
188            numeric_stats.as_ref(),
189        );
190
191        Ok(ColumnProfile {
192            name: name.to_string(),
193            data_type,
194            null_count,
195            null_percentage,
196            unique_count,
197            unique_percentage,
198            distinct_values: distinct_values
199                .into_iter()
200                .take(self.max_distinct_values)
201                .collect(),
202            top_values,
203            length_stats,
204            numeric_stats,
205            date_stats,
206            text_stats,
207            quality_score,
208        })
209    }
210
211    /// Count duplicate rows
212    fn count_duplicate_rows(&self, rows: &[Vec<String>]) -> usize {
213        let mut seen = HashSet::new();
214        let mut duplicates = 0;
215
216        for row in rows {
217            let row_str = row.join("|");
218            if seen.contains(&row_str) {
219                duplicates += 1;
220            } else {
221                seen.insert(row_str);
222            }
223        }
224
225        duplicates
226    }
227}
228
229impl Default for DataProfiler {
230    fn default() -> Self {
231        Self::new()
232    }
233}