xls_rs/profiling/
profiler.rs1use crate::common::{collection, string};
4use anyhow::Result;
5use std::collections::HashSet;
6
7use super::types::*;
8
9pub struct DataProfiler {
11 max_distinct_values: usize,
12 sample_size: Option<usize>,
13}
14
15impl DataProfiler {
16 pub fn new() -> Self {
18 Self {
19 max_distinct_values: 100,
20 sample_size: None,
21 }
22 }
23
24 pub fn with_max_distinct_values(mut self, max: usize) -> Self {
26 self.max_distinct_values = max;
27 self
28 }
29
30 pub fn with_sample_size(mut self, size: usize) -> Self {
32 self.sample_size = Some(size);
33 self
34 }
35
36 pub fn profile(&self, data: &[Vec<String>], file_path: &str) -> Result<DataProfile> {
38 if data.is_empty() {
39 return Ok(DataProfile {
40 file_path: file_path.to_string(),
41 total_rows: 0,
42 total_columns: 0,
43 total_cells: 0,
44 null_cells: 0,
45 null_percentage: 0.0,
46 duplicate_rows: 0,
47 duplicate_percentage: 0.0,
48 columns: Vec::new(),
49 data_quality_score: 0.0,
50 recommendations: Vec::new(),
51 profiling_timestamp: chrono::Utc::now().to_rfc3339(),
52 });
53 }
54
55 let header = &data[0];
56 let total_rows = data.len() - 1;
57 let total_columns = header.len();
58 let total_cells = total_rows * total_columns;
59
60 let data_to_profile = if let Some(sample_size) = self.sample_size {
62 if total_rows > sample_size {
63 let mut sampled = vec![header.clone()];
64 let step = total_rows / sample_size;
65 for i in (1..=total_rows).step_by(step.max(1)) {
66 if i < data.len() {
67 sampled.push(data[i].clone());
68 }
69 }
70 sampled
71 } else {
72 data.to_vec()
73 }
74 } else {
75 data.to_vec()
76 };
77
78 let mut columns = Vec::new();
80 let mut null_cells = 0;
81
82 for (col_idx, col_name) in header.iter().enumerate() {
83 let column_data: Vec<String> = data_to_profile
84 .iter()
85 .skip(1)
86 .filter_map(|row| row.get(col_idx).cloned())
87 .collect();
88
89 let column_profile = self.profile_column(col_name, &column_data, total_rows)?;
90 null_cells += column_profile.null_count;
91 columns.push(column_profile);
92 }
93
94 let duplicate_rows = self.count_duplicate_rows(&data_to_profile[1..]);
96 let duplicate_percentage = (duplicate_rows as f64 / total_rows as f64) * 100.0;
97 let null_percentage = (null_cells as f64 / total_cells as f64) * 100.0;
98
99 let data_quality_score =
101 self.calculate_overall_quality_score(&columns, null_percentage, duplicate_percentage);
102
103 let recommendations =
105 self.generate_recommendations(&columns, null_percentage, duplicate_percentage);
106
107 Ok(DataProfile {
108 file_path: file_path.to_string(),
109 total_rows,
110 total_columns,
111 total_cells,
112 null_cells,
113 null_percentage,
114 duplicate_rows,
115 duplicate_percentage,
116 columns,
117 data_quality_score,
118 recommendations,
119 profiling_timestamp: chrono::Utc::now().to_rfc3339(),
120 })
121 }
122
123 fn profile_column(
125 &self,
126 name: &str,
127 data: &[String],
128 total_rows: usize,
129 ) -> Result<ColumnProfile> {
130 let null_count = data
131 .iter()
132 .filter(|v| string::is_empty_or_whitespace(v))
133 .count();
134 let null_percentage = (null_count as f64 / total_rows as f64) * 100.0;
135
136 let distinct_values: Vec<String> = collection::unique_preserve_order(
138 &data
139 .iter()
140 .filter(|v| !string::is_empty_or_whitespace(v))
141 .cloned()
142 .collect::<Vec<_>>(),
143 );
144
145 let unique_count = distinct_values.len();
146 let unique_percentage = (unique_count as f64 / total_rows as f64) * 100.0;
147
148 let top_values = self.get_value_frequencies(data);
150
151 let data_type = self.infer_data_type(data);
153
154 let length_stats = if matches!(
156 data_type,
157 DataType::String | DataType::Email | DataType::Url | DataType::Phone
158 ) {
159 Some(self.calculate_length_stats(data))
160 } else {
161 None
162 };
163
164 let numeric_stats = if matches!(data_type, DataType::Integer | DataType::Float) {
165 self.calculate_numeric_stats(data)
166 } else {
167 None
168 };
169
170 let date_stats = if matches!(data_type, DataType::Date | DataType::DateTime) {
171 self.calculate_date_stats(data)
172 } else {
173 None
174 };
175
176 let text_stats = if matches!(data_type, DataType::String) {
177 Some(self.calculate_text_stats(data))
178 } else {
179 None
180 };
181
182 let quality_score = self.calculate_column_quality_score(
184 null_percentage,
185 unique_percentage,
186 &data_type,
187 length_stats.as_ref(),
188 numeric_stats.as_ref(),
189 );
190
191 Ok(ColumnProfile {
192 name: name.to_string(),
193 data_type,
194 null_count,
195 null_percentage,
196 unique_count,
197 unique_percentage,
198 distinct_values: distinct_values
199 .into_iter()
200 .take(self.max_distinct_values)
201 .collect(),
202 top_values,
203 length_stats,
204 numeric_stats,
205 date_stats,
206 text_stats,
207 quality_score,
208 })
209 }
210
211 fn count_duplicate_rows(&self, rows: &[Vec<String>]) -> usize {
213 let mut seen = HashSet::new();
214 let mut duplicates = 0;
215
216 for row in rows {
217 let row_str = row.join("|");
218 if seen.contains(&row_str) {
219 duplicates += 1;
220 } else {
221 seen.insert(row_str);
222 }
223 }
224
225 duplicates
226 }
227}
228
229impl Default for DataProfiler {
230 fn default() -> Self {
231 Self::new()
232 }
233}