xls_rs/profiling/
quality.rs1use super::types::*;
4
5impl super::profiler::DataProfiler {
6 pub fn calculate_column_quality_score(
8 &self,
9 null_percentage: f64,
10 unique_percentage: f64,
11 data_type: &DataType,
12 length_stats: Option<&LengthStats>,
13 numeric_stats: Option<&NumericStats>,
14 ) -> f64 {
15 let mut score = 100.0;
16
17 score -= null_percentage * 0.5;
19
20 if matches!(
22 data_type,
23 DataType::String | DataType::Email | DataType::Url | DataType::Phone
24 ) {
25 if unique_percentage > 80.0 {
26 score -= (unique_percentage - 80.0) * 0.2;
27 }
28 }
29
30 if let Some(length_stats) = length_stats {
32 let length_variance = length_stats.std_dev_length / length_stats.avg_length;
33 if length_variance < 0.1 {
34 score += 5.0; }
36 }
37
38 if let Some(numeric_stats) = numeric_stats {
40 if numeric_stats.skewness.abs() > 2.0 {
42 score -= 5.0;
43 }
44
45 if numeric_stats.std_dev > 0.0 && numeric_stats.std_dev < numeric_stats.mean * 2.0 {
47 score += 5.0;
48 }
49 }
50
51 score.max(0.0).min(100.0)
52 }
53
54 pub fn calculate_overall_quality_score(
56 &self,
57 columns: &[ColumnProfile],
58 null_percentage: f64,
59 duplicate_percentage: f64,
60 ) -> f64 {
61 let column_scores: f64 = columns.iter().map(|c| c.quality_score).sum();
62 let avg_column_score = column_scores / columns.len() as f64;
63
64 let mut overall_score = avg_column_score;
65
66 if null_percentage > 10.0 {
68 overall_score -= (null_percentage - 10.0) * 0.3;
69 }
70
71 if duplicate_percentage > 5.0 {
73 overall_score -= (duplicate_percentage - 5.0) * 0.5;
74 }
75
76 overall_score.max(0.0).min(100.0)
77 }
78
79 pub fn generate_recommendations(
81 &self,
82 columns: &[ColumnProfile],
83 null_percentage: f64,
84 duplicate_percentage: f64,
85 ) -> Vec<String> {
86 let mut recommendations = Vec::new();
87
88 if null_percentage > 20.0 {
90 recommendations.push(format!(
91 "High null rate ({:.1}%). Consider data imputation or data quality checks.",
92 null_percentage
93 ));
94 }
95
96 if duplicate_percentage > 10.0 {
97 recommendations.push(format!(
98 "High duplicate rate ({:.1}%). Consider deduplication.",
99 duplicate_percentage
100 ));
101 }
102
103 for column in columns {
105 if column.null_percentage > 30.0 {
106 recommendations.push(format!(
107 "Column '{}' has high null rate ({:.1}%).",
108 column.name, column.null_percentage
109 ));
110 }
111
112 if column.unique_percentage == 100.0 && column.null_percentage == 0.0 {
113 recommendations.push(format!(
114 "Column '{}' might be a candidate for primary key.",
115 column.name
116 ));
117 }
118
119 if matches!(
120 column.data_type,
121 DataType::String | DataType::Email | DataType::Url | DataType::Phone
122 ) && column.unique_percentage > 95.0
123 {
124 recommendations.push(format!(
125 "Column '{}' has many unique values ({:.1}%). Consider if this is intended.",
126 column.name, column.unique_percentage
127 ));
128 }
129
130 if let Some(numeric_stats) = &column.numeric_stats {
131 if numeric_stats.skewness.abs() > 2.0 {
132 recommendations.push(format!(
133 "Column '{}' has high skewness ({:.2}). Consider transformation.",
134 column.name, numeric_stats.skewness
135 ));
136 }
137 }
138
139 if let Some(length_stats) = &column.length_stats {
140 if length_stats.std_dev_length / length_stats.avg_length > 0.5 {
141 recommendations.push(format!(
142 "Column '{}' has inconsistent length pattern.",
143 column.name
144 ));
145 }
146 }
147 }
148
149 recommendations
150 }
151
152 pub fn generate_report(&self, profile: &DataProfile) -> String {
154 let mut report = String::new();
155
156 report.push_str(&format!("# Data Profile Report: {}\n\n", profile.file_path));
157
158 report.push_str(&format!(
159 "## Summary\n\n\
160 - **Total Rows**: {}\n\
161 - **Total Columns**: {}\n\
162 - **Total Cells**: {}\n\
163 - **Null Cells**: {} ({:.1}%)\n\
164 - **Duplicate Rows**: {} ({:.1}%)\n\
165 - **Data Quality Score**: {:.1}/100\n\n",
166 profile.total_rows,
167 profile.total_columns,
168 profile.total_cells,
169 profile.null_cells,
170 profile.null_percentage,
171 profile.duplicate_rows,
172 profile.duplicate_percentage,
173 profile.data_quality_score
174 ));
175
176 if !profile.recommendations.is_empty() {
177 report.push_str("## Recommendations\n\n");
178 for rec in &profile.recommendations {
179 report.push_str(&format!("- {}\n", rec));
180 }
181 report.push_str("\n");
182 }
183
184 report.push_str("## Column Details\n\n");
185
186 for column in &profile.columns {
187 report.push_str(&format!(
188 "### {}\n\n\
189 - **Type**: {:?}\n\
190 - **Quality Score**: {:.1}/100\n\
191 - **Null Count**: {} ({:.1}%)\n\
192 - **Unique Count**: {} ({:.1}%)\n",
193 column.name,
194 column.data_type,
195 column.quality_score,
196 column.null_count,
197 column.null_percentage,
198 column.unique_count,
199 column.unique_percentage
200 ));
201
202 if !column.top_values.is_empty() {
203 report.push_str("- **Top Values**:\n");
204 for (i, val) in column.top_values.iter().take(5).enumerate() {
205 report.push_str(&format!(
206 " {}. {} ({}%, {} occurrences)\n",
207 i + 1,
208 val.value,
209 val.percentage,
210 val.count
211 ));
212 }
213 }
214
215 if let Some(numeric_stats) = &column.numeric_stats {
216 report.push_str(&format!(
217 "- **Numeric Stats**: Min={}, Max={}, Mean={:.2}, Median={:.2}, StdDev={:.2}\n",
218 numeric_stats.min,
219 numeric_stats.max,
220 numeric_stats.mean,
221 numeric_stats.median,
222 numeric_stats.std_dev
223 ));
224 }
225
226 if let Some(length_stats) = &column.length_stats {
227 report.push_str(&format!(
228 "- **Length Stats**: Min={}, Max={}, Avg={:.1}, Median={}\n",
229 length_stats.min_length,
230 length_stats.max_length,
231 length_stats.avg_length,
232 length_stats.median_length
233 ));
234 }
235
236 report.push('\n');
237 }
238
239 report.push_str(&format!(
240 "---\n\n*Report generated on {}*",
241 profile.profiling_timestamp
242 ));
243
244 report
245 }
246}