Skip to main content

xls_rs/profiling/
quality.rs

1//! Quality scoring and reporting methods for data profiling
2
3use super::types::*;
4
5impl super::profiler::DataProfiler {
6    /// Calculate column quality score
7    pub fn calculate_column_quality_score(
8        &self,
9        null_percentage: f64,
10        unique_percentage: f64,
11        data_type: &DataType,
12        length_stats: Option<&LengthStats>,
13        numeric_stats: Option<&NumericStats>,
14    ) -> f64 {
15        let mut score = 100.0;
16
17        // Penalize null values
18        score -= null_percentage * 0.5;
19
20        // Penalize too many unique values for categorical data
21        if matches!(
22            data_type,
23            DataType::String | DataType::Email | DataType::Url | DataType::Phone
24        ) {
25            if unique_percentage > 80.0 {
26                score -= (unique_percentage - 80.0) * 0.2;
27            }
28        }
29
30        // Check for consistent lengths (good for structured data)
31        if let Some(length_stats) = length_stats {
32            let length_variance = length_stats.std_dev_length / length_stats.avg_length;
33            if length_variance < 0.1 {
34                score += 5.0; // Bonus for consistent lengths
35            }
36        }
37
38        // Check for reasonable numeric distributions
39        if let Some(numeric_stats) = numeric_stats {
40            // Penalize extreme skewness
41            if numeric_stats.skewness.abs() > 2.0 {
42                score -= 5.0;
43            }
44
45            // Bonus for reasonable variance
46            if numeric_stats.std_dev > 0.0 && numeric_stats.std_dev < numeric_stats.mean * 2.0 {
47                score += 5.0;
48            }
49        }
50
51        score.max(0.0).min(100.0)
52    }
53
54    /// Calculate overall quality score
55    pub fn calculate_overall_quality_score(
56        &self,
57        columns: &[ColumnProfile],
58        null_percentage: f64,
59        duplicate_percentage: f64,
60    ) -> f64 {
61        let column_scores: f64 = columns.iter().map(|c| c.quality_score).sum();
62        let avg_column_score = column_scores / columns.len() as f64;
63
64        let mut overall_score = avg_column_score;
65
66        // Penalize high null percentage
67        if null_percentage > 10.0 {
68            overall_score -= (null_percentage - 10.0) * 0.3;
69        }
70
71        // Penalize high duplicate percentage
72        if duplicate_percentage > 5.0 {
73            overall_score -= (duplicate_percentage - 5.0) * 0.5;
74        }
75
76        overall_score.max(0.0).min(100.0)
77    }
78
79    /// Generate data quality recommendations
80    pub fn generate_recommendations(
81        &self,
82        columns: &[ColumnProfile],
83        null_percentage: f64,
84        duplicate_percentage: f64,
85    ) -> Vec<String> {
86        let mut recommendations = Vec::new();
87
88        // Overall recommendations
89        if null_percentage > 20.0 {
90            recommendations.push(format!(
91                "High null rate ({:.1}%). Consider data imputation or data quality checks.",
92                null_percentage
93            ));
94        }
95
96        if duplicate_percentage > 10.0 {
97            recommendations.push(format!(
98                "High duplicate rate ({:.1}%). Consider deduplication.",
99                duplicate_percentage
100            ));
101        }
102
103        // Column-specific recommendations
104        for column in columns {
105            if column.null_percentage > 30.0 {
106                recommendations.push(format!(
107                    "Column '{}' has high null rate ({:.1}%).",
108                    column.name, column.null_percentage
109                ));
110            }
111
112            if column.unique_percentage == 100.0 && column.null_percentage == 0.0 {
113                recommendations.push(format!(
114                    "Column '{}' might be a candidate for primary key.",
115                    column.name
116                ));
117            }
118
119            if matches!(
120                column.data_type,
121                DataType::String | DataType::Email | DataType::Url | DataType::Phone
122            ) && column.unique_percentage > 95.0
123            {
124                recommendations.push(format!(
125                    "Column '{}' has many unique values ({:.1}%). Consider if this is intended.",
126                    column.name, column.unique_percentage
127                ));
128            }
129
130            if let Some(numeric_stats) = &column.numeric_stats {
131                if numeric_stats.skewness.abs() > 2.0 {
132                    recommendations.push(format!(
133                        "Column '{}' has high skewness ({:.2}). Consider transformation.",
134                        column.name, numeric_stats.skewness
135                    ));
136                }
137            }
138
139            if let Some(length_stats) = &column.length_stats {
140                if length_stats.std_dev_length / length_stats.avg_length > 0.5 {
141                    recommendations.push(format!(
142                        "Column '{}' has inconsistent length pattern.",
143                        column.name
144                    ));
145                }
146            }
147        }
148
149        recommendations
150    }
151
152    /// Generate a human-readable profile report
153    pub fn generate_report(&self, profile: &DataProfile) -> String {
154        let mut report = String::new();
155
156        report.push_str(&format!("# Data Profile Report: {}\n\n", profile.file_path));
157
158        report.push_str(&format!(
159            "## Summary\n\n\
160             - **Total Rows**: {}\n\
161             - **Total Columns**: {}\n\
162             - **Total Cells**: {}\n\
163             - **Null Cells**: {} ({:.1}%)\n\
164             - **Duplicate Rows**: {} ({:.1}%)\n\
165             - **Data Quality Score**: {:.1}/100\n\n",
166            profile.total_rows,
167            profile.total_columns,
168            profile.total_cells,
169            profile.null_cells,
170            profile.null_percentage,
171            profile.duplicate_rows,
172            profile.duplicate_percentage,
173            profile.data_quality_score
174        ));
175
176        if !profile.recommendations.is_empty() {
177            report.push_str("## Recommendations\n\n");
178            for rec in &profile.recommendations {
179                report.push_str(&format!("- {}\n", rec));
180            }
181            report.push_str("\n");
182        }
183
184        report.push_str("## Column Details\n\n");
185
186        for column in &profile.columns {
187            report.push_str(&format!(
188                "### {}\n\n\
189                 - **Type**: {:?}\n\
190                 - **Quality Score**: {:.1}/100\n\
191                 - **Null Count**: {} ({:.1}%)\n\
192                 - **Unique Count**: {} ({:.1}%)\n",
193                column.name,
194                column.data_type,
195                column.quality_score,
196                column.null_count,
197                column.null_percentage,
198                column.unique_count,
199                column.unique_percentage
200            ));
201
202            if !column.top_values.is_empty() {
203                report.push_str("- **Top Values**:\n");
204                for (i, val) in column.top_values.iter().take(5).enumerate() {
205                    report.push_str(&format!(
206                        "  {}. {} ({}%, {} occurrences)\n",
207                        i + 1,
208                        val.value,
209                        val.percentage,
210                        val.count
211                    ));
212                }
213            }
214
215            if let Some(numeric_stats) = &column.numeric_stats {
216                report.push_str(&format!(
217                    "- **Numeric Stats**: Min={}, Max={}, Mean={:.2}, Median={:.2}, StdDev={:.2}\n",
218                    numeric_stats.min,
219                    numeric_stats.max,
220                    numeric_stats.mean,
221                    numeric_stats.median,
222                    numeric_stats.std_dev
223                ));
224            }
225
226            if let Some(length_stats) = &column.length_stats {
227                report.push_str(&format!(
228                    "- **Length Stats**: Min={}, Max={}, Avg={:.1}, Median={}\n",
229                    length_stats.min_length,
230                    length_stats.max_length,
231                    length_stats.avg_length,
232                    length_stats.median_length
233                ));
234            }
235
236            report.push('\n');
237        }
238
239        report.push_str(&format!(
240            "---\n\n*Report generated on {}*",
241            profile.profiling_timestamp
242        ));
243
244        report
245    }
246}