Skip to main content

xls_rs/
quality.rs

1//! Automated data quality reports
2//!
3//! Generates comprehensive data quality reports with recommendations.
4
5use crate::anomaly::{AnomalyDetector, AnomalyMethod};
6use crate::profiling::DataProfiler;
7use anyhow::Result;
8use serde::{Deserialize, Serialize};
9
10/// Data quality report
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct QualityReport {
13    pub file_path: String,
14    pub timestamp: String,
15    pub overall_score: f64,
16    pub categories: QualityCategories,
17    pub issues: Vec<QualityIssue>,
18    pub recommendations: Vec<String>,
19}
20
21/// Quality categories
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct QualityCategories {
24    pub completeness: f64,
25    pub accuracy: f64,
26    pub consistency: f64,
27    pub validity: f64,
28    pub uniqueness: f64,
29}
30
31/// Quality issue
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct QualityIssue {
34    pub severity: IssueSeverity,
35    pub category: String,
36    pub description: String,
37    pub affected_rows: Option<usize>,
38    pub affected_columns: Option<Vec<String>>,
39}
40
41/// Issue severity
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub enum IssueSeverity {
44    Critical,
45    High,
46    Medium,
47    Low,
48    Info,
49}
50
51/// Quality report generator
52pub struct QualityReportGenerator {
53    profiler: DataProfiler,
54}
55
56impl QualityReportGenerator {
57    pub fn new() -> Self {
58        Self {
59            profiler: DataProfiler::new(),
60        }
61    }
62
63    /// Generate quality report
64    pub fn generate(&self, data: &[Vec<String>], file_path: &str) -> Result<QualityReport> {
65        // Profile data
66        let profile = self.profiler.profile(data, file_path)?;
67
68        // Calculate quality scores
69        let completeness = 100.0 - profile.null_percentage;
70        let uniqueness = 100.0 - profile.duplicate_percentage;
71
72        // Check for anomalies
73        let mut issues = Vec::new();
74        let mut accuracy_score = 100.0;
75
76        for (col_idx, col_profile) in profile.columns.iter().enumerate() {
77            if col_profile.null_percentage > 50.0 {
78                issues.push(QualityIssue {
79                    severity: IssueSeverity::High,
80                    category: "Completeness".to_string(),
81                    description: format!(
82                        "Column '{}' has {:.1}% null values",
83                        col_profile.name, col_profile.null_percentage
84                    ),
85                    affected_rows: None,
86                    affected_columns: Some(vec![col_profile.name.clone()]),
87                });
88                accuracy_score -= 10.0;
89            }
90
91            // Check for anomalies in numeric columns
92            if matches!(
93                col_profile.data_type,
94                crate::profiling::DataType::Integer | crate::profiling::DataType::Float
95            ) {
96                let detector = AnomalyDetector::new(AnomalyMethod::ZScore { threshold: 3.0 });
97                if let Ok(anomaly_result) = detector.detect(data, col_idx) {
98                    if anomaly_result.anomaly_percentage > 5.0 {
99                        issues.push(QualityIssue {
100                            severity: IssueSeverity::Medium,
101                            category: "Accuracy".to_string(),
102                            description: format!(
103                                "Column '{}' has {:.1}% anomalies",
104                                col_profile.name, anomaly_result.anomaly_percentage
105                            ),
106                            affected_rows: Some(anomaly_result.total_anomalies),
107                            affected_columns: Some(vec![col_profile.name.clone()]),
108                        });
109                        accuracy_score -= anomaly_result.anomaly_percentage;
110                    }
111                }
112            }
113        }
114
115        let consistency = profile.data_quality_score;
116        let validity = 100.0 - (issues.len() as f64 * 5.0).min(50.0);
117
118        let categories = QualityCategories {
119            completeness,
120            accuracy: accuracy_score.max(0.0),
121            consistency,
122            validity,
123            uniqueness,
124        };
125
126        let overall_score = (categories.completeness
127            + categories.accuracy
128            + categories.consistency
129            + categories.validity
130            + categories.uniqueness)
131            / 5.0;
132
133        // Generate recommendations
134        let recommendations = self.generate_recommendations(&profile, &issues);
135
136        Ok(QualityReport {
137            file_path: file_path.to_string(),
138            timestamp: chrono::Utc::now().to_rfc3339(),
139            overall_score,
140            categories,
141            issues,
142            recommendations,
143        })
144    }
145
146    fn generate_recommendations(
147        &self,
148        profile: &crate::profiling::DataProfile,
149        issues: &[QualityIssue],
150    ) -> Vec<String> {
151        let mut recommendations = Vec::new();
152
153        if profile.null_percentage > 10.0 {
154            recommendations.push(format!(
155                "Consider filling {}% null values using fillna command",
156                profile.null_percentage
157            ));
158        }
159
160        if profile.duplicate_percentage > 5.0 {
161            recommendations.push(format!(
162                "Remove {}% duplicate rows using dedupe command",
163                profile.duplicate_percentage
164            ));
165        }
166
167        for issue in issues {
168            match issue.severity {
169                IssueSeverity::Critical | IssueSeverity::High => {
170                    recommendations.push(format!(
171                        "Fix {} issue: {}",
172                        issue.category, issue.description
173                    ));
174                }
175                _ => {}
176            }
177        }
178
179        recommendations
180    }
181
182    /// Save report to file
183    pub fn save_report(&self, report: &QualityReport, path: &str) -> Result<()> {
184        let json = serde_json::to_string_pretty(report)?;
185        std::fs::write(path, json)?;
186        Ok(())
187    }
188}