scirs2_core/validation/data/
quality.rs

1//! Data quality assessment and reporting
2//!
3//! This module provides comprehensive data quality assessment capabilities,
4//! including quality metrics calculation, issue detection, and recommendation generation.
5
6use std::fmt;
7
8// Core dependencies for array/matrix validation
9use ::ndarray::{ArrayBase, Data, Dimension, ScalarOperand};
10use num_traits::{Float, FromPrimitive, ToPrimitive};
11
12use super::config::{ErrorSeverity, QualityIssueType};
13use crate::error::CoreError;
14
15use serde::{Deserialize, Serialize};
16
17/// Data quality assessment result
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct DataQualityReport {
20    /// Overall quality score (0.0 to 1.0)
21    pub quality_score: f64,
22    /// Detailed quality metrics
23    pub metrics: QualityMetrics,
24    /// Issues found during validation
25    pub issues: Vec<QualityIssue>,
26    /// Recommendations for improvement
27    pub recommendations: Vec<String>,
28}
29
30/// Detailed quality metrics
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct QualityMetrics {
33    /// Completeness (non-null/NaN ratio)
34    pub completeness: f64,
35    /// Consistency (pattern conformance)
36    pub consistency: f64,
37    /// Accuracy (constraint compliance)
38    pub accuracy: f64,
39    /// Validity (type/format correctness)
40    pub validity: f64,
41    /// Statistical properties
42    pub statistical_summary: Option<StatisticalSummary>,
43}
44
45/// Statistical summary of numeric data
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct StatisticalSummary {
48    /// Number of data points
49    pub count: usize,
50    /// Mean value
51    pub mean: f64,
52    /// Standard deviation
53    pub std_dev: f64,
54    /// Minimum value
55    pub min: f64,
56    /// Maximum value
57    pub max: f64,
58    /// Number of outliers detected
59    pub outliers: usize,
60    /// Data distribution type (if detectable)
61    pub distribution: Option<String>,
62}
63
64/// Quality issue found during validation
65#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct QualityIssue {
67    /// Issue type
68    pub issue_type: QualityIssueType,
69    /// Location where issue was found
70    pub location: String,
71    /// Description of the issue
72    pub description: String,
73    /// Severity of the issue
74    pub severity: ErrorSeverity,
75    /// Suggested fix
76    pub suggestion: Option<String>,
77}
78
79/// Data quality analyzer
80pub struct QualityAnalyzer;
81
82impl QualityAnalyzer {
83    /// Create new quality analyzer
84    pub fn new() -> Self {
85        Self
86    }
87
88    /// Generate comprehensive data quality report for arrays
89    pub fn generate_quality_report<S, D>(
90        &self,
91        array: &ArrayBase<S, D>,
92        fieldname: &str,
93    ) -> Result<DataQualityReport, CoreError>
94    where
95        S: Data,
96        D: Dimension,
97        S::Elem: Float + fmt::Debug + ScalarOperand + Send + Sync + FromPrimitive,
98    {
99        let mut issues = Vec::new();
100        let mut recommendations = Vec::new();
101
102        // Calculate completeness (non-NaN ratio)
103        let total_elements = array.len();
104        let nan_count = array.iter().filter(|&&x| x.is_nan()).count();
105        let completeness = if total_elements > 0 {
106            (total_elements - nan_count) as f64 / total_elements as f64
107        } else {
108            1.0
109        };
110
111        if completeness < 0.95 {
112            issues.push(QualityIssue {
113                issue_type: QualityIssueType::MissingData,
114                location: fieldname.to_string(),
115                description: format!("Low data completeness: {:.1}%", completeness * 100.0),
116                severity: if completeness < 0.8 {
117                    ErrorSeverity::Error
118                } else {
119                    ErrorSeverity::Warning
120                },
121                suggestion: Some(
122                    "Consider data imputation or removal of incomplete records".to_string(),
123                ),
124            });
125
126            if completeness < 0.8 {
127                recommendations.push("Critical: Data completeness is below 80%. Consider data quality improvement before analysis.".to_string());
128            }
129        }
130
131        // Calculate validity (finite values ratio)
132        let inf_count = array.iter().filter(|&&x| x.is_infinite()).count();
133        let validity = if total_elements > 0 {
134            (total_elements - nan_count - inf_count) as f64 / total_elements as f64
135        } else {
136            1.0
137        };
138
139        if validity < 1.0 {
140            issues.push(QualityIssue {
141                issue_type: QualityIssueType::InvalidNumeric,
142                location: fieldname.to_string(),
143                description: format!(
144                    "Invalid numeric values detected: {:.1}% valid",
145                    validity * 100.0
146                ),
147                severity: ErrorSeverity::Warning,
148                suggestion: Some("Remove or replace NaN and infinite values".to_string()),
149            });
150        }
151
152        // Statistical summary
153        let statistical_summary = if total_elements > 0 && nan_count < total_elements {
154            let finite_values: Vec<_> = array.iter().filter(|&&x| x.is_finite()).cloned().collect();
155            if !finite_values.is_empty() {
156                self.calculate_statistical_summary(&finite_values)?
157            } else {
158                None
159            }
160        } else {
161            None
162        };
163
164        // Detect outliers if we have statistical summary
165        if let Some(ref stats) = statistical_summary {
166            let outlier_issues = self.detect_outliers(array, stats, fieldname)?;
167            issues.extend(outlier_issues);
168        }
169
170        // Calculate overall quality score
171        let consistency = self.calculate_consistency(array)?;
172        let accuracy = if issues
173            .iter()
174            .any(|i| matches!(i.issue_type, QualityIssueType::ConstraintViolation))
175        {
176            0.8
177        } else {
178            1.0
179        };
180
181        let quality_score = (completeness + validity + consistency + accuracy) / 4.0;
182
183        // Add performance recommendations
184        if total_elements > 1_000_000 {
185            recommendations.push(
186                "Large dataset detected. Consider parallel processing for better performance."
187                    .to_string(),
188            );
189        }
190
191        if quality_score < 0.8 {
192            recommendations.push(
193                "Overall data quality is low. Review data collection and preprocessing procedures."
194                    .to_string(),
195            );
196        }
197
198        // Add specific recommendations based on issues
199        self.add_specific_recommendations(&issues, &mut recommendations);
200
201        Ok(DataQualityReport {
202            quality_score,
203            metrics: QualityMetrics {
204                completeness,
205                consistency,
206                accuracy,
207                validity,
208                statistical_summary,
209            },
210            issues,
211            recommendations,
212        })
213    }
214
215    /// Calculate statistical summary for finite values
216    fn calculate_statistical_summary<T>(
217        &self,
218        finite_values: &[T],
219    ) -> Result<Option<StatisticalSummary>, CoreError>
220    where
221        T: Float + Copy + FromPrimitive,
222    {
223        if finite_values.is_empty() {
224            return Ok(None);
225        }
226
227        let mean = finite_values.iter().fold(T::zero(), |acc, &x| acc + x)
228            / num_traits::cast(finite_values.len()).unwrap_or(T::one());
229
230        let variance = finite_values
231            .iter()
232            .map(|&x| {
233                let diff = x - mean;
234                diff * diff
235            })
236            .fold(T::zero(), |acc, x| acc + x)
237            / num_traits::cast(finite_values.len()).unwrap_or(T::one());
238
239        let std_dev = variance.sqrt();
240        let min_val = finite_values
241            .iter()
242            .fold(finite_values[0], |acc, &x| if x < acc { x } else { acc });
243        let max_val = finite_values
244            .iter()
245            .fold(finite_values[0], |acc, &x| if x > acc { x } else { acc });
246
247        // Simple outlier detection using IQR method
248        let mut sortedvalues = finite_values.to_vec();
249        sortedvalues.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
250        let outliers = self.count_outliers_iqr(&sortedvalues);
251
252        // Basic distribution detection
253        let distribution = self.detect_distribution(&sortedvalues);
254
255        Ok(Some(StatisticalSummary {
256            count: finite_values.len(),
257            mean: num_traits::cast(mean).unwrap_or(0.0),
258            std_dev: num_traits::cast(std_dev).unwrap_or(0.0),
259            min: num_traits::cast(min_val).unwrap_or(0.0),
260            max: num_traits::cast(max_val).unwrap_or(0.0),
261            outliers,
262            distribution,
263        }))
264    }
265
266    /// Count outliers using IQR method
267    fn count_outliers_iqr<T>(&self, sortedvalues: &[T]) -> usize
268    where
269        T: Float + Copy,
270    {
271        if sortedvalues.len() < 4 {
272            return 0;
273        }
274
275        let q1_index = sortedvalues.len() / 4;
276        let q3_index = 3 * sortedvalues.len() / 4;
277        let q1 = sortedvalues[q1_index];
278        let q3 = sortedvalues[q3_index];
279        let iqr = q3 - q1;
280        let lower_bound = q1 - iqr * num_traits::cast(1.5).unwrap_or(T::one());
281        let upper_bound = q3 + iqr * num_traits::cast(1.5).unwrap_or(T::one());
282
283        sortedvalues
284            .iter()
285            .filter(|&&x| x < lower_bound || x > upper_bound)
286            .count()
287    }
288
289    /// Basic distribution detection
290    fn detect_distribution<T>(&self, sortedvalues: &[T]) -> Option<String>
291    where
292        T: Float + Copy + FromPrimitive,
293    {
294        if sortedvalues.len() < 10 {
295            return None;
296        }
297
298        // Simple skewness calculation
299        let mean = sortedvalues.iter().fold(T::zero(), |acc, &x| acc + x)
300            / num_traits::cast(sortedvalues.len()).unwrap_or(T::one());
301
302        let variance = sortedvalues
303            .iter()
304            .map(|&x| {
305                let diff = x - mean;
306                diff * diff
307            })
308            .fold(T::zero(), |acc, x| acc + x)
309            / num_traits::cast(sortedvalues.len()).unwrap_or(T::one());
310
311        let std_dev = variance.sqrt();
312
313        if std_dev > T::zero() {
314            let skewness = sortedvalues
315                .iter()
316                .map(|&x| {
317                    let diff = (x - mean) / std_dev;
318                    diff * diff * diff
319                })
320                .fold(T::zero(), |acc, x| acc + x)
321                / num_traits::cast(sortedvalues.len()).unwrap_or(T::one());
322
323            let skewness_f64: f64 = num_traits::cast(skewness).unwrap_or(0.0);
324
325            if skewness_f64.abs() < 0.5 {
326                Some("approximately_normal".to_string())
327            } else if skewness_f64 > 0.5 {
328                Some("right_skewed".to_string())
329            } else {
330                Some("left_skewed".to_string())
331            }
332        } else {
333            Some("constant".to_string())
334        }
335    }
336
337    /// Detect outliers and create quality issues
338    fn detect_outliers<S, D>(
339        &self,
340        array: &ArrayBase<S, D>,
341        stats: &StatisticalSummary,
342        fieldname: &str,
343    ) -> Result<Vec<QualityIssue>, CoreError>
344    where
345        S: Data,
346        D: Dimension,
347        S::Elem: Float + fmt::Debug,
348    {
349        let mut issues = Vec::new();
350
351        if stats.outliers > 0 {
352            let outlier_percentage = (stats.outliers as f64 / stats.count as f64) * 100.0;
353
354            if outlier_percentage > 5.0 {
355                issues.push(QualityIssue {
356                    issue_type: QualityIssueType::Outlier,
357                    location: fieldname.to_string(),
358                    description: format!(
359                        "High number of outliers detected: {} ({:.1}%)",
360                        stats.outliers, outlier_percentage
361                    ),
362                    severity: if outlier_percentage > 15.0 {
363                        ErrorSeverity::Error
364                    } else {
365                        ErrorSeverity::Warning
366                    },
367                    suggestion: Some(
368                        "Review outliers for data quality issues or consider outlier treatment"
369                            .to_string(),
370                    ),
371                });
372            }
373        }
374
375        Ok(issues)
376    }
377
378    /// Calculate data consistency score
379    fn calculate_consistency<S, D>(&self, array: &ArrayBase<S, D>) -> Result<f64, CoreError>
380    where
381        S: Data,
382        D: Dimension,
383        S::Elem: Float,
384    {
385        // Implement pattern consistency checking
386        let array_size = array.len();
387
388        if array_size < 3 {
389            // Too small to check patterns
390            return Ok(1.0);
391        }
392
393        let values: Vec<f64> = array.iter().filter_map(|&x| x.to_f64()).collect();
394
395        if values.len() < 3 {
396            // Not enough valid values to check patterns
397            return Ok(1.0);
398        }
399
400        // Check for consistent differences (arithmetic progression)
401        let mut diff_scores = Vec::new();
402        for i in 1..values.len() {
403            diff_scores.push(values[i] - values[i.saturating_sub(1)]);
404        }
405
406        // Calculate variance of differences
407        let mean_diff = diff_scores.iter().sum::<f64>() / diff_scores.len() as f64;
408        let variance = diff_scores
409            .iter()
410            .map(|&d| (d - mean_diff).powi(2))
411            .sum::<f64>()
412            / diff_scores.len() as f64;
413
414        // Check for periodic patterns
415        let mut period_score = 1.0;
416        for period in 2..((values.len() / 2).min(10)) {
417            let mut matches = 0;
418            let mut comparisons = 0;
419
420            for i in period..values.len() {
421                if (values[i] - values[i - period]).abs() < 1e-10 {
422                    matches += 1;
423                }
424                comparisons += 1;
425            }
426
427            if comparisons > 0 {
428                let current_score = matches as f64 / comparisons as f64;
429                period_score = period_score.max(current_score);
430            }
431        }
432
433        // Combine scores: lower variance in differences = higher consistency
434        // Also consider periodic patterns
435        let diff_consistency = if variance > 0.0 {
436            (-variance.ln()).exp().clamp(0.0, 1.0)
437        } else {
438            1.0 // Perfect arithmetic progression
439        };
440
441        // Final score is weighted average
442        let consistency_score = 0.7 * diff_consistency + 0.3 * period_score;
443
444        Ok(consistency_score.clamp(0.0, 1.0))
445    }
446
447    /// Add specific recommendations based on detected issues
448    fn add_specific_recommendations(
449        &self,
450        issues: &[QualityIssue],
451        recommendations: &mut Vec<String>,
452    ) {
453        let has_missing_data = issues
454            .iter()
455            .any(|i| matches!(i.issue_type, QualityIssueType::MissingData));
456        let has_invalid_numeric = issues
457            .iter()
458            .any(|i| matches!(i.issue_type, QualityIssueType::InvalidNumeric));
459        let has_outliers = issues
460            .iter()
461            .any(|i| matches!(i.issue_type, QualityIssueType::Outlier));
462
463        if has_missing_data {
464            recommendations.push("Consider using imputation techniques (mean, median, mode, or forward-fill) for missing values.".to_string());
465        }
466
467        if has_invalid_numeric {
468            recommendations
469                .push("Remove or replace NaN and infinite values before analysis.".to_string());
470        }
471
472        if has_outliers {
473            recommendations.push(
474                "Investigate outliers - they may indicate data errors or interesting edge cases."
475                    .to_string(),
476            );
477        }
478
479        if has_missing_data && has_invalid_numeric {
480            recommendations.push("Consider a comprehensive data cleaning pipeline to address multiple quality issues.".to_string());
481        }
482    }
483}
484
485impl Default for QualityAnalyzer {
486    fn default() -> Self {
487        Self::new()
488    }
489}
490
491impl DataQualityReport {
492    /// Get formatted report string
493    pub fn formatted_report(&self) -> String {
494        let mut report = "Data Quality Report\n".to_string();
495        report.push_str("==================\n\n");
496        report.push_str(&format!(
497            "Overall Quality Score: {:.2}\n\n",
498            self.quality_score
499        ));
500
501        report.push_str("Metrics:\n");
502        report.push_str(&format!(
503            "  Completeness: {:.1}%\n",
504            self.metrics.completeness * 100.0
505        ));
506        report.push_str(&format!(
507            "  Validity: {:.1}%\n",
508            self.metrics.validity * 100.0
509        ));
510        report.push_str(&format!(
511            "  Consistency: {:.1}%\n",
512            self.metrics.consistency * 100.0
513        ));
514        report.push_str(&format!(
515            "  Accuracy: {:.1}%\n\n",
516            self.metrics.accuracy * 100.0
517        ));
518
519        if let Some(ref stats) = self.metrics.statistical_summary {
520            report.push_str("Statistical Summary:\n");
521            report.push_str(&format!("  Count: {}\n", stats.count));
522            report.push_str(&format!("  Mean: {:.6}\n", stats.mean));
523            report.push_str(&format!("  Std Dev: {:.6}\n", stats.std_dev));
524            report.push_str(&format!("  Min: {:.6}\n", stats.min));
525            report.push_str(&format!("  Max: {:.6}\n", stats.max));
526            report.push_str(&format!("  Outliers: {}\n", stats.outliers));
527            if let Some(ref dist) = stats.distribution {
528                report.push_str(&format!("  Distribution: {}\n", dist));
529            }
530            report.push('\n');
531        }
532
533        if !self.issues.is_empty() {
534            report.push_str("Issues Found:\n");
535            for (i, issue) in self.issues.iter().enumerate() {
536                report.push_str(&format!(
537                    "  {}. [{:?}] {}: {}\n",
538                    i + 1,
539                    issue.severity,
540                    issue.location,
541                    issue.description
542                ));
543                if let Some(ref suggestion) = issue.suggestion {
544                    report.push_str(&format!("     Suggestion: {}\n", suggestion));
545                }
546            }
547            report.push('\n');
548        }
549
550        if !self.recommendations.is_empty() {
551            report.push_str("Recommendations:\n");
552            for (i, rec) in self.recommendations.iter().enumerate() {
553                report.push_str(&format!("  {}. {}\n", i + 1, rec));
554            }
555        }
556
557        report
558    }
559
560    /// Check if quality is acceptable (score >= threshold)
561    pub fn is_acceptable(&self, threshold: f64) -> bool {
562        self.quality_score >= threshold
563    }
564
565    /// Get critical issues
566    pub fn get_critical_issues(&self) -> Vec<&QualityIssue> {
567        self.issues
568            .iter()
569            .filter(|issue| issue.severity == ErrorSeverity::Critical)
570            .collect()
571    }
572
573    /// Get issues by type
574    pub fn get_issues_by_type(&self, issuetype: QualityIssueType) -> Vec<&QualityIssue> {
575        self.issues
576            .iter()
577            .filter(|issue| issue.issue_type == issuetype)
578            .collect()
579    }
580}
581
582#[cfg(test)]
583mod tests {
584    use super::*;
585    use ::ndarray::Array1;
586
587    #[test]
588    fn test_quality_analyzer() {
589        let analyzer = QualityAnalyzer::new();
590        let array = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
591
592        let report = analyzer
593            .generate_quality_report(&array, "test_field")
594            .expect("Operation failed");
595
596        assert!(report.quality_score > 0.9); // Should be high quality
597        assert_eq!(report.metrics.completeness, 1.0); // No missing values
598        assert_eq!(report.metrics.validity, 1.0); // No invalid values
599        assert!(report.issues.is_empty()); // No issues expected
600    }
601
602    #[test]
603    fn test_quality_with_missing_data() {
604        let analyzer = QualityAnalyzer::new();
605        let array = Array1::from_vec(vec![1.0, f64::NAN, 3.0, 4.0, 5.0]);
606
607        let report = analyzer
608            .generate_quality_report(&array, "test_field")
609            .expect("Operation failed");
610
611        assert!(report.metrics.completeness < 1.0); // Has missing values
612        assert!(!report.issues.is_empty()); // Should have issues
613
614        let missing_issues = report.get_issues_by_type(QualityIssueType::MissingData);
615        assert!(!missing_issues.is_empty());
616    }
617
618    #[test]
619    fn test_quality_with_infinite_values() {
620        let analyzer = QualityAnalyzer::new();
621        let array = Array1::from_vec(vec![1.0, 2.0, f64::INFINITY, 4.0, 5.0]);
622
623        let report = analyzer
624            .generate_quality_report(&array, "test_field")
625            .expect("Operation failed");
626
627        assert!(report.metrics.validity < 1.0); // Has invalid values
628
629        let invalid_issues = report.get_issues_by_type(QualityIssueType::InvalidNumeric);
630        assert!(!invalid_issues.is_empty());
631    }
632
633    #[test]
634    fn test_statistical_summary() {
635        let analyzer = QualityAnalyzer::new();
636        let array = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
637
638        let report = analyzer
639            .generate_quality_report(&array, "test_field")
640            .expect("Operation failed");
641
642        assert!(report.metrics.statistical_summary.is_some());
643        let stats = report
644            .metrics
645            .statistical_summary
646            .expect("Operation failed");
647        assert_eq!(stats.count, 5);
648        assert!((stats.mean - 3.0).abs() < 1e-10);
649        assert_eq!(stats.min, 1.0);
650        assert_eq!(stats.max, 5.0);
651    }
652
653    #[test]
654    fn test_formatted_report() {
655        let analyzer = QualityAnalyzer::new();
656        let array = Array1::from_vec(vec![1.0, 2.0, 3.0]);
657
658        let report = analyzer
659            .generate_quality_report(&array, "test_field")
660            .expect("Operation failed");
661        let formatted = report.formatted_report();
662
663        assert!(formatted.contains("Data Quality Report"));
664        assert!(formatted.contains("Overall Quality Score"));
665        assert!(formatted.contains("Metrics:"));
666        assert!(formatted.contains("Statistical Summary:"));
667    }
668
669    #[test]
670    fn test_quality_acceptance() {
671        let analyzer = QualityAnalyzer::new();
672        let array = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
673
674        let report = analyzer
675            .generate_quality_report(&array, "test_field")
676            .expect("Operation failed");
677
678        assert!(report.is_acceptable(0.8)); // Should pass 80% threshold
679        assert!(report.is_acceptable(0.9)); // Should pass 90% threshold
680        assert!(report.get_critical_issues().is_empty()); // No critical issues
681    }
682}