1use std::fmt;
7
8use ::ndarray::{ArrayBase, Data, Dimension, ScalarOperand};
10use num_traits::{Float, FromPrimitive, ToPrimitive};
11
12use super::config::{ErrorSeverity, QualityIssueType};
13use crate::error::CoreError;
14
15use serde::{Deserialize, Serialize};
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct DataQualityReport {
20 pub quality_score: f64,
22 pub metrics: QualityMetrics,
24 pub issues: Vec<QualityIssue>,
26 pub recommendations: Vec<String>,
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct QualityMetrics {
33 pub completeness: f64,
35 pub consistency: f64,
37 pub accuracy: f64,
39 pub validity: f64,
41 pub statistical_summary: Option<StatisticalSummary>,
43}
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct StatisticalSummary {
48 pub count: usize,
50 pub mean: f64,
52 pub std_dev: f64,
54 pub min: f64,
56 pub max: f64,
58 pub outliers: usize,
60 pub distribution: Option<String>,
62}
63
64#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct QualityIssue {
67 pub issue_type: QualityIssueType,
69 pub location: String,
71 pub description: String,
73 pub severity: ErrorSeverity,
75 pub suggestion: Option<String>,
77}
78
79pub struct QualityAnalyzer;
81
82impl QualityAnalyzer {
83 pub fn new() -> Self {
85 Self
86 }
87
88 pub fn generate_quality_report<S, D>(
90 &self,
91 array: &ArrayBase<S, D>,
92 fieldname: &str,
93 ) -> Result<DataQualityReport, CoreError>
94 where
95 S: Data,
96 D: Dimension,
97 S::Elem: Float + fmt::Debug + ScalarOperand + Send + Sync + FromPrimitive,
98 {
99 let mut issues = Vec::new();
100 let mut recommendations = Vec::new();
101
102 let total_elements = array.len();
104 let nan_count = array.iter().filter(|&&x| x.is_nan()).count();
105 let completeness = if total_elements > 0 {
106 (total_elements - nan_count) as f64 / total_elements as f64
107 } else {
108 1.0
109 };
110
111 if completeness < 0.95 {
112 issues.push(QualityIssue {
113 issue_type: QualityIssueType::MissingData,
114 location: fieldname.to_string(),
115 description: format!("Low data completeness: {:.1}%", completeness * 100.0),
116 severity: if completeness < 0.8 {
117 ErrorSeverity::Error
118 } else {
119 ErrorSeverity::Warning
120 },
121 suggestion: Some(
122 "Consider data imputation or removal of incomplete records".to_string(),
123 ),
124 });
125
126 if completeness < 0.8 {
127 recommendations.push("Critical: Data completeness is below 80%. Consider data quality improvement before analysis.".to_string());
128 }
129 }
130
131 let inf_count = array.iter().filter(|&&x| x.is_infinite()).count();
133 let validity = if total_elements > 0 {
134 (total_elements - nan_count - inf_count) as f64 / total_elements as f64
135 } else {
136 1.0
137 };
138
139 if validity < 1.0 {
140 issues.push(QualityIssue {
141 issue_type: QualityIssueType::InvalidNumeric,
142 location: fieldname.to_string(),
143 description: format!(
144 "Invalid numeric values detected: {:.1}% valid",
145 validity * 100.0
146 ),
147 severity: ErrorSeverity::Warning,
148 suggestion: Some("Remove or replace NaN and infinite values".to_string()),
149 });
150 }
151
152 let statistical_summary = if total_elements > 0 && nan_count < total_elements {
154 let finite_values: Vec<_> = array.iter().filter(|&&x| x.is_finite()).cloned().collect();
155 if !finite_values.is_empty() {
156 self.calculate_statistical_summary(&finite_values)?
157 } else {
158 None
159 }
160 } else {
161 None
162 };
163
164 if let Some(ref stats) = statistical_summary {
166 let outlier_issues = self.detect_outliers(array, stats, fieldname)?;
167 issues.extend(outlier_issues);
168 }
169
170 let consistency = self.calculate_consistency(array)?;
172 let accuracy = if issues
173 .iter()
174 .any(|i| matches!(i.issue_type, QualityIssueType::ConstraintViolation))
175 {
176 0.8
177 } else {
178 1.0
179 };
180
181 let quality_score = (completeness + validity + consistency + accuracy) / 4.0;
182
183 if total_elements > 1_000_000 {
185 recommendations.push(
186 "Large dataset detected. Consider parallel processing for better performance."
187 .to_string(),
188 );
189 }
190
191 if quality_score < 0.8 {
192 recommendations.push(
193 "Overall data quality is low. Review data collection and preprocessing procedures."
194 .to_string(),
195 );
196 }
197
198 self.add_specific_recommendations(&issues, &mut recommendations);
200
201 Ok(DataQualityReport {
202 quality_score,
203 metrics: QualityMetrics {
204 completeness,
205 consistency,
206 accuracy,
207 validity,
208 statistical_summary,
209 },
210 issues,
211 recommendations,
212 })
213 }
214
215 fn calculate_statistical_summary<T>(
217 &self,
218 finite_values: &[T],
219 ) -> Result<Option<StatisticalSummary>, CoreError>
220 where
221 T: Float + Copy + FromPrimitive,
222 {
223 if finite_values.is_empty() {
224 return Ok(None);
225 }
226
227 let mean = finite_values.iter().fold(T::zero(), |acc, &x| acc + x)
228 / num_traits::cast(finite_values.len()).unwrap_or(T::one());
229
230 let variance = finite_values
231 .iter()
232 .map(|&x| {
233 let diff = x - mean;
234 diff * diff
235 })
236 .fold(T::zero(), |acc, x| acc + x)
237 / num_traits::cast(finite_values.len()).unwrap_or(T::one());
238
239 let std_dev = variance.sqrt();
240 let min_val = finite_values
241 .iter()
242 .fold(finite_values[0], |acc, &x| if x < acc { x } else { acc });
243 let max_val = finite_values
244 .iter()
245 .fold(finite_values[0], |acc, &x| if x > acc { x } else { acc });
246
247 let mut sortedvalues = finite_values.to_vec();
249 sortedvalues.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
250 let outliers = self.count_outliers_iqr(&sortedvalues);
251
252 let distribution = self.detect_distribution(&sortedvalues);
254
255 Ok(Some(StatisticalSummary {
256 count: finite_values.len(),
257 mean: num_traits::cast(mean).unwrap_or(0.0),
258 std_dev: num_traits::cast(std_dev).unwrap_or(0.0),
259 min: num_traits::cast(min_val).unwrap_or(0.0),
260 max: num_traits::cast(max_val).unwrap_or(0.0),
261 outliers,
262 distribution,
263 }))
264 }
265
266 fn count_outliers_iqr<T>(&self, sortedvalues: &[T]) -> usize
268 where
269 T: Float + Copy,
270 {
271 if sortedvalues.len() < 4 {
272 return 0;
273 }
274
275 let q1_index = sortedvalues.len() / 4;
276 let q3_index = 3 * sortedvalues.len() / 4;
277 let q1 = sortedvalues[q1_index];
278 let q3 = sortedvalues[q3_index];
279 let iqr = q3 - q1;
280 let lower_bound = q1 - iqr * num_traits::cast(1.5).unwrap_or(T::one());
281 let upper_bound = q3 + iqr * num_traits::cast(1.5).unwrap_or(T::one());
282
283 sortedvalues
284 .iter()
285 .filter(|&&x| x < lower_bound || x > upper_bound)
286 .count()
287 }
288
289 fn detect_distribution<T>(&self, sortedvalues: &[T]) -> Option<String>
291 where
292 T: Float + Copy + FromPrimitive,
293 {
294 if sortedvalues.len() < 10 {
295 return None;
296 }
297
298 let mean = sortedvalues.iter().fold(T::zero(), |acc, &x| acc + x)
300 / num_traits::cast(sortedvalues.len()).unwrap_or(T::one());
301
302 let variance = sortedvalues
303 .iter()
304 .map(|&x| {
305 let diff = x - mean;
306 diff * diff
307 })
308 .fold(T::zero(), |acc, x| acc + x)
309 / num_traits::cast(sortedvalues.len()).unwrap_or(T::one());
310
311 let std_dev = variance.sqrt();
312
313 if std_dev > T::zero() {
314 let skewness = sortedvalues
315 .iter()
316 .map(|&x| {
317 let diff = (x - mean) / std_dev;
318 diff * diff * diff
319 })
320 .fold(T::zero(), |acc, x| acc + x)
321 / num_traits::cast(sortedvalues.len()).unwrap_or(T::one());
322
323 let skewness_f64: f64 = num_traits::cast(skewness).unwrap_or(0.0);
324
325 if skewness_f64.abs() < 0.5 {
326 Some("approximately_normal".to_string())
327 } else if skewness_f64 > 0.5 {
328 Some("right_skewed".to_string())
329 } else {
330 Some("left_skewed".to_string())
331 }
332 } else {
333 Some("constant".to_string())
334 }
335 }
336
337 fn detect_outliers<S, D>(
339 &self,
340 array: &ArrayBase<S, D>,
341 stats: &StatisticalSummary,
342 fieldname: &str,
343 ) -> Result<Vec<QualityIssue>, CoreError>
344 where
345 S: Data,
346 D: Dimension,
347 S::Elem: Float + fmt::Debug,
348 {
349 let mut issues = Vec::new();
350
351 if stats.outliers > 0 {
352 let outlier_percentage = (stats.outliers as f64 / stats.count as f64) * 100.0;
353
354 if outlier_percentage > 5.0 {
355 issues.push(QualityIssue {
356 issue_type: QualityIssueType::Outlier,
357 location: fieldname.to_string(),
358 description: format!(
359 "High number of outliers detected: {} ({:.1}%)",
360 stats.outliers, outlier_percentage
361 ),
362 severity: if outlier_percentage > 15.0 {
363 ErrorSeverity::Error
364 } else {
365 ErrorSeverity::Warning
366 },
367 suggestion: Some(
368 "Review outliers for data quality issues or consider outlier treatment"
369 .to_string(),
370 ),
371 });
372 }
373 }
374
375 Ok(issues)
376 }
377
378 fn calculate_consistency<S, D>(&self, array: &ArrayBase<S, D>) -> Result<f64, CoreError>
380 where
381 S: Data,
382 D: Dimension,
383 S::Elem: Float,
384 {
385 let array_size = array.len();
387
388 if array_size < 3 {
389 return Ok(1.0);
391 }
392
393 let values: Vec<f64> = array.iter().filter_map(|&x| x.to_f64()).collect();
394
395 if values.len() < 3 {
396 return Ok(1.0);
398 }
399
400 let mut diff_scores = Vec::new();
402 for i in 1..values.len() {
403 diff_scores.push(values[i] - values[i.saturating_sub(1)]);
404 }
405
406 let mean_diff = diff_scores.iter().sum::<f64>() / diff_scores.len() as f64;
408 let variance = diff_scores
409 .iter()
410 .map(|&d| (d - mean_diff).powi(2))
411 .sum::<f64>()
412 / diff_scores.len() as f64;
413
414 let mut period_score = 1.0;
416 for period in 2..((values.len() / 2).min(10)) {
417 let mut matches = 0;
418 let mut comparisons = 0;
419
420 for i in period..values.len() {
421 if (values[i] - values[i - period]).abs() < 1e-10 {
422 matches += 1;
423 }
424 comparisons += 1;
425 }
426
427 if comparisons > 0 {
428 let current_score = matches as f64 / comparisons as f64;
429 period_score = period_score.max(current_score);
430 }
431 }
432
433 let diff_consistency = if variance > 0.0 {
436 (-variance.ln()).exp().clamp(0.0, 1.0)
437 } else {
438 1.0 };
440
441 let consistency_score = 0.7 * diff_consistency + 0.3 * period_score;
443
444 Ok(consistency_score.clamp(0.0, 1.0))
445 }
446
447 fn add_specific_recommendations(
449 &self,
450 issues: &[QualityIssue],
451 recommendations: &mut Vec<String>,
452 ) {
453 let has_missing_data = issues
454 .iter()
455 .any(|i| matches!(i.issue_type, QualityIssueType::MissingData));
456 let has_invalid_numeric = issues
457 .iter()
458 .any(|i| matches!(i.issue_type, QualityIssueType::InvalidNumeric));
459 let has_outliers = issues
460 .iter()
461 .any(|i| matches!(i.issue_type, QualityIssueType::Outlier));
462
463 if has_missing_data {
464 recommendations.push("Consider using imputation techniques (mean, median, mode, or forward-fill) for missing values.".to_string());
465 }
466
467 if has_invalid_numeric {
468 recommendations
469 .push("Remove or replace NaN and infinite values before analysis.".to_string());
470 }
471
472 if has_outliers {
473 recommendations.push(
474 "Investigate outliers - they may indicate data errors or interesting edge cases."
475 .to_string(),
476 );
477 }
478
479 if has_missing_data && has_invalid_numeric {
480 recommendations.push("Consider a comprehensive data cleaning pipeline to address multiple quality issues.".to_string());
481 }
482 }
483}
484
485impl Default for QualityAnalyzer {
486 fn default() -> Self {
487 Self::new()
488 }
489}
490
491impl DataQualityReport {
492 pub fn formatted_report(&self) -> String {
494 let mut report = "Data Quality Report\n".to_string();
495 report.push_str("==================\n\n");
496 report.push_str(&format!(
497 "Overall Quality Score: {:.2}\n\n",
498 self.quality_score
499 ));
500
501 report.push_str("Metrics:\n");
502 report.push_str(&format!(
503 " Completeness: {:.1}%\n",
504 self.metrics.completeness * 100.0
505 ));
506 report.push_str(&format!(
507 " Validity: {:.1}%\n",
508 self.metrics.validity * 100.0
509 ));
510 report.push_str(&format!(
511 " Consistency: {:.1}%\n",
512 self.metrics.consistency * 100.0
513 ));
514 report.push_str(&format!(
515 " Accuracy: {:.1}%\n\n",
516 self.metrics.accuracy * 100.0
517 ));
518
519 if let Some(ref stats) = self.metrics.statistical_summary {
520 report.push_str("Statistical Summary:\n");
521 report.push_str(&format!(" Count: {}\n", stats.count));
522 report.push_str(&format!(" Mean: {:.6}\n", stats.mean));
523 report.push_str(&format!(" Std Dev: {:.6}\n", stats.std_dev));
524 report.push_str(&format!(" Min: {:.6}\n", stats.min));
525 report.push_str(&format!(" Max: {:.6}\n", stats.max));
526 report.push_str(&format!(" Outliers: {}\n", stats.outliers));
527 if let Some(ref dist) = stats.distribution {
528 report.push_str(&format!(" Distribution: {}\n", dist));
529 }
530 report.push('\n');
531 }
532
533 if !self.issues.is_empty() {
534 report.push_str("Issues Found:\n");
535 for (i, issue) in self.issues.iter().enumerate() {
536 report.push_str(&format!(
537 " {}. [{:?}] {}: {}\n",
538 i + 1,
539 issue.severity,
540 issue.location,
541 issue.description
542 ));
543 if let Some(ref suggestion) = issue.suggestion {
544 report.push_str(&format!(" Suggestion: {}\n", suggestion));
545 }
546 }
547 report.push('\n');
548 }
549
550 if !self.recommendations.is_empty() {
551 report.push_str("Recommendations:\n");
552 for (i, rec) in self.recommendations.iter().enumerate() {
553 report.push_str(&format!(" {}. {}\n", i + 1, rec));
554 }
555 }
556
557 report
558 }
559
560 pub fn is_acceptable(&self, threshold: f64) -> bool {
562 self.quality_score >= threshold
563 }
564
565 pub fn get_critical_issues(&self) -> Vec<&QualityIssue> {
567 self.issues
568 .iter()
569 .filter(|issue| issue.severity == ErrorSeverity::Critical)
570 .collect()
571 }
572
573 pub fn get_issues_by_type(&self, issuetype: QualityIssueType) -> Vec<&QualityIssue> {
575 self.issues
576 .iter()
577 .filter(|issue| issue.issue_type == issuetype)
578 .collect()
579 }
580}
581
582#[cfg(test)]
583mod tests {
584 use super::*;
585 use ::ndarray::Array1;
586
587 #[test]
588 fn test_quality_analyzer() {
589 let analyzer = QualityAnalyzer::new();
590 let array = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
591
592 let report = analyzer
593 .generate_quality_report(&array, "test_field")
594 .expect("Operation failed");
595
596 assert!(report.quality_score > 0.9); assert_eq!(report.metrics.completeness, 1.0); assert_eq!(report.metrics.validity, 1.0); assert!(report.issues.is_empty()); }
601
602 #[test]
603 fn test_quality_with_missing_data() {
604 let analyzer = QualityAnalyzer::new();
605 let array = Array1::from_vec(vec![1.0, f64::NAN, 3.0, 4.0, 5.0]);
606
607 let report = analyzer
608 .generate_quality_report(&array, "test_field")
609 .expect("Operation failed");
610
611 assert!(report.metrics.completeness < 1.0); assert!(!report.issues.is_empty()); let missing_issues = report.get_issues_by_type(QualityIssueType::MissingData);
615 assert!(!missing_issues.is_empty());
616 }
617
618 #[test]
619 fn test_quality_with_infinite_values() {
620 let analyzer = QualityAnalyzer::new();
621 let array = Array1::from_vec(vec![1.0, 2.0, f64::INFINITY, 4.0, 5.0]);
622
623 let report = analyzer
624 .generate_quality_report(&array, "test_field")
625 .expect("Operation failed");
626
627 assert!(report.metrics.validity < 1.0); let invalid_issues = report.get_issues_by_type(QualityIssueType::InvalidNumeric);
630 assert!(!invalid_issues.is_empty());
631 }
632
633 #[test]
634 fn test_statistical_summary() {
635 let analyzer = QualityAnalyzer::new();
636 let array = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
637
638 let report = analyzer
639 .generate_quality_report(&array, "test_field")
640 .expect("Operation failed");
641
642 assert!(report.metrics.statistical_summary.is_some());
643 let stats = report
644 .metrics
645 .statistical_summary
646 .expect("Operation failed");
647 assert_eq!(stats.count, 5);
648 assert!((stats.mean - 3.0).abs() < 1e-10);
649 assert_eq!(stats.min, 1.0);
650 assert_eq!(stats.max, 5.0);
651 }
652
653 #[test]
654 fn test_formatted_report() {
655 let analyzer = QualityAnalyzer::new();
656 let array = Array1::from_vec(vec![1.0, 2.0, 3.0]);
657
658 let report = analyzer
659 .generate_quality_report(&array, "test_field")
660 .expect("Operation failed");
661 let formatted = report.formatted_report();
662
663 assert!(formatted.contains("Data Quality Report"));
664 assert!(formatted.contains("Overall Quality Score"));
665 assert!(formatted.contains("Metrics:"));
666 assert!(formatted.contains("Statistical Summary:"));
667 }
668
669 #[test]
670 fn test_quality_acceptance() {
671 let analyzer = QualityAnalyzer::new();
672 let array = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
673
674 let report = analyzer
675 .generate_quality_report(&array, "test_field")
676 .expect("Operation failed");
677
678 assert!(report.is_acceptable(0.8)); assert!(report.is_acceptable(0.9)); assert!(report.get_critical_issues().is_empty()); }
682}