alimentar/
quality.rs

1//! Data quality assessment for ML pipelines
2//!
3//! Detects data quality issues including missing values, outliers,
4//! duplicates, and schema problems.
5//!
6//! # 100-Point Quality Scoring System (GH-6)
7//!
8//! Based on the Toyota Way principles of Jidoka (built-in quality) and
9//! the Doctest Corpus QA Checklist for Publication.
10//!
11//! ## Severity Weights
12//! - **Critical (2.0x)**: Blocks publication - data integrity failures
13//! - **High (1.5x)**: Major issues requiring immediate attention
14//! - **Medium (1.0x)**: Standard issues to address before publication
15//! - **Low (0.5x)**: Minor issues, informational
16//!
17//! ## Letter Grades
18//! - **A (95-100)**: Publish immediately
19//! - **B (85-94)**: Publish with documented caveats
20//! - **C (70-84)**: Remediation required before publication
21//! - **D (50-69)**: Major rework needed
22//! - **F (<50)**: Do not publish
23//!
24//! # Example
25//!
26//! ```ignore
27//! use alimentar::quality::{QualityChecker, QualityScore};
28//!
29//! let checker = QualityChecker::new()
30//!     .max_null_ratio(0.1)
31//!     .max_duplicate_ratio(0.05);
32//!
33//! let report = checker.check(&dataset)?;
34//! let score = QualityScore::from_report(&report);
35//! println!("Grade: {} ({})", score.grade, score.score);
36//! ```
37//!
38//! # References
39//! - [1] Batini & Scannapieco (2016). Data and Information Quality.
40//! - [6] Hynes et al. (2017). The Data Linter. NIPS Workshop on ML Systems.
41
42// Statistical computation and internal methods
43#![allow(clippy::cast_precision_loss)]
44#![allow(clippy::suboptimal_flops)]
45#![allow(clippy::unused_self)]
46#![allow(clippy::if_not_else)]
47
48use std::{
49    collections::{HashMap, HashSet},
50    fmt,
51};
52
53use crate::{
54    dataset::{ArrowDataset, Dataset},
55    error::Result,
56};
57
58// ═══════════════════════════════════════════════════════════════════════════════
59// 100-Point Quality Scoring System (GH-6)
60// ═══════════════════════════════════════════════════════════════════════════════
61
62/// Severity levels for quality issues per QA checklist
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
64pub enum Severity {
65    /// Critical issues block publication (2.0x weight)
66    Critical,
67    /// High priority issues (1.5x weight)
68    High,
69    /// Medium priority issues (1.0x weight)
70    Medium,
71    /// Low priority issues (0.5x weight)
72    Low,
73}
74
75impl Severity {
76    /// Get the weight multiplier for this severity
77    #[must_use]
78    pub fn weight(&self) -> f64 {
79        match self {
80            Self::Critical => 2.0,
81            Self::High => 1.5,
82            Self::Medium => 1.0,
83            Self::Low => 0.5,
84        }
85    }
86
87    /// Get the base point value for this severity
88    #[must_use]
89    pub fn base_points(&self) -> f64 {
90        match self {
91            Self::Critical => 2.0,
92            Self::High => 1.5,
93            Self::Medium => 1.0,
94            Self::Low => 0.5,
95        }
96    }
97}
98
99impl fmt::Display for Severity {
100    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
101        match self {
102            Self::Critical => write!(f, "Critical"),
103            Self::High => write!(f, "High"),
104            Self::Medium => write!(f, "Medium"),
105            Self::Low => write!(f, "Low"),
106        }
107    }
108}
109
110/// Letter grades for dataset quality
111#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
112pub enum LetterGrade {
113    /// A (95-100): Publish immediately
114    A,
115    /// B (85-94): Publish with documented caveats
116    B,
117    /// C (70-84): Remediation required before publication
118    C,
119    /// D (50-69): Major rework needed
120    D,
121    /// F (<50): Do not publish
122    F,
123}
124
125impl LetterGrade {
126    /// Create a letter grade from a numeric score (0-100)
127    #[must_use]
128    pub fn from_score(score: f64) -> Self {
129        match score {
130            s if s >= 95.0 => Self::A,
131            s if s >= 85.0 => Self::B,
132            s if s >= 70.0 => Self::C,
133            s if s >= 50.0 => Self::D,
134            _ => Self::F,
135        }
136    }
137
138    /// Get the publication decision for this grade
139    #[must_use]
140    pub fn publication_decision(&self) -> &'static str {
141        match self {
142            Self::A => "Publish immediately",
143            Self::B => "Publish with documented caveats",
144            Self::C => "Remediation required before publication",
145            Self::D => "Major rework needed",
146            Self::F => "Do not publish",
147        }
148    }
149
150    /// Check if this grade allows publication
151    #[must_use]
152    pub fn is_publishable(&self) -> bool {
153        matches!(self, Self::A | Self::B)
154    }
155}
156
157impl fmt::Display for LetterGrade {
158    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
159        match self {
160            Self::A => write!(f, "A"),
161            Self::B => write!(f, "B"),
162            Self::C => write!(f, "C"),
163            Self::D => write!(f, "D"),
164            Self::F => write!(f, "F"),
165        }
166    }
167}
168
169/// A scored quality check item from the 100-point checklist
170#[derive(Debug, Clone)]
171pub struct ChecklistItem {
172    /// Unique identifier (e.g., "1", "25", "53")
173    pub id: u8,
174    /// Check description
175    pub description: String,
176    /// Pass/fail status
177    pub passed: bool,
178    /// Severity level
179    pub severity: Severity,
180    /// Suggestion for improvement if failed
181    pub suggestion: Option<String>,
182}
183
184impl ChecklistItem {
185    /// Create a new checklist item
186    #[must_use]
187    pub fn new(id: u8, description: impl Into<String>, severity: Severity, passed: bool) -> Self {
188        Self {
189            id,
190            description: description.into(),
191            passed,
192            severity,
193            suggestion: None,
194        }
195    }
196
197    /// Add a suggestion for improvement
198    #[must_use]
199    pub fn with_suggestion(mut self, suggestion: impl Into<String>) -> Self {
200        self.suggestion = Some(suggestion.into());
201        self
202    }
203
204    /// Get the points earned (0 if failed, severity points if passed)
205    #[must_use]
206    pub fn points_earned(&self) -> f64 {
207        if self.passed {
208            self.severity.base_points()
209        } else {
210            0.0
211        }
212    }
213
214    /// Get the maximum possible points for this item
215    #[must_use]
216    pub fn max_points(&self) -> f64 {
217        self.severity.base_points()
218    }
219}
220
221/// Complete quality score with breakdown
222#[derive(Debug, Clone)]
223pub struct QualityScore {
224    /// Numeric score (0-100)
225    pub score: f64,
226    /// Letter grade
227    pub grade: LetterGrade,
228    /// Total points earned
229    pub points_earned: f64,
230    /// Maximum possible points
231    pub max_points: f64,
232    /// Individual checklist items
233    pub checklist: Vec<ChecklistItem>,
234    /// Summary statistics by severity
235    pub severity_breakdown: HashMap<Severity, SeverityStats>,
236}
237
238/// Statistics for a severity level
239#[derive(Debug, Clone, Default)]
240pub struct SeverityStats {
241    /// Number of checks at this severity
242    pub total: usize,
243    /// Number of passed checks
244    pub passed: usize,
245    /// Number of failed checks
246    pub failed: usize,
247    /// Points earned at this severity
248    pub points_earned: f64,
249    /// Maximum possible points at this severity
250    pub max_points: f64,
251}
252
253impl QualityScore {
254    /// Create a quality score from checklist items
255    #[must_use]
256    pub fn from_checklist(checklist: Vec<ChecklistItem>) -> Self {
257        let mut severity_breakdown: HashMap<Severity, SeverityStats> = HashMap::new();
258
259        let mut points_earned = 0.0;
260        let mut max_points = 0.0;
261
262        for item in &checklist {
263            let stats = severity_breakdown.entry(item.severity).or_default();
264
265            stats.total += 1;
266            stats.max_points += item.max_points();
267
268            if item.passed {
269                stats.passed += 1;
270                stats.points_earned += item.points_earned();
271                points_earned += item.points_earned();
272            } else {
273                stats.failed += 1;
274            }
275
276            max_points += item.max_points();
277        }
278
279        let score = if max_points > 0.0 {
280            (points_earned / max_points * 100.0).clamp(0.0, 100.0)
281        } else {
282            100.0
283        };
284
285        let grade = LetterGrade::from_score(score);
286
287        Self {
288            score,
289            grade,
290            points_earned,
291            max_points,
292            checklist,
293            severity_breakdown,
294        }
295    }
296
297    /// Get failed items for actionable suggestions
298    #[must_use]
299    pub fn failed_items(&self) -> Vec<&ChecklistItem> {
300        self.checklist.iter().filter(|item| !item.passed).collect()
301    }
302
303    /// Get critical failures (blocks publication)
304    #[must_use]
305    pub fn critical_failures(&self) -> Vec<&ChecklistItem> {
306        self.checklist
307            .iter()
308            .filter(|item| !item.passed && item.severity == Severity::Critical)
309            .collect()
310    }
311
312    /// Check if there are any critical failures
313    #[must_use]
314    pub fn has_critical_failures(&self) -> bool {
315        self.checklist
316            .iter()
317            .any(|item| !item.passed && item.severity == Severity::Critical)
318    }
319
320    /// Generate a badge URL for shields.io
321    #[must_use]
322    pub fn badge_url(&self) -> String {
323        let color = match self.grade {
324            LetterGrade::A => "brightgreen",
325            LetterGrade::B => "green",
326            LetterGrade::C => "yellow",
327            LetterGrade::D => "orange",
328            LetterGrade::F => "red",
329        };
330        format!(
331            "https://img.shields.io/badge/data_quality-{}_({:.0}%25)-{}",
332            self.grade, self.score, color
333        )
334    }
335
336    /// Generate JSON output for CI/CD integration
337    #[must_use]
338    pub fn to_json(&self) -> String {
339        let failed_items: Vec<_> = self
340            .failed_items()
341            .iter()
342            .map(|item| {
343                format!(
344                    r#"    {{"id": {}, "description": "{}", "severity": "{}", "suggestion": {}}}"#,
345                    item.id,
346                    item.description.replace('"', "\\\""),
347                    item.severity,
348                    item.suggestion
349                        .as_ref()
350                        .map(|s| format!("\"{}\"", s.replace('"', "\\\"")))
351                        .unwrap_or_else(|| "null".to_string())
352                )
353            })
354            .collect();
355
356        format!(
357            r#"{{
358  "score": {:.2},
359  "grade": "{}",
360  "is_publishable": {},
361  "decision": "{}",
362  "points_earned": {:.2},
363  "max_points": {:.2},
364  "critical_failures": {},
365  "failed_items": [
366{}
367  ],
368  "badge_url": "{}"
369}}"#,
370            self.score,
371            self.grade,
372            self.grade.is_publishable(),
373            self.grade.publication_decision(),
374            self.points_earned,
375            self.max_points,
376            self.has_critical_failures(),
377            failed_items.join(",\n"),
378            self.badge_url()
379        )
380    }
381}
382
383// ═══════════════════════════════════════════════════════════════════════════════
384// Quality Profiles (GH-10)
385// ═══════════════════════════════════════════════════════════════════════════════
386
387/// Quality profile for customizing scoring rules per data type.
388///
389/// Different data types (doctest corpora, ML training sets, time series, etc.)
390/// have different expectations. For example:
391/// - Doctest corpus: `source` and `version` columns are expected to be constant
392/// - ML training: features should have high variance, labels can be categorical
393/// - Time series: timestamps should be unique and sequential
394///
395/// # Example
396///
397/// ```ignore
398/// let profile = QualityProfile::doctest_corpus();
399/// let score = profile.score_report(&report);
400/// ```
401#[derive(Debug, Clone)]
402pub struct QualityProfile {
403    /// Profile name for display
404    pub name: String,
405    /// Description of what this profile is for
406    pub description: String,
407    /// Columns that are expected to be constant (not penalized)
408    pub expected_constant_columns: HashSet<String>,
409    /// Columns where high null ratio is acceptable
410    pub nullable_columns: HashSet<String>,
411    /// Maximum acceptable null ratio (default: 0.1)
412    pub max_null_ratio: f64,
413    /// Maximum acceptable duplicate ratio (default: 0.5)
414    pub max_duplicate_ratio: f64,
415    /// Minimum cardinality before flagging as low (default: 2)
416    pub min_cardinality: usize,
417    /// Maximum outlier ratio to report (default: 0.05)
418    pub max_outlier_ratio: f64,
419    /// Maximum duplicate row ratio (default: 0.01)
420    pub max_duplicate_row_ratio: f64,
421    /// Whether to penalize constant columns not in expected list
422    pub penalize_unexpected_constants: bool,
423    /// Whether this profile requires a signature column (for doctest)
424    pub require_signature: bool,
425}
426
427impl Default for QualityProfile {
428    fn default() -> Self {
429        Self {
430            name: "default".to_string(),
431            description: "General-purpose quality profile".to_string(),
432            expected_constant_columns: HashSet::new(),
433            nullable_columns: HashSet::new(),
434            max_null_ratio: 0.1,
435            max_duplicate_ratio: 0.5,
436            min_cardinality: 2,
437            max_outlier_ratio: 0.05,
438            max_duplicate_row_ratio: 0.01,
439            penalize_unexpected_constants: true,
440            require_signature: false,
441        }
442    }
443}
444
445impl QualityProfile {
446    /// Create a new profile with custom name
447    #[must_use]
448    pub fn new(name: impl Into<String>) -> Self {
449        Self {
450            name: name.into(),
451            ..Default::default()
452        }
453    }
454
455    /// Get profile by name
456    #[must_use]
457    pub fn by_name(name: &str) -> Option<Self> {
458        match name {
459            "default" => Some(Self::default()),
460            "doctest-corpus" | "doctest" => Some(Self::doctest_corpus()),
461            "ml-training" | "ml" => Some(Self::ml_training()),
462            "time-series" | "timeseries" => Some(Self::time_series()),
463            _ => None,
464        }
465    }
466
467    /// List available profile names
468    #[must_use]
469    pub fn available_profiles() -> Vec<&'static str> {
470        vec!["default", "doctest-corpus", "ml-training", "time-series"]
471    }
472
473    /// Doctest corpus profile - for Python doctest extraction datasets.
474    ///
475    /// Expects:
476    /// - `source` and `version` columns to be constant (single crate/version)
477    /// - `signature` column may have nulls (module-level doctests)
478    /// - `input`, `expected`, `function` should be non-null
479    #[must_use]
480    pub fn doctest_corpus() -> Self {
481        let mut expected_constants = HashSet::new();
482        expected_constants.insert("source".to_string());
483        expected_constants.insert("version".to_string());
484
485        let mut nullable = HashSet::new();
486        nullable.insert("signature".to_string()); // Module-level doctests have no signature
487
488        Self {
489            name: "doctest-corpus".to_string(),
490            description: "Profile for Python doctest extraction datasets".to_string(),
491            expected_constant_columns: expected_constants,
492            nullable_columns: nullable,
493            max_null_ratio: 0.05,     // Stricter for doctest data
494            max_duplicate_ratio: 0.3, // Some duplicate inputs are normal
495            min_cardinality: 2,
496            max_outlier_ratio: 0.05,
497            max_duplicate_row_ratio: 0.0, // No exact duplicate rows allowed
498            penalize_unexpected_constants: true,
499            require_signature: false, // Relaxed - signature nulls are OK for module doctests
500        }
501    }
502
503    /// ML training profile - for machine learning datasets.
504    ///
505    /// Expects:
506    /// - Features to have reasonable variance
507    /// - Labels can be categorical (low cardinality OK)
508    /// - No null values in features or labels
509    #[must_use]
510    pub fn ml_training() -> Self {
511        Self {
512            name: "ml-training".to_string(),
513            description: "Profile for machine learning training datasets".to_string(),
514            expected_constant_columns: HashSet::new(),
515            nullable_columns: HashSet::new(),
516            max_null_ratio: 0.0,      // No nulls allowed in training data
517            max_duplicate_ratio: 0.8, // Higher tolerance for categorical features
518            min_cardinality: 2,
519            max_outlier_ratio: 0.1, // More tolerant of outliers
520            max_duplicate_row_ratio: 0.01,
521            penalize_unexpected_constants: true,
522            require_signature: false,
523        }
524    }
525
526    /// Time series profile - for temporal data.
527    ///
528    /// Expects:
529    /// - Timestamp column should be unique
530    /// - Data should have temporal patterns
531    #[must_use]
532    pub fn time_series() -> Self {
533        Self {
534            name: "time-series".to_string(),
535            description: "Profile for time series datasets".to_string(),
536            expected_constant_columns: HashSet::new(),
537            nullable_columns: HashSet::new(),
538            max_null_ratio: 0.05,
539            max_duplicate_ratio: 0.5,
540            min_cardinality: 2,
541            max_outlier_ratio: 0.1,       // Time series often have outliers
542            max_duplicate_row_ratio: 0.0, // No duplicate rows (each timestamp unique)
543            penalize_unexpected_constants: true,
544            require_signature: false,
545        }
546    }
547
548    /// Set description
549    #[must_use]
550    pub fn with_description(mut self, desc: impl Into<String>) -> Self {
551        self.description = desc.into();
552        self
553    }
554
555    /// Add an expected constant column
556    #[must_use]
557    pub fn with_expected_constant(mut self, column: impl Into<String>) -> Self {
558        self.expected_constant_columns.insert(column.into());
559        self
560    }
561
562    /// Add a nullable column
563    #[must_use]
564    pub fn with_nullable(mut self, column: impl Into<String>) -> Self {
565        self.nullable_columns.insert(column.into());
566        self
567    }
568
569    /// Set max null ratio
570    #[must_use]
571    pub fn with_max_null_ratio(mut self, ratio: f64) -> Self {
572        self.max_null_ratio = ratio;
573        self
574    }
575
576    /// Set max duplicate ratio
577    #[must_use]
578    pub fn with_max_duplicate_ratio(mut self, ratio: f64) -> Self {
579        self.max_duplicate_ratio = ratio;
580        self
581    }
582
583    /// Check if a column is expected to be constant
584    #[must_use]
585    pub fn is_expected_constant(&self, column: &str) -> bool {
586        self.expected_constant_columns.contains(column)
587    }
588
589    /// Check if a column is allowed to have nulls
590    #[must_use]
591    pub fn is_nullable(&self, column: &str) -> bool {
592        self.nullable_columns.contains(column)
593    }
594
595    /// Get effective null threshold for a column
596    #[must_use]
597    pub fn null_threshold_for(&self, column: &str) -> f64 {
598        if self.is_nullable(column) {
599            1.0 // Allow up to 100% nulls for nullable columns
600        } else {
601            self.max_null_ratio
602        }
603    }
604}
605
606// ═══════════════════════════════════════════════════════════════════════════════
607// Quality Issues
608// ═══════════════════════════════════════════════════════════════════════════════
609
610/// Types of data quality issues
611#[derive(Debug, Clone, PartialEq)]
612pub enum QualityIssue {
613    /// Column has high percentage of null/missing values
614    HighNullRatio {
615        /// Column name
616        column: String,
617        /// Actual null ratio
618        null_ratio: f64,
619        /// Configured threshold
620        threshold: f64,
621    },
622    /// Column has high percentage of duplicate values
623    HighDuplicateRatio {
624        /// Column name
625        column: String,
626        /// Actual duplicate ratio
627        duplicate_ratio: f64,
628        /// Configured threshold
629        threshold: f64,
630    },
631    /// Column has very low cardinality (potential constant)
632    LowCardinality {
633        /// Column name
634        column: String,
635        /// Number of unique values
636        unique_count: usize,
637        /// Total row count
638        total_count: usize,
639    },
640    /// Column has potential outliers (IQR method)
641    OutliersDetected {
642        /// Column name
643        column: String,
644        /// Number of outliers
645        outlier_count: usize,
646        /// Ratio of outliers
647        outlier_ratio: f64,
648    },
649    /// Dataset has duplicate rows
650    DuplicateRows {
651        /// Number of duplicate rows
652        duplicate_count: usize,
653        /// Ratio of duplicate rows
654        duplicate_ratio: f64,
655    },
656    /// Column has constant value (zero variance)
657    ConstantColumn {
658        /// Column name
659        column: String,
660        /// The constant value
661        value: String,
662    },
663    /// Schema has no columns
664    EmptySchema,
665    /// Dataset is empty
666    EmptyDataset,
667}
668
669impl QualityIssue {
670    /// Get severity level (1-5, higher is worse)
671    pub fn severity(&self) -> u8 {
672        match self {
673            Self::EmptySchema | Self::EmptyDataset => 5,
674            Self::ConstantColumn { .. } => 4,
675            Self::HighNullRatio { null_ratio, .. } if *null_ratio > 0.5 => 4,
676            Self::HighNullRatio { .. } => 3,
677            Self::OutliersDetected { outlier_ratio, .. } if *outlier_ratio > 0.1 => 3,
678            Self::OutliersDetected { .. }
679            | Self::HighDuplicateRatio { .. }
680            | Self::DuplicateRows { .. } => 2,
681            Self::LowCardinality { .. } => 1,
682        }
683    }
684
685    /// Get column name if applicable
686    pub fn column(&self) -> Option<&str> {
687        match self {
688            Self::HighNullRatio { column, .. }
689            | Self::HighDuplicateRatio { column, .. }
690            | Self::LowCardinality { column, .. }
691            | Self::OutliersDetected { column, .. }
692            | Self::ConstantColumn { column, .. } => Some(column),
693            _ => None,
694        }
695    }
696}
697
698/// Quality statistics for a single column
699#[derive(Debug, Clone)]
700pub struct ColumnQuality {
701    /// Column name
702    pub name: String,
703    /// Total row count
704    pub total_count: usize,
705    /// Null/missing count
706    pub null_count: usize,
707    /// Null ratio (0-1)
708    pub null_ratio: f64,
709    /// Number of unique values
710    pub unique_count: usize,
711    /// Unique ratio (unique/total)
712    pub unique_ratio: f64,
713    /// Number of duplicate values (non-unique occurrences)
714    pub duplicate_count: usize,
715    /// Duplicate ratio
716    pub duplicate_ratio: f64,
717    /// Number of outliers (for numeric columns)
718    pub outlier_count: Option<usize>,
719    /// Basic stats for numeric columns
720    pub numeric_stats: Option<NumericStats>,
721}
722
723impl ColumnQuality {
724    /// Check if column is constant (single unique value)
725    pub fn is_constant(&self) -> bool {
726        self.unique_count <= 1 && self.total_count > 0
727    }
728
729    /// Check if column is mostly null
730    pub fn is_mostly_null(&self, threshold: f64) -> bool {
731        self.null_ratio >= threshold
732    }
733}
734
735/// Basic statistics for numeric columns
736#[derive(Debug, Clone)]
737pub struct NumericStats {
738    /// Minimum value
739    pub min: f64,
740    /// Maximum value
741    pub max: f64,
742    /// Mean value
743    pub mean: f64,
744    /// Standard deviation
745    pub std_dev: f64,
746    /// 25th percentile (Q1)
747    pub q1: f64,
748    /// 50th percentile (median)
749    pub median: f64,
750    /// 75th percentile (Q3)
751    pub q3: f64,
752}
753
754impl NumericStats {
755    /// Calculate IQR (Interquartile Range)
756    pub fn iqr(&self) -> f64 {
757        self.q3 - self.q1
758    }
759
760    /// Get lower bound for outliers (Q1 - 1.5*IQR)
761    pub fn outlier_lower_bound(&self) -> f64 {
762        self.q1 - 1.5 * self.iqr()
763    }
764
765    /// Get upper bound for outliers (Q3 + 1.5*IQR)
766    pub fn outlier_upper_bound(&self) -> f64 {
767        self.q3 + 1.5 * self.iqr()
768    }
769}
770
771/// Overall data quality report
772#[derive(Debug, Clone)]
773pub struct QualityReport {
774    /// Total row count
775    pub row_count: usize,
776    /// Total column count
777    pub column_count: usize,
778    /// Per-column quality statistics
779    pub columns: HashMap<String, ColumnQuality>,
780    /// Detected issues
781    pub issues: Vec<QualityIssue>,
782    /// Overall quality score (0-100)
783    pub score: f64,
784    /// Number of duplicate rows
785    pub duplicate_row_count: usize,
786}
787
788impl QualityReport {
789    /// Check if any issues were found
790    pub fn has_issues(&self) -> bool {
791        !self.issues.is_empty()
792    }
793
794    /// Get issues for a specific column
795    pub fn column_issues(&self, column: &str) -> Vec<&QualityIssue> {
796        self.issues
797            .iter()
798            .filter(|i| i.column() == Some(column))
799            .collect()
800    }
801
802    /// Get maximum severity among all issues
803    pub fn max_severity(&self) -> u8 {
804        self.issues.iter().map(|i| i.severity()).max().unwrap_or(0)
805    }
806
807    /// Get columns with issues
808    pub fn problematic_columns(&self) -> Vec<&str> {
809        self.issues
810            .iter()
811            .filter_map(|i| i.column())
812            .collect::<HashSet<_>>()
813            .into_iter()
814            .collect()
815    }
816}
817
818/// Configuration thresholds for quality checking
819#[derive(Debug, Clone)]
820pub struct QualityThresholds {
821    /// Maximum acceptable null ratio (default: 0.1)
822    pub max_null_ratio: f64,
823    /// Maximum acceptable duplicate ratio (default: 0.5)
824    pub max_duplicate_ratio: f64,
825    /// Minimum cardinality to not flag as low (default: 2)
826    pub min_cardinality: usize,
827    /// Maximum outlier ratio to report (default: 0.05)
828    pub max_outlier_ratio: f64,
829    /// Maximum duplicate row ratio (default: 0.01)
830    pub max_duplicate_row_ratio: f64,
831}
832
833impl Default for QualityThresholds {
834    fn default() -> Self {
835        Self {
836            max_null_ratio: 0.1,
837            max_duplicate_ratio: 0.5,
838            min_cardinality: 2,
839            max_outlier_ratio: 0.05,
840            max_duplicate_row_ratio: 0.01,
841        }
842    }
843}
844
845/// Data quality checker
846pub struct QualityChecker {
847    thresholds: QualityThresholds,
848    check_outliers: bool,
849    check_duplicates: bool,
850}
851
852impl Default for QualityChecker {
853    fn default() -> Self {
854        Self::new()
855    }
856}
857
858impl QualityChecker {
859    /// Create a new quality checker with default thresholds
860    pub fn new() -> Self {
861        Self {
862            thresholds: QualityThresholds::default(),
863            check_outliers: true,
864            check_duplicates: true,
865        }
866    }
867
868    /// Set maximum null ratio threshold
869    #[must_use]
870    pub fn max_null_ratio(mut self, ratio: f64) -> Self {
871        self.thresholds.max_null_ratio = ratio;
872        self
873    }
874
875    /// Set maximum duplicate ratio threshold
876    #[must_use]
877    pub fn max_duplicate_ratio(mut self, ratio: f64) -> Self {
878        self.thresholds.max_duplicate_ratio = ratio;
879        self
880    }
881
882    /// Set minimum cardinality threshold
883    #[must_use]
884    pub fn min_cardinality(mut self, min: usize) -> Self {
885        self.thresholds.min_cardinality = min;
886        self
887    }
888
889    /// Set maximum outlier ratio threshold
890    #[must_use]
891    pub fn max_outlier_ratio(mut self, ratio: f64) -> Self {
892        self.thresholds.max_outlier_ratio = ratio;
893        self
894    }
895
896    /// Enable/disable outlier checking
897    #[must_use]
898    pub fn with_outlier_check(mut self, enabled: bool) -> Self {
899        self.check_outliers = enabled;
900        self
901    }
902
903    /// Enable/disable duplicate row checking
904    #[must_use]
905    pub fn with_duplicate_check(mut self, enabled: bool) -> Self {
906        self.check_duplicates = enabled;
907        self
908    }
909
910    /// Check dataset quality
911    pub fn check(&self, dataset: &ArrowDataset) -> Result<QualityReport> {
912        let schema = dataset.schema();
913        let mut issues = Vec::new();
914
915        // Check for empty schema
916        if schema.fields().is_empty() {
917            issues.push(QualityIssue::EmptySchema);
918            return Ok(QualityReport {
919                row_count: 0,
920                column_count: 0,
921                columns: HashMap::new(),
922                issues,
923                score: 0.0,
924                duplicate_row_count: 0,
925            });
926        }
927
928        // Collect all data
929        let (column_data, row_count) = self.collect_data(dataset);
930
931        // Check for empty dataset
932        if row_count == 0 {
933            issues.push(QualityIssue::EmptyDataset);
934            return Ok(QualityReport {
935                row_count: 0,
936                column_count: schema.fields().len(),
937                columns: HashMap::new(),
938                issues,
939                score: 0.0,
940                duplicate_row_count: 0,
941            });
942        }
943
944        // Analyze each column
945        let mut columns = HashMap::new();
946        for (col_name, values) in &column_data {
947            let quality = self.analyze_column(col_name, values, row_count);
948
949            // Check for issues
950            if quality.null_ratio > self.thresholds.max_null_ratio {
951                issues.push(QualityIssue::HighNullRatio {
952                    column: col_name.clone(),
953                    null_ratio: quality.null_ratio,
954                    threshold: self.thresholds.max_null_ratio,
955                });
956            }
957
958            if quality.duplicate_ratio > self.thresholds.max_duplicate_ratio {
959                issues.push(QualityIssue::HighDuplicateRatio {
960                    column: col_name.clone(),
961                    duplicate_ratio: quality.duplicate_ratio,
962                    threshold: self.thresholds.max_duplicate_ratio,
963                });
964            }
965
966            if quality.unique_count < self.thresholds.min_cardinality && row_count > 1 {
967                issues.push(QualityIssue::LowCardinality {
968                    column: col_name.clone(),
969                    unique_count: quality.unique_count,
970                    total_count: row_count,
971                });
972            }
973
974            if quality.is_constant() {
975                let value = values
976                    .iter()
977                    .find(|v| v.is_some())
978                    .map(|v| v.clone().unwrap_or_default())
979                    .unwrap_or_default();
980                issues.push(QualityIssue::ConstantColumn {
981                    column: col_name.clone(),
982                    value,
983                });
984            }
985
986            if let Some(outlier_count) = quality.outlier_count {
987                let outlier_ratio = outlier_count as f64 / row_count as f64;
988                if outlier_ratio > self.thresholds.max_outlier_ratio {
989                    issues.push(QualityIssue::OutliersDetected {
990                        column: col_name.clone(),
991                        outlier_count,
992                        outlier_ratio,
993                    });
994                }
995            }
996
997            columns.insert(col_name.clone(), quality);
998        }
999
1000        // Check for duplicate rows
1001        let duplicate_row_count = if self.check_duplicates {
1002            self.count_duplicate_rows(&column_data, row_count)
1003        } else {
1004            0
1005        };
1006
1007        let duplicate_row_ratio = duplicate_row_count as f64 / row_count as f64;
1008        if duplicate_row_ratio > self.thresholds.max_duplicate_row_ratio {
1009            issues.push(QualityIssue::DuplicateRows {
1010                duplicate_count: duplicate_row_count,
1011                duplicate_ratio: duplicate_row_ratio,
1012            });
1013        }
1014
1015        // Calculate quality score
1016        let score = self.calculate_score(&columns, &issues, row_count);
1017
1018        Ok(QualityReport {
1019            row_count,
1020            column_count: schema.fields().len(),
1021            columns,
1022            issues,
1023            score,
1024            duplicate_row_count,
1025        })
1026    }
1027
1028    /// Collect data from dataset as strings for analysis
1029    fn collect_data(
1030        &self,
1031        dataset: &ArrowDataset,
1032    ) -> (HashMap<String, Vec<Option<String>>>, usize) {
1033        use arrow::array::{Array, Float64Array, Int32Array, Int64Array, StringArray};
1034
1035        let schema = dataset.schema();
1036        let mut data: HashMap<String, Vec<Option<String>>> = HashMap::new();
1037        let mut row_count = 0;
1038
1039        for field in schema.fields() {
1040            data.insert(field.name().clone(), Vec::new());
1041        }
1042
1043        for batch in dataset.iter() {
1044            row_count += batch.num_rows();
1045
1046            for (col_idx, field) in schema.fields().iter().enumerate() {
1047                if let Some(col_data) = data.get_mut(field.name()) {
1048                    let array = batch.column(col_idx);
1049
1050                    for i in 0..array.len() {
1051                        if array.is_null(i) {
1052                            col_data.push(None);
1053                        } else if let Some(arr) = array.as_any().downcast_ref::<StringArray>() {
1054                            col_data.push(Some(arr.value(i).to_string()));
1055                        } else if let Some(arr) = array.as_any().downcast_ref::<Int32Array>() {
1056                            col_data.push(Some(arr.value(i).to_string()));
1057                        } else if let Some(arr) = array.as_any().downcast_ref::<Int64Array>() {
1058                            col_data.push(Some(arr.value(i).to_string()));
1059                        } else if let Some(arr) = array.as_any().downcast_ref::<Float64Array>() {
1060                            col_data.push(Some(arr.value(i).to_string()));
1061                        } else {
1062                            col_data.push(Some("?".to_string()));
1063                        }
1064                    }
1065                }
1066            }
1067        }
1068
1069        (data, row_count)
1070    }
1071
1072    /// Analyze a single column
1073    fn analyze_column(
1074        &self,
1075        name: &str,
1076        values: &[Option<String>],
1077        total_count: usize,
1078    ) -> ColumnQuality {
1079        let null_count = values.iter().filter(|v| v.is_none()).count();
1080        let null_ratio = if total_count > 0 {
1081            null_count as f64 / total_count as f64
1082        } else {
1083            0.0
1084        };
1085
1086        // Count unique values
1087        let non_null_values: Vec<&str> = values.iter().filter_map(|v| v.as_deref()).collect();
1088        let unique_set: HashSet<&str> = non_null_values.iter().copied().collect();
1089        let unique_count = unique_set.len();
1090        let unique_ratio = if !non_null_values.is_empty() {
1091            unique_count as f64 / non_null_values.len() as f64
1092        } else {
1093            0.0
1094        };
1095
1096        // Calculate duplicates
1097        let duplicate_count = non_null_values.len().saturating_sub(unique_count);
1098        let duplicate_ratio = if !non_null_values.is_empty() {
1099            duplicate_count as f64 / non_null_values.len() as f64
1100        } else {
1101            0.0
1102        };
1103
1104        // Try to parse as numeric for outlier detection
1105        let (outlier_count, numeric_stats) = if self.check_outliers {
1106            self.analyze_numeric(&non_null_values)
1107        } else {
1108            (None, None)
1109        };
1110
1111        ColumnQuality {
1112            name: name.to_string(),
1113            total_count,
1114            null_count,
1115            null_ratio,
1116            unique_count,
1117            unique_ratio,
1118            duplicate_count,
1119            duplicate_ratio,
1120            outlier_count,
1121            numeric_stats,
1122        }
1123    }
1124
1125    /// Analyze numeric column for outliers and stats
1126    fn analyze_numeric(&self, values: &[&str]) -> (Option<usize>, Option<NumericStats>) {
1127        let numeric_values: Vec<f64> = values
1128            .iter()
1129            .filter_map(|v| v.parse::<f64>().ok())
1130            .filter(|v| v.is_finite())
1131            .collect();
1132
1133        if numeric_values.len() < 4 {
1134            return (None, None);
1135        }
1136
1137        let mut sorted = numeric_values.clone();
1138        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1139
1140        let n = sorted.len();
1141        let min = sorted[0];
1142        let max = sorted[n - 1];
1143        let mean = numeric_values.iter().sum::<f64>() / n as f64;
1144
1145        let variance = numeric_values
1146            .iter()
1147            .map(|v| (v - mean).powi(2))
1148            .sum::<f64>()
1149            / n as f64;
1150        let std_dev = variance.sqrt();
1151
1152        let q1 = sorted[n / 4];
1153        let median = sorted[n / 2];
1154        let q3 = sorted[3 * n / 4];
1155
1156        let stats = NumericStats {
1157            min,
1158            max,
1159            mean,
1160            std_dev,
1161            q1,
1162            median,
1163            q3,
1164        };
1165
1166        // Count outliers using IQR method
1167        let lower = stats.outlier_lower_bound();
1168        let upper = stats.outlier_upper_bound();
1169        let outlier_count = numeric_values
1170            .iter()
1171            .filter(|&&v| v < lower || v > upper)
1172            .count();
1173
1174        (Some(outlier_count), Some(stats))
1175    }
1176
1177    /// Count duplicate rows
1178    fn count_duplicate_rows(
1179        &self,
1180        data: &HashMap<String, Vec<Option<String>>>,
1181        row_count: usize,
1182    ) -> usize {
1183        if data.is_empty() || row_count == 0 {
1184            return 0;
1185        }
1186
1187        // Build row hashes
1188        let mut row_set: HashSet<String> = HashSet::new();
1189        let mut duplicates = 0;
1190
1191        let columns: Vec<&String> = data.keys().collect();
1192
1193        for i in 0..row_count {
1194            let row_key: String = columns
1195                .iter()
1196                .map(|col| {
1197                    data.get(*col)
1198                        .and_then(|v| v.get(i))
1199                        .map(|v| v.clone().unwrap_or_else(|| "NULL".to_string()))
1200                        .unwrap_or_else(|| "NULL".to_string())
1201                })
1202                .collect::<Vec<_>>()
1203                .join("|");
1204
1205            if !row_set.insert(row_key) {
1206                duplicates += 1;
1207            }
1208        }
1209
1210        duplicates
1211    }
1212
1213    /// Calculate quality score (0-100)
1214    fn calculate_score(
1215        &self,
1216        columns: &HashMap<String, ColumnQuality>,
1217        issues: &[QualityIssue],
1218        row_count: usize,
1219    ) -> f64 {
1220        if row_count == 0 || columns.is_empty() {
1221            return 0.0;
1222        }
1223
1224        let mut score = 100.0;
1225
1226        // Deduct for null ratios
1227        let avg_null_ratio: f64 =
1228            columns.values().map(|c| c.null_ratio).sum::<f64>() / columns.len() as f64;
1229        score -= avg_null_ratio * 30.0;
1230
1231        // Deduct for issues
1232        for issue in issues {
1233            score -= match issue.severity() {
1234                5 => 25.0,
1235                4 => 15.0,
1236                3 => 10.0,
1237                2 => 5.0,
1238                1 => 2.0,
1239                _ => 0.0,
1240            };
1241        }
1242
1243        score.clamp(0.0, 100.0)
1244    }
1245}
1246
1247#[cfg(test)]
1248mod tests {
1249    use std::sync::Arc;
1250
1251    use arrow::{
1252        array::{Float64Array, Int32Array, StringArray},
1253        datatypes::{DataType, Field, Schema},
1254        record_batch::RecordBatch,
1255    };
1256
1257    use super::*;
1258
1259    // ========== QualityIssue tests ==========
1260
1261    #[test]
1262    fn test_issue_severity() {
1263        assert_eq!(QualityIssue::EmptySchema.severity(), 5);
1264        assert_eq!(QualityIssue::EmptyDataset.severity(), 5);
1265
1266        let constant = QualityIssue::ConstantColumn {
1267            column: "x".to_string(),
1268            value: "1".to_string(),
1269        };
1270        assert_eq!(constant.severity(), 4);
1271
1272        let high_null = QualityIssue::HighNullRatio {
1273            column: "x".to_string(),
1274            null_ratio: 0.6,
1275            threshold: 0.1,
1276        };
1277        assert_eq!(high_null.severity(), 4);
1278
1279        let low_null = QualityIssue::HighNullRatio {
1280            column: "x".to_string(),
1281            null_ratio: 0.3,
1282            threshold: 0.1,
1283        };
1284        assert_eq!(low_null.severity(), 3);
1285    }
1286
1287    #[test]
1288    fn test_issue_column() {
1289        let issue = QualityIssue::HighNullRatio {
1290            column: "test".to_string(),
1291            null_ratio: 0.5,
1292            threshold: 0.1,
1293        };
1294        assert_eq!(issue.column(), Some("test"));
1295
1296        assert_eq!(QualityIssue::EmptySchema.column(), None);
1297    }
1298
1299    // ========== ColumnQuality tests ==========
1300
1301    #[test]
1302    fn test_column_quality_is_constant() {
1303        let mut quality = ColumnQuality {
1304            name: "test".to_string(),
1305            total_count: 100,
1306            null_count: 0,
1307            null_ratio: 0.0,
1308            unique_count: 1,
1309            unique_ratio: 0.01,
1310            duplicate_count: 99,
1311            duplicate_ratio: 0.99,
1312            outlier_count: None,
1313            numeric_stats: None,
1314        };
1315
1316        assert!(quality.is_constant());
1317
1318        quality.unique_count = 5;
1319        assert!(!quality.is_constant());
1320    }
1321
1322    #[test]
1323    fn test_column_quality_mostly_null() {
1324        let quality = ColumnQuality {
1325            name: "test".to_string(),
1326            total_count: 100,
1327            null_count: 80,
1328            null_ratio: 0.8,
1329            unique_count: 5,
1330            unique_ratio: 0.25,
1331            duplicate_count: 15,
1332            duplicate_ratio: 0.75,
1333            outlier_count: None,
1334            numeric_stats: None,
1335        };
1336
1337        assert!(quality.is_mostly_null(0.5));
1338        assert!(!quality.is_mostly_null(0.9));
1339    }
1340
1341    // ========== NumericStats tests ==========
1342
1343    #[test]
1344    fn test_numeric_stats_iqr() {
1345        let stats = NumericStats {
1346            min: 0.0,
1347            max: 100.0,
1348            mean: 50.0,
1349            std_dev: 25.0,
1350            q1: 25.0,
1351            median: 50.0,
1352            q3: 75.0,
1353        };
1354
1355        assert!((stats.iqr() - 50.0).abs() < 0.01);
1356        assert!((stats.outlier_lower_bound() - (-50.0)).abs() < 0.01);
1357        assert!((stats.outlier_upper_bound() - 150.0).abs() < 0.01);
1358    }
1359
1360    // ========== QualityReport tests ==========
1361
1362    #[test]
1363    fn test_report_has_issues() {
1364        let report = QualityReport {
1365            row_count: 100,
1366            column_count: 2,
1367            columns: HashMap::new(),
1368            issues: vec![],
1369            score: 100.0,
1370            duplicate_row_count: 0,
1371        };
1372        assert!(!report.has_issues());
1373
1374        let report_with_issues = QualityReport {
1375            row_count: 100,
1376            column_count: 2,
1377            columns: HashMap::new(),
1378            issues: vec![QualityIssue::EmptySchema],
1379            score: 50.0,
1380            duplicate_row_count: 0,
1381        };
1382        assert!(report_with_issues.has_issues());
1383    }
1384
1385    #[test]
1386    fn test_report_max_severity() {
1387        let report = QualityReport {
1388            row_count: 100,
1389            column_count: 2,
1390            columns: HashMap::new(),
1391            issues: vec![
1392                QualityIssue::LowCardinality {
1393                    column: "x".to_string(),
1394                    unique_count: 1,
1395                    total_count: 100,
1396                },
1397                QualityIssue::ConstantColumn {
1398                    column: "y".to_string(),
1399                    value: "1".to_string(),
1400                },
1401            ],
1402            score: 80.0,
1403            duplicate_row_count: 0,
1404        };
1405
1406        assert_eq!(report.max_severity(), 4);
1407    }
1408
1409    // ========== QualityChecker tests ==========
1410
1411    fn make_dataset(col1: Vec<Option<&str>>, col2: Vec<Option<i32>>) -> ArrowDataset {
1412        let schema = Arc::new(Schema::new(vec![
1413            Field::new("name", DataType::Utf8, true),
1414            Field::new("value", DataType::Int32, true),
1415        ]));
1416
1417        let names: Vec<Option<&str>> = col1;
1418        let values: Vec<Option<i32>> = col2;
1419
1420        let batch = RecordBatch::try_new(
1421            Arc::clone(&schema),
1422            vec![
1423                Arc::new(StringArray::from(names)),
1424                Arc::new(Int32Array::from(values)),
1425            ],
1426        )
1427        .expect("batch");
1428
1429        ArrowDataset::from_batch(batch).expect("dataset")
1430    }
1431
1432    fn make_float_dataset(values: Vec<Option<f64>>) -> ArrowDataset {
1433        let schema = Arc::new(Schema::new(vec![Field::new(
1434            "value",
1435            DataType::Float64,
1436            true,
1437        )]));
1438
1439        let batch = RecordBatch::try_new(
1440            Arc::clone(&schema),
1441            vec![Arc::new(Float64Array::from(values))],
1442        )
1443        .expect("batch");
1444
1445        ArrowDataset::from_batch(batch).expect("dataset")
1446    }
1447
1448    #[test]
1449    fn test_checker_new() {
1450        let checker = QualityChecker::new();
1451        assert!((checker.thresholds.max_null_ratio - 0.1).abs() < 0.01);
1452    }
1453
1454    #[test]
1455    fn test_checker_builder() {
1456        let checker = QualityChecker::new()
1457            .max_null_ratio(0.2)
1458            .max_duplicate_ratio(0.3)
1459            .min_cardinality(5);
1460
1461        assert!((checker.thresholds.max_null_ratio - 0.2).abs() < 0.01);
1462        assert!((checker.thresholds.max_duplicate_ratio - 0.3).abs() < 0.01);
1463        assert_eq!(checker.thresholds.min_cardinality, 5);
1464    }
1465
1466    #[test]
1467    fn test_checker_clean_data() {
1468        let dataset = make_dataset(
1469            vec![Some("a"), Some("b"), Some("c"), Some("d")],
1470            vec![Some(1), Some(2), Some(3), Some(4)],
1471        );
1472
1473        let checker = QualityChecker::new();
1474        let report = checker.check(&dataset).expect("check");
1475
1476        assert_eq!(report.row_count, 4);
1477        assert_eq!(report.column_count, 2);
1478        assert!(report.score > 80.0);
1479    }
1480
1481    #[test]
1482    fn test_checker_detects_nulls() {
1483        let dataset = make_dataset(
1484            vec![Some("a"), None, None, None, None],
1485            vec![Some(1), Some(2), Some(3), Some(4), Some(5)],
1486        );
1487
1488        let checker = QualityChecker::new().max_null_ratio(0.5);
1489        let report = checker.check(&dataset).expect("check");
1490
1491        let null_issues: Vec<_> = report
1492            .issues
1493            .iter()
1494            .filter(|i| matches!(i, QualityIssue::HighNullRatio { .. }))
1495            .collect();
1496
1497        assert_eq!(null_issues.len(), 1);
1498    }
1499
1500    #[test]
1501    fn test_checker_detects_constant() {
1502        let dataset = make_dataset(
1503            vec![Some("same"), Some("same"), Some("same"), Some("same")],
1504            vec![Some(1), Some(2), Some(3), Some(4)],
1505        );
1506
1507        let checker = QualityChecker::new();
1508        let report = checker.check(&dataset).expect("check");
1509
1510        let constant_issues: Vec<_> = report
1511            .issues
1512            .iter()
1513            .filter(|i| matches!(i, QualityIssue::ConstantColumn { .. }))
1514            .collect();
1515
1516        assert_eq!(constant_issues.len(), 1);
1517    }
1518
1519    #[test]
1520    fn test_checker_detects_duplicates() {
1521        let dataset = make_dataset(
1522            vec![Some("a"), Some("a"), Some("a"), Some("b")],
1523            vec![Some(1), Some(1), Some(1), Some(2)],
1524        );
1525
1526        let checker = QualityChecker::new().max_duplicate_ratio(0.01);
1527        let report = checker.check(&dataset).expect("check");
1528
1529        // Should detect duplicate rows
1530        assert!(report.duplicate_row_count > 0);
1531    }
1532
1533    #[test]
1534    fn test_checker_detects_outliers() {
1535        // Create dataset with clear outliers
1536        let mut values: Vec<Option<f64>> = (0..100).map(|i| Some(i as f64)).collect();
1537        values.push(Some(10000.0)); // outlier
1538        values.push(Some(-10000.0)); // outlier
1539
1540        let dataset = make_float_dataset(values);
1541
1542        let checker = QualityChecker::new().max_outlier_ratio(0.01);
1543        let report = checker.check(&dataset).expect("check");
1544
1545        let outlier_issues: Vec<_> = report
1546            .issues
1547            .iter()
1548            .filter(|i| matches!(i, QualityIssue::OutliersDetected { .. }))
1549            .collect();
1550
1551        assert!(!outlier_issues.is_empty());
1552    }
1553
1554    #[test]
1555    fn test_checker_empty_dataset() {
1556        let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, true)]));
1557        let batch = RecordBatch::try_new(
1558            Arc::clone(&schema),
1559            vec![Arc::new(Int32Array::from(Vec::<i32>::new()))],
1560        )
1561        .expect("batch");
1562        let dataset = ArrowDataset::from_batch(batch).expect("dataset");
1563
1564        let checker = QualityChecker::new();
1565        let report = checker.check(&dataset).expect("check");
1566
1567        assert!(report.issues.contains(&QualityIssue::EmptyDataset));
1568        assert_eq!(report.score, 0.0);
1569    }
1570
1571    #[test]
1572    fn test_checker_score_decreases_with_issues() {
1573        let clean_dataset = make_dataset(
1574            vec![Some("a"), Some("b"), Some("c"), Some("d")],
1575            vec![Some(1), Some(2), Some(3), Some(4)],
1576        );
1577
1578        let dirty_dataset = make_dataset(
1579            vec![Some("same"), Some("same"), None, None],
1580            vec![Some(1), Some(1), None, None],
1581        );
1582
1583        let checker = QualityChecker::new();
1584        let clean_report = checker.check(&clean_dataset).expect("check");
1585        let dirty_report = checker.check(&dirty_dataset).expect("check");
1586
1587        assert!(clean_report.score > dirty_report.score);
1588    }
1589
1590    #[test]
1591    fn test_checker_column_issues() {
1592        let dataset = make_dataset(
1593            vec![None, None, None, None],
1594            vec![Some(1), Some(2), Some(3), Some(4)],
1595        );
1596
1597        let checker = QualityChecker::new();
1598        let report = checker.check(&dataset).expect("check");
1599
1600        let name_issues = report.column_issues("name");
1601        assert!(!name_issues.is_empty());
1602
1603        let value_issues = report.column_issues("value");
1604        // value column should have fewer issues
1605        assert!(value_issues.len() < name_issues.len());
1606    }
1607
1608    #[test]
1609    fn test_checker_problematic_columns() {
1610        let dataset = make_dataset(
1611            vec![None, None, None, None],
1612            vec![Some(1), Some(1), Some(1), Some(1)],
1613        );
1614
1615        let checker = QualityChecker::new();
1616        let report = checker.check(&dataset).expect("check");
1617
1618        let problematic = report.problematic_columns();
1619        assert!(problematic.contains(&"name"));
1620        assert!(problematic.contains(&"value"));
1621    }
1622
1623    #[test]
1624    fn test_checker_disable_outliers() {
1625        let mut values: Vec<Option<f64>> = (0..100).map(|i| Some(i as f64)).collect();
1626        values.push(Some(10000.0));
1627
1628        let dataset = make_float_dataset(values);
1629
1630        let checker = QualityChecker::new()
1631            .with_outlier_check(false)
1632            .max_outlier_ratio(0.001);
1633        let report = checker.check(&dataset).expect("check");
1634
1635        let outlier_issues: Vec<_> = report
1636            .issues
1637            .iter()
1638            .filter(|i| matches!(i, QualityIssue::OutliersDetected { .. }))
1639            .collect();
1640
1641        assert!(outlier_issues.is_empty());
1642    }
1643
1644    // ========== 100-Point Quality Scoring System Tests (GH-6) ==========
1645
1646    #[test]
1647    fn test_severity_weights() {
1648        assert!((Severity::Critical.weight() - 2.0).abs() < 0.01);
1649        assert!((Severity::High.weight() - 1.5).abs() < 0.01);
1650        assert!((Severity::Medium.weight() - 1.0).abs() < 0.01);
1651        assert!((Severity::Low.weight() - 0.5).abs() < 0.01);
1652    }
1653
1654    #[test]
1655    fn test_severity_base_points() {
1656        assert!((Severity::Critical.base_points() - 2.0).abs() < 0.01);
1657        assert!((Severity::High.base_points() - 1.5).abs() < 0.01);
1658        assert!((Severity::Medium.base_points() - 1.0).abs() < 0.01);
1659        assert!((Severity::Low.base_points() - 0.5).abs() < 0.01);
1660    }
1661
1662    #[test]
1663    fn test_severity_display() {
1664        assert_eq!(format!("{}", Severity::Critical), "Critical");
1665        assert_eq!(format!("{}", Severity::High), "High");
1666        assert_eq!(format!("{}", Severity::Medium), "Medium");
1667        assert_eq!(format!("{}", Severity::Low), "Low");
1668    }
1669
1670    #[test]
1671    fn test_letter_grade_from_score() {
1672        assert_eq!(LetterGrade::from_score(100.0), LetterGrade::A);
1673        assert_eq!(LetterGrade::from_score(95.0), LetterGrade::A);
1674        assert_eq!(LetterGrade::from_score(94.9), LetterGrade::B);
1675        assert_eq!(LetterGrade::from_score(85.0), LetterGrade::B);
1676        assert_eq!(LetterGrade::from_score(84.9), LetterGrade::C);
1677        assert_eq!(LetterGrade::from_score(70.0), LetterGrade::C);
1678        assert_eq!(LetterGrade::from_score(69.9), LetterGrade::D);
1679        assert_eq!(LetterGrade::from_score(50.0), LetterGrade::D);
1680        assert_eq!(LetterGrade::from_score(49.9), LetterGrade::F);
1681        assert_eq!(LetterGrade::from_score(0.0), LetterGrade::F);
1682    }
1683
1684    #[test]
1685    fn test_letter_grade_publication_decision() {
1686        assert_eq!(LetterGrade::A.publication_decision(), "Publish immediately");
1687        assert_eq!(
1688            LetterGrade::B.publication_decision(),
1689            "Publish with documented caveats"
1690        );
1691        assert_eq!(
1692            LetterGrade::C.publication_decision(),
1693            "Remediation required before publication"
1694        );
1695        assert_eq!(LetterGrade::D.publication_decision(), "Major rework needed");
1696        assert_eq!(LetterGrade::F.publication_decision(), "Do not publish");
1697    }
1698
1699    #[test]
1700    fn test_letter_grade_is_publishable() {
1701        assert!(LetterGrade::A.is_publishable());
1702        assert!(LetterGrade::B.is_publishable());
1703        assert!(!LetterGrade::C.is_publishable());
1704        assert!(!LetterGrade::D.is_publishable());
1705        assert!(!LetterGrade::F.is_publishable());
1706    }
1707
1708    #[test]
1709    fn test_letter_grade_display() {
1710        assert_eq!(format!("{}", LetterGrade::A), "A");
1711        assert_eq!(format!("{}", LetterGrade::B), "B");
1712        assert_eq!(format!("{}", LetterGrade::C), "C");
1713        assert_eq!(format!("{}", LetterGrade::D), "D");
1714        assert_eq!(format!("{}", LetterGrade::F), "F");
1715    }
1716
1717    #[test]
1718    fn test_checklist_item_new() {
1719        let item = ChecklistItem::new(1, "Schema version documented", Severity::Critical, true);
1720        assert_eq!(item.id, 1);
1721        assert_eq!(item.description, "Schema version documented");
1722        assert!(item.passed);
1723        assert_eq!(item.severity, Severity::Critical);
1724        assert!(item.suggestion.is_none());
1725    }
1726
1727    #[test]
1728    fn test_checklist_item_with_suggestion() {
1729        let item = ChecklistItem::new(1, "Schema version documented", Severity::Critical, false)
1730            .with_suggestion("Add schema_version field to metadata");
1731        assert!(item.suggestion.is_some());
1732        assert_eq!(
1733            item.suggestion.unwrap(),
1734            "Add schema_version field to metadata"
1735        );
1736    }
1737
1738    #[test]
1739    fn test_checklist_item_points() {
1740        let passed_critical = ChecklistItem::new(1, "Test", Severity::Critical, true);
1741        assert!((passed_critical.points_earned() - 2.0).abs() < 0.01);
1742        assert!((passed_critical.max_points() - 2.0).abs() < 0.01);
1743
1744        let failed_critical = ChecklistItem::new(2, "Test", Severity::Critical, false);
1745        assert!((failed_critical.points_earned() - 0.0).abs() < 0.01);
1746        assert!((failed_critical.max_points() - 2.0).abs() < 0.01);
1747
1748        let passed_low = ChecklistItem::new(3, "Test", Severity::Low, true);
1749        assert!((passed_low.points_earned() - 0.5).abs() < 0.01);
1750    }
1751
1752    #[test]
1753    fn test_quality_score_perfect() {
1754        let checklist = vec![
1755            ChecklistItem::new(1, "Critical check", Severity::Critical, true),
1756            ChecklistItem::new(2, "High check", Severity::High, true),
1757            ChecklistItem::new(3, "Medium check", Severity::Medium, true),
1758            ChecklistItem::new(4, "Low check", Severity::Low, true),
1759        ];
1760        let score = QualityScore::from_checklist(checklist);
1761
1762        // Total max points: 2.0 + 1.5 + 1.0 + 0.5 = 5.0
1763        // Total earned: 5.0
1764        // Score: 100%
1765        assert!((score.score - 100.0).abs() < 0.01);
1766        assert_eq!(score.grade, LetterGrade::A);
1767        assert!(score.grade.is_publishable());
1768        assert!(!score.has_critical_failures());
1769    }
1770
1771    #[test]
1772    fn test_quality_score_with_critical_failure() {
1773        let checklist = vec![
1774            ChecklistItem::new(1, "Critical check", Severity::Critical, false),
1775            ChecklistItem::new(2, "High check", Severity::High, true),
1776            ChecklistItem::new(3, "Medium check", Severity::Medium, true),
1777            ChecklistItem::new(4, "Low check", Severity::Low, true),
1778        ];
1779        let score = QualityScore::from_checklist(checklist);
1780
1781        // Total max: 5.0, Earned: 3.0, Score: 60%
1782        assert!((score.score - 60.0).abs() < 0.01);
1783        assert_eq!(score.grade, LetterGrade::D);
1784        assert!(score.has_critical_failures());
1785        assert!(!score.grade.is_publishable());
1786    }
1787
1788    #[test]
1789    fn test_quality_score_failed_items() {
1790        let checklist = vec![
1791            ChecklistItem::new(1, "Critical check", Severity::Critical, false),
1792            ChecklistItem::new(2, "High check", Severity::High, true),
1793            ChecklistItem::new(3, "Medium check", Severity::Medium, false),
1794        ];
1795        let score = QualityScore::from_checklist(checklist);
1796
1797        let failed = score.failed_items();
1798        assert_eq!(failed.len(), 2);
1799        assert_eq!(failed[0].id, 1);
1800        assert_eq!(failed[1].id, 3);
1801
1802        let critical = score.critical_failures();
1803        assert_eq!(critical.len(), 1);
1804        assert_eq!(critical[0].id, 1);
1805    }
1806
1807    #[test]
1808    fn test_quality_score_severity_breakdown() {
1809        let checklist = vec![
1810            ChecklistItem::new(1, "C1", Severity::Critical, true),
1811            ChecklistItem::new(2, "C2", Severity::Critical, false),
1812            ChecklistItem::new(3, "H1", Severity::High, true),
1813        ];
1814        let score = QualityScore::from_checklist(checklist);
1815
1816        let critical_stats = score.severity_breakdown.get(&Severity::Critical).unwrap();
1817        assert_eq!(critical_stats.total, 2);
1818        assert_eq!(critical_stats.passed, 1);
1819        assert_eq!(critical_stats.failed, 1);
1820
1821        let high_stats = score.severity_breakdown.get(&Severity::High).unwrap();
1822        assert_eq!(high_stats.total, 1);
1823        assert_eq!(high_stats.passed, 1);
1824    }
1825
1826    #[test]
1827    fn test_quality_score_badge_url() {
1828        let checklist = vec![ChecklistItem::new(1, "Test", Severity::Critical, true)];
1829        let score = QualityScore::from_checklist(checklist);
1830
1831        let badge = score.badge_url();
1832        assert!(badge.contains("shields.io"));
1833        assert!(badge.contains("data_quality"));
1834        assert!(badge.contains("brightgreen")); // Grade A
1835    }
1836
1837    #[test]
1838    fn test_quality_score_badge_colors() {
1839        // Test each grade gets correct color
1840        let grades_colors = vec![
1841            (100.0, "brightgreen"), // A
1842            (90.0, "green"),        // B
1843            (75.0, "yellow"),       // C
1844            (55.0, "orange"),       // D
1845            (30.0, "red"),          // F
1846        ];
1847
1848        for (target_score, expected_color) in grades_colors {
1849            // Create checklist that produces approximately the target score
1850            let target: f64 = target_score;
1851            #[allow(clippy::cast_sign_loss)] // target is always positive (30.0-100.0)
1852            let passed = (target / 100.0 * 10.0).round() as usize;
1853            let failed = 10 - passed;
1854            let mut checklist: Vec<ChecklistItem> = (0..passed)
1855                .map(|i| ChecklistItem::new(i as u8, "Test", Severity::Medium, true))
1856                .collect();
1857            checklist.extend(
1858                (0..failed).map(|i| {
1859                    ChecklistItem::new((passed + i) as u8, "Test", Severity::Medium, false)
1860                }),
1861            );
1862
1863            let score = QualityScore::from_checklist(checklist);
1864            let badge = score.badge_url();
1865            assert!(
1866                badge.contains(expected_color),
1867                "Score {:.0} should have color {} but badge was {}",
1868                score.score,
1869                expected_color,
1870                badge
1871            );
1872        }
1873    }
1874
1875    #[test]
1876    fn test_quality_score_json_output() {
1877        let checklist = vec![
1878            ChecklistItem::new(1, "Schema check", Severity::Critical, true),
1879            ChecklistItem::new(2, "Column check", Severity::High, false)
1880                .with_suggestion("Add missing columns"),
1881        ];
1882        let score = QualityScore::from_checklist(checklist);
1883
1884        let json = score.to_json();
1885        assert!(json.contains("\"score\":"));
1886        assert!(json.contains("\"grade\":"));
1887        assert!(json.contains("\"is_publishable\":"));
1888        assert!(json.contains("\"failed_items\":"));
1889        assert!(json.contains("\"badge_url\":"));
1890        assert!(json.contains("Add missing columns"));
1891    }
1892
1893    #[test]
1894    fn test_quality_score_empty_checklist() {
1895        let checklist: Vec<ChecklistItem> = vec![];
1896        let score = QualityScore::from_checklist(checklist);
1897
1898        // Empty checklist = 100% (nothing to fail)
1899        assert!((score.score - 100.0).abs() < 0.01);
1900        assert_eq!(score.grade, LetterGrade::A);
1901    }
1902
1903    // ========== Quality Profile Tests (GH-10) ==========
1904
1905    #[test]
1906    fn test_quality_profile_default() {
1907        let profile = QualityProfile::default();
1908        assert_eq!(profile.name, "default");
1909        assert!(profile.expected_constant_columns.is_empty());
1910        assert!(profile.nullable_columns.is_empty());
1911        assert!((profile.max_null_ratio - 0.1).abs() < 0.001);
1912    }
1913
1914    #[test]
1915    fn test_quality_profile_doctest_corpus() {
1916        let profile = QualityProfile::doctest_corpus();
1917        assert_eq!(profile.name, "doctest-corpus");
1918        assert!(profile.is_expected_constant("source"));
1919        assert!(profile.is_expected_constant("version"));
1920        assert!(!profile.is_expected_constant("function"));
1921        assert!(profile.is_nullable("signature"));
1922        assert!(!profile.is_nullable("input"));
1923    }
1924
1925    #[test]
1926    fn test_quality_profile_ml_training() {
1927        let profile = QualityProfile::ml_training();
1928        assert_eq!(profile.name, "ml-training");
1929        assert!((profile.max_null_ratio - 0.0).abs() < 0.001);
1930        assert!((profile.max_duplicate_ratio - 0.8).abs() < 0.001);
1931    }
1932
1933    #[test]
1934    fn test_quality_profile_time_series() {
1935        let profile = QualityProfile::time_series();
1936        assert_eq!(profile.name, "time-series");
1937        assert!((profile.max_duplicate_row_ratio - 0.0).abs() < 0.001);
1938    }
1939
1940    #[test]
1941    fn test_quality_profile_by_name() {
1942        assert!(QualityProfile::by_name("default").is_some());
1943        assert!(QualityProfile::by_name("doctest-corpus").is_some());
1944        assert!(QualityProfile::by_name("doctest").is_some());
1945        assert!(QualityProfile::by_name("ml-training").is_some());
1946        assert!(QualityProfile::by_name("ml").is_some());
1947        assert!(QualityProfile::by_name("time-series").is_some());
1948        assert!(QualityProfile::by_name("timeseries").is_some());
1949        assert!(QualityProfile::by_name("nonexistent").is_none());
1950    }
1951
1952    #[test]
1953    fn test_quality_profile_available_profiles() {
1954        let profiles = QualityProfile::available_profiles();
1955        assert!(profiles.contains(&"default"));
1956        assert!(profiles.contains(&"doctest-corpus"));
1957        assert!(profiles.contains(&"ml-training"));
1958        assert!(profiles.contains(&"time-series"));
1959    }
1960
1961    #[test]
1962    fn test_quality_profile_builders() {
1963        let profile = QualityProfile::new("custom")
1964            .with_description("Custom profile")
1965            .with_expected_constant("id")
1966            .with_nullable("optional_field")
1967            .with_max_null_ratio(0.2)
1968            .with_max_duplicate_ratio(0.6);
1969
1970        assert_eq!(profile.name, "custom");
1971        assert_eq!(profile.description, "Custom profile");
1972        assert!(profile.is_expected_constant("id"));
1973        assert!(profile.is_nullable("optional_field"));
1974        assert!((profile.max_null_ratio - 0.2).abs() < 0.001);
1975        assert!((profile.max_duplicate_ratio - 0.6).abs() < 0.001);
1976    }
1977
1978    #[test]
1979    fn test_quality_profile_null_threshold_for() {
1980        let profile = QualityProfile::doctest_corpus();
1981
1982        // Nullable columns get 100% threshold
1983        assert!((profile.null_threshold_for("signature") - 1.0).abs() < 0.001);
1984
1985        // Non-nullable columns get profile threshold
1986        assert!((profile.null_threshold_for("input") - profile.max_null_ratio).abs() < 0.001);
1987    }
1988
1989    #[test]
1990    fn test_quality_profile_clone() {
1991        let profile = QualityProfile::doctest_corpus();
1992        let cloned = profile.clone();
1993        assert_eq!(profile.name, cloned.name);
1994        assert_eq!(
1995            profile.expected_constant_columns,
1996            cloned.expected_constant_columns
1997        );
1998    }
1999
2000    #[test]
2001    fn test_quality_profile_debug() {
2002        let profile = QualityProfile::default();
2003        let debug = format!("{:?}", profile);
2004        assert!(debug.contains("QualityProfile"));
2005        assert!(debug.contains("default"));
2006    }
2007}
alimentar/quality.rs

alimentar/
quality.rs