alimentar/quality/mod.rs
1//! Data quality assessment for ML pipelines
2//!
3//! Detects data quality issues including missing values, outliers,
4//! duplicates, and schema problems.
5//!
6//! # 100-Point Quality Scoring System (GH-6)
7//!
8//! Based on the Toyota Way principles of Jidoka (built-in quality) and
9//! the Doctest Corpus QA Checklist for Publication.
10//!
11//! ## Severity Weights
12//! - **Critical (2.0x)**: Blocks publication - data integrity failures
13//! - **High (1.5x)**: Major issues requiring immediate attention
14//! - **Medium (1.0x)**: Standard issues to address before publication
15//! - **Low (0.5x)**: Minor issues, informational
16//!
17//! ## Letter Grades
18//! - **A (95-100)**: Publish immediately
19//! - **B (85-94)**: Publish with documented caveats
20//! - **C (70-84)**: Remediation required before publication
21//! - **D (50-69)**: Major rework needed
22//! - **F (<50)**: Do not publish
23//!
24//! # Example
25//!
26//! ```ignore
27//! use alimentar::quality::{QualityChecker, QualityScore};
28//!
29//! let checker = QualityChecker::new()
30//! .max_null_ratio(0.1)
31//! .max_duplicate_ratio(0.05);
32//!
33//! let report = checker.check(&dataset)?;
34//! let score = QualityScore::from_report(&report);
35//! println!("Grade: {} ({})", score.grade, score.score);
36//! ```
37//!
38//! # References
39//! - [1] Batini & Scannapieco (2016). Data and Information Quality.
40//! - [6] Hynes et al. (2017). The Data Linter. NIPS Workshop on ML Systems.
41
42// Statistical computation and internal methods
43#![allow(clippy::cast_precision_loss)]
44#![allow(clippy::suboptimal_flops)]
45#![allow(clippy::unused_self)]
46#![allow(clippy::if_not_else)]
47
48mod checks;
49pub mod decontaminate;
50mod profiles;
51mod scoring;
52
53#[cfg(test)]
54mod tests;
55
56// Re-export scoring types
57// Re-export check types
58pub use checks::{
59 ColumnQuality, NumericStats, QualityChecker, QualityIssue, QualityReport, QualityThresholds,
60 TextColumnStats,
61};
62// Re-export decontamination types
63pub use decontaminate::{
64 check_contamination, ngram_overlap, ContaminationResult, DecontaminationReport,
65};
66// Re-export profile types
67pub use profiles::QualityProfile;
68pub use scoring::{ChecklistItem, LetterGrade, QualityScore, Severity, SeverityStats};