Skip to main content

alimentar/quality/
profiles.rs

1//! Quality Profiles (GH-10)
2//!
3//! Quality profile for customizing scoring rules per data type.
4
5use std::collections::HashSet;
6
7/// Quality profile for customizing scoring rules per data type.
8///
9/// Different data types (doctest corpora, ML training sets, time series, etc.)
10/// have different expectations. For example:
11/// - Doctest corpus: `source` and `version` columns are expected to be constant
12/// - ML training: features should have high variance, labels can be categorical
13/// - Time series: timestamps should be unique and sequential
14///
15/// # Example
16///
17/// ```ignore
18/// let profile = QualityProfile::doctest_corpus();
19/// let score = profile.score_report(&report);
20/// ```
21#[derive(Debug, Clone)]
22pub struct QualityProfile {
23    /// Profile name for display
24    pub name: String,
25    /// Description of what this profile is for
26    pub description: String,
27    /// Columns that are expected to be constant (not penalized)
28    pub expected_constant_columns: HashSet<String>,
29    /// Columns where high null ratio is acceptable
30    pub nullable_columns: HashSet<String>,
31    /// Maximum acceptable null ratio (default: 0.1)
32    pub max_null_ratio: f64,
33    /// Maximum acceptable duplicate ratio (default: 0.5)
34    pub max_duplicate_ratio: f64,
35    /// Minimum cardinality before flagging as low (default: 2)
36    pub min_cardinality: usize,
37    /// Maximum outlier ratio to report (default: 0.05)
38    pub max_outlier_ratio: f64,
39    /// Maximum duplicate row ratio (default: 0.01)
40    pub max_duplicate_row_ratio: f64,
41    /// Whether to penalize constant columns not in expected list
42    pub penalize_unexpected_constants: bool,
43    /// Whether this profile requires a signature column (for doctest)
44    pub require_signature: bool,
45}
46
47impl Default for QualityProfile {
48    fn default() -> Self {
49        Self {
50            name: "default".to_string(),
51            description: "General-purpose quality profile".to_string(),
52            expected_constant_columns: HashSet::new(),
53            nullable_columns: HashSet::new(),
54            max_null_ratio: 0.1,
55            max_duplicate_ratio: 0.5,
56            min_cardinality: 2,
57            max_outlier_ratio: 0.05,
58            max_duplicate_row_ratio: 0.01,
59            penalize_unexpected_constants: true,
60            require_signature: false,
61        }
62    }
63}
64
65impl QualityProfile {
66    /// Create a new profile with custom name
67    #[must_use]
68    pub fn new(name: impl Into<String>) -> Self {
69        Self {
70            name: name.into(),
71            ..Default::default()
72        }
73    }
74
75    /// Get profile by name
76    #[must_use]
77    pub fn by_name(name: &str) -> Option<Self> {
78        match name {
79            "default" => Some(Self::default()),
80            "doctest-corpus" | "doctest" => Some(Self::doctest_corpus()),
81            "ml-training" | "ml" => Some(Self::ml_training()),
82            "time-series" | "timeseries" => Some(Self::time_series()),
83            _ => None,
84        }
85    }
86
87    /// List available profile names
88    #[must_use]
89    pub fn available_profiles() -> Vec<&'static str> {
90        vec!["default", "doctest-corpus", "ml-training", "time-series"]
91    }
92
93    /// Doctest corpus profile - for Python doctest extraction datasets.
94    ///
95    /// Expects:
96    /// - `source` and `version` columns to be constant (single crate/version)
97    /// - `signature` column may have nulls (module-level doctests)
98    /// - `input`, `expected`, `function` should be non-null
99    #[must_use]
100    pub fn doctest_corpus() -> Self {
101        let mut expected_constants = HashSet::new();
102        expected_constants.insert("source".to_string());
103        expected_constants.insert("version".to_string());
104
105        let mut nullable = HashSet::new();
106        nullable.insert("signature".to_string()); // Module-level doctests have no signature
107
108        Self {
109            name: "doctest-corpus".to_string(),
110            description: "Profile for Python doctest extraction datasets".to_string(),
111            expected_constant_columns: expected_constants,
112            nullable_columns: nullable,
113            max_null_ratio: 0.05,     // Stricter for doctest data
114            max_duplicate_ratio: 0.3, // Some duplicate inputs are normal
115            min_cardinality: 2,
116            max_outlier_ratio: 0.05,
117            max_duplicate_row_ratio: 0.0, // No exact duplicate rows allowed
118            penalize_unexpected_constants: true,
119            require_signature: false, // Relaxed - signature nulls are OK for module doctests
120        }
121    }
122
123    /// ML training profile - for machine learning datasets.
124    ///
125    /// Expects:
126    /// - Features to have reasonable variance
127    /// - Labels can be categorical (low cardinality OK)
128    /// - No null values in features or labels
129    #[must_use]
130    pub fn ml_training() -> Self {
131        Self {
132            name: "ml-training".to_string(),
133            description: "Profile for machine learning training datasets".to_string(),
134            expected_constant_columns: HashSet::new(),
135            nullable_columns: HashSet::new(),
136            max_null_ratio: 0.0,      // No nulls allowed in training data
137            max_duplicate_ratio: 0.8, // Higher tolerance for categorical features
138            min_cardinality: 2,
139            max_outlier_ratio: 0.1, // More tolerant of outliers
140            max_duplicate_row_ratio: 0.01,
141            penalize_unexpected_constants: true,
142            require_signature: false,
143        }
144    }
145
146    /// Time series profile - for temporal data.
147    ///
148    /// Expects:
149    /// - Timestamp column should be unique
150    /// - Data should have temporal patterns
151    #[must_use]
152    pub fn time_series() -> Self {
153        Self {
154            name: "time-series".to_string(),
155            description: "Profile for time series datasets".to_string(),
156            expected_constant_columns: HashSet::new(),
157            nullable_columns: HashSet::new(),
158            max_null_ratio: 0.05,
159            max_duplicate_ratio: 0.5,
160            min_cardinality: 2,
161            max_outlier_ratio: 0.1,       // Time series often have outliers
162            max_duplicate_row_ratio: 0.0, // No duplicate rows (each timestamp unique)
163            penalize_unexpected_constants: true,
164            require_signature: false,
165        }
166    }
167
168    /// Set description
169    #[must_use]
170    pub fn with_description(mut self, desc: impl Into<String>) -> Self {
171        self.description = desc.into();
172        self
173    }
174
175    /// Add an expected constant column
176    #[must_use]
177    pub fn with_expected_constant(mut self, column: impl Into<String>) -> Self {
178        self.expected_constant_columns.insert(column.into());
179        self
180    }
181
182    /// Add a nullable column
183    #[must_use]
184    pub fn with_nullable(mut self, column: impl Into<String>) -> Self {
185        self.nullable_columns.insert(column.into());
186        self
187    }
188
189    /// Set max null ratio
190    #[must_use]
191    pub fn with_max_null_ratio(mut self, ratio: f64) -> Self {
192        self.max_null_ratio = ratio;
193        self
194    }
195
196    /// Set max duplicate ratio
197    #[must_use]
198    pub fn with_max_duplicate_ratio(mut self, ratio: f64) -> Self {
199        self.max_duplicate_ratio = ratio;
200        self
201    }
202
203    /// Check if a column is expected to be constant
204    #[must_use]
205    pub fn is_expected_constant(&self, column: &str) -> bool {
206        self.expected_constant_columns.contains(column)
207    }
208
209    /// Check if a column is allowed to have nulls
210    #[must_use]
211    pub fn is_nullable(&self, column: &str) -> bool {
212        self.nullable_columns.contains(column)
213    }
214
215    /// Get effective null threshold for a column
216    #[must_use]
217    pub fn null_threshold_for(&self, column: &str) -> f64 {
218        if self.is_nullable(column) {
219            1.0 // Allow up to 100% nulls for nullable columns
220        } else {
221            self.max_null_ratio
222        }
223    }
224}