alimentar/quality/
profiles.rs1use std::collections::HashSet;
6
7#[derive(Debug, Clone)]
22pub struct QualityProfile {
23 pub name: String,
25 pub description: String,
27 pub expected_constant_columns: HashSet<String>,
29 pub nullable_columns: HashSet<String>,
31 pub max_null_ratio: f64,
33 pub max_duplicate_ratio: f64,
35 pub min_cardinality: usize,
37 pub max_outlier_ratio: f64,
39 pub max_duplicate_row_ratio: f64,
41 pub penalize_unexpected_constants: bool,
43 pub require_signature: bool,
45}
46
47impl Default for QualityProfile {
48 fn default() -> Self {
49 Self {
50 name: "default".to_string(),
51 description: "General-purpose quality profile".to_string(),
52 expected_constant_columns: HashSet::new(),
53 nullable_columns: HashSet::new(),
54 max_null_ratio: 0.1,
55 max_duplicate_ratio: 0.5,
56 min_cardinality: 2,
57 max_outlier_ratio: 0.05,
58 max_duplicate_row_ratio: 0.01,
59 penalize_unexpected_constants: true,
60 require_signature: false,
61 }
62 }
63}
64
65impl QualityProfile {
66 #[must_use]
68 pub fn new(name: impl Into<String>) -> Self {
69 Self {
70 name: name.into(),
71 ..Default::default()
72 }
73 }
74
75 #[must_use]
77 pub fn by_name(name: &str) -> Option<Self> {
78 match name {
79 "default" => Some(Self::default()),
80 "doctest-corpus" | "doctest" => Some(Self::doctest_corpus()),
81 "ml-training" | "ml" => Some(Self::ml_training()),
82 "time-series" | "timeseries" => Some(Self::time_series()),
83 _ => None,
84 }
85 }
86
87 #[must_use]
89 pub fn available_profiles() -> Vec<&'static str> {
90 vec!["default", "doctest-corpus", "ml-training", "time-series"]
91 }
92
93 #[must_use]
100 pub fn doctest_corpus() -> Self {
101 let mut expected_constants = HashSet::new();
102 expected_constants.insert("source".to_string());
103 expected_constants.insert("version".to_string());
104
105 let mut nullable = HashSet::new();
106 nullable.insert("signature".to_string()); Self {
109 name: "doctest-corpus".to_string(),
110 description: "Profile for Python doctest extraction datasets".to_string(),
111 expected_constant_columns: expected_constants,
112 nullable_columns: nullable,
113 max_null_ratio: 0.05, max_duplicate_ratio: 0.3, min_cardinality: 2,
116 max_outlier_ratio: 0.05,
117 max_duplicate_row_ratio: 0.0, penalize_unexpected_constants: true,
119 require_signature: false, }
121 }
122
123 #[must_use]
130 pub fn ml_training() -> Self {
131 Self {
132 name: "ml-training".to_string(),
133 description: "Profile for machine learning training datasets".to_string(),
134 expected_constant_columns: HashSet::new(),
135 nullable_columns: HashSet::new(),
136 max_null_ratio: 0.0, max_duplicate_ratio: 0.8, min_cardinality: 2,
139 max_outlier_ratio: 0.1, max_duplicate_row_ratio: 0.01,
141 penalize_unexpected_constants: true,
142 require_signature: false,
143 }
144 }
145
146 #[must_use]
152 pub fn time_series() -> Self {
153 Self {
154 name: "time-series".to_string(),
155 description: "Profile for time series datasets".to_string(),
156 expected_constant_columns: HashSet::new(),
157 nullable_columns: HashSet::new(),
158 max_null_ratio: 0.05,
159 max_duplicate_ratio: 0.5,
160 min_cardinality: 2,
161 max_outlier_ratio: 0.1, max_duplicate_row_ratio: 0.0, penalize_unexpected_constants: true,
164 require_signature: false,
165 }
166 }
167
168 #[must_use]
170 pub fn with_description(mut self, desc: impl Into<String>) -> Self {
171 self.description = desc.into();
172 self
173 }
174
175 #[must_use]
177 pub fn with_expected_constant(mut self, column: impl Into<String>) -> Self {
178 self.expected_constant_columns.insert(column.into());
179 self
180 }
181
182 #[must_use]
184 pub fn with_nullable(mut self, column: impl Into<String>) -> Self {
185 self.nullable_columns.insert(column.into());
186 self
187 }
188
189 #[must_use]
191 pub fn with_max_null_ratio(mut self, ratio: f64) -> Self {
192 self.max_null_ratio = ratio;
193 self
194 }
195
196 #[must_use]
198 pub fn with_max_duplicate_ratio(mut self, ratio: f64) -> Self {
199 self.max_duplicate_ratio = ratio;
200 self
201 }
202
203 #[must_use]
205 pub fn is_expected_constant(&self, column: &str) -> bool {
206 self.expected_constant_columns.contains(column)
207 }
208
209 #[must_use]
211 pub fn is_nullable(&self, column: &str) -> bool {
212 self.nullable_columns.contains(column)
213 }
214
215 #[must_use]
217 pub fn null_threshold_for(&self, column: &str) -> f64 {
218 if self.is_nullable(column) {
219 1.0 } else {
221 self.max_null_ratio
222 }
223 }
224}