datasynth_generators/data_quality/
missing_values.rs1use rand::Rng;
9use std::collections::{HashMap, HashSet};
10
11#[derive(Debug, Clone)]
13pub enum MissingValueStrategy {
14 MCAR {
16 probability: f64,
18 },
19 MAR {
21 base_probability: f64,
23 conditions: Vec<MissingCondition>,
25 },
26 MNAR {
28 value_patterns: Vec<MissingPattern>,
30 },
31 Systematic {
33 field_groups: Vec<Vec<String>>,
35 probability: f64,
37 },
38}
39
40impl Default for MissingValueStrategy {
41 fn default() -> Self {
42 MissingValueStrategy::MCAR { probability: 0.01 }
43 }
44}
45
46#[derive(Debug, Clone)]
48pub struct MissingCondition {
49 pub field: String,
51 pub condition_type: ConditionType,
53 pub multiplier: f64,
55}
56
57#[derive(Debug, Clone)]
59pub enum ConditionType {
60 Equals(String),
62 Contains(String),
64 IsEmpty,
66 Matches(String),
68 GreaterThan(f64),
70 LessThan(f64),
72}
73
74#[derive(Debug, Clone)]
76pub struct MissingPattern {
77 pub description: String,
79 pub field: String,
81 pub pattern_type: PatternType,
83 pub probability: f64,
85}
86
87#[derive(Debug, Clone)]
89pub enum PatternType {
90 HighValues { threshold: f64 },
92 LowValues { threshold: f64 },
94 ExtremeValues { low: f64, high: f64 },
96 SensitivePatterns { patterns: Vec<String> },
98}
99
100#[derive(Debug, Clone)]
102pub struct MissingValueConfig {
103 pub global_rate: f64,
105 pub field_rates: HashMap<String, f64>,
107 pub required_fields: HashSet<String>,
109 pub strategy: MissingValueStrategy,
111 pub track_statistics: bool,
113}
114
115impl Default for MissingValueConfig {
116 fn default() -> Self {
117 let mut required_fields = HashSet::new();
118 required_fields.insert("document_number".to_string());
120 required_fields.insert("company_code".to_string());
121 required_fields.insert("posting_date".to_string());
122 required_fields.insert("account_code".to_string());
123
124 Self {
125 global_rate: 0.01,
126 field_rates: HashMap::new(),
127 required_fields,
128 strategy: MissingValueStrategy::default(),
129 track_statistics: true,
130 }
131 }
132}
133
134impl MissingValueConfig {
135 pub fn with_field_rates(mut self, rates: HashMap<String, f64>) -> Self {
137 self.field_rates = rates;
138 self
139 }
140
141 pub fn with_required_field(mut self, field: &str) -> Self {
143 self.required_fields.insert(field.to_string());
144 self
145 }
146
147 pub fn with_strategy(mut self, strategy: MissingValueStrategy) -> Self {
149 self.strategy = strategy;
150 self
151 }
152
153 pub fn get_rate(&self, field: &str) -> f64 {
155 if self.required_fields.contains(field) {
156 return 0.0;
157 }
158 *self.field_rates.get(field).unwrap_or(&self.global_rate)
159 }
160}
161
162#[derive(Debug, Clone, Default)]
164pub struct MissingValueStats {
165 pub total_fields: usize,
167 pub total_missing: usize,
169 pub by_field: HashMap<String, usize>,
171 pub records_with_missing: usize,
173 pub total_records: usize,
175}
176
177impl MissingValueStats {
178 pub fn overall_rate(&self) -> f64 {
180 if self.total_fields == 0 {
181 0.0
182 } else {
183 self.total_missing as f64 / self.total_fields as f64
184 }
185 }
186
187 pub fn field_rate(&self, field: &str, total_records: usize) -> f64 {
189 if total_records == 0 {
190 return 0.0;
191 }
192 *self.by_field.get(field).unwrap_or(&0) as f64 / total_records as f64
193 }
194}
195
196pub struct MissingValueInjector {
198 config: MissingValueConfig,
199 stats: MissingValueStats,
200}
201
202impl MissingValueInjector {
203 pub fn new(config: MissingValueConfig) -> Self {
205 Self {
206 config,
207 stats: MissingValueStats::default(),
208 }
209 }
210
211 pub fn should_be_missing<R: Rng>(
213 &mut self,
214 field: &str,
215 value: Option<&str>,
216 context: &HashMap<String, String>,
217 rng: &mut R,
218 ) -> bool {
219 if self.config.required_fields.contains(field) {
221 return false;
222 }
223
224 let probability = self.calculate_probability(field, value, context);
225
226 if self.config.track_statistics {
227 self.stats.total_fields += 1;
228 }
229
230 let is_missing = rng.gen::<f64>() < probability;
231
232 if is_missing && self.config.track_statistics {
233 self.stats.total_missing += 1;
234 *self.stats.by_field.entry(field.to_string()).or_insert(0) += 1;
235 }
236
237 is_missing
238 }
239
240 fn calculate_probability(
242 &self,
243 field: &str,
244 value: Option<&str>,
245 context: &HashMap<String, String>,
246 ) -> f64 {
247 match &self.config.strategy {
248 MissingValueStrategy::MCAR { probability } => {
249 let base = self.config.get_rate(field);
251 if base > 0.0 {
252 base
253 } else {
254 *probability
255 }
256 }
257 MissingValueStrategy::MAR {
258 base_probability,
259 conditions,
260 } => {
261 let mut prob = *base_probability;
262
263 for condition in conditions {
264 if let Some(field_value) = context.get(&condition.field) {
265 if self.check_condition(&condition.condition_type, field_value) {
266 prob *= condition.multiplier;
267 }
268 }
269 }
270
271 prob.min(1.0)
272 }
273 MissingValueStrategy::MNAR { value_patterns } => {
274 if let Some(val) = value {
275 for pattern in value_patterns {
276 if pattern.field == field
277 && self.check_value_pattern(&pattern.pattern_type, val)
278 {
279 return pattern.probability;
280 }
281 }
282 }
283 self.config.get_rate(field)
284 }
285 MissingValueStrategy::Systematic {
286 field_groups,
287 probability,
288 } => {
289 for group in field_groups {
291 if group.contains(&field.to_string()) {
292 return *probability;
293 }
294 }
295 self.config.get_rate(field)
296 }
297 }
298 }
299
300 fn check_condition(&self, condition: &ConditionType, value: &str) -> bool {
302 match condition {
303 ConditionType::Equals(expected) => value == expected,
304 ConditionType::Contains(substring) => value.contains(substring),
305 ConditionType::IsEmpty => value.is_empty(),
306 ConditionType::Matches(pattern) => {
307 value.contains(pattern)
309 }
310 ConditionType::GreaterThan(threshold) => value
311 .parse::<f64>()
312 .map(|v| v > *threshold)
313 .unwrap_or(false),
314 ConditionType::LessThan(threshold) => value
315 .parse::<f64>()
316 .map(|v| v < *threshold)
317 .unwrap_or(false),
318 }
319 }
320
321 fn check_value_pattern(&self, pattern: &PatternType, value: &str) -> bool {
323 match pattern {
324 PatternType::HighValues { threshold } => value
325 .parse::<f64>()
326 .map(|v| v > *threshold)
327 .unwrap_or(false),
328 PatternType::LowValues { threshold } => value
329 .parse::<f64>()
330 .map(|v| v < *threshold)
331 .unwrap_or(false),
332 PatternType::ExtremeValues { low, high } => value
333 .parse::<f64>()
334 .map(|v| v < *low || v > *high)
335 .unwrap_or(false),
336 PatternType::SensitivePatterns { patterns } => {
337 patterns.iter().any(|p| value.contains(p))
338 }
339 }
340 }
341
342 pub fn record_processed(&mut self, had_missing: bool) {
344 if self.config.track_statistics {
345 self.stats.total_records += 1;
346 if had_missing {
347 self.stats.records_with_missing += 1;
348 }
349 }
350 }
351
352 pub fn stats(&self) -> &MissingValueStats {
354 &self.stats
355 }
356
357 pub fn reset_stats(&mut self) {
359 self.stats = MissingValueStats::default();
360 }
361}
362
363#[derive(Debug, Clone, PartialEq)]
365pub enum MissingValue {
366 Null,
368 Empty,
370 Marker(String),
372 NA,
374 Dash,
376 Unknown,
378}
379
380impl MissingValue {
381 pub fn to_string_value(&self) -> String {
383 match self {
384 MissingValue::Null => String::new(),
385 MissingValue::Empty => String::new(),
386 MissingValue::Marker(s) => s.clone(),
387 MissingValue::NA => "N/A".to_string(),
388 MissingValue::Dash => "-".to_string(),
389 MissingValue::Unknown => "?".to_string(),
390 }
391 }
392
393 pub fn common_representations() -> Vec<Self> {
395 vec![
396 MissingValue::Null,
397 MissingValue::Empty,
398 MissingValue::NA,
399 MissingValue::Marker("NULL".to_string()),
400 MissingValue::Marker("NONE".to_string()),
401 MissingValue::Marker("#N/A".to_string()),
402 MissingValue::Dash,
403 MissingValue::Unknown,
404 ]
405 }
406}
407
408pub fn random_missing_representation<R: Rng>(rng: &mut R) -> MissingValue {
410 let representations = MissingValue::common_representations();
411 representations[rng.gen_range(0..representations.len())].clone()
412}
413
414#[cfg(test)]
415mod tests {
416 use super::*;
417 use rand::SeedableRng;
418 use rand_chacha::ChaCha8Rng;
419
420 #[test]
421 fn test_mcar_strategy() {
422 let config = MissingValueConfig {
423 global_rate: 0.5, strategy: MissingValueStrategy::MCAR { probability: 0.5 },
425 ..Default::default()
426 };
427
428 let mut injector = MissingValueInjector::new(config);
429 let mut rng = ChaCha8Rng::seed_from_u64(42);
430 let context = HashMap::new();
431
432 let mut missing_count = 0;
433 for _ in 0..1000 {
434 if injector.should_be_missing("description", Some("test"), &context, &mut rng) {
435 missing_count += 1;
436 }
437 }
438
439 assert!(missing_count > 400 && missing_count < 600);
441 }
442
443 #[test]
444 fn test_required_fields() {
445 let config = MissingValueConfig {
446 global_rate: 1.0, ..Default::default()
448 };
449
450 let mut injector = MissingValueInjector::new(config);
451 let mut rng = ChaCha8Rng::seed_from_u64(42);
452 let context = HashMap::new();
453
454 assert!(!injector.should_be_missing("document_number", Some("JE001"), &context, &mut rng));
456
457 assert!(injector.should_be_missing("description", Some("test"), &context, &mut rng));
459 }
460
461 #[test]
462 fn test_field_specific_rates() {
463 let mut field_rates = HashMap::new();
464 field_rates.insert("description".to_string(), 0.0);
465 field_rates.insert("cost_center".to_string(), 1.0);
466
467 let config = MissingValueConfig::default().with_field_rates(field_rates);
468
469 let mut injector = MissingValueInjector::new(config);
470 let mut rng = ChaCha8Rng::seed_from_u64(42);
471 let context = HashMap::new();
472
473 assert!(!injector.should_be_missing("description", Some("test"), &context, &mut rng));
475
476 assert!(injector.should_be_missing("cost_center", Some("CC001"), &context, &mut rng));
478 }
479
480 #[test]
481 fn test_statistics() {
482 let config = MissingValueConfig {
483 global_rate: 0.5,
484 track_statistics: true,
485 ..Default::default()
486 };
487
488 let mut injector = MissingValueInjector::new(config);
489 let mut rng = ChaCha8Rng::seed_from_u64(42);
490 let context = HashMap::new();
491
492 for _ in 0..100 {
493 injector.should_be_missing("description", Some("test"), &context, &mut rng);
494 }
495
496 assert_eq!(injector.stats().total_fields, 100);
497 assert!(injector.stats().total_missing > 0);
498 }
499}