datasynth_generators/data_quality/
missing_values.rs1use rand::Rng;
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, HashSet};
11
12#[derive(Debug, Clone)]
14pub enum MissingValueStrategy {
15 MCAR {
17 probability: f64,
19 },
20 MAR {
22 base_probability: f64,
24 conditions: Vec<MissingCondition>,
26 },
27 MNAR {
29 value_patterns: Vec<MissingPattern>,
31 },
32 Systematic {
34 field_groups: Vec<Vec<String>>,
36 probability: f64,
38 },
39}
40
41impl Default for MissingValueStrategy {
42 fn default() -> Self {
43 MissingValueStrategy::MCAR { probability: 0.01 }
44 }
45}
46
47#[derive(Debug, Clone)]
49pub struct MissingCondition {
50 pub field: String,
52 pub condition_type: ConditionType,
54 pub multiplier: f64,
56}
57
58#[derive(Debug, Clone)]
60pub enum ConditionType {
61 Equals(String),
63 Contains(String),
65 IsEmpty,
67 Matches(String),
69 GreaterThan(f64),
71 LessThan(f64),
73}
74
75#[derive(Debug, Clone)]
77pub struct MissingPattern {
78 pub description: String,
80 pub field: String,
82 pub pattern_type: PatternType,
84 pub probability: f64,
86}
87
88#[derive(Debug, Clone)]
90pub enum PatternType {
91 HighValues { threshold: f64 },
93 LowValues { threshold: f64 },
95 ExtremeValues { low: f64, high: f64 },
97 SensitivePatterns { patterns: Vec<String> },
99}
100
101#[derive(Debug, Clone)]
103pub struct MissingValueConfig {
104 pub global_rate: f64,
106 pub field_rates: HashMap<String, f64>,
108 pub required_fields: HashSet<String>,
110 pub strategy: MissingValueStrategy,
112 pub track_statistics: bool,
114}
115
116impl Default for MissingValueConfig {
117 fn default() -> Self {
118 let mut required_fields = HashSet::new();
119 required_fields.insert("document_number".to_string());
121 required_fields.insert("company_code".to_string());
122 required_fields.insert("posting_date".to_string());
123 required_fields.insert("account_code".to_string());
124
125 Self {
126 global_rate: 0.01,
127 field_rates: HashMap::new(),
128 required_fields,
129 strategy: MissingValueStrategy::default(),
130 track_statistics: true,
131 }
132 }
133}
134
135impl MissingValueConfig {
136 pub fn with_field_rates(mut self, rates: HashMap<String, f64>) -> Self {
138 self.field_rates = rates;
139 self
140 }
141
142 pub fn with_required_field(mut self, field: &str) -> Self {
144 self.required_fields.insert(field.to_string());
145 self
146 }
147
148 pub fn with_strategy(mut self, strategy: MissingValueStrategy) -> Self {
150 self.strategy = strategy;
151 self
152 }
153
154 pub fn get_rate(&self, field: &str) -> f64 {
156 if self.required_fields.contains(field) {
157 return 0.0;
158 }
159 *self.field_rates.get(field).unwrap_or(&self.global_rate)
160 }
161}
162
163#[derive(Debug, Clone, Default, Serialize, Deserialize)]
165pub struct MissingValueStats {
166 pub total_fields: usize,
168 pub total_missing: usize,
170 pub by_field: HashMap<String, usize>,
172 pub records_with_missing: usize,
174 pub total_records: usize,
176}
177
178impl MissingValueStats {
179 pub fn overall_rate(&self) -> f64 {
181 if self.total_fields == 0 {
182 0.0
183 } else {
184 self.total_missing as f64 / self.total_fields as f64
185 }
186 }
187
188 pub fn field_rate(&self, field: &str, total_records: usize) -> f64 {
190 if total_records == 0 {
191 return 0.0;
192 }
193 *self.by_field.get(field).unwrap_or(&0) as f64 / total_records as f64
194 }
195}
196
197pub struct MissingValueInjector {
199 config: MissingValueConfig,
200 stats: MissingValueStats,
201}
202
203impl MissingValueInjector {
204 pub fn new(config: MissingValueConfig) -> Self {
206 Self {
207 config,
208 stats: MissingValueStats::default(),
209 }
210 }
211
212 pub fn should_be_missing<R: Rng>(
214 &mut self,
215 field: &str,
216 value: Option<&str>,
217 context: &HashMap<String, String>,
218 rng: &mut R,
219 ) -> bool {
220 if self.config.required_fields.contains(field) {
222 return false;
223 }
224
225 let probability = self.calculate_probability(field, value, context);
226
227 if self.config.track_statistics {
228 self.stats.total_fields += 1;
229 }
230
231 let is_missing = rng.random::<f64>() < probability;
232
233 if is_missing && self.config.track_statistics {
234 self.stats.total_missing += 1;
235 *self.stats.by_field.entry(field.to_string()).or_insert(0) += 1;
236 }
237
238 is_missing
239 }
240
241 fn calculate_probability(
243 &self,
244 field: &str,
245 value: Option<&str>,
246 context: &HashMap<String, String>,
247 ) -> f64 {
248 match &self.config.strategy {
249 MissingValueStrategy::MCAR { probability } => {
250 let base = self.config.get_rate(field);
252 if base > 0.0 {
253 base
254 } else {
255 *probability
256 }
257 }
258 MissingValueStrategy::MAR {
259 base_probability,
260 conditions,
261 } => {
262 let mut prob = *base_probability;
263
264 for condition in conditions {
265 if let Some(field_value) = context.get(&condition.field) {
266 if self.check_condition(&condition.condition_type, field_value) {
267 prob *= condition.multiplier;
268 }
269 }
270 }
271
272 prob.min(1.0)
273 }
274 MissingValueStrategy::MNAR { value_patterns } => {
275 if let Some(val) = value {
276 for pattern in value_patterns {
277 if pattern.field == field
278 && self.check_value_pattern(&pattern.pattern_type, val)
279 {
280 return pattern.probability;
281 }
282 }
283 }
284 self.config.get_rate(field)
285 }
286 MissingValueStrategy::Systematic {
287 field_groups,
288 probability,
289 } => {
290 for group in field_groups {
292 if group.contains(&field.to_string()) {
293 return *probability;
294 }
295 }
296 self.config.get_rate(field)
297 }
298 }
299 }
300
301 fn check_condition(&self, condition: &ConditionType, value: &str) -> bool {
303 match condition {
304 ConditionType::Equals(expected) => value == expected,
305 ConditionType::Contains(substring) => value.contains(substring),
306 ConditionType::IsEmpty => value.is_empty(),
307 ConditionType::Matches(pattern) => {
308 value.contains(pattern)
310 }
311 ConditionType::GreaterThan(threshold) => value
312 .parse::<f64>()
313 .map(|v| v > *threshold)
314 .unwrap_or(false),
315 ConditionType::LessThan(threshold) => value
316 .parse::<f64>()
317 .map(|v| v < *threshold)
318 .unwrap_or(false),
319 }
320 }
321
322 fn check_value_pattern(&self, pattern: &PatternType, value: &str) -> bool {
324 match pattern {
325 PatternType::HighValues { threshold } => value
326 .parse::<f64>()
327 .map(|v| v > *threshold)
328 .unwrap_or(false),
329 PatternType::LowValues { threshold } => value
330 .parse::<f64>()
331 .map(|v| v < *threshold)
332 .unwrap_or(false),
333 PatternType::ExtremeValues { low, high } => value
334 .parse::<f64>()
335 .map(|v| v < *low || v > *high)
336 .unwrap_or(false),
337 PatternType::SensitivePatterns { patterns } => {
338 patterns.iter().any(|p| value.contains(p))
339 }
340 }
341 }
342
343 pub fn record_processed(&mut self, had_missing: bool) {
345 if self.config.track_statistics {
346 self.stats.total_records += 1;
347 if had_missing {
348 self.stats.records_with_missing += 1;
349 }
350 }
351 }
352
353 pub fn stats(&self) -> &MissingValueStats {
355 &self.stats
356 }
357
358 pub fn reset_stats(&mut self) {
360 self.stats = MissingValueStats::default();
361 }
362}
363
364#[derive(Debug, Clone, PartialEq)]
366pub enum MissingValue {
367 Null,
369 Empty,
371 Marker(String),
373 NA,
375 Dash,
377 Unknown,
379}
380
381impl MissingValue {
382 pub fn to_string_value(&self) -> String {
384 match self {
385 MissingValue::Null => String::new(),
386 MissingValue::Empty => String::new(),
387 MissingValue::Marker(s) => s.clone(),
388 MissingValue::NA => "N/A".to_string(),
389 MissingValue::Dash => "-".to_string(),
390 MissingValue::Unknown => "?".to_string(),
391 }
392 }
393
394 pub fn common_representations() -> Vec<Self> {
396 vec![
397 MissingValue::Null,
398 MissingValue::Empty,
399 MissingValue::NA,
400 MissingValue::Marker("NULL".to_string()),
401 MissingValue::Marker("NONE".to_string()),
402 MissingValue::Marker("#N/A".to_string()),
403 MissingValue::Dash,
404 MissingValue::Unknown,
405 ]
406 }
407}
408
409pub fn random_missing_representation<R: Rng>(rng: &mut R) -> MissingValue {
411 let representations = MissingValue::common_representations();
412 representations[rng.random_range(0..representations.len())].clone()
413}
414
415#[cfg(test)]
416#[allow(clippy::unwrap_used)]
417mod tests {
418 use super::*;
419 use rand::SeedableRng;
420 use rand_chacha::ChaCha8Rng;
421
422 #[test]
423 fn test_mcar_strategy() {
424 let config = MissingValueConfig {
425 global_rate: 0.5, strategy: MissingValueStrategy::MCAR { probability: 0.5 },
427 ..Default::default()
428 };
429
430 let mut injector = MissingValueInjector::new(config);
431 let mut rng = ChaCha8Rng::seed_from_u64(42);
432 let context = HashMap::new();
433
434 let mut missing_count = 0;
435 for _ in 0..1000 {
436 if injector.should_be_missing("description", Some("test"), &context, &mut rng) {
437 missing_count += 1;
438 }
439 }
440
441 assert!(missing_count > 400 && missing_count < 600);
443 }
444
445 #[test]
446 fn test_required_fields() {
447 let config = MissingValueConfig {
448 global_rate: 1.0, ..Default::default()
450 };
451
452 let mut injector = MissingValueInjector::new(config);
453 let mut rng = ChaCha8Rng::seed_from_u64(42);
454 let context = HashMap::new();
455
456 assert!(!injector.should_be_missing("document_number", Some("JE001"), &context, &mut rng));
458
459 assert!(injector.should_be_missing("description", Some("test"), &context, &mut rng));
461 }
462
463 #[test]
464 fn test_field_specific_rates() {
465 let mut field_rates = HashMap::new();
466 field_rates.insert("description".to_string(), 0.0);
467 field_rates.insert("cost_center".to_string(), 1.0);
468
469 let config = MissingValueConfig::default().with_field_rates(field_rates);
470
471 let mut injector = MissingValueInjector::new(config);
472 let mut rng = ChaCha8Rng::seed_from_u64(42);
473 let context = HashMap::new();
474
475 assert!(!injector.should_be_missing("description", Some("test"), &context, &mut rng));
477
478 assert!(injector.should_be_missing("cost_center", Some("CC001"), &context, &mut rng));
480 }
481
482 #[test]
483 fn test_statistics() {
484 let config = MissingValueConfig {
485 global_rate: 0.5,
486 track_statistics: true,
487 ..Default::default()
488 };
489
490 let mut injector = MissingValueInjector::new(config);
491 let mut rng = ChaCha8Rng::seed_from_u64(42);
492 let context = HashMap::new();
493
494 for _ in 0..100 {
495 injector.should_be_missing("description", Some("test"), &context, &mut rng);
496 }
497
498 assert_eq!(injector.stats().total_fields, 100);
499 assert!(injector.stats().total_missing > 0);
500 }
501}