Skip to main content

datasynth_generators/data_quality/
missing_values.rs

1//! Missing value injection for data quality simulation.
2//!
3//! Simulates realistic missing data patterns including:
4//! - Random missing values (MCAR - Missing Completely At Random)
5//! - Conditional missing values (MAR - Missing At Random)
6//! - Systematic missing values (MNAR - Missing Not At Random)
7
8use rand::Rng;
9use std::collections::{HashMap, HashSet};
10
11/// Strategy for missing value injection.
12#[derive(Debug, Clone)]
13pub enum MissingValueStrategy {
14    /// Missing Completely At Random - each value has equal probability of being missing.
15    MCAR {
16        /// Probability of a value being missing (0.0 - 1.0).
17        probability: f64,
18    },
19    /// Missing At Random - probability depends on other observed values.
20    MAR {
21        /// Base probability.
22        base_probability: f64,
23        /// Conditions that increase probability.
24        conditions: Vec<MissingCondition>,
25    },
26    /// Missing Not At Random - probability depends on the value itself.
27    MNAR {
28        /// Missing probability for specific value ranges/patterns.
29        value_patterns: Vec<MissingPattern>,
30    },
31    /// Systematic missing - entire fields missing for certain records.
32    Systematic {
33        /// Fields that are systematically missing together.
34        field_groups: Vec<Vec<String>>,
35        /// Probability of group being missing.
36        probability: f64,
37    },
38}
39
40impl Default for MissingValueStrategy {
41    fn default() -> Self {
42        MissingValueStrategy::MCAR { probability: 0.01 }
43    }
44}
45
46/// Condition for MAR missing values.
47#[derive(Debug, Clone)]
48pub struct MissingCondition {
49    /// Field to check.
50    pub field: String,
51    /// Condition type.
52    pub condition_type: ConditionType,
53    /// Probability multiplier when condition is met.
54    pub multiplier: f64,
55}
56
57/// Type of condition for missing values.
58#[derive(Debug, Clone)]
59pub enum ConditionType {
60    /// Field equals specific value.
61    Equals(String),
62    /// Field contains substring.
63    Contains(String),
64    /// Field is empty.
65    IsEmpty,
66    /// Field matches pattern.
67    Matches(String),
68    /// Numeric field greater than threshold.
69    GreaterThan(f64),
70    /// Numeric field less than threshold.
71    LessThan(f64),
72}
73
74/// Pattern for MNAR missing values.
75#[derive(Debug, Clone)]
76pub struct MissingPattern {
77    /// Description of the pattern.
78    pub description: String,
79    /// Field to check.
80    pub field: String,
81    /// Pattern type.
82    pub pattern_type: PatternType,
83    /// Probability when pattern matches.
84    pub probability: f64,
85}
86
87/// Type of pattern for MNAR.
88#[derive(Debug, Clone)]
89pub enum PatternType {
90    /// High values tend to be missing.
91    HighValues { threshold: f64 },
92    /// Low values tend to be missing.
93    LowValues { threshold: f64 },
94    /// Extreme values (outliers) tend to be missing.
95    ExtremeValues { low: f64, high: f64 },
96    /// Sensitive values tend to be missing.
97    SensitivePatterns { patterns: Vec<String> },
98}
99
100/// Configuration for missing values by field.
101#[derive(Debug, Clone)]
102pub struct MissingValueConfig {
103    /// Global missing rate (default for fields not specified).
104    pub global_rate: f64,
105    /// Field-specific missing rates.
106    pub field_rates: HashMap<String, f64>,
107    /// Fields that should never be missing (required fields).
108    pub required_fields: HashSet<String>,
109    /// Strategy for missing value injection.
110    pub strategy: MissingValueStrategy,
111    /// Whether to track missing value statistics.
112    pub track_statistics: bool,
113}
114
115impl Default for MissingValueConfig {
116    fn default() -> Self {
117        let mut required_fields = HashSet::new();
118        // Common required fields in accounting data
119        required_fields.insert("document_number".to_string());
120        required_fields.insert("company_code".to_string());
121        required_fields.insert("posting_date".to_string());
122        required_fields.insert("account_code".to_string());
123
124        Self {
125            global_rate: 0.01,
126            field_rates: HashMap::new(),
127            required_fields,
128            strategy: MissingValueStrategy::default(),
129            track_statistics: true,
130        }
131    }
132}
133
134impl MissingValueConfig {
135    /// Creates a configuration with specific field rates.
136    pub fn with_field_rates(mut self, rates: HashMap<String, f64>) -> Self {
137        self.field_rates = rates;
138        self
139    }
140
141    /// Adds a required field.
142    pub fn with_required_field(mut self, field: &str) -> Self {
143        self.required_fields.insert(field.to_string());
144        self
145    }
146
147    /// Sets the strategy.
148    pub fn with_strategy(mut self, strategy: MissingValueStrategy) -> Self {
149        self.strategy = strategy;
150        self
151    }
152
153    /// Gets the missing rate for a specific field.
154    pub fn get_rate(&self, field: &str) -> f64 {
155        if self.required_fields.contains(field) {
156            return 0.0;
157        }
158        *self.field_rates.get(field).unwrap_or(&self.global_rate)
159    }
160}
161
162/// Statistics about missing values.
163#[derive(Debug, Clone, Default)]
164pub struct MissingValueStats {
165    /// Total fields processed.
166    pub total_fields: usize,
167    /// Total missing values injected.
168    pub total_missing: usize,
169    /// Missing count by field.
170    pub by_field: HashMap<String, usize>,
171    /// Records with any missing value.
172    pub records_with_missing: usize,
173    /// Total records processed.
174    pub total_records: usize,
175}
176
177impl MissingValueStats {
178    /// Returns the overall missing rate.
179    pub fn overall_rate(&self) -> f64 {
180        if self.total_fields == 0 {
181            0.0
182        } else {
183            self.total_missing as f64 / self.total_fields as f64
184        }
185    }
186
187    /// Returns the rate for a specific field.
188    pub fn field_rate(&self, field: &str, total_records: usize) -> f64 {
189        if total_records == 0 {
190            return 0.0;
191        }
192        *self.by_field.get(field).unwrap_or(&0) as f64 / total_records as f64
193    }
194}
195
196/// Missing value injector.
197pub struct MissingValueInjector {
198    config: MissingValueConfig,
199    stats: MissingValueStats,
200}
201
202impl MissingValueInjector {
203    /// Creates a new missing value injector.
204    pub fn new(config: MissingValueConfig) -> Self {
205        Self {
206            config,
207            stats: MissingValueStats::default(),
208        }
209    }
210
211    /// Determines if a value should be made missing.
212    pub fn should_be_missing<R: Rng>(
213        &mut self,
214        field: &str,
215        value: Option<&str>,
216        context: &HashMap<String, String>,
217        rng: &mut R,
218    ) -> bool {
219        // Never make required fields missing
220        if self.config.required_fields.contains(field) {
221            return false;
222        }
223
224        let probability = self.calculate_probability(field, value, context);
225
226        if self.config.track_statistics {
227            self.stats.total_fields += 1;
228        }
229
230        let is_missing = rng.gen::<f64>() < probability;
231
232        if is_missing && self.config.track_statistics {
233            self.stats.total_missing += 1;
234            *self.stats.by_field.entry(field.to_string()).or_insert(0) += 1;
235        }
236
237        is_missing
238    }
239
240    /// Calculates the missing probability based on strategy.
241    fn calculate_probability(
242        &self,
243        field: &str,
244        value: Option<&str>,
245        context: &HashMap<String, String>,
246    ) -> f64 {
247        match &self.config.strategy {
248            MissingValueStrategy::MCAR { probability } => {
249                // Use field-specific rate if available
250                let base = self.config.get_rate(field);
251                if base > 0.0 {
252                    base
253                } else {
254                    *probability
255                }
256            }
257            MissingValueStrategy::MAR {
258                base_probability,
259                conditions,
260            } => {
261                let mut prob = *base_probability;
262
263                for condition in conditions {
264                    if let Some(field_value) = context.get(&condition.field) {
265                        if self.check_condition(&condition.condition_type, field_value) {
266                            prob *= condition.multiplier;
267                        }
268                    }
269                }
270
271                prob.min(1.0)
272            }
273            MissingValueStrategy::MNAR { value_patterns } => {
274                if let Some(val) = value {
275                    for pattern in value_patterns {
276                        if pattern.field == field
277                            && self.check_value_pattern(&pattern.pattern_type, val)
278                        {
279                            return pattern.probability;
280                        }
281                    }
282                }
283                self.config.get_rate(field)
284            }
285            MissingValueStrategy::Systematic {
286                field_groups,
287                probability,
288            } => {
289                // Check if field is in any group
290                for group in field_groups {
291                    if group.contains(&field.to_string()) {
292                        return *probability;
293                    }
294                }
295                self.config.get_rate(field)
296            }
297        }
298    }
299
300    /// Checks if a condition is met.
301    fn check_condition(&self, condition: &ConditionType, value: &str) -> bool {
302        match condition {
303            ConditionType::Equals(expected) => value == expected,
304            ConditionType::Contains(substring) => value.contains(substring),
305            ConditionType::IsEmpty => value.is_empty(),
306            ConditionType::Matches(pattern) => {
307                // Simple pattern matching (could use regex)
308                value.contains(pattern)
309            }
310            ConditionType::GreaterThan(threshold) => value
311                .parse::<f64>()
312                .map(|v| v > *threshold)
313                .unwrap_or(false),
314            ConditionType::LessThan(threshold) => value
315                .parse::<f64>()
316                .map(|v| v < *threshold)
317                .unwrap_or(false),
318        }
319    }
320
321    /// Checks if a value matches an MNAR pattern.
322    fn check_value_pattern(&self, pattern: &PatternType, value: &str) -> bool {
323        match pattern {
324            PatternType::HighValues { threshold } => value
325                .parse::<f64>()
326                .map(|v| v > *threshold)
327                .unwrap_or(false),
328            PatternType::LowValues { threshold } => value
329                .parse::<f64>()
330                .map(|v| v < *threshold)
331                .unwrap_or(false),
332            PatternType::ExtremeValues { low, high } => value
333                .parse::<f64>()
334                .map(|v| v < *low || v > *high)
335                .unwrap_or(false),
336            PatternType::SensitivePatterns { patterns } => {
337                patterns.iter().any(|p| value.contains(p))
338            }
339        }
340    }
341
342    /// Records that a record was processed.
343    pub fn record_processed(&mut self, had_missing: bool) {
344        if self.config.track_statistics {
345            self.stats.total_records += 1;
346            if had_missing {
347                self.stats.records_with_missing += 1;
348            }
349        }
350    }
351
352    /// Returns the statistics.
353    pub fn stats(&self) -> &MissingValueStats {
354        &self.stats
355    }
356
357    /// Resets statistics.
358    pub fn reset_stats(&mut self) {
359        self.stats = MissingValueStats::default();
360    }
361}
362
363/// Represents a missing value placeholder.
364#[derive(Debug, Clone, PartialEq)]
365pub enum MissingValue {
366    /// Standard null/None.
367    Null,
368    /// Empty string.
369    Empty,
370    /// Special marker string.
371    Marker(String),
372    /// NA string.
373    NA,
374    /// Dash placeholder.
375    Dash,
376    /// Question mark.
377    Unknown,
378}
379
380impl MissingValue {
381    /// Converts to string representation.
382    pub fn to_string_value(&self) -> String {
383        match self {
384            MissingValue::Null => String::new(),
385            MissingValue::Empty => String::new(),
386            MissingValue::Marker(s) => s.clone(),
387            MissingValue::NA => "N/A".to_string(),
388            MissingValue::Dash => "-".to_string(),
389            MissingValue::Unknown => "?".to_string(),
390        }
391    }
392
393    /// Returns common missing value representations.
394    pub fn common_representations() -> Vec<Self> {
395        vec![
396            MissingValue::Null,
397            MissingValue::Empty,
398            MissingValue::NA,
399            MissingValue::Marker("NULL".to_string()),
400            MissingValue::Marker("NONE".to_string()),
401            MissingValue::Marker("#N/A".to_string()),
402            MissingValue::Dash,
403            MissingValue::Unknown,
404        ]
405    }
406}
407
408/// Selects a random missing value representation.
409pub fn random_missing_representation<R: Rng>(rng: &mut R) -> MissingValue {
410    let representations = MissingValue::common_representations();
411    representations[rng.gen_range(0..representations.len())].clone()
412}
413
414#[cfg(test)]
415mod tests {
416    use super::*;
417    use rand::SeedableRng;
418    use rand_chacha::ChaCha8Rng;
419
420    #[test]
421    fn test_mcar_strategy() {
422        let config = MissingValueConfig {
423            global_rate: 0.5, // High rate for testing
424            strategy: MissingValueStrategy::MCAR { probability: 0.5 },
425            ..Default::default()
426        };
427
428        let mut injector = MissingValueInjector::new(config);
429        let mut rng = ChaCha8Rng::seed_from_u64(42);
430        let context = HashMap::new();
431
432        let mut missing_count = 0;
433        for _ in 0..1000 {
434            if injector.should_be_missing("description", Some("test"), &context, &mut rng) {
435                missing_count += 1;
436            }
437        }
438
439        // Should be roughly 50%
440        assert!(missing_count > 400 && missing_count < 600);
441    }
442
443    #[test]
444    fn test_required_fields() {
445        let config = MissingValueConfig {
446            global_rate: 1.0, // 100% rate
447            ..Default::default()
448        };
449
450        let mut injector = MissingValueInjector::new(config);
451        let mut rng = ChaCha8Rng::seed_from_u64(42);
452        let context = HashMap::new();
453
454        // Required field should never be missing
455        assert!(!injector.should_be_missing("document_number", Some("JE001"), &context, &mut rng));
456
457        // Non-required field should be missing at 100% rate
458        assert!(injector.should_be_missing("description", Some("test"), &context, &mut rng));
459    }
460
461    #[test]
462    fn test_field_specific_rates() {
463        let mut field_rates = HashMap::new();
464        field_rates.insert("description".to_string(), 0.0);
465        field_rates.insert("cost_center".to_string(), 1.0);
466
467        let config = MissingValueConfig::default().with_field_rates(field_rates);
468
469        let mut injector = MissingValueInjector::new(config);
470        let mut rng = ChaCha8Rng::seed_from_u64(42);
471        let context = HashMap::new();
472
473        // Description should never be missing (0% rate)
474        assert!(!injector.should_be_missing("description", Some("test"), &context, &mut rng));
475
476        // Cost center should always be missing (100% rate)
477        assert!(injector.should_be_missing("cost_center", Some("CC001"), &context, &mut rng));
478    }
479
480    #[test]
481    fn test_statistics() {
482        let config = MissingValueConfig {
483            global_rate: 0.5,
484            track_statistics: true,
485            ..Default::default()
486        };
487
488        let mut injector = MissingValueInjector::new(config);
489        let mut rng = ChaCha8Rng::seed_from_u64(42);
490        let context = HashMap::new();
491
492        for _ in 0..100 {
493            injector.should_be_missing("description", Some("test"), &context, &mut rng);
494        }
495
496        assert_eq!(injector.stats().total_fields, 100);
497        assert!(injector.stats().total_missing > 0);
498    }
499}