Skip to main content

datasynth_generators/data_quality/
missing_values.rs

1//! Missing value injection for data quality simulation.
2//!
3//! Simulates realistic missing data patterns including:
4//! - Random missing values (MCAR - Missing Completely At Random)
5//! - Conditional missing values (MAR - Missing At Random)
6//! - Systematic missing values (MNAR - Missing Not At Random)
7
8use rand::Rng;
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, HashSet};
11
12/// Strategy for missing value injection.
13#[derive(Debug, Clone)]
14pub enum MissingValueStrategy {
15    /// Missing Completely At Random - each value has equal probability of being missing.
16    MCAR {
17        /// Probability of a value being missing (0.0 - 1.0).
18        probability: f64,
19    },
20    /// Missing At Random - probability depends on other observed values.
21    MAR {
22        /// Base probability.
23        base_probability: f64,
24        /// Conditions that increase probability.
25        conditions: Vec<MissingCondition>,
26    },
27    /// Missing Not At Random - probability depends on the value itself.
28    MNAR {
29        /// Missing probability for specific value ranges/patterns.
30        value_patterns: Vec<MissingPattern>,
31    },
32    /// Systematic missing - entire fields missing for certain records.
33    Systematic {
34        /// Fields that are systematically missing together.
35        field_groups: Vec<Vec<String>>,
36        /// Probability of group being missing.
37        probability: f64,
38    },
39}
40
41impl Default for MissingValueStrategy {
42    fn default() -> Self {
43        MissingValueStrategy::MCAR { probability: 0.01 }
44    }
45}
46
47/// Condition for MAR missing values.
48#[derive(Debug, Clone)]
49pub struct MissingCondition {
50    /// Field to check.
51    pub field: String,
52    /// Condition type.
53    pub condition_type: ConditionType,
54    /// Probability multiplier when condition is met.
55    pub multiplier: f64,
56}
57
58/// Type of condition for missing values.
59#[derive(Debug, Clone)]
60pub enum ConditionType {
61    /// Field equals specific value.
62    Equals(String),
63    /// Field contains substring.
64    Contains(String),
65    /// Field is empty.
66    IsEmpty,
67    /// Field matches pattern.
68    Matches(String),
69    /// Numeric field greater than threshold.
70    GreaterThan(f64),
71    /// Numeric field less than threshold.
72    LessThan(f64),
73}
74
75/// Pattern for MNAR missing values.
76#[derive(Debug, Clone)]
77pub struct MissingPattern {
78    /// Description of the pattern.
79    pub description: String,
80    /// Field to check.
81    pub field: String,
82    /// Pattern type.
83    pub pattern_type: PatternType,
84    /// Probability when pattern matches.
85    pub probability: f64,
86}
87
88/// Type of pattern for MNAR.
89#[derive(Debug, Clone)]
90pub enum PatternType {
91    /// High values tend to be missing.
92    HighValues { threshold: f64 },
93    /// Low values tend to be missing.
94    LowValues { threshold: f64 },
95    /// Extreme values (outliers) tend to be missing.
96    ExtremeValues { low: f64, high: f64 },
97    /// Sensitive values tend to be missing.
98    SensitivePatterns { patterns: Vec<String> },
99}
100
101/// Configuration for missing values by field.
102#[derive(Debug, Clone)]
103pub struct MissingValueConfig {
104    /// Global missing rate (default for fields not specified).
105    pub global_rate: f64,
106    /// Field-specific missing rates.
107    pub field_rates: HashMap<String, f64>,
108    /// Fields that should never be missing (required fields).
109    pub required_fields: HashSet<String>,
110    /// Strategy for missing value injection.
111    pub strategy: MissingValueStrategy,
112    /// Whether to track missing value statistics.
113    pub track_statistics: bool,
114}
115
116impl Default for MissingValueConfig {
117    fn default() -> Self {
118        let mut required_fields = HashSet::new();
119        // Common required fields in accounting data
120        required_fields.insert("document_number".to_string());
121        required_fields.insert("company_code".to_string());
122        required_fields.insert("posting_date".to_string());
123        required_fields.insert("account_code".to_string());
124
125        Self {
126            global_rate: 0.01,
127            field_rates: HashMap::new(),
128            required_fields,
129            strategy: MissingValueStrategy::default(),
130            track_statistics: true,
131        }
132    }
133}
134
135impl MissingValueConfig {
136    /// Creates a configuration with specific field rates.
137    pub fn with_field_rates(mut self, rates: HashMap<String, f64>) -> Self {
138        self.field_rates = rates;
139        self
140    }
141
142    /// Adds a required field.
143    pub fn with_required_field(mut self, field: &str) -> Self {
144        self.required_fields.insert(field.to_string());
145        self
146    }
147
148    /// Sets the strategy.
149    pub fn with_strategy(mut self, strategy: MissingValueStrategy) -> Self {
150        self.strategy = strategy;
151        self
152    }
153
154    /// Gets the missing rate for a specific field.
155    pub fn get_rate(&self, field: &str) -> f64 {
156        if self.required_fields.contains(field) {
157            return 0.0;
158        }
159        *self.field_rates.get(field).unwrap_or(&self.global_rate)
160    }
161}
162
163/// Statistics about missing values.
164#[derive(Debug, Clone, Default, Serialize, Deserialize)]
165pub struct MissingValueStats {
166    /// Total fields processed.
167    pub total_fields: usize,
168    /// Total missing values injected.
169    pub total_missing: usize,
170    /// Missing count by field.
171    pub by_field: HashMap<String, usize>,
172    /// Records with any missing value.
173    pub records_with_missing: usize,
174    /// Total records processed.
175    pub total_records: usize,
176}
177
178impl MissingValueStats {
179    /// Returns the overall missing rate.
180    pub fn overall_rate(&self) -> f64 {
181        if self.total_fields == 0 {
182            0.0
183        } else {
184            self.total_missing as f64 / self.total_fields as f64
185        }
186    }
187
188    /// Returns the rate for a specific field.
189    pub fn field_rate(&self, field: &str, total_records: usize) -> f64 {
190        if total_records == 0 {
191            return 0.0;
192        }
193        *self.by_field.get(field).unwrap_or(&0) as f64 / total_records as f64
194    }
195}
196
197/// Missing value injector.
198pub struct MissingValueInjector {
199    config: MissingValueConfig,
200    stats: MissingValueStats,
201}
202
203impl MissingValueInjector {
204    /// Creates a new missing value injector.
205    pub fn new(config: MissingValueConfig) -> Self {
206        Self {
207            config,
208            stats: MissingValueStats::default(),
209        }
210    }
211
212    /// Determines if a value should be made missing.
213    pub fn should_be_missing<R: Rng>(
214        &mut self,
215        field: &str,
216        value: Option<&str>,
217        context: &HashMap<String, String>,
218        rng: &mut R,
219    ) -> bool {
220        // Never make required fields missing
221        if self.config.required_fields.contains(field) {
222            return false;
223        }
224
225        let probability = self.calculate_probability(field, value, context);
226
227        if self.config.track_statistics {
228            self.stats.total_fields += 1;
229        }
230
231        let is_missing = rng.random::<f64>() < probability;
232
233        if is_missing && self.config.track_statistics {
234            self.stats.total_missing += 1;
235            *self.stats.by_field.entry(field.to_string()).or_insert(0) += 1;
236        }
237
238        is_missing
239    }
240
241    /// Calculates the missing probability based on strategy.
242    fn calculate_probability(
243        &self,
244        field: &str,
245        value: Option<&str>,
246        context: &HashMap<String, String>,
247    ) -> f64 {
248        match &self.config.strategy {
249            MissingValueStrategy::MCAR { probability } => {
250                // Use field-specific rate if available
251                let base = self.config.get_rate(field);
252                if base > 0.0 {
253                    base
254                } else {
255                    *probability
256                }
257            }
258            MissingValueStrategy::MAR {
259                base_probability,
260                conditions,
261            } => {
262                let mut prob = *base_probability;
263
264                for condition in conditions {
265                    if let Some(field_value) = context.get(&condition.field) {
266                        if self.check_condition(&condition.condition_type, field_value) {
267                            prob *= condition.multiplier;
268                        }
269                    }
270                }
271
272                prob.min(1.0)
273            }
274            MissingValueStrategy::MNAR { value_patterns } => {
275                if let Some(val) = value {
276                    for pattern in value_patterns {
277                        if pattern.field == field
278                            && self.check_value_pattern(&pattern.pattern_type, val)
279                        {
280                            return pattern.probability;
281                        }
282                    }
283                }
284                self.config.get_rate(field)
285            }
286            MissingValueStrategy::Systematic {
287                field_groups,
288                probability,
289            } => {
290                // Check if field is in any group
291                for group in field_groups {
292                    if group.contains(&field.to_string()) {
293                        return *probability;
294                    }
295                }
296                self.config.get_rate(field)
297            }
298        }
299    }
300
301    /// Checks if a condition is met.
302    fn check_condition(&self, condition: &ConditionType, value: &str) -> bool {
303        match condition {
304            ConditionType::Equals(expected) => value == expected,
305            ConditionType::Contains(substring) => value.contains(substring),
306            ConditionType::IsEmpty => value.is_empty(),
307            ConditionType::Matches(pattern) => {
308                // Simple pattern matching (could use regex)
309                value.contains(pattern)
310            }
311            ConditionType::GreaterThan(threshold) => value
312                .parse::<f64>()
313                .map(|v| v > *threshold)
314                .unwrap_or(false),
315            ConditionType::LessThan(threshold) => value
316                .parse::<f64>()
317                .map(|v| v < *threshold)
318                .unwrap_or(false),
319        }
320    }
321
322    /// Checks if a value matches an MNAR pattern.
323    fn check_value_pattern(&self, pattern: &PatternType, value: &str) -> bool {
324        match pattern {
325            PatternType::HighValues { threshold } => value
326                .parse::<f64>()
327                .map(|v| v > *threshold)
328                .unwrap_or(false),
329            PatternType::LowValues { threshold } => value
330                .parse::<f64>()
331                .map(|v| v < *threshold)
332                .unwrap_or(false),
333            PatternType::ExtremeValues { low, high } => value
334                .parse::<f64>()
335                .map(|v| v < *low || v > *high)
336                .unwrap_or(false),
337            PatternType::SensitivePatterns { patterns } => {
338                patterns.iter().any(|p| value.contains(p))
339            }
340        }
341    }
342
343    /// Records that a record was processed.
344    pub fn record_processed(&mut self, had_missing: bool) {
345        if self.config.track_statistics {
346            self.stats.total_records += 1;
347            if had_missing {
348                self.stats.records_with_missing += 1;
349            }
350        }
351    }
352
353    /// Returns the statistics.
354    pub fn stats(&self) -> &MissingValueStats {
355        &self.stats
356    }
357
358    /// Resets statistics.
359    pub fn reset_stats(&mut self) {
360        self.stats = MissingValueStats::default();
361    }
362}
363
364/// Represents a missing value placeholder.
365#[derive(Debug, Clone, PartialEq)]
366pub enum MissingValue {
367    /// Standard null/None.
368    Null,
369    /// Empty string.
370    Empty,
371    /// Special marker string.
372    Marker(String),
373    /// NA string.
374    NA,
375    /// Dash placeholder.
376    Dash,
377    /// Question mark.
378    Unknown,
379}
380
381impl MissingValue {
382    /// Converts to string representation.
383    pub fn to_string_value(&self) -> String {
384        match self {
385            MissingValue::Null => String::new(),
386            MissingValue::Empty => String::new(),
387            MissingValue::Marker(s) => s.clone(),
388            MissingValue::NA => "N/A".to_string(),
389            MissingValue::Dash => "-".to_string(),
390            MissingValue::Unknown => "?".to_string(),
391        }
392    }
393
394    /// Returns common missing value representations.
395    pub fn common_representations() -> Vec<Self> {
396        vec![
397            MissingValue::Null,
398            MissingValue::Empty,
399            MissingValue::NA,
400            MissingValue::Marker("NULL".to_string()),
401            MissingValue::Marker("NONE".to_string()),
402            MissingValue::Marker("#N/A".to_string()),
403            MissingValue::Dash,
404            MissingValue::Unknown,
405        ]
406    }
407}
408
409/// Selects a random missing value representation.
410pub fn random_missing_representation<R: Rng>(rng: &mut R) -> MissingValue {
411    let representations = MissingValue::common_representations();
412    representations[rng.random_range(0..representations.len())].clone()
413}
414
415#[cfg(test)]
416#[allow(clippy::unwrap_used)]
417mod tests {
418    use super::*;
419    use rand::SeedableRng;
420    use rand_chacha::ChaCha8Rng;
421
422    #[test]
423    fn test_mcar_strategy() {
424        let config = MissingValueConfig {
425            global_rate: 0.5, // High rate for testing
426            strategy: MissingValueStrategy::MCAR { probability: 0.5 },
427            ..Default::default()
428        };
429
430        let mut injector = MissingValueInjector::new(config);
431        let mut rng = ChaCha8Rng::seed_from_u64(42);
432        let context = HashMap::new();
433
434        let mut missing_count = 0;
435        for _ in 0..1000 {
436            if injector.should_be_missing("description", Some("test"), &context, &mut rng) {
437                missing_count += 1;
438            }
439        }
440
441        // Should be roughly 50%
442        assert!(missing_count > 400 && missing_count < 600);
443    }
444
445    #[test]
446    fn test_required_fields() {
447        let config = MissingValueConfig {
448            global_rate: 1.0, // 100% rate
449            ..Default::default()
450        };
451
452        let mut injector = MissingValueInjector::new(config);
453        let mut rng = ChaCha8Rng::seed_from_u64(42);
454        let context = HashMap::new();
455
456        // Required field should never be missing
457        assert!(!injector.should_be_missing("document_number", Some("JE001"), &context, &mut rng));
458
459        // Non-required field should be missing at 100% rate
460        assert!(injector.should_be_missing("description", Some("test"), &context, &mut rng));
461    }
462
463    #[test]
464    fn test_field_specific_rates() {
465        let mut field_rates = HashMap::new();
466        field_rates.insert("description".to_string(), 0.0);
467        field_rates.insert("cost_center".to_string(), 1.0);
468
469        let config = MissingValueConfig::default().with_field_rates(field_rates);
470
471        let mut injector = MissingValueInjector::new(config);
472        let mut rng = ChaCha8Rng::seed_from_u64(42);
473        let context = HashMap::new();
474
475        // Description should never be missing (0% rate)
476        assert!(!injector.should_be_missing("description", Some("test"), &context, &mut rng));
477
478        // Cost center should always be missing (100% rate)
479        assert!(injector.should_be_missing("cost_center", Some("CC001"), &context, &mut rng));
480    }
481
482    #[test]
483    fn test_statistics() {
484        let config = MissingValueConfig {
485            global_rate: 0.5,
486            track_statistics: true,
487            ..Default::default()
488        };
489
490        let mut injector = MissingValueInjector::new(config);
491        let mut rng = ChaCha8Rng::seed_from_u64(42);
492        let context = HashMap::new();
493
494        for _ in 0..100 {
495            injector.should_be_missing("description", Some("test"), &context, &mut rng);
496        }
497
498        assert_eq!(injector.stats().total_fields, 100);
499        assert!(injector.stats().total_missing > 0);
500    }
501}