Skip to main content

datasynth_generators/data_quality/
injector.rs

1//! Main data quality injector coordinating all quality issues.
2//!
3//! This module provides a unified interface for introducing various
4//! data quality issues into synthetic data.
5
6use chrono::NaiveDate;
7use datasynth_core::utils::seeded_rng;
8use datasynth_core::CountryPack;
9use rand::Rng;
10use rand_chacha::ChaCha8Rng;
11use rust_decimal::Decimal;
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14
15use super::duplicates::{DuplicateConfig, DuplicateGenerator, DuplicateStats};
16use super::format_variations::{
17    AmountFormat, DateFormat, FormatVariationConfig, FormatVariationInjector, FormatVariationStats,
18};
19use super::missing_values::{MissingValueConfig, MissingValueInjector, MissingValueStats};
20use super::typos::{introduce_encoding_issue, EncodingIssue, TypoConfig, TypoGenerator, TypoStats};
21
22/// Configuration for the data quality injector.
23#[derive(Debug, Clone)]
24pub struct DataQualityConfig {
25    /// Enable missing value injection.
26    pub enable_missing_values: bool,
27    /// Missing value configuration.
28    pub missing_values: MissingValueConfig,
29    /// Enable format variations.
30    pub enable_format_variations: bool,
31    /// Format variation configuration.
32    pub format_variations: FormatVariationConfig,
33    /// Enable duplicates.
34    pub enable_duplicates: bool,
35    /// Duplicate configuration.
36    pub duplicates: DuplicateConfig,
37    /// Enable typos.
38    pub enable_typos: bool,
39    /// Typo configuration.
40    pub typos: TypoConfig,
41    /// Enable encoding issues.
42    pub enable_encoding_issues: bool,
43    /// Encoding issue rate.
44    pub encoding_issue_rate: f64,
45    /// Random seed for reproducibility.
46    pub seed: u64,
47    /// Track detailed statistics.
48    pub track_statistics: bool,
49}
50
51impl Default for DataQualityConfig {
52    fn default() -> Self {
53        Self {
54            enable_missing_values: true,
55            missing_values: MissingValueConfig::default(),
56            enable_format_variations: true,
57            format_variations: FormatVariationConfig::default(),
58            enable_duplicates: true,
59            duplicates: DuplicateConfig::default(),
60            enable_typos: true,
61            typos: TypoConfig::default(),
62            enable_encoding_issues: false, // Off by default (can cause issues)
63            encoding_issue_rate: 0.001,
64            seed: 42,
65            track_statistics: true,
66        }
67    }
68}
69
70impl DataQualityConfig {
71    /// Creates a minimal configuration (low rates).
72    pub fn minimal() -> Self {
73        Self {
74            missing_values: MissingValueConfig {
75                global_rate: 0.005,
76                ..Default::default()
77            },
78            format_variations: FormatVariationConfig {
79                date_variation_rate: 0.01,
80                amount_variation_rate: 0.01,
81                identifier_variation_rate: 0.005,
82                text_variation_rate: 0.01,
83                ..Default::default()
84            },
85            duplicates: DuplicateConfig {
86                duplicate_rate: 0.001,
87                ..Default::default()
88            },
89            typos: TypoConfig {
90                char_error_rate: 0.001,
91                ..Default::default()
92            },
93            ..Default::default()
94        }
95    }
96
97    /// Creates a high-variation configuration (for stress testing).
98    pub fn high_variation() -> Self {
99        Self {
100            missing_values: MissingValueConfig {
101                global_rate: 0.05,
102                ..Default::default()
103            },
104            format_variations: FormatVariationConfig {
105                date_variation_rate: 0.2,
106                amount_variation_rate: 0.1,
107                identifier_variation_rate: 0.1,
108                text_variation_rate: 0.2,
109                ..Default::default()
110            },
111            duplicates: DuplicateConfig {
112                duplicate_rate: 0.02,
113                ..Default::default()
114            },
115            typos: TypoConfig {
116                char_error_rate: 0.02,
117                ..Default::default()
118            },
119            enable_encoding_issues: true,
120            encoding_issue_rate: 0.01,
121            ..Default::default()
122        }
123    }
124}
125
126/// Combined statistics for all data quality issues.
127#[derive(Debug, Clone, Default, Serialize, Deserialize)]
128pub struct DataQualityStats {
129    /// Missing value statistics.
130    pub missing_values: MissingValueStats,
131    /// Format variation statistics.
132    pub format_variations: FormatVariationStats,
133    /// Duplicate statistics.
134    pub duplicates: DuplicateStats,
135    /// Typo statistics.
136    pub typos: TypoStats,
137    /// Encoding issues injected.
138    pub encoding_issues: usize,
139    /// Total records processed.
140    pub total_records: usize,
141    /// Records with any quality issue.
142    pub records_with_issues: usize,
143}
144
145impl DataQualityStats {
146    /// Returns the overall issue rate.
147    pub fn overall_issue_rate(&self) -> f64 {
148        if self.total_records == 0 {
149            0.0
150        } else {
151            self.records_with_issues as f64 / self.total_records as f64
152        }
153    }
154
155    /// Returns a summary of issues.
156    pub fn summary(&self) -> HashMap<String, usize> {
157        let mut summary = HashMap::new();
158        summary.insert(
159            "missing_values".to_string(),
160            self.missing_values.total_missing,
161        );
162        summary.insert(
163            "format_variations".to_string(),
164            self.format_variations.date_variations
165                + self.format_variations.amount_variations
166                + self.format_variations.identifier_variations
167                + self.format_variations.text_variations,
168        );
169        summary.insert("duplicates".to_string(), self.duplicates.total_duplicates);
170        summary.insert("typos".to_string(), self.typos.total_typos);
171        summary.insert("encoding_issues".to_string(), self.encoding_issues);
172        summary
173    }
174}
175
176/// A data quality issue record.
177#[derive(Debug, Clone)]
178pub struct QualityIssue {
179    /// Unique issue identifier.
180    pub issue_id: String,
181    /// Type of issue.
182    pub issue_type: QualityIssueType,
183    /// Record identifier.
184    pub record_id: String,
185    /// Field affected.
186    pub field: Option<String>,
187    /// Original value (if available).
188    pub original_value: Option<String>,
189    /// Modified value (if available).
190    pub modified_value: Option<String>,
191    /// Description.
192    pub description: String,
193}
194
195/// Type of quality issue.
196#[derive(Debug, Clone, PartialEq)]
197pub enum QualityIssueType {
198    /// Missing value.
199    MissingValue,
200    /// Date format variation.
201    DateFormatVariation,
202    /// Amount format variation.
203    AmountFormatVariation,
204    /// Identifier format variation.
205    IdentifierFormatVariation,
206    /// Text format variation.
207    TextFormatVariation,
208    /// Exact duplicate.
209    ExactDuplicate,
210    /// Near duplicate.
211    NearDuplicate,
212    /// Fuzzy duplicate.
213    FuzzyDuplicate,
214    /// Typo.
215    Typo,
216    /// Encoding issue.
217    EncodingIssue,
218}
219
220/// Main data quality injector.
221pub struct DataQualityInjector {
222    config: DataQualityConfig,
223    rng: ChaCha8Rng,
224    missing_value_injector: MissingValueInjector,
225    format_injector: FormatVariationInjector,
226    duplicate_generator: DuplicateGenerator,
227    typo_generator: TypoGenerator,
228    stats: DataQualityStats,
229    issues: Vec<QualityIssue>,
230    next_issue_id: u64,
231}
232
233impl DataQualityInjector {
234    /// Creates a new data quality injector.
235    pub fn new(config: DataQualityConfig) -> Self {
236        let rng = seeded_rng(config.seed, 0);
237        let missing_value_injector = MissingValueInjector::new(config.missing_values.clone());
238        let format_injector = FormatVariationInjector::new(config.format_variations.clone());
239        let duplicate_generator = DuplicateGenerator::new(config.duplicates.clone());
240        let typo_generator = TypoGenerator::new(config.typos.clone());
241
242        Self {
243            config,
244            rng,
245            missing_value_injector,
246            format_injector,
247            duplicate_generator,
248            typo_generator,
249            stats: DataQualityStats::default(),
250            issues: Vec::new(),
251            next_issue_id: 1,
252        }
253    }
254
255    /// Set the country pack for locale-aware format variation baselines.
256    ///
257    /// Propagates to the internal `FormatVariationInjector` so that date and
258    /// amount baselines reflect the country's locale conventions.
259    pub fn set_country_pack(&mut self, pack: CountryPack) {
260        self.format_injector.set_country_pack(pack);
261    }
262
263    /// Processes a text field, potentially introducing quality issues.
264    pub fn process_text_field(
265        &mut self,
266        field: &str,
267        value: &str,
268        record_id: &str,
269        context: &HashMap<String, String>,
270    ) -> Option<String> {
271        let mut result = value.to_string();
272        let mut had_issue = false;
273
274        // Check for missing value
275        if self.config.enable_missing_values
276            && self.missing_value_injector.should_be_missing(
277                field,
278                Some(value),
279                context,
280                &mut self.rng,
281            )
282        {
283            let issue_id = self.next_issue_id();
284            self.record_issue(QualityIssue {
285                issue_id,
286                issue_type: QualityIssueType::MissingValue,
287                record_id: record_id.to_string(),
288                field: Some(field.to_string()),
289                original_value: Some(value.to_string()),
290                modified_value: None,
291                description: format!("Field '{}' set to missing", field),
292            });
293            return None;
294        }
295
296        // Apply typos
297        if self.config.enable_typos && !self.typo_generator.is_protected(field) {
298            let with_typos = self.typo_generator.introduce_typos(&result, &mut self.rng);
299            if with_typos != result {
300                let issue_id = self.next_issue_id();
301                self.record_issue(QualityIssue {
302                    issue_id,
303                    issue_type: QualityIssueType::Typo,
304                    record_id: record_id.to_string(),
305                    field: Some(field.to_string()),
306                    original_value: Some(result.clone()),
307                    modified_value: Some(with_typos.clone()),
308                    description: format!("Typo introduced in field '{}'", field),
309                });
310                result = with_typos;
311                had_issue = true;
312            }
313        }
314
315        // Apply format variations
316        if self.config.enable_format_variations {
317            let varied = self.format_injector.vary_text(&result, &mut self.rng);
318            if varied != result {
319                let issue_id = self.next_issue_id();
320                self.record_issue(QualityIssue {
321                    issue_id,
322                    issue_type: QualityIssueType::TextFormatVariation,
323                    record_id: record_id.to_string(),
324                    field: Some(field.to_string()),
325                    original_value: Some(result.clone()),
326                    modified_value: Some(varied.clone()),
327                    description: format!("Format variation in field '{}'", field),
328                });
329                result = varied;
330                had_issue = true;
331            }
332        }
333
334        // Apply encoding issues
335        if self.config.enable_encoding_issues
336            && self.rng.gen::<f64>() < self.config.encoding_issue_rate
337        {
338            let issues = [
339                EncodingIssue::Mojibake,
340                EncodingIssue::MissingChars,
341                EncodingIssue::HTMLEntities,
342            ];
343            let issue = issues[self.rng.gen_range(0..issues.len())];
344            let with_encoding = introduce_encoding_issue(&result, issue, &mut self.rng);
345
346            if with_encoding != result {
347                let issue_id = self.next_issue_id();
348                self.record_issue(QualityIssue {
349                    issue_id,
350                    issue_type: QualityIssueType::EncodingIssue,
351                    record_id: record_id.to_string(),
352                    field: Some(field.to_string()),
353                    original_value: Some(result.clone()),
354                    modified_value: Some(with_encoding.clone()),
355                    description: format!("Encoding issue ({:?}) in field '{}'", issue, field),
356                });
357                result = with_encoding;
358                had_issue = true;
359                self.stats.encoding_issues += 1;
360            }
361        }
362
363        if had_issue {
364            self.stats.records_with_issues += 1;
365        }
366
367        Some(result)
368    }
369
370    /// Processes a date field, potentially introducing format variations.
371    pub fn process_date_field(
372        &mut self,
373        field: &str,
374        date: NaiveDate,
375        record_id: &str,
376        context: &HashMap<String, String>,
377    ) -> Option<String> {
378        // Check for missing value
379        if self.config.enable_missing_values
380            && self.missing_value_injector.should_be_missing(
381                field,
382                Some(&date.to_string()),
383                context,
384                &mut self.rng,
385            )
386        {
387            let issue_id = self.next_issue_id();
388            self.record_issue(QualityIssue {
389                issue_id,
390                issue_type: QualityIssueType::MissingValue,
391                record_id: record_id.to_string(),
392                field: Some(field.to_string()),
393                original_value: Some(date.to_string()),
394                modified_value: None,
395                description: format!("Date field '{}' set to missing", field),
396            });
397            return None;
398        }
399
400        // Apply format variations
401        if self.config.enable_format_variations {
402            let formatted = self.format_injector.vary_date(date, &mut self.rng);
403            let standard = DateFormat::ISO.format(date);
404
405            if formatted != standard {
406                let issue_id = self.next_issue_id();
407                self.record_issue(QualityIssue {
408                    issue_id,
409                    issue_type: QualityIssueType::DateFormatVariation,
410                    record_id: record_id.to_string(),
411                    field: Some(field.to_string()),
412                    original_value: Some(standard),
413                    modified_value: Some(formatted.clone()),
414                    description: format!("Date format variation in field '{}'", field),
415                });
416            }
417
418            return Some(formatted);
419        }
420
421        Some(DateFormat::ISO.format(date))
422    }
423
424    /// Processes an amount field, potentially introducing format variations.
425    pub fn process_amount_field(
426        &mut self,
427        field: &str,
428        amount: Decimal,
429        record_id: &str,
430        context: &HashMap<String, String>,
431    ) -> Option<String> {
432        // Check for missing value
433        if self.config.enable_missing_values
434            && self.missing_value_injector.should_be_missing(
435                field,
436                Some(&amount.to_string()),
437                context,
438                &mut self.rng,
439            )
440        {
441            let issue_id = self.next_issue_id();
442            self.record_issue(QualityIssue {
443                issue_id,
444                issue_type: QualityIssueType::MissingValue,
445                record_id: record_id.to_string(),
446                field: Some(field.to_string()),
447                original_value: Some(amount.to_string()),
448                modified_value: None,
449                description: format!("Amount field '{}' set to missing", field),
450            });
451            return None;
452        }
453
454        // Apply format variations
455        if self.config.enable_format_variations {
456            let formatted = self.format_injector.vary_amount(amount, &mut self.rng);
457            let standard = AmountFormat::Plain.format(amount);
458
459            if formatted != standard {
460                let issue_id = self.next_issue_id();
461                self.record_issue(QualityIssue {
462                    issue_id,
463                    issue_type: QualityIssueType::AmountFormatVariation,
464                    record_id: record_id.to_string(),
465                    field: Some(field.to_string()),
466                    original_value: Some(standard),
467                    modified_value: Some(formatted.clone()),
468                    description: format!("Amount format variation in field '{}'", field),
469                });
470            }
471
472            return Some(formatted);
473        }
474
475        Some(AmountFormat::Plain.format(amount))
476    }
477
478    /// Processes an identifier field, potentially introducing variations.
479    pub fn process_identifier_field(
480        &mut self,
481        field: &str,
482        id: &str,
483        record_id: &str,
484        context: &HashMap<String, String>,
485    ) -> Option<String> {
486        // Check for missing value (rare for identifiers)
487        if self.config.enable_missing_values
488            && self.missing_value_injector.should_be_missing(
489                field,
490                Some(id),
491                context,
492                &mut self.rng,
493            )
494        {
495            let issue_id = self.next_issue_id();
496            self.record_issue(QualityIssue {
497                issue_id,
498                issue_type: QualityIssueType::MissingValue,
499                record_id: record_id.to_string(),
500                field: Some(field.to_string()),
501                original_value: Some(id.to_string()),
502                modified_value: None,
503                description: format!("Identifier field '{}' set to missing", field),
504            });
505            return None;
506        }
507
508        // Apply format variations
509        if self.config.enable_format_variations {
510            let varied = self.format_injector.vary_identifier(id, &mut self.rng);
511
512            if varied != id {
513                let issue_id = self.next_issue_id();
514                self.record_issue(QualityIssue {
515                    issue_id,
516                    issue_type: QualityIssueType::IdentifierFormatVariation,
517                    record_id: record_id.to_string(),
518                    field: Some(field.to_string()),
519                    original_value: Some(id.to_string()),
520                    modified_value: Some(varied.clone()),
521                    description: format!("Identifier format variation in field '{}'", field),
522                });
523            }
524
525            return Some(varied);
526        }
527
528        Some(id.to_string())
529    }
530
531    /// Determines if a record should be duplicated.
532    pub fn should_duplicate(&mut self) -> bool {
533        self.config.enable_duplicates && self.duplicate_generator.should_duplicate(&mut self.rng)
534    }
535
536    /// Records a quality issue.
537    fn record_issue(&mut self, issue: QualityIssue) {
538        if self.config.track_statistics {
539            self.issues.push(issue);
540        }
541    }
542
543    /// Generates the next issue ID.
544    fn next_issue_id(&mut self) -> String {
545        let id = format!("QI{:08}", self.next_issue_id);
546        self.next_issue_id += 1;
547        id
548    }
549
550    /// Returns statistics.
551    pub fn stats(&self) -> &DataQualityStats {
552        &self.stats
553    }
554
555    /// Returns all recorded issues.
556    pub fn issues(&self) -> &[QualityIssue] {
557        &self.issues
558    }
559
560    /// Returns issues for a specific record.
561    pub fn issues_for_record(&self, record_id: &str) -> Vec<&QualityIssue> {
562        self.issues
563            .iter()
564            .filter(|i| i.record_id == record_id)
565            .collect()
566    }
567
568    /// Returns issues of a specific type.
569    pub fn issues_by_type(&self, issue_type: QualityIssueType) -> Vec<&QualityIssue> {
570        self.issues
571            .iter()
572            .filter(|i| i.issue_type == issue_type)
573            .collect()
574    }
575
576    /// Resets all statistics and issues.
577    pub fn reset(&mut self) {
578        self.stats = DataQualityStats::default();
579        self.issues.clear();
580        self.next_issue_id = 1;
581        self.missing_value_injector.reset_stats();
582        self.format_injector.reset_stats();
583        self.duplicate_generator.reset_stats();
584        self.typo_generator.reset_stats();
585    }
586
587    /// Updates aggregate statistics.
588    pub fn update_stats(&mut self) {
589        self.stats.missing_values = self.missing_value_injector.stats().clone();
590        self.stats.format_variations = self.format_injector.stats().clone();
591        self.stats.duplicates = self.duplicate_generator.stats().clone();
592        self.stats.typos = self.typo_generator.stats().clone();
593    }
594}
595
596/// Builder for DataQualityConfig.
597pub struct DataQualityConfigBuilder {
598    config: DataQualityConfig,
599}
600
601impl DataQualityConfigBuilder {
602    /// Creates a new builder with default configuration.
603    pub fn new() -> Self {
604        Self {
605            config: DataQualityConfig::default(),
606        }
607    }
608
609    /// Enables or disables missing values.
610    pub fn with_missing_values(mut self, enable: bool) -> Self {
611        self.config.enable_missing_values = enable;
612        self
613    }
614
615    /// Sets the global missing value rate.
616    pub fn with_missing_rate(mut self, rate: f64) -> Self {
617        self.config.missing_values.global_rate = rate;
618        self
619    }
620
621    /// Enables or disables format variations.
622    pub fn with_format_variations(mut self, enable: bool) -> Self {
623        self.config.enable_format_variations = enable;
624        self
625    }
626
627    /// Enables or disables duplicates.
628    pub fn with_duplicates(mut self, enable: bool) -> Self {
629        self.config.enable_duplicates = enable;
630        self
631    }
632
633    /// Sets the duplicate rate.
634    pub fn with_duplicate_rate(mut self, rate: f64) -> Self {
635        self.config.duplicates.duplicate_rate = rate;
636        self
637    }
638
639    /// Enables or disables typos.
640    pub fn with_typos(mut self, enable: bool) -> Self {
641        self.config.enable_typos = enable;
642        self
643    }
644
645    /// Sets the typo rate.
646    pub fn with_typo_rate(mut self, rate: f64) -> Self {
647        self.config.typos.char_error_rate = rate;
648        self
649    }
650
651    /// Enables or disables encoding issues.
652    pub fn with_encoding_issues(mut self, enable: bool) -> Self {
653        self.config.enable_encoding_issues = enable;
654        self
655    }
656
657    /// Sets the random seed.
658    pub fn with_seed(mut self, seed: u64) -> Self {
659        self.config.seed = seed;
660        self
661    }
662
663    /// Builds the configuration.
664    pub fn build(self) -> DataQualityConfig {
665        self.config
666    }
667}
668
669impl Default for DataQualityConfigBuilder {
670    fn default() -> Self {
671        Self::new()
672    }
673}
674
675#[cfg(test)]
676#[allow(clippy::unwrap_used)]
677mod tests {
678    use super::*;
679    use rust_decimal_macros::dec;
680
681    #[test]
682    fn test_data_quality_injector_creation() {
683        let config = DataQualityConfig::default();
684        let injector = DataQualityInjector::new(config);
685
686        assert_eq!(injector.stats().total_records, 0);
687    }
688
689    #[test]
690    fn test_text_field_processing() {
691        let config = DataQualityConfigBuilder::new()
692            .with_typo_rate(0.5) // High rate for testing
693            .with_seed(42)
694            .build();
695
696        let mut injector = DataQualityInjector::new(config);
697        let context = HashMap::new();
698
699        let result =
700            injector.process_text_field("description", "Test Entry Description", "JE001", &context);
701
702        assert!(result.is_some());
703    }
704
705    #[test]
706    fn test_date_field_processing() {
707        let config = DataQualityConfigBuilder::new()
708            .with_format_variations(true)
709            .with_seed(42)
710            .build();
711
712        let mut injector = DataQualityInjector::new(config);
713        let context = HashMap::new();
714
715        let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
716        let result = injector.process_date_field("posting_date", date, "JE001", &context);
717
718        assert!(result.is_some());
719    }
720
721    #[test]
722    fn test_amount_field_processing() {
723        let config = DataQualityConfigBuilder::new()
724            .with_format_variations(true)
725            .with_seed(42)
726            .build();
727
728        let mut injector = DataQualityInjector::new(config);
729        let context = HashMap::new();
730
731        let amount = dec!(1234.56);
732        let result = injector.process_amount_field("debit_amount", amount, "JE001", &context);
733
734        assert!(result.is_some());
735    }
736
737    #[test]
738    fn test_minimal_config() {
739        let config = DataQualityConfig::minimal();
740        assert!(config.missing_values.global_rate < 0.01);
741        assert!(config.typos.char_error_rate < 0.01);
742    }
743
744    #[test]
745    fn test_high_variation_config() {
746        let config = DataQualityConfig::high_variation();
747        assert!(config.missing_values.global_rate > 0.01);
748        assert!(config.enable_encoding_issues);
749    }
750}