1use chrono::NaiveDate;
7use rand::Rng;
8use rand::SeedableRng;
9use rand_chacha::ChaCha8Rng;
10use rust_decimal::Decimal;
11use std::collections::HashMap;
12
13use super::duplicates::{DuplicateConfig, DuplicateGenerator, DuplicateStats};
14use super::format_variations::{
15 AmountFormat, DateFormat, FormatVariationConfig, FormatVariationInjector, FormatVariationStats,
16};
17use super::missing_values::{MissingValueConfig, MissingValueInjector, MissingValueStats};
18use super::typos::{introduce_encoding_issue, EncodingIssue, TypoConfig, TypoGenerator, TypoStats};
19
20#[derive(Debug, Clone)]
22pub struct DataQualityConfig {
23 pub enable_missing_values: bool,
25 pub missing_values: MissingValueConfig,
27 pub enable_format_variations: bool,
29 pub format_variations: FormatVariationConfig,
31 pub enable_duplicates: bool,
33 pub duplicates: DuplicateConfig,
35 pub enable_typos: bool,
37 pub typos: TypoConfig,
39 pub enable_encoding_issues: bool,
41 pub encoding_issue_rate: f64,
43 pub seed: u64,
45 pub track_statistics: bool,
47}
48
49impl Default for DataQualityConfig {
50 fn default() -> Self {
51 Self {
52 enable_missing_values: true,
53 missing_values: MissingValueConfig::default(),
54 enable_format_variations: true,
55 format_variations: FormatVariationConfig::default(),
56 enable_duplicates: true,
57 duplicates: DuplicateConfig::default(),
58 enable_typos: true,
59 typos: TypoConfig::default(),
60 enable_encoding_issues: false, encoding_issue_rate: 0.001,
62 seed: 42,
63 track_statistics: true,
64 }
65 }
66}
67
68impl DataQualityConfig {
69 pub fn minimal() -> Self {
71 Self {
72 missing_values: MissingValueConfig {
73 global_rate: 0.005,
74 ..Default::default()
75 },
76 format_variations: FormatVariationConfig {
77 date_variation_rate: 0.01,
78 amount_variation_rate: 0.01,
79 identifier_variation_rate: 0.005,
80 text_variation_rate: 0.01,
81 ..Default::default()
82 },
83 duplicates: DuplicateConfig {
84 duplicate_rate: 0.001,
85 ..Default::default()
86 },
87 typos: TypoConfig {
88 char_error_rate: 0.001,
89 ..Default::default()
90 },
91 ..Default::default()
92 }
93 }
94
95 pub fn high_variation() -> Self {
97 Self {
98 missing_values: MissingValueConfig {
99 global_rate: 0.05,
100 ..Default::default()
101 },
102 format_variations: FormatVariationConfig {
103 date_variation_rate: 0.2,
104 amount_variation_rate: 0.1,
105 identifier_variation_rate: 0.1,
106 text_variation_rate: 0.2,
107 ..Default::default()
108 },
109 duplicates: DuplicateConfig {
110 duplicate_rate: 0.02,
111 ..Default::default()
112 },
113 typos: TypoConfig {
114 char_error_rate: 0.02,
115 ..Default::default()
116 },
117 enable_encoding_issues: true,
118 encoding_issue_rate: 0.01,
119 ..Default::default()
120 }
121 }
122}
123
124#[derive(Debug, Clone, Default)]
126pub struct DataQualityStats {
127 pub missing_values: MissingValueStats,
129 pub format_variations: FormatVariationStats,
131 pub duplicates: DuplicateStats,
133 pub typos: TypoStats,
135 pub encoding_issues: usize,
137 pub total_records: usize,
139 pub records_with_issues: usize,
141}
142
143impl DataQualityStats {
144 pub fn overall_issue_rate(&self) -> f64 {
146 if self.total_records == 0 {
147 0.0
148 } else {
149 self.records_with_issues as f64 / self.total_records as f64
150 }
151 }
152
153 pub fn summary(&self) -> HashMap<String, usize> {
155 let mut summary = HashMap::new();
156 summary.insert(
157 "missing_values".to_string(),
158 self.missing_values.total_missing,
159 );
160 summary.insert(
161 "format_variations".to_string(),
162 self.format_variations.date_variations
163 + self.format_variations.amount_variations
164 + self.format_variations.identifier_variations
165 + self.format_variations.text_variations,
166 );
167 summary.insert("duplicates".to_string(), self.duplicates.total_duplicates);
168 summary.insert("typos".to_string(), self.typos.total_typos);
169 summary.insert("encoding_issues".to_string(), self.encoding_issues);
170 summary
171 }
172}
173
174#[derive(Debug, Clone)]
176pub struct QualityIssue {
177 pub issue_id: String,
179 pub issue_type: QualityIssueType,
181 pub record_id: String,
183 pub field: Option<String>,
185 pub original_value: Option<String>,
187 pub modified_value: Option<String>,
189 pub description: String,
191}
192
193#[derive(Debug, Clone, PartialEq)]
195pub enum QualityIssueType {
196 MissingValue,
198 DateFormatVariation,
200 AmountFormatVariation,
202 IdentifierFormatVariation,
204 TextFormatVariation,
206 ExactDuplicate,
208 NearDuplicate,
210 FuzzyDuplicate,
212 Typo,
214 EncodingIssue,
216}
217
218pub struct DataQualityInjector {
220 config: DataQualityConfig,
221 rng: ChaCha8Rng,
222 missing_value_injector: MissingValueInjector,
223 format_injector: FormatVariationInjector,
224 duplicate_generator: DuplicateGenerator,
225 typo_generator: TypoGenerator,
226 stats: DataQualityStats,
227 issues: Vec<QualityIssue>,
228 next_issue_id: u64,
229}
230
231impl DataQualityInjector {
232 pub fn new(config: DataQualityConfig) -> Self {
234 let rng = ChaCha8Rng::seed_from_u64(config.seed);
235 let missing_value_injector = MissingValueInjector::new(config.missing_values.clone());
236 let format_injector = FormatVariationInjector::new(config.format_variations.clone());
237 let duplicate_generator = DuplicateGenerator::new(config.duplicates.clone());
238 let typo_generator = TypoGenerator::new(config.typos.clone());
239
240 Self {
241 config,
242 rng,
243 missing_value_injector,
244 format_injector,
245 duplicate_generator,
246 typo_generator,
247 stats: DataQualityStats::default(),
248 issues: Vec::new(),
249 next_issue_id: 1,
250 }
251 }
252
253 pub fn process_text_field(
255 &mut self,
256 field: &str,
257 value: &str,
258 record_id: &str,
259 context: &HashMap<String, String>,
260 ) -> Option<String> {
261 let mut result = value.to_string();
262 let mut had_issue = false;
263
264 if self.config.enable_missing_values
266 && self.missing_value_injector.should_be_missing(
267 field,
268 Some(value),
269 context,
270 &mut self.rng,
271 )
272 {
273 let issue_id = self.next_issue_id();
274 self.record_issue(QualityIssue {
275 issue_id,
276 issue_type: QualityIssueType::MissingValue,
277 record_id: record_id.to_string(),
278 field: Some(field.to_string()),
279 original_value: Some(value.to_string()),
280 modified_value: None,
281 description: format!("Field '{}' set to missing", field),
282 });
283 return None;
284 }
285
286 if self.config.enable_typos && !self.typo_generator.is_protected(field) {
288 let with_typos = self.typo_generator.introduce_typos(&result, &mut self.rng);
289 if with_typos != result {
290 let issue_id = self.next_issue_id();
291 self.record_issue(QualityIssue {
292 issue_id,
293 issue_type: QualityIssueType::Typo,
294 record_id: record_id.to_string(),
295 field: Some(field.to_string()),
296 original_value: Some(result.clone()),
297 modified_value: Some(with_typos.clone()),
298 description: format!("Typo introduced in field '{}'", field),
299 });
300 result = with_typos;
301 had_issue = true;
302 }
303 }
304
305 if self.config.enable_format_variations {
307 let varied = self.format_injector.vary_text(&result, &mut self.rng);
308 if varied != result {
309 let issue_id = self.next_issue_id();
310 self.record_issue(QualityIssue {
311 issue_id,
312 issue_type: QualityIssueType::TextFormatVariation,
313 record_id: record_id.to_string(),
314 field: Some(field.to_string()),
315 original_value: Some(result.clone()),
316 modified_value: Some(varied.clone()),
317 description: format!("Format variation in field '{}'", field),
318 });
319 result = varied;
320 had_issue = true;
321 }
322 }
323
324 if self.config.enable_encoding_issues
326 && self.rng.gen::<f64>() < self.config.encoding_issue_rate
327 {
328 let issues = [
329 EncodingIssue::Mojibake,
330 EncodingIssue::MissingChars,
331 EncodingIssue::HTMLEntities,
332 ];
333 let issue = issues[self.rng.gen_range(0..issues.len())];
334 let with_encoding = introduce_encoding_issue(&result, issue, &mut self.rng);
335
336 if with_encoding != result {
337 let issue_id = self.next_issue_id();
338 self.record_issue(QualityIssue {
339 issue_id,
340 issue_type: QualityIssueType::EncodingIssue,
341 record_id: record_id.to_string(),
342 field: Some(field.to_string()),
343 original_value: Some(result.clone()),
344 modified_value: Some(with_encoding.clone()),
345 description: format!("Encoding issue ({:?}) in field '{}'", issue, field),
346 });
347 result = with_encoding;
348 had_issue = true;
349 self.stats.encoding_issues += 1;
350 }
351 }
352
353 if had_issue {
354 self.stats.records_with_issues += 1;
355 }
356
357 Some(result)
358 }
359
360 pub fn process_date_field(
362 &mut self,
363 field: &str,
364 date: NaiveDate,
365 record_id: &str,
366 context: &HashMap<String, String>,
367 ) -> Option<String> {
368 if self.config.enable_missing_values
370 && self.missing_value_injector.should_be_missing(
371 field,
372 Some(&date.to_string()),
373 context,
374 &mut self.rng,
375 )
376 {
377 let issue_id = self.next_issue_id();
378 self.record_issue(QualityIssue {
379 issue_id,
380 issue_type: QualityIssueType::MissingValue,
381 record_id: record_id.to_string(),
382 field: Some(field.to_string()),
383 original_value: Some(date.to_string()),
384 modified_value: None,
385 description: format!("Date field '{}' set to missing", field),
386 });
387 return None;
388 }
389
390 if self.config.enable_format_variations {
392 let formatted = self.format_injector.vary_date(date, &mut self.rng);
393 let standard = DateFormat::ISO.format(date);
394
395 if formatted != standard {
396 let issue_id = self.next_issue_id();
397 self.record_issue(QualityIssue {
398 issue_id,
399 issue_type: QualityIssueType::DateFormatVariation,
400 record_id: record_id.to_string(),
401 field: Some(field.to_string()),
402 original_value: Some(standard),
403 modified_value: Some(formatted.clone()),
404 description: format!("Date format variation in field '{}'", field),
405 });
406 }
407
408 return Some(formatted);
409 }
410
411 Some(DateFormat::ISO.format(date))
412 }
413
414 pub fn process_amount_field(
416 &mut self,
417 field: &str,
418 amount: Decimal,
419 record_id: &str,
420 context: &HashMap<String, String>,
421 ) -> Option<String> {
422 if self.config.enable_missing_values
424 && self.missing_value_injector.should_be_missing(
425 field,
426 Some(&amount.to_string()),
427 context,
428 &mut self.rng,
429 )
430 {
431 let issue_id = self.next_issue_id();
432 self.record_issue(QualityIssue {
433 issue_id,
434 issue_type: QualityIssueType::MissingValue,
435 record_id: record_id.to_string(),
436 field: Some(field.to_string()),
437 original_value: Some(amount.to_string()),
438 modified_value: None,
439 description: format!("Amount field '{}' set to missing", field),
440 });
441 return None;
442 }
443
444 if self.config.enable_format_variations {
446 let formatted = self.format_injector.vary_amount(amount, &mut self.rng);
447 let standard = AmountFormat::Plain.format(amount);
448
449 if formatted != standard {
450 let issue_id = self.next_issue_id();
451 self.record_issue(QualityIssue {
452 issue_id,
453 issue_type: QualityIssueType::AmountFormatVariation,
454 record_id: record_id.to_string(),
455 field: Some(field.to_string()),
456 original_value: Some(standard),
457 modified_value: Some(formatted.clone()),
458 description: format!("Amount format variation in field '{}'", field),
459 });
460 }
461
462 return Some(formatted);
463 }
464
465 Some(AmountFormat::Plain.format(amount))
466 }
467
468 pub fn process_identifier_field(
470 &mut self,
471 field: &str,
472 id: &str,
473 record_id: &str,
474 context: &HashMap<String, String>,
475 ) -> Option<String> {
476 if self.config.enable_missing_values
478 && self.missing_value_injector.should_be_missing(
479 field,
480 Some(id),
481 context,
482 &mut self.rng,
483 )
484 {
485 let issue_id = self.next_issue_id();
486 self.record_issue(QualityIssue {
487 issue_id,
488 issue_type: QualityIssueType::MissingValue,
489 record_id: record_id.to_string(),
490 field: Some(field.to_string()),
491 original_value: Some(id.to_string()),
492 modified_value: None,
493 description: format!("Identifier field '{}' set to missing", field),
494 });
495 return None;
496 }
497
498 if self.config.enable_format_variations {
500 let varied = self.format_injector.vary_identifier(id, &mut self.rng);
501
502 if varied != id {
503 let issue_id = self.next_issue_id();
504 self.record_issue(QualityIssue {
505 issue_id,
506 issue_type: QualityIssueType::IdentifierFormatVariation,
507 record_id: record_id.to_string(),
508 field: Some(field.to_string()),
509 original_value: Some(id.to_string()),
510 modified_value: Some(varied.clone()),
511 description: format!("Identifier format variation in field '{}'", field),
512 });
513 }
514
515 return Some(varied);
516 }
517
518 Some(id.to_string())
519 }
520
521 pub fn should_duplicate(&mut self) -> bool {
523 self.config.enable_duplicates && self.duplicate_generator.should_duplicate(&mut self.rng)
524 }
525
526 fn record_issue(&mut self, issue: QualityIssue) {
528 if self.config.track_statistics {
529 self.issues.push(issue);
530 }
531 }
532
533 fn next_issue_id(&mut self) -> String {
535 let id = format!("QI{:08}", self.next_issue_id);
536 self.next_issue_id += 1;
537 id
538 }
539
540 pub fn stats(&self) -> &DataQualityStats {
542 &self.stats
543 }
544
545 pub fn issues(&self) -> &[QualityIssue] {
547 &self.issues
548 }
549
550 pub fn issues_for_record(&self, record_id: &str) -> Vec<&QualityIssue> {
552 self.issues
553 .iter()
554 .filter(|i| i.record_id == record_id)
555 .collect()
556 }
557
558 pub fn issues_by_type(&self, issue_type: QualityIssueType) -> Vec<&QualityIssue> {
560 self.issues
561 .iter()
562 .filter(|i| i.issue_type == issue_type)
563 .collect()
564 }
565
566 pub fn reset(&mut self) {
568 self.stats = DataQualityStats::default();
569 self.issues.clear();
570 self.next_issue_id = 1;
571 self.missing_value_injector.reset_stats();
572 self.format_injector.reset_stats();
573 self.duplicate_generator.reset_stats();
574 self.typo_generator.reset_stats();
575 }
576
577 pub fn update_stats(&mut self) {
579 self.stats.missing_values = self.missing_value_injector.stats().clone();
580 self.stats.format_variations = self.format_injector.stats().clone();
581 self.stats.duplicates = self.duplicate_generator.stats().clone();
582 self.stats.typos = self.typo_generator.stats().clone();
583 }
584}
585
586pub struct DataQualityConfigBuilder {
588 config: DataQualityConfig,
589}
590
591impl DataQualityConfigBuilder {
592 pub fn new() -> Self {
594 Self {
595 config: DataQualityConfig::default(),
596 }
597 }
598
599 pub fn with_missing_values(mut self, enable: bool) -> Self {
601 self.config.enable_missing_values = enable;
602 self
603 }
604
605 pub fn with_missing_rate(mut self, rate: f64) -> Self {
607 self.config.missing_values.global_rate = rate;
608 self
609 }
610
611 pub fn with_format_variations(mut self, enable: bool) -> Self {
613 self.config.enable_format_variations = enable;
614 self
615 }
616
617 pub fn with_duplicates(mut self, enable: bool) -> Self {
619 self.config.enable_duplicates = enable;
620 self
621 }
622
623 pub fn with_duplicate_rate(mut self, rate: f64) -> Self {
625 self.config.duplicates.duplicate_rate = rate;
626 self
627 }
628
629 pub fn with_typos(mut self, enable: bool) -> Self {
631 self.config.enable_typos = enable;
632 self
633 }
634
635 pub fn with_typo_rate(mut self, rate: f64) -> Self {
637 self.config.typos.char_error_rate = rate;
638 self
639 }
640
641 pub fn with_encoding_issues(mut self, enable: bool) -> Self {
643 self.config.enable_encoding_issues = enable;
644 self
645 }
646
647 pub fn with_seed(mut self, seed: u64) -> Self {
649 self.config.seed = seed;
650 self
651 }
652
653 pub fn build(self) -> DataQualityConfig {
655 self.config
656 }
657}
658
659impl Default for DataQualityConfigBuilder {
660 fn default() -> Self {
661 Self::new()
662 }
663}
664
665#[cfg(test)]
666mod tests {
667 use super::*;
668 use rust_decimal_macros::dec;
669
670 #[test]
671 fn test_data_quality_injector_creation() {
672 let config = DataQualityConfig::default();
673 let injector = DataQualityInjector::new(config);
674
675 assert_eq!(injector.stats().total_records, 0);
676 }
677
678 #[test]
679 fn test_text_field_processing() {
680 let config = DataQualityConfigBuilder::new()
681 .with_typo_rate(0.5) .with_seed(42)
683 .build();
684
685 let mut injector = DataQualityInjector::new(config);
686 let context = HashMap::new();
687
688 let result =
689 injector.process_text_field("description", "Test Entry Description", "JE001", &context);
690
691 assert!(result.is_some());
692 }
693
694 #[test]
695 fn test_date_field_processing() {
696 let config = DataQualityConfigBuilder::new()
697 .with_format_variations(true)
698 .with_seed(42)
699 .build();
700
701 let mut injector = DataQualityInjector::new(config);
702 let context = HashMap::new();
703
704 let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
705 let result = injector.process_date_field("posting_date", date, "JE001", &context);
706
707 assert!(result.is_some());
708 }
709
710 #[test]
711 fn test_amount_field_processing() {
712 let config = DataQualityConfigBuilder::new()
713 .with_format_variations(true)
714 .with_seed(42)
715 .build();
716
717 let mut injector = DataQualityInjector::new(config);
718 let context = HashMap::new();
719
720 let amount = dec!(1234.56);
721 let result = injector.process_amount_field("debit_amount", amount, "JE001", &context);
722
723 assert!(result.is_some());
724 }
725
726 #[test]
727 fn test_minimal_config() {
728 let config = DataQualityConfig::minimal();
729 assert!(config.missing_values.global_rate < 0.01);
730 assert!(config.typos.char_error_rate < 0.01);
731 }
732
733 #[test]
734 fn test_high_variation_config() {
735 let config = DataQualityConfig::high_variation();
736 assert!(config.missing_values.global_rate > 0.01);
737 assert!(config.enable_encoding_issues);
738 }
739}