1use chrono::NaiveDate;
7use datasynth_core::utils::seeded_rng;
8use datasynth_core::CountryPack;
9use rand::Rng;
10use rand_chacha::ChaCha8Rng;
11use rust_decimal::Decimal;
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14
15use super::duplicates::{DuplicateConfig, DuplicateGenerator, DuplicateStats};
16use super::format_variations::{
17 AmountFormat, DateFormat, FormatVariationConfig, FormatVariationInjector, FormatVariationStats,
18};
19use super::missing_values::{MissingValueConfig, MissingValueInjector, MissingValueStats};
20use super::typos::{introduce_encoding_issue, EncodingIssue, TypoConfig, TypoGenerator, TypoStats};
21
22#[derive(Debug, Clone)]
24pub struct DataQualityConfig {
25 pub enable_missing_values: bool,
27 pub missing_values: MissingValueConfig,
29 pub enable_format_variations: bool,
31 pub format_variations: FormatVariationConfig,
33 pub enable_duplicates: bool,
35 pub duplicates: DuplicateConfig,
37 pub enable_typos: bool,
39 pub typos: TypoConfig,
41 pub enable_encoding_issues: bool,
43 pub encoding_issue_rate: f64,
45 pub seed: u64,
47 pub track_statistics: bool,
49}
50
51impl Default for DataQualityConfig {
52 fn default() -> Self {
53 Self {
54 enable_missing_values: true,
55 missing_values: MissingValueConfig::default(),
56 enable_format_variations: true,
57 format_variations: FormatVariationConfig::default(),
58 enable_duplicates: true,
59 duplicates: DuplicateConfig::default(),
60 enable_typos: true,
61 typos: TypoConfig::default(),
62 enable_encoding_issues: false, encoding_issue_rate: 0.001,
64 seed: 42,
65 track_statistics: true,
66 }
67 }
68}
69
70impl DataQualityConfig {
71 pub fn minimal() -> Self {
73 Self {
74 missing_values: MissingValueConfig {
75 global_rate: 0.005,
76 ..Default::default()
77 },
78 format_variations: FormatVariationConfig {
79 date_variation_rate: 0.01,
80 amount_variation_rate: 0.01,
81 identifier_variation_rate: 0.005,
82 text_variation_rate: 0.01,
83 ..Default::default()
84 },
85 duplicates: DuplicateConfig {
86 duplicate_rate: 0.001,
87 ..Default::default()
88 },
89 typos: TypoConfig {
90 char_error_rate: 0.001,
91 ..Default::default()
92 },
93 ..Default::default()
94 }
95 }
96
97 pub fn high_variation() -> Self {
99 Self {
100 missing_values: MissingValueConfig {
101 global_rate: 0.05,
102 ..Default::default()
103 },
104 format_variations: FormatVariationConfig {
105 date_variation_rate: 0.2,
106 amount_variation_rate: 0.1,
107 identifier_variation_rate: 0.1,
108 text_variation_rate: 0.2,
109 ..Default::default()
110 },
111 duplicates: DuplicateConfig {
112 duplicate_rate: 0.02,
113 ..Default::default()
114 },
115 typos: TypoConfig {
116 char_error_rate: 0.02,
117 ..Default::default()
118 },
119 enable_encoding_issues: true,
120 encoding_issue_rate: 0.01,
121 ..Default::default()
122 }
123 }
124}
125
126#[derive(Debug, Clone, Default, Serialize, Deserialize)]
128pub struct DataQualityStats {
129 pub missing_values: MissingValueStats,
131 pub format_variations: FormatVariationStats,
133 pub duplicates: DuplicateStats,
135 pub typos: TypoStats,
137 pub encoding_issues: usize,
139 pub total_records: usize,
141 pub records_with_issues: usize,
143}
144
145impl DataQualityStats {
146 pub fn overall_issue_rate(&self) -> f64 {
148 if self.total_records == 0 {
149 0.0
150 } else {
151 self.records_with_issues as f64 / self.total_records as f64
152 }
153 }
154
155 pub fn summary(&self) -> HashMap<String, usize> {
157 let mut summary = HashMap::new();
158 summary.insert(
159 "missing_values".to_string(),
160 self.missing_values.total_missing,
161 );
162 summary.insert(
163 "format_variations".to_string(),
164 self.format_variations.date_variations
165 + self.format_variations.amount_variations
166 + self.format_variations.identifier_variations
167 + self.format_variations.text_variations,
168 );
169 summary.insert("duplicates".to_string(), self.duplicates.total_duplicates);
170 summary.insert("typos".to_string(), self.typos.total_typos);
171 summary.insert("encoding_issues".to_string(), self.encoding_issues);
172 summary
173 }
174}
175
176#[derive(Debug, Clone)]
178pub struct QualityIssue {
179 pub issue_id: String,
181 pub issue_type: QualityIssueType,
183 pub record_id: String,
185 pub field: Option<String>,
187 pub original_value: Option<String>,
189 pub modified_value: Option<String>,
191 pub description: String,
193}
194
195#[derive(Debug, Clone, PartialEq)]
197pub enum QualityIssueType {
198 MissingValue,
200 DateFormatVariation,
202 AmountFormatVariation,
204 IdentifierFormatVariation,
206 TextFormatVariation,
208 ExactDuplicate,
210 NearDuplicate,
212 FuzzyDuplicate,
214 Typo,
216 EncodingIssue,
218}
219
220pub struct DataQualityInjector {
222 config: DataQualityConfig,
223 rng: ChaCha8Rng,
224 missing_value_injector: MissingValueInjector,
225 format_injector: FormatVariationInjector,
226 duplicate_generator: DuplicateGenerator,
227 typo_generator: TypoGenerator,
228 stats: DataQualityStats,
229 issues: Vec<QualityIssue>,
230 next_issue_id: u64,
231}
232
233impl DataQualityInjector {
234 pub fn new(config: DataQualityConfig) -> Self {
236 let rng = seeded_rng(config.seed, 0);
237 let missing_value_injector = MissingValueInjector::new(config.missing_values.clone());
238 let format_injector = FormatVariationInjector::new(config.format_variations.clone());
239 let duplicate_generator = DuplicateGenerator::new(config.duplicates.clone());
240 let typo_generator = TypoGenerator::new(config.typos.clone());
241
242 Self {
243 config,
244 rng,
245 missing_value_injector,
246 format_injector,
247 duplicate_generator,
248 typo_generator,
249 stats: DataQualityStats::default(),
250 issues: Vec::new(),
251 next_issue_id: 1,
252 }
253 }
254
255 pub fn set_country_pack(&mut self, pack: CountryPack) {
260 self.format_injector.set_country_pack(pack);
261 }
262
263 pub fn process_text_field(
265 &mut self,
266 field: &str,
267 value: &str,
268 record_id: &str,
269 context: &HashMap<String, String>,
270 ) -> Option<String> {
271 let mut result = value.to_string();
272 let mut had_issue = false;
273
274 if self.config.enable_missing_values
276 && self.missing_value_injector.should_be_missing(
277 field,
278 Some(value),
279 context,
280 &mut self.rng,
281 )
282 {
283 let issue_id = self.next_issue_id();
284 self.record_issue(QualityIssue {
285 issue_id,
286 issue_type: QualityIssueType::MissingValue,
287 record_id: record_id.to_string(),
288 field: Some(field.to_string()),
289 original_value: Some(value.to_string()),
290 modified_value: None,
291 description: format!("Field '{}' set to missing", field),
292 });
293 return None;
294 }
295
296 if self.config.enable_typos && !self.typo_generator.is_protected(field) {
298 let with_typos = self.typo_generator.introduce_typos(&result, &mut self.rng);
299 if with_typos != result {
300 let issue_id = self.next_issue_id();
301 self.record_issue(QualityIssue {
302 issue_id,
303 issue_type: QualityIssueType::Typo,
304 record_id: record_id.to_string(),
305 field: Some(field.to_string()),
306 original_value: Some(result.clone()),
307 modified_value: Some(with_typos.clone()),
308 description: format!("Typo introduced in field '{}'", field),
309 });
310 result = with_typos;
311 had_issue = true;
312 }
313 }
314
315 if self.config.enable_format_variations {
317 let varied = self.format_injector.vary_text(&result, &mut self.rng);
318 if varied != result {
319 let issue_id = self.next_issue_id();
320 self.record_issue(QualityIssue {
321 issue_id,
322 issue_type: QualityIssueType::TextFormatVariation,
323 record_id: record_id.to_string(),
324 field: Some(field.to_string()),
325 original_value: Some(result.clone()),
326 modified_value: Some(varied.clone()),
327 description: format!("Format variation in field '{}'", field),
328 });
329 result = varied;
330 had_issue = true;
331 }
332 }
333
334 if self.config.enable_encoding_issues
336 && self.rng.random::<f64>() < self.config.encoding_issue_rate
337 {
338 let issues = [
339 EncodingIssue::Mojibake,
340 EncodingIssue::MissingChars,
341 EncodingIssue::HTMLEntities,
342 ];
343 let issue = issues[self.rng.random_range(0..issues.len())];
344 let with_encoding = introduce_encoding_issue(&result, issue, &mut self.rng);
345
346 if with_encoding != result {
347 let issue_id = self.next_issue_id();
348 self.record_issue(QualityIssue {
349 issue_id,
350 issue_type: QualityIssueType::EncodingIssue,
351 record_id: record_id.to_string(),
352 field: Some(field.to_string()),
353 original_value: Some(result.clone()),
354 modified_value: Some(with_encoding.clone()),
355 description: format!("Encoding issue ({:?}) in field '{}'", issue, field),
356 });
357 result = with_encoding;
358 had_issue = true;
359 self.stats.encoding_issues += 1;
360 }
361 }
362
363 if had_issue {
364 self.stats.records_with_issues += 1;
365 }
366
367 Some(result)
368 }
369
370 pub fn process_date_field(
372 &mut self,
373 field: &str,
374 date: NaiveDate,
375 record_id: &str,
376 context: &HashMap<String, String>,
377 ) -> Option<String> {
378 if self.config.enable_missing_values
380 && self.missing_value_injector.should_be_missing(
381 field,
382 Some(&date.to_string()),
383 context,
384 &mut self.rng,
385 )
386 {
387 let issue_id = self.next_issue_id();
388 self.record_issue(QualityIssue {
389 issue_id,
390 issue_type: QualityIssueType::MissingValue,
391 record_id: record_id.to_string(),
392 field: Some(field.to_string()),
393 original_value: Some(date.to_string()),
394 modified_value: None,
395 description: format!("Date field '{}' set to missing", field),
396 });
397 return None;
398 }
399
400 if self.config.enable_format_variations {
402 let formatted = self.format_injector.vary_date(date, &mut self.rng);
403 let standard = DateFormat::ISO.format(date);
404
405 if formatted != standard {
406 let issue_id = self.next_issue_id();
407 self.record_issue(QualityIssue {
408 issue_id,
409 issue_type: QualityIssueType::DateFormatVariation,
410 record_id: record_id.to_string(),
411 field: Some(field.to_string()),
412 original_value: Some(standard),
413 modified_value: Some(formatted.clone()),
414 description: format!("Date format variation in field '{}'", field),
415 });
416 }
417
418 return Some(formatted);
419 }
420
421 Some(DateFormat::ISO.format(date))
422 }
423
424 pub fn process_amount_field(
426 &mut self,
427 field: &str,
428 amount: Decimal,
429 record_id: &str,
430 context: &HashMap<String, String>,
431 ) -> Option<String> {
432 if self.config.enable_missing_values
434 && self.missing_value_injector.should_be_missing(
435 field,
436 Some(&amount.to_string()),
437 context,
438 &mut self.rng,
439 )
440 {
441 let issue_id = self.next_issue_id();
442 self.record_issue(QualityIssue {
443 issue_id,
444 issue_type: QualityIssueType::MissingValue,
445 record_id: record_id.to_string(),
446 field: Some(field.to_string()),
447 original_value: Some(amount.to_string()),
448 modified_value: None,
449 description: format!("Amount field '{}' set to missing", field),
450 });
451 return None;
452 }
453
454 if self.config.enable_format_variations {
456 let formatted = self.format_injector.vary_amount(amount, &mut self.rng);
457 let standard = AmountFormat::Plain.format(amount);
458
459 if formatted != standard {
460 let issue_id = self.next_issue_id();
461 self.record_issue(QualityIssue {
462 issue_id,
463 issue_type: QualityIssueType::AmountFormatVariation,
464 record_id: record_id.to_string(),
465 field: Some(field.to_string()),
466 original_value: Some(standard),
467 modified_value: Some(formatted.clone()),
468 description: format!("Amount format variation in field '{}'", field),
469 });
470 }
471
472 return Some(formatted);
473 }
474
475 Some(AmountFormat::Plain.format(amount))
476 }
477
478 pub fn process_identifier_field(
480 &mut self,
481 field: &str,
482 id: &str,
483 record_id: &str,
484 context: &HashMap<String, String>,
485 ) -> Option<String> {
486 if self.config.enable_missing_values
488 && self.missing_value_injector.should_be_missing(
489 field,
490 Some(id),
491 context,
492 &mut self.rng,
493 )
494 {
495 let issue_id = self.next_issue_id();
496 self.record_issue(QualityIssue {
497 issue_id,
498 issue_type: QualityIssueType::MissingValue,
499 record_id: record_id.to_string(),
500 field: Some(field.to_string()),
501 original_value: Some(id.to_string()),
502 modified_value: None,
503 description: format!("Identifier field '{}' set to missing", field),
504 });
505 return None;
506 }
507
508 if self.config.enable_format_variations {
510 let varied = self.format_injector.vary_identifier(id, &mut self.rng);
511
512 if varied != id {
513 let issue_id = self.next_issue_id();
514 self.record_issue(QualityIssue {
515 issue_id,
516 issue_type: QualityIssueType::IdentifierFormatVariation,
517 record_id: record_id.to_string(),
518 field: Some(field.to_string()),
519 original_value: Some(id.to_string()),
520 modified_value: Some(varied.clone()),
521 description: format!("Identifier format variation in field '{}'", field),
522 });
523 }
524
525 return Some(varied);
526 }
527
528 Some(id.to_string())
529 }
530
531 pub fn should_duplicate(&mut self) -> bool {
533 self.config.enable_duplicates && self.duplicate_generator.should_duplicate(&mut self.rng)
534 }
535
536 fn record_issue(&mut self, issue: QualityIssue) {
538 if self.config.track_statistics {
539 self.issues.push(issue);
540 }
541 }
542
543 fn next_issue_id(&mut self) -> String {
545 let id = format!("QI{:08}", self.next_issue_id);
546 self.next_issue_id += 1;
547 id
548 }
549
550 pub fn stats(&self) -> &DataQualityStats {
552 &self.stats
553 }
554
555 pub fn issues(&self) -> &[QualityIssue] {
557 &self.issues
558 }
559
560 pub fn issues_for_record(&self, record_id: &str) -> Vec<&QualityIssue> {
562 self.issues
563 .iter()
564 .filter(|i| i.record_id == record_id)
565 .collect()
566 }
567
568 pub fn issues_by_type(&self, issue_type: QualityIssueType) -> Vec<&QualityIssue> {
570 self.issues
571 .iter()
572 .filter(|i| i.issue_type == issue_type)
573 .collect()
574 }
575
576 pub fn reset(&mut self) {
578 self.stats = DataQualityStats::default();
579 self.issues.clear();
580 self.next_issue_id = 1;
581 self.missing_value_injector.reset_stats();
582 self.format_injector.reset_stats();
583 self.duplicate_generator.reset_stats();
584 self.typo_generator.reset_stats();
585 }
586
587 pub fn update_stats(&mut self) {
589 self.stats.missing_values = self.missing_value_injector.stats().clone();
590 self.stats.format_variations = self.format_injector.stats().clone();
591 self.stats.duplicates = self.duplicate_generator.stats().clone();
592 self.stats.typos = self.typo_generator.stats().clone();
593 }
594}
595
596pub struct DataQualityConfigBuilder {
598 config: DataQualityConfig,
599}
600
601impl DataQualityConfigBuilder {
602 pub fn new() -> Self {
604 Self {
605 config: DataQualityConfig::default(),
606 }
607 }
608
609 pub fn with_missing_values(mut self, enable: bool) -> Self {
611 self.config.enable_missing_values = enable;
612 self
613 }
614
615 pub fn with_missing_rate(mut self, rate: f64) -> Self {
617 self.config.missing_values.global_rate = rate;
618 self
619 }
620
621 pub fn with_format_variations(mut self, enable: bool) -> Self {
623 self.config.enable_format_variations = enable;
624 self
625 }
626
627 pub fn with_duplicates(mut self, enable: bool) -> Self {
629 self.config.enable_duplicates = enable;
630 self
631 }
632
633 pub fn with_duplicate_rate(mut self, rate: f64) -> Self {
635 self.config.duplicates.duplicate_rate = rate;
636 self
637 }
638
639 pub fn with_typos(mut self, enable: bool) -> Self {
641 self.config.enable_typos = enable;
642 self
643 }
644
645 pub fn with_typo_rate(mut self, rate: f64) -> Self {
647 self.config.typos.char_error_rate = rate;
648 self
649 }
650
651 pub fn with_encoding_issues(mut self, enable: bool) -> Self {
653 self.config.enable_encoding_issues = enable;
654 self
655 }
656
657 pub fn with_seed(mut self, seed: u64) -> Self {
659 self.config.seed = seed;
660 self
661 }
662
663 pub fn build(self) -> DataQualityConfig {
665 self.config
666 }
667}
668
669impl Default for DataQualityConfigBuilder {
670 fn default() -> Self {
671 Self::new()
672 }
673}
674
675#[cfg(test)]
676#[allow(clippy::unwrap_used)]
677mod tests {
678 use super::*;
679 use rust_decimal_macros::dec;
680
681 #[test]
682 fn test_data_quality_injector_creation() {
683 let config = DataQualityConfig::default();
684 let injector = DataQualityInjector::new(config);
685
686 assert_eq!(injector.stats().total_records, 0);
687 }
688
689 #[test]
690 fn test_text_field_processing() {
691 let config = DataQualityConfigBuilder::new()
692 .with_typo_rate(0.5) .with_seed(42)
694 .build();
695
696 let mut injector = DataQualityInjector::new(config);
697 let context = HashMap::new();
698
699 let result =
700 injector.process_text_field("description", "Test Entry Description", "JE001", &context);
701
702 assert!(result.is_some());
703 }
704
705 #[test]
706 fn test_date_field_processing() {
707 let config = DataQualityConfigBuilder::new()
708 .with_format_variations(true)
709 .with_seed(42)
710 .build();
711
712 let mut injector = DataQualityInjector::new(config);
713 let context = HashMap::new();
714
715 let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
716 let result = injector.process_date_field("posting_date", date, "JE001", &context);
717
718 assert!(result.is_some());
719 }
720
721 #[test]
722 fn test_amount_field_processing() {
723 let config = DataQualityConfigBuilder::new()
724 .with_format_variations(true)
725 .with_seed(42)
726 .build();
727
728 let mut injector = DataQualityInjector::new(config);
729 let context = HashMap::new();
730
731 let amount = dec!(1234.56);
732 let result = injector.process_amount_field("debit_amount", amount, "JE001", &context);
733
734 assert!(result.is_some());
735 }
736
737 #[test]
738 fn test_minimal_config() {
739 let config = DataQualityConfig::minimal();
740 assert!(config.missing_values.global_rate < 0.01);
741 assert!(config.typos.char_error_rate < 0.01);
742 }
743
744 #[test]
745 fn test_high_variation_config() {
746 let config = DataQualityConfig::high_variation();
747 assert!(config.missing_values.global_rate > 0.01);
748 assert!(config.enable_encoding_issues);
749 }
750}