datasynth_generators/data_quality/
format_variations.rs

1//! Format variations for data quality simulation.
2//!
3//! Simulates realistic format inconsistencies including:
4//! - Date formats (ISO, US, EU, various separators)
5//! - Amount formats (decimal separators, thousand separators, currency symbols)
6//! - Identifier formats (padding, prefixes, case variations)
7//! - Text formats (case, whitespace, encoding)
8
9use chrono::NaiveDate;
10use rand::Rng;
11use rust_decimal::Decimal;
12
13/// Date format variations.
14#[derive(Debug, Clone, Copy, PartialEq)]
15pub enum DateFormat {
16    /// ISO 8601: 2024-01-15
17    ISO,
18    /// US format: 01/15/2024
19    US,
20    /// US with dashes: 01-15-2024
21    USDash,
22    /// European: 15/01/2024
23    EU,
24    /// European with dashes: 15-01-2024
25    EUDash,
26    /// European with dots: 15.01.2024
27    EUDot,
28    /// Long format: January 15, 2024
29    Long,
30    /// Short year: 01/15/24
31    ShortYear,
32    /// Compact: 20240115
33    Compact,
34    /// Unix timestamp
35    Unix,
36    /// Excel serial number
37    ExcelSerial,
38}
39
40impl DateFormat {
41    /// Returns all date formats.
42    pub fn all() -> Vec<Self> {
43        vec![
44            DateFormat::ISO,
45            DateFormat::US,
46            DateFormat::USDash,
47            DateFormat::EU,
48            DateFormat::EUDash,
49            DateFormat::EUDot,
50            DateFormat::Long,
51            DateFormat::ShortYear,
52            DateFormat::Compact,
53        ]
54    }
55
56    /// Formats a date using this format.
57    pub fn format(&self, date: NaiveDate) -> String {
58        match self {
59            DateFormat::ISO => date.format("%Y-%m-%d").to_string(),
60            DateFormat::US => date.format("%m/%d/%Y").to_string(),
61            DateFormat::USDash => date.format("%m-%d-%Y").to_string(),
62            DateFormat::EU => date.format("%d/%m/%Y").to_string(),
63            DateFormat::EUDash => date.format("%d-%m-%Y").to_string(),
64            DateFormat::EUDot => date.format("%d.%m.%Y").to_string(),
65            DateFormat::Long => date.format("%B %d, %Y").to_string(),
66            DateFormat::ShortYear => date.format("%m/%d/%y").to_string(),
67            DateFormat::Compact => date.format("%Y%m%d").to_string(),
68            DateFormat::Unix => {
69                let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap();
70                let days = (date - epoch).num_days();
71                (days * 86400).to_string()
72            }
73            DateFormat::ExcelSerial => {
74                // Excel epoch is December 30, 1899
75                let epoch = NaiveDate::from_ymd_opt(1899, 12, 30).unwrap();
76                let days = (date - epoch).num_days();
77                days.to_string()
78            }
79        }
80    }
81}
82
83/// Amount format variations.
84#[derive(Debug, Clone, PartialEq)]
85pub enum AmountFormat {
86    /// Plain number: 1234.56
87    Plain,
88    /// With thousand separator (comma): 1,234.56
89    USComma,
90    /// European (dot thousand, comma decimal): 1.234,56
91    EUFormat,
92    /// Space thousand separator: 1 234.56
93    SpaceSeparator,
94    /// With currency prefix: $1,234.56
95    CurrencyPrefix(String),
96    /// With currency suffix: 1,234.56 USD
97    CurrencySuffix(String),
98    /// Accounting format (parentheses for negative): (1,234.56)
99    Accounting,
100    /// Scientific notation: 1.23456E+03
101    Scientific,
102    /// No decimal places: 1235
103    NoDecimals,
104    /// Four decimal places: 1234.5600
105    FourDecimals,
106}
107
108impl AmountFormat {
109    /// Returns common amount formats.
110    pub fn common() -> Vec<Self> {
111        vec![
112            AmountFormat::Plain,
113            AmountFormat::USComma,
114            AmountFormat::EUFormat,
115            AmountFormat::SpaceSeparator,
116            AmountFormat::CurrencyPrefix("$".to_string()),
117            AmountFormat::CurrencySuffix("USD".to_string()),
118            AmountFormat::Accounting,
119            AmountFormat::NoDecimals,
120        ]
121    }
122
123    /// Formats a decimal using this format.
124    pub fn format(&self, amount: Decimal) -> String {
125        let is_negative = amount < Decimal::ZERO;
126        let abs_amount = amount.abs();
127        let amount_f64: f64 = abs_amount.try_into().unwrap_or(0.0);
128
129        match self {
130            AmountFormat::Plain => {
131                if is_negative {
132                    format!("-{:.2}", amount_f64)
133                } else {
134                    format!("{:.2}", amount_f64)
135                }
136            }
137            AmountFormat::USComma => {
138                let formatted = format_with_thousands(amount_f64, ',', '.');
139                if is_negative {
140                    format!("-{}", formatted)
141                } else {
142                    formatted
143                }
144            }
145            AmountFormat::EUFormat => {
146                let formatted = format_with_thousands(amount_f64, '.', ',');
147                if is_negative {
148                    format!("-{}", formatted)
149                } else {
150                    formatted
151                }
152            }
153            AmountFormat::SpaceSeparator => {
154                let formatted = format_with_thousands(amount_f64, ' ', '.');
155                if is_negative {
156                    format!("-{}", formatted)
157                } else {
158                    formatted
159                }
160            }
161            AmountFormat::CurrencyPrefix(symbol) => {
162                let formatted = format_with_thousands(amount_f64, ',', '.');
163                if is_negative {
164                    format!("-{}{}", symbol, formatted)
165                } else {
166                    format!("{}{}", symbol, formatted)
167                }
168            }
169            AmountFormat::CurrencySuffix(code) => {
170                let formatted = format_with_thousands(amount_f64, ',', '.');
171                if is_negative {
172                    format!("-{} {}", formatted, code)
173                } else {
174                    format!("{} {}", formatted, code)
175                }
176            }
177            AmountFormat::Accounting => {
178                let formatted = format_with_thousands(amount_f64, ',', '.');
179                if is_negative {
180                    format!("({})", formatted)
181                } else {
182                    formatted
183                }
184            }
185            AmountFormat::Scientific => {
186                if is_negative {
187                    format!("-{:.5E}", amount_f64)
188                } else {
189                    format!("{:.5E}", amount_f64)
190                }
191            }
192            AmountFormat::NoDecimals => {
193                let rounded = amount_f64.round() as i64;
194                if is_negative {
195                    format!("-{}", rounded.abs())
196                } else {
197                    rounded.to_string()
198                }
199            }
200            AmountFormat::FourDecimals => {
201                if is_negative {
202                    format!("-{:.4}", amount_f64)
203                } else {
204                    format!("{:.4}", amount_f64)
205                }
206            }
207        }
208    }
209}
210
211/// Formats a number with thousand separators.
212fn format_with_thousands(value: f64, thousand_sep: char, decimal_sep: char) -> String {
213    let integer_part = value.trunc() as i64;
214    let decimal_part = ((value.fract() * 100.0).round() as i64).abs();
215
216    let integer_str = integer_part.abs().to_string();
217    let mut result = String::new();
218
219    for (i, c) in integer_str.chars().rev().enumerate() {
220        if i > 0 && i % 3 == 0 {
221            result.push(thousand_sep);
222        }
223        result.push(c);
224    }
225
226    let integer_formatted: String = result.chars().rev().collect();
227    format!("{}{}{:02}", integer_formatted, decimal_sep, decimal_part)
228}
229
230/// Identifier format variations.
231#[derive(Debug, Clone)]
232pub enum IdentifierFormat {
233    /// Original case.
234    Original,
235    /// Uppercase.
236    Upper,
237    /// Lowercase.
238    Lower,
239    /// With prefix.
240    WithPrefix(String),
241    /// With suffix.
242    WithSuffix(String),
243    /// Zero-padded to length.
244    ZeroPadded(usize),
245    /// Space-padded to length.
246    SpacePadded(usize),
247    /// With separator.
248    WithSeparator { separator: char, interval: usize },
249}
250
251impl IdentifierFormat {
252    /// Formats an identifier using this format.
253    pub fn format(&self, id: &str) -> String {
254        match self {
255            IdentifierFormat::Original => id.to_string(),
256            IdentifierFormat::Upper => id.to_uppercase(),
257            IdentifierFormat::Lower => id.to_lowercase(),
258            IdentifierFormat::WithPrefix(prefix) => format!("{}{}", prefix, id),
259            IdentifierFormat::WithSuffix(suffix) => format!("{}{}", id, suffix),
260            IdentifierFormat::ZeroPadded(len) => {
261                if id.len() >= *len {
262                    id.to_string()
263                } else {
264                    format!("{:0>width$}", id, width = len)
265                }
266            }
267            IdentifierFormat::SpacePadded(len) => {
268                if id.len() >= *len {
269                    id.to_string()
270                } else {
271                    format!("{:>width$}", id, width = len)
272                }
273            }
274            IdentifierFormat::WithSeparator {
275                separator,
276                interval,
277            } => {
278                let mut result = String::new();
279                for (i, c) in id.chars().enumerate() {
280                    if i > 0 && i % interval == 0 {
281                        result.push(*separator);
282                    }
283                    result.push(c);
284                }
285                result
286            }
287        }
288    }
289}
290
291/// Text format variations.
292#[derive(Debug, Clone)]
293pub enum TextFormat {
294    /// Original text.
295    Original,
296    /// Uppercase.
297    Upper,
298    /// Lowercase.
299    Lower,
300    /// Title case.
301    Title,
302    /// With leading whitespace.
303    LeadingWhitespace(usize),
304    /// With trailing whitespace.
305    TrailingWhitespace(usize),
306    /// With extra internal spaces.
307    ExtraSpaces,
308    /// Trimmed.
309    Trimmed,
310    /// With non-breaking spaces.
311    NonBreakingSpaces,
312}
313
314impl TextFormat {
315    /// Formats text using this format.
316    pub fn format(&self, text: &str) -> String {
317        match self {
318            TextFormat::Original => text.to_string(),
319            TextFormat::Upper => text.to_uppercase(),
320            TextFormat::Lower => text.to_lowercase(),
321            TextFormat::Title => text
322                .split_whitespace()
323                .map(|word| {
324                    let mut chars = word.chars();
325                    match chars.next() {
326                        None => String::new(),
327                        Some(first) => {
328                            first.to_uppercase().to_string()
329                                + chars.as_str().to_lowercase().as_str()
330                        }
331                    }
332                })
333                .collect::<Vec<_>>()
334                .join(" "),
335            TextFormat::LeadingWhitespace(n) => {
336                format!("{}{}", " ".repeat(*n), text)
337            }
338            TextFormat::TrailingWhitespace(n) => {
339                format!("{}{}", text, " ".repeat(*n))
340            }
341            TextFormat::ExtraSpaces => text.split_whitespace().collect::<Vec<_>>().join("  "),
342            TextFormat::Trimmed => text.trim().to_string(),
343            TextFormat::NonBreakingSpaces => text.replace(' ', "\u{00A0}"),
344        }
345    }
346}
347
348/// Configuration for format variations.
349#[derive(Debug, Clone)]
350pub struct FormatVariationConfig {
351    /// Probability of applying date format variation.
352    pub date_variation_rate: f64,
353    /// Probability of applying amount format variation.
354    pub amount_variation_rate: f64,
355    /// Probability of applying identifier format variation.
356    pub identifier_variation_rate: f64,
357    /// Probability of applying text format variation.
358    pub text_variation_rate: f64,
359    /// Allowed date formats.
360    pub allowed_date_formats: Vec<DateFormat>,
361    /// Allowed amount formats.
362    pub allowed_amount_formats: Vec<AmountFormat>,
363}
364
365impl Default for FormatVariationConfig {
366    fn default() -> Self {
367        Self {
368            date_variation_rate: 0.05,
369            amount_variation_rate: 0.03,
370            identifier_variation_rate: 0.02,
371            text_variation_rate: 0.05,
372            allowed_date_formats: DateFormat::all(),
373            allowed_amount_formats: AmountFormat::common(),
374        }
375    }
376}
377
378/// Format variation injector.
379pub struct FormatVariationInjector {
380    config: FormatVariationConfig,
381    stats: FormatVariationStats,
382}
383
384/// Statistics for format variations.
385#[derive(Debug, Clone, Default)]
386pub struct FormatVariationStats {
387    pub date_variations: usize,
388    pub amount_variations: usize,
389    pub identifier_variations: usize,
390    pub text_variations: usize,
391    pub total_processed: usize,
392}
393
394impl FormatVariationInjector {
395    /// Creates a new format variation injector.
396    pub fn new(config: FormatVariationConfig) -> Self {
397        Self {
398            config,
399            stats: FormatVariationStats::default(),
400        }
401    }
402
403    /// Potentially applies a date format variation.
404    pub fn vary_date<R: Rng>(&mut self, date: NaiveDate, rng: &mut R) -> String {
405        self.stats.total_processed += 1;
406
407        if rng.gen::<f64>() < self.config.date_variation_rate {
408            self.stats.date_variations += 1;
409            let format = &self.config.allowed_date_formats
410                [rng.gen_range(0..self.config.allowed_date_formats.len())];
411            format.format(date)
412        } else {
413            DateFormat::ISO.format(date)
414        }
415    }
416
417    /// Potentially applies an amount format variation.
418    pub fn vary_amount<R: Rng>(&mut self, amount: Decimal, rng: &mut R) -> String {
419        self.stats.total_processed += 1;
420
421        if rng.gen::<f64>() < self.config.amount_variation_rate {
422            self.stats.amount_variations += 1;
423            let format = &self.config.allowed_amount_formats
424                [rng.gen_range(0..self.config.allowed_amount_formats.len())];
425            format.format(amount)
426        } else {
427            AmountFormat::Plain.format(amount)
428        }
429    }
430
431    /// Potentially applies an identifier format variation.
432    pub fn vary_identifier<R: Rng>(&mut self, id: &str, rng: &mut R) -> String {
433        self.stats.total_processed += 1;
434
435        if rng.gen::<f64>() < self.config.identifier_variation_rate {
436            self.stats.identifier_variations += 1;
437
438            let variations = [
439                IdentifierFormat::Upper,
440                IdentifierFormat::Lower,
441                IdentifierFormat::ZeroPadded(10),
442                IdentifierFormat::WithPrefix(" ".to_string()),
443                IdentifierFormat::WithSuffix(" ".to_string()),
444            ];
445
446            let format = &variations[rng.gen_range(0..variations.len())];
447            format.format(id)
448        } else {
449            id.to_string()
450        }
451    }
452
453    /// Potentially applies a text format variation.
454    pub fn vary_text<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
455        self.stats.total_processed += 1;
456
457        if rng.gen::<f64>() < self.config.text_variation_rate {
458            self.stats.text_variations += 1;
459
460            let variations = [
461                TextFormat::Upper,
462                TextFormat::Lower,
463                TextFormat::Title,
464                TextFormat::LeadingWhitespace(1),
465                TextFormat::TrailingWhitespace(1),
466                TextFormat::ExtraSpaces,
467            ];
468
469            let format = &variations[rng.gen_range(0..variations.len())];
470            format.format(text)
471        } else {
472            text.to_string()
473        }
474    }
475
476    /// Returns statistics.
477    pub fn stats(&self) -> &FormatVariationStats {
478        &self.stats
479    }
480
481    /// Resets statistics.
482    pub fn reset_stats(&mut self) {
483        self.stats = FormatVariationStats::default();
484    }
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490    use rust_decimal_macros::dec;
491
492    #[test]
493    fn test_date_formats() {
494        let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
495
496        assert_eq!(DateFormat::ISO.format(date), "2024-01-15");
497        assert_eq!(DateFormat::US.format(date), "01/15/2024");
498        assert_eq!(DateFormat::EU.format(date), "15/01/2024");
499        assert_eq!(DateFormat::Compact.format(date), "20240115");
500    }
501
502    #[test]
503    fn test_amount_formats() {
504        let amount = dec!(1234567.89);
505
506        assert_eq!(AmountFormat::Plain.format(amount), "1234567.89");
507        assert_eq!(AmountFormat::USComma.format(amount), "1,234,567.89");
508        assert_eq!(AmountFormat::EUFormat.format(amount), "1.234.567,89");
509        assert_eq!(AmountFormat::NoDecimals.format(amount), "1234568");
510    }
511
512    #[test]
513    fn test_negative_amounts() {
514        let amount = dec!(-1234.56);
515
516        assert_eq!(AmountFormat::Plain.format(amount), "-1234.56");
517        assert_eq!(AmountFormat::Accounting.format(amount), "(1,234.56)");
518    }
519
520    #[test]
521    fn test_identifier_formats() {
522        let id = "abc123";
523
524        assert_eq!(IdentifierFormat::Upper.format(id), "ABC123");
525        assert_eq!(IdentifierFormat::ZeroPadded(10).format(id), "0000abc123");
526    }
527
528    #[test]
529    fn test_text_formats() {
530        let text = "hello world";
531
532        assert_eq!(TextFormat::Upper.format(text), "HELLO WORLD");
533        assert_eq!(TextFormat::Title.format(text), "Hello World");
534        assert_eq!(TextFormat::ExtraSpaces.format(text), "hello  world");
535    }
536
537    #[test]
538    fn test_format_injector() {
539        use rand::SeedableRng;
540        use rand_chacha::ChaCha8Rng;
541
542        let config = FormatVariationConfig {
543            date_variation_rate: 1.0, // Always vary for testing
544            ..Default::default()
545        };
546
547        let mut injector = FormatVariationInjector::new(config);
548        let mut rng = ChaCha8Rng::seed_from_u64(42);
549
550        let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
551        let formatted = injector.vary_date(date, &mut rng);
552
553        // Formatted date should not be empty and stats should be updated
554        assert!(!formatted.is_empty());
555        assert_eq!(injector.stats().date_variations, 1);
556    }
557}