datasynth_generators/data_quality/
typos.rs

1//! Typo and character error injection for data quality simulation.
2//!
3//! Simulates realistic typing errors:
4//! - Character substitution (nearby keys)
5//! - Character transposition (adjacent swaps)
6//! - Character insertion (double-typing)
7//! - Character deletion (missed keys)
8//! - Encoding issues (character corruption)
9
10use rand::Rng;
11use std::collections::HashMap;
12
13/// Type of typo/error.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15pub enum TypoType {
16    /// Substitution with nearby key.
17    Substitution,
18    /// Adjacent character transposition.
19    Transposition,
20    /// Extra character insertion.
21    Insertion,
22    /// Missing character deletion.
23    Deletion,
24    /// Double character (repetition).
25    DoubleChar,
26    /// Case error (wrong case).
27    CaseError,
28    /// Homophone substitution.
29    Homophone,
30    /// OCR-style error (similar looking characters).
31    OCRError,
32}
33
34impl TypoType {
35    /// Returns all typo types.
36    pub fn all() -> Vec<Self> {
37        vec![
38            TypoType::Substitution,
39            TypoType::Transposition,
40            TypoType::Insertion,
41            TypoType::Deletion,
42            TypoType::DoubleChar,
43            TypoType::CaseError,
44            TypoType::Homophone,
45            TypoType::OCRError,
46        ]
47    }
48}
49
50/// Configuration for typo generation.
51#[derive(Debug, Clone)]
52pub struct TypoConfig {
53    /// Overall typo rate (per character).
54    pub char_error_rate: f64,
55    /// Weights for each typo type.
56    pub type_weights: HashMap<TypoType, f64>,
57    /// Whether to preserve word boundaries.
58    pub preserve_word_boundaries: bool,
59    /// Maximum typos per word.
60    pub max_typos_per_word: usize,
61    /// Fields that should not have typos (identifiers, codes).
62    pub protected_fields: Vec<String>,
63}
64
65impl Default for TypoConfig {
66    fn default() -> Self {
67        let mut type_weights = HashMap::new();
68        type_weights.insert(TypoType::Substitution, 0.25);
69        type_weights.insert(TypoType::Transposition, 0.20);
70        type_weights.insert(TypoType::Insertion, 0.10);
71        type_weights.insert(TypoType::Deletion, 0.20);
72        type_weights.insert(TypoType::DoubleChar, 0.10);
73        type_weights.insert(TypoType::CaseError, 0.10);
74        type_weights.insert(TypoType::OCRError, 0.05);
75
76        Self {
77            char_error_rate: 0.005, // 0.5% per character
78            type_weights,
79            preserve_word_boundaries: true,
80            max_typos_per_word: 2,
81            protected_fields: vec![
82                "document_number".to_string(),
83                "account_code".to_string(),
84                "company_code".to_string(),
85                "vendor_id".to_string(),
86                "customer_id".to_string(),
87            ],
88        }
89    }
90}
91
92/// QWERTY keyboard layout for nearby key substitution.
93pub struct KeyboardLayout {
94    /// Map from character to nearby characters.
95    nearby_keys: HashMap<char, Vec<char>>,
96}
97
98impl Default for KeyboardLayout {
99    fn default() -> Self {
100        Self::qwerty()
101    }
102}
103
104impl KeyboardLayout {
105    /// Creates a QWERTY keyboard layout.
106    pub fn qwerty() -> Self {
107        let mut nearby_keys = HashMap::new();
108
109        // Row 1: qwertyuiop
110        nearby_keys.insert('q', vec!['w', 'a', '1', '2']);
111        nearby_keys.insert('w', vec!['q', 'e', 'a', 's', '2', '3']);
112        nearby_keys.insert('e', vec!['w', 'r', 's', 'd', '3', '4']);
113        nearby_keys.insert('r', vec!['e', 't', 'd', 'f', '4', '5']);
114        nearby_keys.insert('t', vec!['r', 'y', 'f', 'g', '5', '6']);
115        nearby_keys.insert('y', vec!['t', 'u', 'g', 'h', '6', '7']);
116        nearby_keys.insert('u', vec!['y', 'i', 'h', 'j', '7', '8']);
117        nearby_keys.insert('i', vec!['u', 'o', 'j', 'k', '8', '9']);
118        nearby_keys.insert('o', vec!['i', 'p', 'k', 'l', '9', '0']);
119        nearby_keys.insert('p', vec!['o', 'l', '0']);
120
121        // Row 2: asdfghjkl
122        nearby_keys.insert('a', vec!['q', 'w', 's', 'z']);
123        nearby_keys.insert('s', vec!['a', 'w', 'e', 'd', 'z', 'x']);
124        nearby_keys.insert('d', vec!['s', 'e', 'r', 'f', 'x', 'c']);
125        nearby_keys.insert('f', vec!['d', 'r', 't', 'g', 'c', 'v']);
126        nearby_keys.insert('g', vec!['f', 't', 'y', 'h', 'v', 'b']);
127        nearby_keys.insert('h', vec!['g', 'y', 'u', 'j', 'b', 'n']);
128        nearby_keys.insert('j', vec!['h', 'u', 'i', 'k', 'n', 'm']);
129        nearby_keys.insert('k', vec!['j', 'i', 'o', 'l', 'm']);
130        nearby_keys.insert('l', vec!['k', 'o', 'p']);
131
132        // Row 3: zxcvbnm
133        nearby_keys.insert('z', vec!['a', 's', 'x']);
134        nearby_keys.insert('x', vec!['z', 's', 'd', 'c']);
135        nearby_keys.insert('c', vec!['x', 'd', 'f', 'v']);
136        nearby_keys.insert('v', vec!['c', 'f', 'g', 'b']);
137        nearby_keys.insert('b', vec!['v', 'g', 'h', 'n']);
138        nearby_keys.insert('n', vec!['b', 'h', 'j', 'm']);
139        nearby_keys.insert('m', vec!['n', 'j', 'k']);
140
141        // Numbers
142        nearby_keys.insert('1', vec!['2', 'q']);
143        nearby_keys.insert('2', vec!['1', '3', 'q', 'w']);
144        nearby_keys.insert('3', vec!['2', '4', 'w', 'e']);
145        nearby_keys.insert('4', vec!['3', '5', 'e', 'r']);
146        nearby_keys.insert('5', vec!['4', '6', 'r', 't']);
147        nearby_keys.insert('6', vec!['5', '7', 't', 'y']);
148        nearby_keys.insert('7', vec!['6', '8', 'y', 'u']);
149        nearby_keys.insert('8', vec!['7', '9', 'u', 'i']);
150        nearby_keys.insert('9', vec!['8', '0', 'i', 'o']);
151        nearby_keys.insert('0', vec!['9', 'o', 'p']);
152
153        Self { nearby_keys }
154    }
155
156    /// Gets nearby keys for a character.
157    pub fn get_nearby(&self, c: char) -> Vec<char> {
158        self.nearby_keys
159            .get(&c.to_ascii_lowercase())
160            .cloned()
161            .unwrap_or_else(|| vec![c])
162    }
163}
164
165/// OCR-similar characters (often confused in OCR).
166pub struct OCRConfusions {
167    /// Map from character to similar-looking characters.
168    confusions: HashMap<char, Vec<char>>,
169}
170
171impl Default for OCRConfusions {
172    fn default() -> Self {
173        Self::new()
174    }
175}
176
177impl OCRConfusions {
178    /// Creates OCR confusion mappings.
179    pub fn new() -> Self {
180        let mut confusions = HashMap::new();
181
182        // Commonly confused characters
183        confusions.insert('0', vec!['O', 'o', 'Q', 'D']);
184        confusions.insert('O', vec!['0', 'Q', 'D', 'o']);
185        confusions.insert('o', vec!['0', 'O', 'a', 'e']);
186        confusions.insert('1', vec!['l', 'I', 'i', '|', '7']);
187        confusions.insert('l', vec!['1', 'I', 'i', '|']);
188        confusions.insert('I', vec!['1', 'l', 'i', '|']);
189        confusions.insert('i', vec!['1', 'l', 'I', 'j']);
190        confusions.insert('5', vec!['S', 's']);
191        confusions.insert('S', vec!['5', 's', '8']);
192        confusions.insert('s', vec!['5', 'S', 'z']);
193        confusions.insert('8', vec!['B', '&', 'S']);
194        confusions.insert('B', vec!['8', 'R', 'D']);
195        confusions.insert('6', vec!['G', 'b']);
196        confusions.insert('G', vec!['6', 'C', 'O']);
197        confusions.insert('2', vec!['Z', 'z']);
198        confusions.insert('Z', vec!['2', 'z', '7']);
199        confusions.insert('z', vec!['2', 'Z', 's']);
200        confusions.insert('n', vec!['m', 'h', 'r']);
201        confusions.insert('m', vec!['n', 'r']);
202        confusions.insert('h', vec!['n', 'b', 'k']);
203        confusions.insert('c', vec!['e', 'o', '(']);
204        confusions.insert('e', vec!['c', 'a', 'o']);
205        confusions.insert('a', vec!['e', 'o', 'd']);
206        confusions.insert('d', vec!['a', 'o', 'c']);
207        confusions.insert('g', vec!['q', '9', 'a']);
208        confusions.insert('q', vec!['g', '9', 'p']);
209        confusions.insert('9', vec!['g', 'q']);
210        confusions.insert('v', vec!['u', 'w', 'y']);
211        confusions.insert('u', vec!['v', 'n', 'w']);
212        confusions.insert('w', vec!['v', 'u', 'x']);
213        confusions.insert('y', vec!['v', 'u', 'j']);
214        confusions.insert('f', vec!['t', 'r']);
215        confusions.insert('t', vec!['f', 'l', '+']);
216        confusions.insert('r', vec!['n', 'f']);
217
218        Self { confusions }
219    }
220
221    /// Gets OCR-confusable characters.
222    pub fn get_confusions(&self, c: char) -> Vec<char> {
223        self.confusions.get(&c).cloned().unwrap_or_else(|| vec![c])
224    }
225}
226
227/// Common homophones (words that sound alike).
228pub struct Homophones {
229    /// Map from word to homophones.
230    homophones: HashMap<String, Vec<String>>,
231}
232
233impl Default for Homophones {
234    fn default() -> Self {
235        Self::new()
236    }
237}
238
239impl Homophones {
240    /// Creates homophone mappings.
241    pub fn new() -> Self {
242        let mut homophones = HashMap::new();
243
244        // Common business/accounting homophones
245        homophones.insert("to".to_string(), vec!["two".to_string(), "too".to_string()]);
246        homophones.insert("two".to_string(), vec!["to".to_string(), "too".to_string()]);
247        homophones.insert(
248            "their".to_string(),
249            vec!["there".to_string(), "they're".to_string()],
250        );
251        homophones.insert(
252            "there".to_string(),
253            vec!["their".to_string(), "they're".to_string()],
254        );
255        homophones.insert("its".to_string(), vec!["it's".to_string()]);
256        homophones.insert("your".to_string(), vec!["you're".to_string()]);
257        homophones.insert("than".to_string(), vec!["then".to_string()]);
258        homophones.insert("then".to_string(), vec!["than".to_string()]);
259        homophones.insert("accept".to_string(), vec!["except".to_string()]);
260        homophones.insert("affect".to_string(), vec!["effect".to_string()]);
261        homophones.insert("effect".to_string(), vec!["affect".to_string()]);
262        homophones.insert("capital".to_string(), vec!["capitol".to_string()]);
263        homophones.insert("principal".to_string(), vec!["principle".to_string()]);
264        homophones.insert("compliment".to_string(), vec!["complement".to_string()]);
265        homophones.insert("stationary".to_string(), vec!["stationery".to_string()]);
266        homophones.insert("advice".to_string(), vec!["advise".to_string()]);
267        homophones.insert(
268            "loss".to_string(),
269            vec!["lost".to_string(), "lose".to_string()],
270        );
271
272        Self { homophones }
273    }
274
275    /// Gets homophones for a word.
276    pub fn get_homophones(&self, word: &str) -> Option<&Vec<String>> {
277        self.homophones.get(&word.to_lowercase())
278    }
279}
280
281/// Typo generator.
282pub struct TypoGenerator {
283    config: TypoConfig,
284    keyboard: KeyboardLayout,
285    ocr: OCRConfusions,
286    homophones: Homophones,
287    stats: TypoStats,
288}
289
290/// Statistics for typo generation.
291#[derive(Debug, Clone, Default)]
292pub struct TypoStats {
293    pub total_characters: usize,
294    pub total_typos: usize,
295    pub by_type: HashMap<TypoType, usize>,
296    pub total_words: usize,
297    pub words_with_typos: usize,
298}
299
300impl TypoGenerator {
301    /// Creates a new typo generator.
302    pub fn new(config: TypoConfig) -> Self {
303        Self {
304            config,
305            keyboard: KeyboardLayout::default(),
306            ocr: OCRConfusions::default(),
307            homophones: Homophones::default(),
308            stats: TypoStats::default(),
309        }
310    }
311
312    /// Introduces typos into text.
313    pub fn introduce_typos<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
314        if self.config.preserve_word_boundaries {
315            self.introduce_typos_by_word(text, rng)
316        } else {
317            self.introduce_typos_by_char(text, rng)
318        }
319    }
320
321    /// Introduces typos word by word.
322    fn introduce_typos_by_word<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
323        let mut result = String::new();
324        let chars = text.chars().peekable();
325        let mut current_word = String::new();
326
327        for c in chars {
328            if c.is_alphanumeric() {
329                current_word.push(c);
330            } else {
331                // Process the word
332                if !current_word.is_empty() {
333                    self.stats.total_words += 1;
334                    let processed = self.process_word(&current_word, rng);
335                    if processed != current_word {
336                        self.stats.words_with_typos += 1;
337                    }
338                    result.push_str(&processed);
339                    current_word.clear();
340                }
341                result.push(c);
342            }
343        }
344
345        // Process remaining word
346        if !current_word.is_empty() {
347            self.stats.total_words += 1;
348            let processed = self.process_word(&current_word, rng);
349            if processed != current_word {
350                self.stats.words_with_typos += 1;
351            }
352            result.push_str(&processed);
353        }
354
355        result
356    }
357
358    /// Processes a single word for typos.
359    fn process_word<R: Rng>(&mut self, word: &str, rng: &mut R) -> String {
360        // Check for homophone substitution first
361        if let Some(homophones) = self.homophones.get_homophones(word) {
362            if rng.gen::<f64>() < self.config.char_error_rate * 10.0 {
363                // Higher probability for whole-word substitution
364                self.stats.total_typos += 1;
365                *self.stats.by_type.entry(TypoType::Homophone).or_insert(0) += 1;
366                return homophones[rng.gen_range(0..homophones.len())].clone();
367            }
368        }
369
370        let mut result: Vec<char> = word.chars().collect();
371        let mut typos_in_word = 0;
372        let mut i = 0;
373
374        while i < result.len() {
375            if typos_in_word >= self.config.max_typos_per_word {
376                break;
377            }
378
379            self.stats.total_characters += 1;
380
381            if rng.gen::<f64>() < self.config.char_error_rate {
382                let typo_type = self.select_typo_type(rng);
383                let c = result[i];
384
385                match typo_type {
386                    TypoType::Substitution => {
387                        let nearby = self.keyboard.get_nearby(c);
388                        if !nearby.is_empty() {
389                            result[i] = nearby[rng.gen_range(0..nearby.len())];
390                        }
391                    }
392                    TypoType::Transposition => {
393                        if i + 1 < result.len() {
394                            result.swap(i, i + 1);
395                        }
396                    }
397                    TypoType::Deletion => {
398                        if result.len() > 1 {
399                            result.remove(i);
400                            // Don't increment i since we removed the current element
401                            // Stats are tracked below, just continue to avoid index issues
402                            self.stats.total_typos += 1;
403                            *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
404                            typos_in_word += 1;
405                            continue;
406                        }
407                    }
408                    TypoType::Insertion => {
409                        let nearby = self.keyboard.get_nearby(c);
410                        if !nearby.is_empty() {
411                            result.insert(i, nearby[rng.gen_range(0..nearby.len())]);
412                            // Skip the inserted character
413                            i += 1;
414                        }
415                    }
416                    TypoType::DoubleChar => {
417                        result.insert(i, c);
418                        // Skip the duplicated character
419                        i += 1;
420                    }
421                    TypoType::CaseError => {
422                        if c.is_uppercase() {
423                            result[i] = c.to_ascii_lowercase();
424                        } else {
425                            result[i] = c.to_ascii_uppercase();
426                        }
427                    }
428                    TypoType::OCRError => {
429                        let confusions = self.ocr.get_confusions(c);
430                        if !confusions.is_empty() {
431                            result[i] = confusions[rng.gen_range(0..confusions.len())];
432                        }
433                    }
434                    TypoType::Homophone => {
435                        // Already handled above
436                    }
437                }
438
439                self.stats.total_typos += 1;
440                *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
441                typos_in_word += 1;
442            }
443            i += 1;
444        }
445
446        result.into_iter().collect()
447    }
448
449    /// Introduces typos character by character.
450    fn introduce_typos_by_char<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
451        let mut result = String::new();
452
453        for c in text.chars() {
454            self.stats.total_characters += 1;
455
456            if c.is_alphanumeric() && rng.gen::<f64>() < self.config.char_error_rate {
457                let typo_type = self.select_typo_type(rng);
458
459                match typo_type {
460                    TypoType::Substitution => {
461                        let nearby = self.keyboard.get_nearby(c);
462                        if !nearby.is_empty() {
463                            result.push(nearby[rng.gen_range(0..nearby.len())]);
464                        } else {
465                            result.push(c);
466                        }
467                    }
468                    TypoType::Deletion => {
469                        // Skip character (deletion)
470                    }
471                    TypoType::Insertion => {
472                        result.push(c);
473                        let nearby = self.keyboard.get_nearby(c);
474                        if !nearby.is_empty() {
475                            result.push(nearby[rng.gen_range(0..nearby.len())]);
476                        }
477                    }
478                    TypoType::DoubleChar => {
479                        result.push(c);
480                        result.push(c);
481                    }
482                    TypoType::CaseError => {
483                        if c.is_uppercase() {
484                            result.push(c.to_ascii_lowercase());
485                        } else {
486                            result.push(c.to_ascii_uppercase());
487                        }
488                    }
489                    _ => {
490                        result.push(c);
491                    }
492                }
493
494                self.stats.total_typos += 1;
495                *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
496            } else {
497                result.push(c);
498            }
499        }
500
501        result
502    }
503
504    /// Selects a typo type based on weights.
505    fn select_typo_type<R: Rng>(&self, rng: &mut R) -> TypoType {
506        let total_weight: f64 = self.config.type_weights.values().sum();
507        let mut random_weight = rng.gen::<f64>() * total_weight;
508
509        for (typo_type, weight) in &self.config.type_weights {
510            random_weight -= weight;
511            if random_weight <= 0.0 {
512                return *typo_type;
513            }
514        }
515
516        TypoType::Substitution // Default fallback
517    }
518
519    /// Checks if a field is protected.
520    pub fn is_protected(&self, field: &str) -> bool {
521        self.config.protected_fields.contains(&field.to_string())
522    }
523
524    /// Returns statistics.
525    pub fn stats(&self) -> &TypoStats {
526        &self.stats
527    }
528
529    /// Resets statistics.
530    pub fn reset_stats(&mut self) {
531        self.stats = TypoStats::default();
532    }
533}
534
535/// Encoding issue types.
536#[derive(Debug, Clone, Copy, PartialEq)]
537pub enum EncodingIssue {
538    /// Mojibake (wrong encoding interpretation).
539    Mojibake,
540    /// Missing characters (replaced with ?).
541    MissingChars,
542    /// UTF-8 BOM inserted.
543    BOM,
544    /// Control characters inserted.
545    ControlChars,
546    /// HTML entities.
547    HTMLEntities,
548}
549
550/// Introduces encoding issues.
551pub fn introduce_encoding_issue<R: Rng>(text: &str, issue: EncodingIssue, rng: &mut R) -> String {
552    match issue {
553        EncodingIssue::Mojibake => {
554            // Simulate common Mojibake patterns
555            text.replace('é', "é")
556                .replace('ñ', "ñ")
557                .replace('ü', "ü")
558                .replace('ö', "ö")
559                .replace('ä', "ä")
560                .replace('€', "€")
561        }
562        EncodingIssue::MissingChars => text
563            .chars()
564            .map(|c| {
565                if !c.is_ascii() && rng.gen::<f64>() < 0.5 {
566                    '?'
567                } else {
568                    c
569                }
570            })
571            .collect(),
572        EncodingIssue::BOM => {
573            format!("\u{FEFF}{}", text)
574        }
575        EncodingIssue::ControlChars => {
576            let mut result = String::new();
577            for c in text.chars() {
578                result.push(c);
579                if rng.gen::<f64>() < 0.01 {
580                    // Insert random control character
581                    result.push('\u{0000}');
582                }
583            }
584            result
585        }
586        EncodingIssue::HTMLEntities => text
587            .replace('&', "&amp;")
588            .replace('<', "&lt;")
589            .replace('>', "&gt;")
590            .replace('"', "&quot;")
591            .replace(' ', "&nbsp;"),
592    }
593}
594
595#[cfg(test)]
596mod tests {
597    use super::*;
598    use rand::SeedableRng;
599    use rand_chacha::ChaCha8Rng;
600
601    #[test]
602    fn test_keyboard_nearby_keys() {
603        let keyboard = KeyboardLayout::qwerty();
604        let nearby = keyboard.get_nearby('e');
605        assert!(nearby.contains(&'w'));
606        assert!(nearby.contains(&'r'));
607        assert!(nearby.contains(&'s'));
608        assert!(nearby.contains(&'d'));
609    }
610
611    #[test]
612    fn test_typo_generation() {
613        let config = TypoConfig {
614            char_error_rate: 0.5, // High rate for testing
615            ..Default::default()
616        };
617
618        let mut generator = TypoGenerator::new(config);
619        let mut rng = ChaCha8Rng::seed_from_u64(42);
620
621        let text = "Hello World";
622        let _with_typos = generator.introduce_typos(text, &mut rng);
623
624        // With high error rate, should have some typos
625        assert!(generator.stats().total_typos > 0);
626    }
627
628    #[test]
629    fn test_encoding_issues() {
630        let mut rng = ChaCha8Rng::seed_from_u64(42);
631
632        let text = "Héllo & Wörld";
633        let mojibake = introduce_encoding_issue(text, EncodingIssue::Mojibake, &mut rng);
634        assert!(mojibake.contains("é"));
635
636        let html = introduce_encoding_issue("A & B", EncodingIssue::HTMLEntities, &mut rng);
637        assert!(html.contains("&amp;"));
638    }
639
640    #[test]
641    fn test_homophones() {
642        let homophones = Homophones::new();
643        let alternatives = homophones.get_homophones("their");
644        assert!(alternatives.is_some());
645        assert!(alternatives.unwrap().contains(&"there".to_string()));
646    }
647}