Skip to main content

datasynth_generators/data_quality/
typos.rs

1//! Typo and character error injection for data quality simulation.
2//!
3//! Simulates realistic typing errors:
4//! - Character substitution (nearby keys)
5//! - Character transposition (adjacent swaps)
6//! - Character insertion (double-typing)
7//! - Character deletion (missed keys)
8//! - Encoding issues (character corruption)
9
10use datasynth_core::utils::weighted_select;
11use rand::Rng;
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14
15/// Type of typo/error.
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
17pub enum TypoType {
18    /// Substitution with nearby key.
19    Substitution,
20    /// Adjacent character transposition.
21    Transposition,
22    /// Extra character insertion.
23    Insertion,
24    /// Missing character deletion.
25    Deletion,
26    /// Double character (repetition).
27    DoubleChar,
28    /// Case error (wrong case).
29    CaseError,
30    /// Homophone substitution.
31    Homophone,
32    /// OCR-style error (similar looking characters).
33    OCRError,
34}
35
36impl TypoType {
37    /// Returns all typo types.
38    pub fn all() -> Vec<Self> {
39        vec![
40            TypoType::Substitution,
41            TypoType::Transposition,
42            TypoType::Insertion,
43            TypoType::Deletion,
44            TypoType::DoubleChar,
45            TypoType::CaseError,
46            TypoType::Homophone,
47            TypoType::OCRError,
48        ]
49    }
50}
51
52/// Configuration for typo generation.
53#[derive(Debug, Clone)]
54pub struct TypoConfig {
55    /// Overall typo rate (per character).
56    pub char_error_rate: f64,
57    /// Weights for each typo type.
58    pub type_weights: HashMap<TypoType, f64>,
59    /// Whether to preserve word boundaries.
60    pub preserve_word_boundaries: bool,
61    /// Maximum typos per word.
62    pub max_typos_per_word: usize,
63    /// Fields that should not have typos (identifiers, codes).
64    pub protected_fields: Vec<String>,
65}
66
67impl Default for TypoConfig {
68    fn default() -> Self {
69        let mut type_weights = HashMap::new();
70        type_weights.insert(TypoType::Substitution, 0.25);
71        type_weights.insert(TypoType::Transposition, 0.20);
72        type_weights.insert(TypoType::Insertion, 0.10);
73        type_weights.insert(TypoType::Deletion, 0.20);
74        type_weights.insert(TypoType::DoubleChar, 0.10);
75        type_weights.insert(TypoType::CaseError, 0.10);
76        type_weights.insert(TypoType::OCRError, 0.05);
77
78        Self {
79            char_error_rate: 0.005, // 0.5% per character
80            type_weights,
81            preserve_word_boundaries: true,
82            max_typos_per_word: 2,
83            protected_fields: vec![
84                "document_number".to_string(),
85                "account_code".to_string(),
86                "company_code".to_string(),
87                "vendor_id".to_string(),
88                "customer_id".to_string(),
89            ],
90        }
91    }
92}
93
94/// QWERTY keyboard layout for nearby key substitution.
95pub struct KeyboardLayout {
96    /// Map from character to nearby characters.
97    nearby_keys: HashMap<char, Vec<char>>,
98}
99
100impl Default for KeyboardLayout {
101    fn default() -> Self {
102        Self::qwerty()
103    }
104}
105
106impl KeyboardLayout {
107    /// Creates a QWERTY keyboard layout.
108    pub fn qwerty() -> Self {
109        let mut nearby_keys = HashMap::new();
110
111        // Row 1: qwertyuiop
112        nearby_keys.insert('q', vec!['w', 'a', '1', '2']);
113        nearby_keys.insert('w', vec!['q', 'e', 'a', 's', '2', '3']);
114        nearby_keys.insert('e', vec!['w', 'r', 's', 'd', '3', '4']);
115        nearby_keys.insert('r', vec!['e', 't', 'd', 'f', '4', '5']);
116        nearby_keys.insert('t', vec!['r', 'y', 'f', 'g', '5', '6']);
117        nearby_keys.insert('y', vec!['t', 'u', 'g', 'h', '6', '7']);
118        nearby_keys.insert('u', vec!['y', 'i', 'h', 'j', '7', '8']);
119        nearby_keys.insert('i', vec!['u', 'o', 'j', 'k', '8', '9']);
120        nearby_keys.insert('o', vec!['i', 'p', 'k', 'l', '9', '0']);
121        nearby_keys.insert('p', vec!['o', 'l', '0']);
122
123        // Row 2: asdfghjkl
124        nearby_keys.insert('a', vec!['q', 'w', 's', 'z']);
125        nearby_keys.insert('s', vec!['a', 'w', 'e', 'd', 'z', 'x']);
126        nearby_keys.insert('d', vec!['s', 'e', 'r', 'f', 'x', 'c']);
127        nearby_keys.insert('f', vec!['d', 'r', 't', 'g', 'c', 'v']);
128        nearby_keys.insert('g', vec!['f', 't', 'y', 'h', 'v', 'b']);
129        nearby_keys.insert('h', vec!['g', 'y', 'u', 'j', 'b', 'n']);
130        nearby_keys.insert('j', vec!['h', 'u', 'i', 'k', 'n', 'm']);
131        nearby_keys.insert('k', vec!['j', 'i', 'o', 'l', 'm']);
132        nearby_keys.insert('l', vec!['k', 'o', 'p']);
133
134        // Row 3: zxcvbnm
135        nearby_keys.insert('z', vec!['a', 's', 'x']);
136        nearby_keys.insert('x', vec!['z', 's', 'd', 'c']);
137        nearby_keys.insert('c', vec!['x', 'd', 'f', 'v']);
138        nearby_keys.insert('v', vec!['c', 'f', 'g', 'b']);
139        nearby_keys.insert('b', vec!['v', 'g', 'h', 'n']);
140        nearby_keys.insert('n', vec!['b', 'h', 'j', 'm']);
141        nearby_keys.insert('m', vec!['n', 'j', 'k']);
142
143        // Numbers
144        nearby_keys.insert('1', vec!['2', 'q']);
145        nearby_keys.insert('2', vec!['1', '3', 'q', 'w']);
146        nearby_keys.insert('3', vec!['2', '4', 'w', 'e']);
147        nearby_keys.insert('4', vec!['3', '5', 'e', 'r']);
148        nearby_keys.insert('5', vec!['4', '6', 'r', 't']);
149        nearby_keys.insert('6', vec!['5', '7', 't', 'y']);
150        nearby_keys.insert('7', vec!['6', '8', 'y', 'u']);
151        nearby_keys.insert('8', vec!['7', '9', 'u', 'i']);
152        nearby_keys.insert('9', vec!['8', '0', 'i', 'o']);
153        nearby_keys.insert('0', vec!['9', 'o', 'p']);
154
155        Self { nearby_keys }
156    }
157
158    /// Gets nearby keys for a character.
159    pub fn get_nearby(&self, c: char) -> Vec<char> {
160        self.nearby_keys
161            .get(&c.to_ascii_lowercase())
162            .cloned()
163            .unwrap_or_else(|| vec![c])
164    }
165}
166
167/// OCR-similar characters (often confused in OCR).
168pub struct OCRConfusions {
169    /// Map from character to similar-looking characters.
170    confusions: HashMap<char, Vec<char>>,
171}
172
173impl Default for OCRConfusions {
174    fn default() -> Self {
175        Self::new()
176    }
177}
178
179impl OCRConfusions {
180    /// Creates OCR confusion mappings.
181    pub fn new() -> Self {
182        let mut confusions = HashMap::new();
183
184        // Commonly confused characters
185        confusions.insert('0', vec!['O', 'o', 'Q', 'D']);
186        confusions.insert('O', vec!['0', 'Q', 'D', 'o']);
187        confusions.insert('o', vec!['0', 'O', 'a', 'e']);
188        confusions.insert('1', vec!['l', 'I', 'i', '|', '7']);
189        confusions.insert('l', vec!['1', 'I', 'i', '|']);
190        confusions.insert('I', vec!['1', 'l', 'i', '|']);
191        confusions.insert('i', vec!['1', 'l', 'I', 'j']);
192        confusions.insert('5', vec!['S', 's']);
193        confusions.insert('S', vec!['5', 's', '8']);
194        confusions.insert('s', vec!['5', 'S', 'z']);
195        confusions.insert('8', vec!['B', '&', 'S']);
196        confusions.insert('B', vec!['8', 'R', 'D']);
197        confusions.insert('6', vec!['G', 'b']);
198        confusions.insert('G', vec!['6', 'C', 'O']);
199        confusions.insert('2', vec!['Z', 'z']);
200        confusions.insert('Z', vec!['2', 'z', '7']);
201        confusions.insert('z', vec!['2', 'Z', 's']);
202        confusions.insert('n', vec!['m', 'h', 'r']);
203        confusions.insert('m', vec!['n', 'r']);
204        confusions.insert('h', vec!['n', 'b', 'k']);
205        confusions.insert('c', vec!['e', 'o', '(']);
206        confusions.insert('e', vec!['c', 'a', 'o']);
207        confusions.insert('a', vec!['e', 'o', 'd']);
208        confusions.insert('d', vec!['a', 'o', 'c']);
209        confusions.insert('g', vec!['q', '9', 'a']);
210        confusions.insert('q', vec!['g', '9', 'p']);
211        confusions.insert('9', vec!['g', 'q']);
212        confusions.insert('v', vec!['u', 'w', 'y']);
213        confusions.insert('u', vec!['v', 'n', 'w']);
214        confusions.insert('w', vec!['v', 'u', 'x']);
215        confusions.insert('y', vec!['v', 'u', 'j']);
216        confusions.insert('f', vec!['t', 'r']);
217        confusions.insert('t', vec!['f', 'l', '+']);
218        confusions.insert('r', vec!['n', 'f']);
219
220        Self { confusions }
221    }
222
223    /// Gets OCR-confusable characters.
224    pub fn get_confusions(&self, c: char) -> Vec<char> {
225        self.confusions.get(&c).cloned().unwrap_or_else(|| vec![c])
226    }
227}
228
229/// Common homophones (words that sound alike).
230pub struct Homophones {
231    /// Map from word to homophones.
232    homophones: HashMap<String, Vec<String>>,
233}
234
235impl Default for Homophones {
236    fn default() -> Self {
237        Self::new()
238    }
239}
240
241impl Homophones {
242    /// Creates homophone mappings.
243    pub fn new() -> Self {
244        let mut homophones = HashMap::new();
245
246        // Common business/accounting homophones
247        homophones.insert("to".to_string(), vec!["two".to_string(), "too".to_string()]);
248        homophones.insert("two".to_string(), vec!["to".to_string(), "too".to_string()]);
249        homophones.insert(
250            "their".to_string(),
251            vec!["there".to_string(), "they're".to_string()],
252        );
253        homophones.insert(
254            "there".to_string(),
255            vec!["their".to_string(), "they're".to_string()],
256        );
257        homophones.insert("its".to_string(), vec!["it's".to_string()]);
258        homophones.insert("your".to_string(), vec!["you're".to_string()]);
259        homophones.insert("than".to_string(), vec!["then".to_string()]);
260        homophones.insert("then".to_string(), vec!["than".to_string()]);
261        homophones.insert("accept".to_string(), vec!["except".to_string()]);
262        homophones.insert("affect".to_string(), vec!["effect".to_string()]);
263        homophones.insert("effect".to_string(), vec!["affect".to_string()]);
264        homophones.insert("capital".to_string(), vec!["capitol".to_string()]);
265        homophones.insert("principal".to_string(), vec!["principle".to_string()]);
266        homophones.insert("compliment".to_string(), vec!["complement".to_string()]);
267        homophones.insert("stationary".to_string(), vec!["stationery".to_string()]);
268        homophones.insert("advice".to_string(), vec!["advise".to_string()]);
269        homophones.insert(
270            "loss".to_string(),
271            vec!["lost".to_string(), "lose".to_string()],
272        );
273
274        Self { homophones }
275    }
276
277    /// Gets homophones for a word.
278    pub fn get_homophones(&self, word: &str) -> Option<&Vec<String>> {
279        self.homophones.get(&word.to_lowercase())
280    }
281}
282
283/// Typo generator.
284pub struct TypoGenerator {
285    config: TypoConfig,
286    keyboard: KeyboardLayout,
287    ocr: OCRConfusions,
288    homophones: Homophones,
289    stats: TypoStats,
290}
291
292/// Statistics for typo generation.
293#[derive(Debug, Clone, Default, Serialize, Deserialize)]
294pub struct TypoStats {
295    pub total_characters: usize,
296    pub total_typos: usize,
297    pub by_type: HashMap<TypoType, usize>,
298    pub total_words: usize,
299    pub words_with_typos: usize,
300}
301
302impl TypoGenerator {
303    /// Creates a new typo generator.
304    pub fn new(config: TypoConfig) -> Self {
305        Self {
306            config,
307            keyboard: KeyboardLayout::default(),
308            ocr: OCRConfusions::default(),
309            homophones: Homophones::default(),
310            stats: TypoStats::default(),
311        }
312    }
313
314    /// Introduces typos into text.
315    pub fn introduce_typos<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
316        if self.config.preserve_word_boundaries {
317            self.introduce_typos_by_word(text, rng)
318        } else {
319            self.introduce_typos_by_char(text, rng)
320        }
321    }
322
323    /// Introduces typos word by word.
324    fn introduce_typos_by_word<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
325        let mut result = String::new();
326        let chars = text.chars().peekable();
327        let mut current_word = String::new();
328
329        for c in chars {
330            if c.is_alphanumeric() {
331                current_word.push(c);
332            } else {
333                // Process the word
334                if !current_word.is_empty() {
335                    self.stats.total_words += 1;
336                    let processed = self.process_word(&current_word, rng);
337                    if processed != current_word {
338                        self.stats.words_with_typos += 1;
339                    }
340                    result.push_str(&processed);
341                    current_word.clear();
342                }
343                result.push(c);
344            }
345        }
346
347        // Process remaining word
348        if !current_word.is_empty() {
349            self.stats.total_words += 1;
350            let processed = self.process_word(&current_word, rng);
351            if processed != current_word {
352                self.stats.words_with_typos += 1;
353            }
354            result.push_str(&processed);
355        }
356
357        result
358    }
359
360    /// Processes a single word for typos.
361    fn process_word<R: Rng>(&mut self, word: &str, rng: &mut R) -> String {
362        // Check for homophone substitution first
363        if let Some(homophones) = self.homophones.get_homophones(word) {
364            if rng.gen::<f64>() < self.config.char_error_rate * 10.0 {
365                // Higher probability for whole-word substitution
366                self.stats.total_typos += 1;
367                *self.stats.by_type.entry(TypoType::Homophone).or_insert(0) += 1;
368                return homophones[rng.gen_range(0..homophones.len())].clone();
369            }
370        }
371
372        let mut result: Vec<char> = word.chars().collect();
373        let mut typos_in_word = 0;
374        let mut i = 0;
375
376        while i < result.len() {
377            if typos_in_word >= self.config.max_typos_per_word {
378                break;
379            }
380
381            self.stats.total_characters += 1;
382
383            if rng.gen::<f64>() < self.config.char_error_rate {
384                let typo_type = self.select_typo_type(rng);
385                let c = result[i];
386
387                match typo_type {
388                    TypoType::Substitution => {
389                        let nearby = self.keyboard.get_nearby(c);
390                        if !nearby.is_empty() {
391                            result[i] = nearby[rng.gen_range(0..nearby.len())];
392                        }
393                    }
394                    TypoType::Transposition => {
395                        if i + 1 < result.len() {
396                            result.swap(i, i + 1);
397                        }
398                    }
399                    TypoType::Deletion => {
400                        if result.len() > 1 {
401                            result.remove(i);
402                            // Don't increment i since we removed the current element
403                            // Stats are tracked below, just continue to avoid index issues
404                            self.stats.total_typos += 1;
405                            *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
406                            typos_in_word += 1;
407                            continue;
408                        }
409                    }
410                    TypoType::Insertion => {
411                        let nearby = self.keyboard.get_nearby(c);
412                        if !nearby.is_empty() {
413                            result.insert(i, nearby[rng.gen_range(0..nearby.len())]);
414                            // Skip the inserted character
415                            i += 1;
416                        }
417                    }
418                    TypoType::DoubleChar => {
419                        result.insert(i, c);
420                        // Skip the duplicated character
421                        i += 1;
422                    }
423                    TypoType::CaseError => {
424                        if c.is_uppercase() {
425                            result[i] = c.to_ascii_lowercase();
426                        } else {
427                            result[i] = c.to_ascii_uppercase();
428                        }
429                    }
430                    TypoType::OCRError => {
431                        let confusions = self.ocr.get_confusions(c);
432                        if !confusions.is_empty() {
433                            result[i] = confusions[rng.gen_range(0..confusions.len())];
434                        }
435                    }
436                    TypoType::Homophone => {
437                        // Already handled above
438                    }
439                }
440
441                self.stats.total_typos += 1;
442                *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
443                typos_in_word += 1;
444            }
445            i += 1;
446        }
447
448        result.into_iter().collect()
449    }
450
451    /// Introduces typos character by character.
452    fn introduce_typos_by_char<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
453        let mut result = String::new();
454
455        for c in text.chars() {
456            self.stats.total_characters += 1;
457
458            if c.is_alphanumeric() && rng.gen::<f64>() < self.config.char_error_rate {
459                let typo_type = self.select_typo_type(rng);
460
461                match typo_type {
462                    TypoType::Substitution => {
463                        let nearby = self.keyboard.get_nearby(c);
464                        if !nearby.is_empty() {
465                            result.push(nearby[rng.gen_range(0..nearby.len())]);
466                        } else {
467                            result.push(c);
468                        }
469                    }
470                    TypoType::Deletion => {
471                        // Skip character (deletion)
472                    }
473                    TypoType::Insertion => {
474                        result.push(c);
475                        let nearby = self.keyboard.get_nearby(c);
476                        if !nearby.is_empty() {
477                            result.push(nearby[rng.gen_range(0..nearby.len())]);
478                        }
479                    }
480                    TypoType::DoubleChar => {
481                        result.push(c);
482                        result.push(c);
483                    }
484                    TypoType::CaseError => {
485                        if c.is_uppercase() {
486                            result.push(c.to_ascii_lowercase());
487                        } else {
488                            result.push(c.to_ascii_uppercase());
489                        }
490                    }
491                    _ => {
492                        result.push(c);
493                    }
494                }
495
496                self.stats.total_typos += 1;
497                *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
498            } else {
499                result.push(c);
500            }
501        }
502
503        result
504    }
505
506    /// Selects a typo type based on weights.
507    fn select_typo_type<R: Rng>(&self, rng: &mut R) -> TypoType {
508        let options: Vec<(TypoType, f64)> = self
509            .config
510            .type_weights
511            .iter()
512            .map(|(&typo_type, &weight)| (typo_type, weight))
513            .collect();
514
515        if options.is_empty() {
516            return TypoType::Substitution;
517        }
518
519        *weighted_select(rng, &options)
520    }
521
522    /// Checks if a field is protected.
523    pub fn is_protected(&self, field: &str) -> bool {
524        self.config.protected_fields.contains(&field.to_string())
525    }
526
527    /// Returns statistics.
528    pub fn stats(&self) -> &TypoStats {
529        &self.stats
530    }
531
532    /// Resets statistics.
533    pub fn reset_stats(&mut self) {
534        self.stats = TypoStats::default();
535    }
536}
537
538/// Encoding issue types.
539#[derive(Debug, Clone, Copy, PartialEq)]
540pub enum EncodingIssue {
541    /// Mojibake (wrong encoding interpretation).
542    Mojibake,
543    /// Missing characters (replaced with ?).
544    MissingChars,
545    /// UTF-8 BOM inserted.
546    BOM,
547    /// Control characters inserted.
548    ControlChars,
549    /// HTML entities.
550    HTMLEntities,
551}
552
553/// Introduces encoding issues.
554pub fn introduce_encoding_issue<R: Rng>(text: &str, issue: EncodingIssue, rng: &mut R) -> String {
555    match issue {
556        EncodingIssue::Mojibake => {
557            // Simulate common Mojibake patterns
558            text.replace('é', "é")
559                .replace('ñ', "ñ")
560                .replace('ü', "ü")
561                .replace('ö', "ö")
562                .replace('ä', "ä")
563                .replace('€', "€")
564        }
565        EncodingIssue::MissingChars => text
566            .chars()
567            .map(|c| {
568                if !c.is_ascii() && rng.gen::<f64>() < 0.5 {
569                    '?'
570                } else {
571                    c
572                }
573            })
574            .collect(),
575        EncodingIssue::BOM => {
576            format!("\u{FEFF}{}", text)
577        }
578        EncodingIssue::ControlChars => {
579            let mut result = String::new();
580            for c in text.chars() {
581                result.push(c);
582                if rng.gen::<f64>() < 0.01 {
583                    // Insert random control character
584                    result.push('\u{0000}');
585                }
586            }
587            result
588        }
589        EncodingIssue::HTMLEntities => text
590            .replace('&', "&amp;")
591            .replace('<', "&lt;")
592            .replace('>', "&gt;")
593            .replace('"', "&quot;")
594            .replace(' ', "&nbsp;"),
595    }
596}
597
598#[cfg(test)]
599#[allow(clippy::unwrap_used)]
600mod tests {
601    use super::*;
602    use datasynth_core::utils::seeded_rng;
603
604    #[test]
605    fn test_keyboard_nearby_keys() {
606        let keyboard = KeyboardLayout::qwerty();
607        let nearby = keyboard.get_nearby('e');
608        assert!(nearby.contains(&'w'));
609        assert!(nearby.contains(&'r'));
610        assert!(nearby.contains(&'s'));
611        assert!(nearby.contains(&'d'));
612    }
613
614    #[test]
615    fn test_typo_generation() {
616        let config = TypoConfig {
617            char_error_rate: 0.5, // High rate for testing
618            ..Default::default()
619        };
620
621        let mut generator = TypoGenerator::new(config);
622        let mut rng = seeded_rng(42, 0);
623
624        let text = "Hello World";
625        let _with_typos = generator.introduce_typos(text, &mut rng);
626
627        // With high error rate, should have some typos
628        assert!(generator.stats().total_typos > 0);
629    }
630
631    #[test]
632    fn test_encoding_issues() {
633        let mut rng = seeded_rng(42, 0);
634
635        let text = "Héllo & Wörld";
636        let mojibake = introduce_encoding_issue(text, EncodingIssue::Mojibake, &mut rng);
637        assert!(mojibake.contains("é"));
638
639        let html = introduce_encoding_issue("A & B", EncodingIssue::HTMLEntities, &mut rng);
640        assert!(html.contains("&amp;"));
641    }
642
643    #[test]
644    fn test_homophones() {
645        let homophones = Homophones::new();
646        let alternatives = homophones.get_homophones("their");
647        assert!(alternatives.is_some());
648        assert!(alternatives.unwrap().contains(&"there".to_string()));
649    }
650}