1use datasynth_core::utils::weighted_select;
11use rand::Rng;
12use serde::{Deserialize, Serialize};
13use std::collections::HashMap;
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
17pub enum TypoType {
18 Substitution,
20 Transposition,
22 Insertion,
24 Deletion,
26 DoubleChar,
28 CaseError,
30 Homophone,
32 OCRError,
34}
35
36impl TypoType {
37 pub fn all() -> Vec<Self> {
39 vec![
40 TypoType::Substitution,
41 TypoType::Transposition,
42 TypoType::Insertion,
43 TypoType::Deletion,
44 TypoType::DoubleChar,
45 TypoType::CaseError,
46 TypoType::Homophone,
47 TypoType::OCRError,
48 ]
49 }
50}
51
52#[derive(Debug, Clone)]
54pub struct TypoConfig {
55 pub char_error_rate: f64,
57 pub type_weights: HashMap<TypoType, f64>,
59 pub preserve_word_boundaries: bool,
61 pub max_typos_per_word: usize,
63 pub protected_fields: Vec<String>,
65}
66
67impl Default for TypoConfig {
68 fn default() -> Self {
69 let mut type_weights = HashMap::new();
70 type_weights.insert(TypoType::Substitution, 0.25);
71 type_weights.insert(TypoType::Transposition, 0.20);
72 type_weights.insert(TypoType::Insertion, 0.10);
73 type_weights.insert(TypoType::Deletion, 0.20);
74 type_weights.insert(TypoType::DoubleChar, 0.10);
75 type_weights.insert(TypoType::CaseError, 0.10);
76 type_weights.insert(TypoType::OCRError, 0.05);
77
78 Self {
79 char_error_rate: 0.005, type_weights,
81 preserve_word_boundaries: true,
82 max_typos_per_word: 2,
83 protected_fields: vec![
84 "document_number".to_string(),
85 "account_code".to_string(),
86 "company_code".to_string(),
87 "vendor_id".to_string(),
88 "customer_id".to_string(),
89 ],
90 }
91 }
92}
93
94pub struct KeyboardLayout {
96 nearby_keys: HashMap<char, Vec<char>>,
98}
99
100impl Default for KeyboardLayout {
101 fn default() -> Self {
102 Self::qwerty()
103 }
104}
105
106impl KeyboardLayout {
107 pub fn qwerty() -> Self {
109 let mut nearby_keys = HashMap::new();
110
111 nearby_keys.insert('q', vec!['w', 'a', '1', '2']);
113 nearby_keys.insert('w', vec!['q', 'e', 'a', 's', '2', '3']);
114 nearby_keys.insert('e', vec!['w', 'r', 's', 'd', '3', '4']);
115 nearby_keys.insert('r', vec!['e', 't', 'd', 'f', '4', '5']);
116 nearby_keys.insert('t', vec!['r', 'y', 'f', 'g', '5', '6']);
117 nearby_keys.insert('y', vec!['t', 'u', 'g', 'h', '6', '7']);
118 nearby_keys.insert('u', vec!['y', 'i', 'h', 'j', '7', '8']);
119 nearby_keys.insert('i', vec!['u', 'o', 'j', 'k', '8', '9']);
120 nearby_keys.insert('o', vec!['i', 'p', 'k', 'l', '9', '0']);
121 nearby_keys.insert('p', vec!['o', 'l', '0']);
122
123 nearby_keys.insert('a', vec!['q', 'w', 's', 'z']);
125 nearby_keys.insert('s', vec!['a', 'w', 'e', 'd', 'z', 'x']);
126 nearby_keys.insert('d', vec!['s', 'e', 'r', 'f', 'x', 'c']);
127 nearby_keys.insert('f', vec!['d', 'r', 't', 'g', 'c', 'v']);
128 nearby_keys.insert('g', vec!['f', 't', 'y', 'h', 'v', 'b']);
129 nearby_keys.insert('h', vec!['g', 'y', 'u', 'j', 'b', 'n']);
130 nearby_keys.insert('j', vec!['h', 'u', 'i', 'k', 'n', 'm']);
131 nearby_keys.insert('k', vec!['j', 'i', 'o', 'l', 'm']);
132 nearby_keys.insert('l', vec!['k', 'o', 'p']);
133
134 nearby_keys.insert('z', vec!['a', 's', 'x']);
136 nearby_keys.insert('x', vec!['z', 's', 'd', 'c']);
137 nearby_keys.insert('c', vec!['x', 'd', 'f', 'v']);
138 nearby_keys.insert('v', vec!['c', 'f', 'g', 'b']);
139 nearby_keys.insert('b', vec!['v', 'g', 'h', 'n']);
140 nearby_keys.insert('n', vec!['b', 'h', 'j', 'm']);
141 nearby_keys.insert('m', vec!['n', 'j', 'k']);
142
143 nearby_keys.insert('1', vec!['2', 'q']);
145 nearby_keys.insert('2', vec!['1', '3', 'q', 'w']);
146 nearby_keys.insert('3', vec!['2', '4', 'w', 'e']);
147 nearby_keys.insert('4', vec!['3', '5', 'e', 'r']);
148 nearby_keys.insert('5', vec!['4', '6', 'r', 't']);
149 nearby_keys.insert('6', vec!['5', '7', 't', 'y']);
150 nearby_keys.insert('7', vec!['6', '8', 'y', 'u']);
151 nearby_keys.insert('8', vec!['7', '9', 'u', 'i']);
152 nearby_keys.insert('9', vec!['8', '0', 'i', 'o']);
153 nearby_keys.insert('0', vec!['9', 'o', 'p']);
154
155 Self { nearby_keys }
156 }
157
158 pub fn get_nearby(&self, c: char) -> Vec<char> {
160 self.nearby_keys
161 .get(&c.to_ascii_lowercase())
162 .cloned()
163 .unwrap_or_else(|| vec![c])
164 }
165}
166
167pub struct OCRConfusions {
169 confusions: HashMap<char, Vec<char>>,
171}
172
173impl Default for OCRConfusions {
174 fn default() -> Self {
175 Self::new()
176 }
177}
178
179impl OCRConfusions {
180 pub fn new() -> Self {
182 let mut confusions = HashMap::new();
183
184 confusions.insert('0', vec!['O', 'o', 'Q', 'D']);
186 confusions.insert('O', vec!['0', 'Q', 'D', 'o']);
187 confusions.insert('o', vec!['0', 'O', 'a', 'e']);
188 confusions.insert('1', vec!['l', 'I', 'i', '|', '7']);
189 confusions.insert('l', vec!['1', 'I', 'i', '|']);
190 confusions.insert('I', vec!['1', 'l', 'i', '|']);
191 confusions.insert('i', vec!['1', 'l', 'I', 'j']);
192 confusions.insert('5', vec!['S', 's']);
193 confusions.insert('S', vec!['5', 's', '8']);
194 confusions.insert('s', vec!['5', 'S', 'z']);
195 confusions.insert('8', vec!['B', '&', 'S']);
196 confusions.insert('B', vec!['8', 'R', 'D']);
197 confusions.insert('6', vec!['G', 'b']);
198 confusions.insert('G', vec!['6', 'C', 'O']);
199 confusions.insert('2', vec!['Z', 'z']);
200 confusions.insert('Z', vec!['2', 'z', '7']);
201 confusions.insert('z', vec!['2', 'Z', 's']);
202 confusions.insert('n', vec!['m', 'h', 'r']);
203 confusions.insert('m', vec!['n', 'r']);
204 confusions.insert('h', vec!['n', 'b', 'k']);
205 confusions.insert('c', vec!['e', 'o', '(']);
206 confusions.insert('e', vec!['c', 'a', 'o']);
207 confusions.insert('a', vec!['e', 'o', 'd']);
208 confusions.insert('d', vec!['a', 'o', 'c']);
209 confusions.insert('g', vec!['q', '9', 'a']);
210 confusions.insert('q', vec!['g', '9', 'p']);
211 confusions.insert('9', vec!['g', 'q']);
212 confusions.insert('v', vec!['u', 'w', 'y']);
213 confusions.insert('u', vec!['v', 'n', 'w']);
214 confusions.insert('w', vec!['v', 'u', 'x']);
215 confusions.insert('y', vec!['v', 'u', 'j']);
216 confusions.insert('f', vec!['t', 'r']);
217 confusions.insert('t', vec!['f', 'l', '+']);
218 confusions.insert('r', vec!['n', 'f']);
219
220 Self { confusions }
221 }
222
223 pub fn get_confusions(&self, c: char) -> Vec<char> {
225 self.confusions.get(&c).cloned().unwrap_or_else(|| vec![c])
226 }
227}
228
229pub struct Homophones {
231 homophones: HashMap<String, Vec<String>>,
233}
234
235impl Default for Homophones {
236 fn default() -> Self {
237 Self::new()
238 }
239}
240
241impl Homophones {
242 pub fn new() -> Self {
244 let mut homophones = HashMap::new();
245
246 homophones.insert("to".to_string(), vec!["two".to_string(), "too".to_string()]);
248 homophones.insert("two".to_string(), vec!["to".to_string(), "too".to_string()]);
249 homophones.insert(
250 "their".to_string(),
251 vec!["there".to_string(), "they're".to_string()],
252 );
253 homophones.insert(
254 "there".to_string(),
255 vec!["their".to_string(), "they're".to_string()],
256 );
257 homophones.insert("its".to_string(), vec!["it's".to_string()]);
258 homophones.insert("your".to_string(), vec!["you're".to_string()]);
259 homophones.insert("than".to_string(), vec!["then".to_string()]);
260 homophones.insert("then".to_string(), vec!["than".to_string()]);
261 homophones.insert("accept".to_string(), vec!["except".to_string()]);
262 homophones.insert("affect".to_string(), vec!["effect".to_string()]);
263 homophones.insert("effect".to_string(), vec!["affect".to_string()]);
264 homophones.insert("capital".to_string(), vec!["capitol".to_string()]);
265 homophones.insert("principal".to_string(), vec!["principle".to_string()]);
266 homophones.insert("compliment".to_string(), vec!["complement".to_string()]);
267 homophones.insert("stationary".to_string(), vec!["stationery".to_string()]);
268 homophones.insert("advice".to_string(), vec!["advise".to_string()]);
269 homophones.insert(
270 "loss".to_string(),
271 vec!["lost".to_string(), "lose".to_string()],
272 );
273
274 Self { homophones }
275 }
276
277 pub fn get_homophones(&self, word: &str) -> Option<&Vec<String>> {
279 self.homophones.get(&word.to_lowercase())
280 }
281}
282
283pub struct TypoGenerator {
285 config: TypoConfig,
286 keyboard: KeyboardLayout,
287 ocr: OCRConfusions,
288 homophones: Homophones,
289 stats: TypoStats,
290}
291
292#[derive(Debug, Clone, Default, Serialize, Deserialize)]
294pub struct TypoStats {
295 pub total_characters: usize,
296 pub total_typos: usize,
297 pub by_type: HashMap<TypoType, usize>,
298 pub total_words: usize,
299 pub words_with_typos: usize,
300}
301
302impl TypoGenerator {
303 pub fn new(config: TypoConfig) -> Self {
305 Self {
306 config,
307 keyboard: KeyboardLayout::default(),
308 ocr: OCRConfusions::default(),
309 homophones: Homophones::default(),
310 stats: TypoStats::default(),
311 }
312 }
313
314 pub fn introduce_typos<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
316 if self.config.preserve_word_boundaries {
317 self.introduce_typos_by_word(text, rng)
318 } else {
319 self.introduce_typos_by_char(text, rng)
320 }
321 }
322
323 fn introduce_typos_by_word<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
325 let mut result = String::new();
326 let chars = text.chars().peekable();
327 let mut current_word = String::new();
328
329 for c in chars {
330 if c.is_alphanumeric() {
331 current_word.push(c);
332 } else {
333 if !current_word.is_empty() {
335 self.stats.total_words += 1;
336 let processed = self.process_word(¤t_word, rng);
337 if processed != current_word {
338 self.stats.words_with_typos += 1;
339 }
340 result.push_str(&processed);
341 current_word.clear();
342 }
343 result.push(c);
344 }
345 }
346
347 if !current_word.is_empty() {
349 self.stats.total_words += 1;
350 let processed = self.process_word(¤t_word, rng);
351 if processed != current_word {
352 self.stats.words_with_typos += 1;
353 }
354 result.push_str(&processed);
355 }
356
357 result
358 }
359
360 fn process_word<R: Rng>(&mut self, word: &str, rng: &mut R) -> String {
362 if let Some(homophones) = self.homophones.get_homophones(word) {
364 if rng.random::<f64>() < self.config.char_error_rate * 10.0 {
365 self.stats.total_typos += 1;
367 *self.stats.by_type.entry(TypoType::Homophone).or_insert(0) += 1;
368 return homophones[rng.random_range(0..homophones.len())].clone();
369 }
370 }
371
372 let mut result: Vec<char> = word.chars().collect();
373 let mut typos_in_word = 0;
374 let mut i = 0;
375
376 while i < result.len() {
377 if typos_in_word >= self.config.max_typos_per_word {
378 break;
379 }
380
381 self.stats.total_characters += 1;
382
383 if rng.random::<f64>() < self.config.char_error_rate {
384 let typo_type = self.select_typo_type(rng);
385 let c = result[i];
386
387 match typo_type {
388 TypoType::Substitution => {
389 let nearby = self.keyboard.get_nearby(c);
390 if !nearby.is_empty() {
391 result[i] = nearby[rng.random_range(0..nearby.len())];
392 }
393 }
394 TypoType::Transposition => {
395 if i + 1 < result.len() {
396 result.swap(i, i + 1);
397 }
398 }
399 TypoType::Deletion => {
400 if result.len() > 1 {
401 result.remove(i);
402 self.stats.total_typos += 1;
405 *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
406 typos_in_word += 1;
407 continue;
408 }
409 }
410 TypoType::Insertion => {
411 let nearby = self.keyboard.get_nearby(c);
412 if !nearby.is_empty() {
413 result.insert(i, nearby[rng.random_range(0..nearby.len())]);
414 i += 1;
416 }
417 }
418 TypoType::DoubleChar => {
419 result.insert(i, c);
420 i += 1;
422 }
423 TypoType::CaseError => {
424 if c.is_uppercase() {
425 result[i] = c.to_ascii_lowercase();
426 } else {
427 result[i] = c.to_ascii_uppercase();
428 }
429 }
430 TypoType::OCRError => {
431 let confusions = self.ocr.get_confusions(c);
432 if !confusions.is_empty() {
433 result[i] = confusions[rng.random_range(0..confusions.len())];
434 }
435 }
436 TypoType::Homophone => {
437 }
439 }
440
441 self.stats.total_typos += 1;
442 *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
443 typos_in_word += 1;
444 }
445 i += 1;
446 }
447
448 result.into_iter().collect()
449 }
450
451 fn introduce_typos_by_char<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
453 let mut result = String::new();
454
455 for c in text.chars() {
456 self.stats.total_characters += 1;
457
458 if c.is_alphanumeric() && rng.random::<f64>() < self.config.char_error_rate {
459 let typo_type = self.select_typo_type(rng);
460
461 match typo_type {
462 TypoType::Substitution => {
463 let nearby = self.keyboard.get_nearby(c);
464 if !nearby.is_empty() {
465 result.push(nearby[rng.random_range(0..nearby.len())]);
466 } else {
467 result.push(c);
468 }
469 }
470 TypoType::Deletion => {
471 }
473 TypoType::Insertion => {
474 result.push(c);
475 let nearby = self.keyboard.get_nearby(c);
476 if !nearby.is_empty() {
477 result.push(nearby[rng.random_range(0..nearby.len())]);
478 }
479 }
480 TypoType::DoubleChar => {
481 result.push(c);
482 result.push(c);
483 }
484 TypoType::CaseError => {
485 if c.is_uppercase() {
486 result.push(c.to_ascii_lowercase());
487 } else {
488 result.push(c.to_ascii_uppercase());
489 }
490 }
491 _ => {
492 result.push(c);
493 }
494 }
495
496 self.stats.total_typos += 1;
497 *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
498 } else {
499 result.push(c);
500 }
501 }
502
503 result
504 }
505
506 fn select_typo_type<R: Rng>(&self, rng: &mut R) -> TypoType {
508 let options: Vec<(TypoType, f64)> = self
509 .config
510 .type_weights
511 .iter()
512 .map(|(&typo_type, &weight)| (typo_type, weight))
513 .collect();
514
515 if options.is_empty() {
516 return TypoType::Substitution;
517 }
518
519 *weighted_select(rng, &options)
520 }
521
522 pub fn is_protected(&self, field: &str) -> bool {
524 self.config.protected_fields.contains(&field.to_string())
525 }
526
527 pub fn stats(&self) -> &TypoStats {
529 &self.stats
530 }
531
532 pub fn reset_stats(&mut self) {
534 self.stats = TypoStats::default();
535 }
536}
537
538#[derive(Debug, Clone, Copy, PartialEq)]
540pub enum EncodingIssue {
541 Mojibake,
543 MissingChars,
545 BOM,
547 ControlChars,
549 HTMLEntities,
551}
552
553pub fn introduce_encoding_issue<R: Rng>(text: &str, issue: EncodingIssue, rng: &mut R) -> String {
555 match issue {
556 EncodingIssue::Mojibake => {
557 text.replace('é', "é")
559 .replace('ñ', "ñ")
560 .replace('ü', "ü")
561 .replace('ö', "ö")
562 .replace('ä', "ä")
563 .replace('€', "€")
564 }
565 EncodingIssue::MissingChars => text
566 .chars()
567 .map(|c| {
568 if !c.is_ascii() && rng.random::<f64>() < 0.5 {
569 '?'
570 } else {
571 c
572 }
573 })
574 .collect(),
575 EncodingIssue::BOM => {
576 format!("\u{FEFF}{}", text)
577 }
578 EncodingIssue::ControlChars => {
579 let mut result = String::new();
580 for c in text.chars() {
581 result.push(c);
582 if rng.random::<f64>() < 0.01 {
583 result.push('\u{0000}');
585 }
586 }
587 result
588 }
589 EncodingIssue::HTMLEntities => text
590 .replace('&', "&")
591 .replace('<', "<")
592 .replace('>', ">")
593 .replace('"', """)
594 .replace(' ', " "),
595 }
596}
597
598#[cfg(test)]
599#[allow(clippy::unwrap_used)]
600mod tests {
601 use super::*;
602 use datasynth_core::utils::seeded_rng;
603
604 #[test]
605 fn test_keyboard_nearby_keys() {
606 let keyboard = KeyboardLayout::qwerty();
607 let nearby = keyboard.get_nearby('e');
608 assert!(nearby.contains(&'w'));
609 assert!(nearby.contains(&'r'));
610 assert!(nearby.contains(&'s'));
611 assert!(nearby.contains(&'d'));
612 }
613
614 #[test]
615 fn test_typo_generation() {
616 let config = TypoConfig {
617 char_error_rate: 0.5, ..Default::default()
619 };
620
621 let mut generator = TypoGenerator::new(config);
622 let mut rng = seeded_rng(42, 0);
623
624 let text = "Hello World";
625 let _with_typos = generator.introduce_typos(text, &mut rng);
626
627 assert!(generator.stats().total_typos > 0);
629 }
630
631 #[test]
632 fn test_encoding_issues() {
633 let mut rng = seeded_rng(42, 0);
634
635 let text = "Héllo & Wörld";
636 let mojibake = introduce_encoding_issue(text, EncodingIssue::Mojibake, &mut rng);
637 assert!(mojibake.contains("é"));
638
639 let html = introduce_encoding_issue("A & B", EncodingIssue::HTMLEntities, &mut rng);
640 assert!(html.contains("&"));
641 }
642
643 #[test]
644 fn test_homophones() {
645 let homophones = Homophones::new();
646 let alternatives = homophones.get_homophones("their");
647 assert!(alternatives.is_some());
648 assert!(alternatives.unwrap().contains(&"there".to_string()));
649 }
650}