1use rand::Rng;
11use std::collections::HashMap;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15pub enum TypoType {
16 Substitution,
18 Transposition,
20 Insertion,
22 Deletion,
24 DoubleChar,
26 CaseError,
28 Homophone,
30 OCRError,
32}
33
34impl TypoType {
35 pub fn all() -> Vec<Self> {
37 vec![
38 TypoType::Substitution,
39 TypoType::Transposition,
40 TypoType::Insertion,
41 TypoType::Deletion,
42 TypoType::DoubleChar,
43 TypoType::CaseError,
44 TypoType::Homophone,
45 TypoType::OCRError,
46 ]
47 }
48}
49
50#[derive(Debug, Clone)]
52pub struct TypoConfig {
53 pub char_error_rate: f64,
55 pub type_weights: HashMap<TypoType, f64>,
57 pub preserve_word_boundaries: bool,
59 pub max_typos_per_word: usize,
61 pub protected_fields: Vec<String>,
63}
64
65impl Default for TypoConfig {
66 fn default() -> Self {
67 let mut type_weights = HashMap::new();
68 type_weights.insert(TypoType::Substitution, 0.25);
69 type_weights.insert(TypoType::Transposition, 0.20);
70 type_weights.insert(TypoType::Insertion, 0.10);
71 type_weights.insert(TypoType::Deletion, 0.20);
72 type_weights.insert(TypoType::DoubleChar, 0.10);
73 type_weights.insert(TypoType::CaseError, 0.10);
74 type_weights.insert(TypoType::OCRError, 0.05);
75
76 Self {
77 char_error_rate: 0.005, type_weights,
79 preserve_word_boundaries: true,
80 max_typos_per_word: 2,
81 protected_fields: vec![
82 "document_number".to_string(),
83 "account_code".to_string(),
84 "company_code".to_string(),
85 "vendor_id".to_string(),
86 "customer_id".to_string(),
87 ],
88 }
89 }
90}
91
92pub struct KeyboardLayout {
94 nearby_keys: HashMap<char, Vec<char>>,
96}
97
98impl Default for KeyboardLayout {
99 fn default() -> Self {
100 Self::qwerty()
101 }
102}
103
104impl KeyboardLayout {
105 pub fn qwerty() -> Self {
107 let mut nearby_keys = HashMap::new();
108
109 nearby_keys.insert('q', vec!['w', 'a', '1', '2']);
111 nearby_keys.insert('w', vec!['q', 'e', 'a', 's', '2', '3']);
112 nearby_keys.insert('e', vec!['w', 'r', 's', 'd', '3', '4']);
113 nearby_keys.insert('r', vec!['e', 't', 'd', 'f', '4', '5']);
114 nearby_keys.insert('t', vec!['r', 'y', 'f', 'g', '5', '6']);
115 nearby_keys.insert('y', vec!['t', 'u', 'g', 'h', '6', '7']);
116 nearby_keys.insert('u', vec!['y', 'i', 'h', 'j', '7', '8']);
117 nearby_keys.insert('i', vec!['u', 'o', 'j', 'k', '8', '9']);
118 nearby_keys.insert('o', vec!['i', 'p', 'k', 'l', '9', '0']);
119 nearby_keys.insert('p', vec!['o', 'l', '0']);
120
121 nearby_keys.insert('a', vec!['q', 'w', 's', 'z']);
123 nearby_keys.insert('s', vec!['a', 'w', 'e', 'd', 'z', 'x']);
124 nearby_keys.insert('d', vec!['s', 'e', 'r', 'f', 'x', 'c']);
125 nearby_keys.insert('f', vec!['d', 'r', 't', 'g', 'c', 'v']);
126 nearby_keys.insert('g', vec!['f', 't', 'y', 'h', 'v', 'b']);
127 nearby_keys.insert('h', vec!['g', 'y', 'u', 'j', 'b', 'n']);
128 nearby_keys.insert('j', vec!['h', 'u', 'i', 'k', 'n', 'm']);
129 nearby_keys.insert('k', vec!['j', 'i', 'o', 'l', 'm']);
130 nearby_keys.insert('l', vec!['k', 'o', 'p']);
131
132 nearby_keys.insert('z', vec!['a', 's', 'x']);
134 nearby_keys.insert('x', vec!['z', 's', 'd', 'c']);
135 nearby_keys.insert('c', vec!['x', 'd', 'f', 'v']);
136 nearby_keys.insert('v', vec!['c', 'f', 'g', 'b']);
137 nearby_keys.insert('b', vec!['v', 'g', 'h', 'n']);
138 nearby_keys.insert('n', vec!['b', 'h', 'j', 'm']);
139 nearby_keys.insert('m', vec!['n', 'j', 'k']);
140
141 nearby_keys.insert('1', vec!['2', 'q']);
143 nearby_keys.insert('2', vec!['1', '3', 'q', 'w']);
144 nearby_keys.insert('3', vec!['2', '4', 'w', 'e']);
145 nearby_keys.insert('4', vec!['3', '5', 'e', 'r']);
146 nearby_keys.insert('5', vec!['4', '6', 'r', 't']);
147 nearby_keys.insert('6', vec!['5', '7', 't', 'y']);
148 nearby_keys.insert('7', vec!['6', '8', 'y', 'u']);
149 nearby_keys.insert('8', vec!['7', '9', 'u', 'i']);
150 nearby_keys.insert('9', vec!['8', '0', 'i', 'o']);
151 nearby_keys.insert('0', vec!['9', 'o', 'p']);
152
153 Self { nearby_keys }
154 }
155
156 pub fn get_nearby(&self, c: char) -> Vec<char> {
158 self.nearby_keys
159 .get(&c.to_ascii_lowercase())
160 .cloned()
161 .unwrap_or_else(|| vec![c])
162 }
163}
164
165pub struct OCRConfusions {
167 confusions: HashMap<char, Vec<char>>,
169}
170
171impl Default for OCRConfusions {
172 fn default() -> Self {
173 Self::new()
174 }
175}
176
177impl OCRConfusions {
178 pub fn new() -> Self {
180 let mut confusions = HashMap::new();
181
182 confusions.insert('0', vec!['O', 'o', 'Q', 'D']);
184 confusions.insert('O', vec!['0', 'Q', 'D', 'o']);
185 confusions.insert('o', vec!['0', 'O', 'a', 'e']);
186 confusions.insert('1', vec!['l', 'I', 'i', '|', '7']);
187 confusions.insert('l', vec!['1', 'I', 'i', '|']);
188 confusions.insert('I', vec!['1', 'l', 'i', '|']);
189 confusions.insert('i', vec!['1', 'l', 'I', 'j']);
190 confusions.insert('5', vec!['S', 's']);
191 confusions.insert('S', vec!['5', 's', '8']);
192 confusions.insert('s', vec!['5', 'S', 'z']);
193 confusions.insert('8', vec!['B', '&', 'S']);
194 confusions.insert('B', vec!['8', 'R', 'D']);
195 confusions.insert('6', vec!['G', 'b']);
196 confusions.insert('G', vec!['6', 'C', 'O']);
197 confusions.insert('2', vec!['Z', 'z']);
198 confusions.insert('Z', vec!['2', 'z', '7']);
199 confusions.insert('z', vec!['2', 'Z', 's']);
200 confusions.insert('n', vec!['m', 'h', 'r']);
201 confusions.insert('m', vec!['n', 'r']);
202 confusions.insert('h', vec!['n', 'b', 'k']);
203 confusions.insert('c', vec!['e', 'o', '(']);
204 confusions.insert('e', vec!['c', 'a', 'o']);
205 confusions.insert('a', vec!['e', 'o', 'd']);
206 confusions.insert('d', vec!['a', 'o', 'c']);
207 confusions.insert('g', vec!['q', '9', 'a']);
208 confusions.insert('q', vec!['g', '9', 'p']);
209 confusions.insert('9', vec!['g', 'q']);
210 confusions.insert('v', vec!['u', 'w', 'y']);
211 confusions.insert('u', vec!['v', 'n', 'w']);
212 confusions.insert('w', vec!['v', 'u', 'x']);
213 confusions.insert('y', vec!['v', 'u', 'j']);
214 confusions.insert('f', vec!['t', 'r']);
215 confusions.insert('t', vec!['f', 'l', '+']);
216 confusions.insert('r', vec!['n', 'f']);
217
218 Self { confusions }
219 }
220
221 pub fn get_confusions(&self, c: char) -> Vec<char> {
223 self.confusions.get(&c).cloned().unwrap_or_else(|| vec![c])
224 }
225}
226
227pub struct Homophones {
229 homophones: HashMap<String, Vec<String>>,
231}
232
233impl Default for Homophones {
234 fn default() -> Self {
235 Self::new()
236 }
237}
238
239impl Homophones {
240 pub fn new() -> Self {
242 let mut homophones = HashMap::new();
243
244 homophones.insert("to".to_string(), vec!["two".to_string(), "too".to_string()]);
246 homophones.insert("two".to_string(), vec!["to".to_string(), "too".to_string()]);
247 homophones.insert(
248 "their".to_string(),
249 vec!["there".to_string(), "they're".to_string()],
250 );
251 homophones.insert(
252 "there".to_string(),
253 vec!["their".to_string(), "they're".to_string()],
254 );
255 homophones.insert("its".to_string(), vec!["it's".to_string()]);
256 homophones.insert("your".to_string(), vec!["you're".to_string()]);
257 homophones.insert("than".to_string(), vec!["then".to_string()]);
258 homophones.insert("then".to_string(), vec!["than".to_string()]);
259 homophones.insert("accept".to_string(), vec!["except".to_string()]);
260 homophones.insert("affect".to_string(), vec!["effect".to_string()]);
261 homophones.insert("effect".to_string(), vec!["affect".to_string()]);
262 homophones.insert("capital".to_string(), vec!["capitol".to_string()]);
263 homophones.insert("principal".to_string(), vec!["principle".to_string()]);
264 homophones.insert("compliment".to_string(), vec!["complement".to_string()]);
265 homophones.insert("stationary".to_string(), vec!["stationery".to_string()]);
266 homophones.insert("advice".to_string(), vec!["advise".to_string()]);
267 homophones.insert(
268 "loss".to_string(),
269 vec!["lost".to_string(), "lose".to_string()],
270 );
271
272 Self { homophones }
273 }
274
275 pub fn get_homophones(&self, word: &str) -> Option<&Vec<String>> {
277 self.homophones.get(&word.to_lowercase())
278 }
279}
280
281pub struct TypoGenerator {
283 config: TypoConfig,
284 keyboard: KeyboardLayout,
285 ocr: OCRConfusions,
286 homophones: Homophones,
287 stats: TypoStats,
288}
289
290#[derive(Debug, Clone, Default)]
292pub struct TypoStats {
293 pub total_characters: usize,
294 pub total_typos: usize,
295 pub by_type: HashMap<TypoType, usize>,
296 pub total_words: usize,
297 pub words_with_typos: usize,
298}
299
300impl TypoGenerator {
301 pub fn new(config: TypoConfig) -> Self {
303 Self {
304 config,
305 keyboard: KeyboardLayout::default(),
306 ocr: OCRConfusions::default(),
307 homophones: Homophones::default(),
308 stats: TypoStats::default(),
309 }
310 }
311
312 pub fn introduce_typos<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
314 if self.config.preserve_word_boundaries {
315 self.introduce_typos_by_word(text, rng)
316 } else {
317 self.introduce_typos_by_char(text, rng)
318 }
319 }
320
321 fn introduce_typos_by_word<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
323 let mut result = String::new();
324 let chars = text.chars().peekable();
325 let mut current_word = String::new();
326
327 for c in chars {
328 if c.is_alphanumeric() {
329 current_word.push(c);
330 } else {
331 if !current_word.is_empty() {
333 self.stats.total_words += 1;
334 let processed = self.process_word(¤t_word, rng);
335 if processed != current_word {
336 self.stats.words_with_typos += 1;
337 }
338 result.push_str(&processed);
339 current_word.clear();
340 }
341 result.push(c);
342 }
343 }
344
345 if !current_word.is_empty() {
347 self.stats.total_words += 1;
348 let processed = self.process_word(¤t_word, rng);
349 if processed != current_word {
350 self.stats.words_with_typos += 1;
351 }
352 result.push_str(&processed);
353 }
354
355 result
356 }
357
358 fn process_word<R: Rng>(&mut self, word: &str, rng: &mut R) -> String {
360 if let Some(homophones) = self.homophones.get_homophones(word) {
362 if rng.gen::<f64>() < self.config.char_error_rate * 10.0 {
363 self.stats.total_typos += 1;
365 *self.stats.by_type.entry(TypoType::Homophone).or_insert(0) += 1;
366 return homophones[rng.gen_range(0..homophones.len())].clone();
367 }
368 }
369
370 let mut result: Vec<char> = word.chars().collect();
371 let mut typos_in_word = 0;
372 let mut i = 0;
373
374 while i < result.len() {
375 if typos_in_word >= self.config.max_typos_per_word {
376 break;
377 }
378
379 self.stats.total_characters += 1;
380
381 if rng.gen::<f64>() < self.config.char_error_rate {
382 let typo_type = self.select_typo_type(rng);
383 let c = result[i];
384
385 match typo_type {
386 TypoType::Substitution => {
387 let nearby = self.keyboard.get_nearby(c);
388 if !nearby.is_empty() {
389 result[i] = nearby[rng.gen_range(0..nearby.len())];
390 }
391 }
392 TypoType::Transposition => {
393 if i + 1 < result.len() {
394 result.swap(i, i + 1);
395 }
396 }
397 TypoType::Deletion => {
398 if result.len() > 1 {
399 result.remove(i);
400 self.stats.total_typos += 1;
403 *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
404 typos_in_word += 1;
405 continue;
406 }
407 }
408 TypoType::Insertion => {
409 let nearby = self.keyboard.get_nearby(c);
410 if !nearby.is_empty() {
411 result.insert(i, nearby[rng.gen_range(0..nearby.len())]);
412 i += 1;
414 }
415 }
416 TypoType::DoubleChar => {
417 result.insert(i, c);
418 i += 1;
420 }
421 TypoType::CaseError => {
422 if c.is_uppercase() {
423 result[i] = c.to_ascii_lowercase();
424 } else {
425 result[i] = c.to_ascii_uppercase();
426 }
427 }
428 TypoType::OCRError => {
429 let confusions = self.ocr.get_confusions(c);
430 if !confusions.is_empty() {
431 result[i] = confusions[rng.gen_range(0..confusions.len())];
432 }
433 }
434 TypoType::Homophone => {
435 }
437 }
438
439 self.stats.total_typos += 1;
440 *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
441 typos_in_word += 1;
442 }
443 i += 1;
444 }
445
446 result.into_iter().collect()
447 }
448
449 fn introduce_typos_by_char<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
451 let mut result = String::new();
452
453 for c in text.chars() {
454 self.stats.total_characters += 1;
455
456 if c.is_alphanumeric() && rng.gen::<f64>() < self.config.char_error_rate {
457 let typo_type = self.select_typo_type(rng);
458
459 match typo_type {
460 TypoType::Substitution => {
461 let nearby = self.keyboard.get_nearby(c);
462 if !nearby.is_empty() {
463 result.push(nearby[rng.gen_range(0..nearby.len())]);
464 } else {
465 result.push(c);
466 }
467 }
468 TypoType::Deletion => {
469 }
471 TypoType::Insertion => {
472 result.push(c);
473 let nearby = self.keyboard.get_nearby(c);
474 if !nearby.is_empty() {
475 result.push(nearby[rng.gen_range(0..nearby.len())]);
476 }
477 }
478 TypoType::DoubleChar => {
479 result.push(c);
480 result.push(c);
481 }
482 TypoType::CaseError => {
483 if c.is_uppercase() {
484 result.push(c.to_ascii_lowercase());
485 } else {
486 result.push(c.to_ascii_uppercase());
487 }
488 }
489 _ => {
490 result.push(c);
491 }
492 }
493
494 self.stats.total_typos += 1;
495 *self.stats.by_type.entry(typo_type).or_insert(0) += 1;
496 } else {
497 result.push(c);
498 }
499 }
500
501 result
502 }
503
504 fn select_typo_type<R: Rng>(&self, rng: &mut R) -> TypoType {
506 let total_weight: f64 = self.config.type_weights.values().sum();
507 let mut random_weight = rng.gen::<f64>() * total_weight;
508
509 for (typo_type, weight) in &self.config.type_weights {
510 random_weight -= weight;
511 if random_weight <= 0.0 {
512 return *typo_type;
513 }
514 }
515
516 TypoType::Substitution }
518
519 pub fn is_protected(&self, field: &str) -> bool {
521 self.config.protected_fields.contains(&field.to_string())
522 }
523
524 pub fn stats(&self) -> &TypoStats {
526 &self.stats
527 }
528
529 pub fn reset_stats(&mut self) {
531 self.stats = TypoStats::default();
532 }
533}
534
535#[derive(Debug, Clone, Copy, PartialEq)]
537pub enum EncodingIssue {
538 Mojibake,
540 MissingChars,
542 BOM,
544 ControlChars,
546 HTMLEntities,
548}
549
550pub fn introduce_encoding_issue<R: Rng>(text: &str, issue: EncodingIssue, rng: &mut R) -> String {
552 match issue {
553 EncodingIssue::Mojibake => {
554 text.replace('é', "é")
556 .replace('ñ', "ñ")
557 .replace('ü', "ü")
558 .replace('ö', "ö")
559 .replace('ä', "ä")
560 .replace('€', "€")
561 }
562 EncodingIssue::MissingChars => text
563 .chars()
564 .map(|c| {
565 if !c.is_ascii() && rng.gen::<f64>() < 0.5 {
566 '?'
567 } else {
568 c
569 }
570 })
571 .collect(),
572 EncodingIssue::BOM => {
573 format!("\u{FEFF}{}", text)
574 }
575 EncodingIssue::ControlChars => {
576 let mut result = String::new();
577 for c in text.chars() {
578 result.push(c);
579 if rng.gen::<f64>() < 0.01 {
580 result.push('\u{0000}');
582 }
583 }
584 result
585 }
586 EncodingIssue::HTMLEntities => text
587 .replace('&', "&")
588 .replace('<', "<")
589 .replace('>', ">")
590 .replace('"', """)
591 .replace(' ', " "),
592 }
593}
594
595#[cfg(test)]
596mod tests {
597 use super::*;
598 use rand::SeedableRng;
599 use rand_chacha::ChaCha8Rng;
600
601 #[test]
602 fn test_keyboard_nearby_keys() {
603 let keyboard = KeyboardLayout::qwerty();
604 let nearby = keyboard.get_nearby('e');
605 assert!(nearby.contains(&'w'));
606 assert!(nearby.contains(&'r'));
607 assert!(nearby.contains(&'s'));
608 assert!(nearby.contains(&'d'));
609 }
610
611 #[test]
612 fn test_typo_generation() {
613 let config = TypoConfig {
614 char_error_rate: 0.5, ..Default::default()
616 };
617
618 let mut generator = TypoGenerator::new(config);
619 let mut rng = ChaCha8Rng::seed_from_u64(42);
620
621 let text = "Hello World";
622 let _with_typos = generator.introduce_typos(text, &mut rng);
623
624 assert!(generator.stats().total_typos > 0);
626 }
627
628 #[test]
629 fn test_encoding_issues() {
630 let mut rng = ChaCha8Rng::seed_from_u64(42);
631
632 let text = "Héllo & Wörld";
633 let mojibake = introduce_encoding_issue(text, EncodingIssue::Mojibake, &mut rng);
634 assert!(mojibake.contains("é"));
635
636 let html = introduce_encoding_issue("A & B", EncodingIssue::HTMLEntities, &mut rng);
637 assert!(html.contains("&"));
638 }
639
640 #[test]
641 fn test_homophones() {
642 let homophones = Homophones::new();
643 let alternatives = homophones.get_homophones("their");
644 assert!(alternatives.is_some());
645 assert!(alternatives.unwrap().contains(&"there".to_string()));
646 }
647}