1use rand::seq::IndexedRandom;
8use rand::Rng;
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14#[serde(default)]
15pub struct VariationConfig {
16 pub abbreviation_rate: f64,
18 pub typo_rate: f64,
20 pub case_variation_rate: f64,
22 pub word_order_variation: bool,
24 pub number_format_variation: bool,
26}
27
28impl Default for VariationConfig {
29 fn default() -> Self {
30 Self {
31 abbreviation_rate: 0.25,
32 typo_rate: 0.01,
33 case_variation_rate: 0.05,
34 word_order_variation: false,
35 number_format_variation: true,
36 }
37 }
38}
39
40#[derive(Debug, Clone)]
42pub struct TypoGenerator {
43 keyboard_neighbors: HashMap<char, Vec<char>>,
44 common_transpositions: Vec<(&'static str, &'static str)>,
45 common_omissions: Vec<(&'static str, &'static str)>,
46}
47
48impl Default for TypoGenerator {
49 fn default() -> Self {
50 Self::new()
51 }
52}
53
54impl TypoGenerator {
55 pub fn new() -> Self {
57 let mut keyboard_neighbors = HashMap::new();
58
59 keyboard_neighbors.insert('q', vec!['w', 'a', '1', '2']);
61 keyboard_neighbors.insert('w', vec!['q', 'e', 'a', 's', '2', '3']);
62 keyboard_neighbors.insert('e', vec!['w', 'r', 's', 'd', '3', '4']);
63 keyboard_neighbors.insert('r', vec!['e', 't', 'd', 'f', '4', '5']);
64 keyboard_neighbors.insert('t', vec!['r', 'y', 'f', 'g', '5', '6']);
65 keyboard_neighbors.insert('y', vec!['t', 'u', 'g', 'h', '6', '7']);
66 keyboard_neighbors.insert('u', vec!['y', 'i', 'h', 'j', '7', '8']);
67 keyboard_neighbors.insert('i', vec!['u', 'o', 'j', 'k', '8', '9']);
68 keyboard_neighbors.insert('o', vec!['i', 'p', 'k', 'l', '9', '0']);
69 keyboard_neighbors.insert('p', vec!['o', 'l', '0']);
70 keyboard_neighbors.insert('a', vec!['q', 'w', 's', 'z']);
71 keyboard_neighbors.insert('s', vec!['a', 'w', 'e', 'd', 'z', 'x']);
72 keyboard_neighbors.insert('d', vec!['s', 'e', 'r', 'f', 'x', 'c']);
73 keyboard_neighbors.insert('f', vec!['d', 'r', 't', 'g', 'c', 'v']);
74 keyboard_neighbors.insert('g', vec!['f', 't', 'y', 'h', 'v', 'b']);
75 keyboard_neighbors.insert('h', vec!['g', 'y', 'u', 'j', 'b', 'n']);
76 keyboard_neighbors.insert('j', vec!['h', 'u', 'i', 'k', 'n', 'm']);
77 keyboard_neighbors.insert('k', vec!['j', 'i', 'o', 'l', 'm']);
78 keyboard_neighbors.insert('l', vec!['k', 'o', 'p']);
79 keyboard_neighbors.insert('z', vec!['a', 's', 'x']);
80 keyboard_neighbors.insert('x', vec!['z', 's', 'd', 'c']);
81 keyboard_neighbors.insert('c', vec!['x', 'd', 'f', 'v']);
82 keyboard_neighbors.insert('v', vec!['c', 'f', 'g', 'b']);
83 keyboard_neighbors.insert('b', vec!['v', 'g', 'h', 'n']);
84 keyboard_neighbors.insert('n', vec!['b', 'h', 'j', 'm']);
85 keyboard_neighbors.insert('m', vec!['n', 'j', 'k']);
86
87 Self {
88 keyboard_neighbors,
89 common_transpositions: vec![
90 ("the", "teh"),
91 ("and", "adn"),
92 ("for", "fro"),
93 ("that", "taht"),
94 ("with", "wiht"),
95 ("from", "form"),
96 ("have", "ahve"),
97 ("this", "tihs"),
98 ("will", "wil"),
99 ("your", "yoru"),
100 ("payment", "paymnet"),
101 ("invoice", "invocie"),
102 ("account", "acocunt"),
103 ("amount", "amuont"),
104 ("receipt", "reciept"),
105 ],
106 common_omissions: vec![
107 ("the", "te"),
108 ("and", "ad"),
109 ("payment", "paymet"),
110 ("invoice", "invoce"),
111 ("account", "accont"),
112 ("received", "recived"),
113 ("processing", "procesing"),
114 ("transaction", "transacion"),
115 ("reference", "referece"),
116 ("description", "descripton"),
117 ],
118 }
119 }
120
121 pub fn introduce_typo(&self, text: &str, rng: &mut impl Rng) -> String {
123 if text.is_empty() {
124 return text.to_string();
125 }
126
127 let typo_type = rng.random_range(0..5);
128 match typo_type {
129 0 => self.keyboard_typo(text, rng),
130 1 => self.transposition_typo(text, rng),
131 2 => self.omission_typo(text, rng),
132 3 => self.double_letter_typo(text, rng),
133 _ => self.common_word_typo(text, rng),
134 }
135 }
136
137 fn keyboard_typo(&self, text: &str, rng: &mut impl Rng) -> String {
138 let chars: Vec<char> = text.chars().collect();
139 if chars.is_empty() {
140 return text.to_string();
141 }
142
143 let alpha_indices: Vec<usize> = chars
145 .iter()
146 .enumerate()
147 .filter(|(_, c)| c.is_ascii_alphabetic())
148 .map(|(i, _)| i)
149 .collect();
150
151 if alpha_indices.is_empty() {
152 return text.to_string();
153 }
154
155 let idx = *alpha_indices.choose(rng).expect("non-empty collection");
156 let original_char = chars[idx].to_ascii_lowercase();
157
158 if let Some(neighbors) = self.keyboard_neighbors.get(&original_char) {
159 if let Some(&neighbor) = neighbors.choose(rng) {
160 let mut result: Vec<char> = chars.clone();
161 result[idx] = if chars[idx].is_uppercase() {
162 neighbor.to_ascii_uppercase()
163 } else {
164 neighbor
165 };
166 return result.into_iter().collect();
167 }
168 }
169
170 text.to_string()
171 }
172
173 fn transposition_typo(&self, text: &str, rng: &mut impl Rng) -> String {
174 let chars: Vec<char> = text.chars().collect();
175 if chars.len() < 2 {
176 return text.to_string();
177 }
178
179 let valid_positions: Vec<usize> = (0..chars.len() - 1)
181 .filter(|&i| chars[i].is_ascii_alphabetic() && chars[i + 1].is_ascii_alphabetic())
182 .collect();
183
184 if valid_positions.is_empty() {
185 return text.to_string();
186 }
187
188 let idx = *valid_positions.choose(rng).expect("non-empty collection");
189 let mut result = chars.clone();
190 result.swap(idx, idx + 1);
191 result.into_iter().collect()
192 }
193
194 fn omission_typo(&self, text: &str, rng: &mut impl Rng) -> String {
195 let chars: Vec<char> = text.chars().collect();
196 if chars.len() < 3 {
197 return text.to_string();
198 }
199
200 let valid_positions: Vec<usize> = (1..chars.len() - 1)
202 .filter(|&i| {
203 chars[i].is_ascii_alphabetic()
204 && chars[i - 1].is_ascii_alphabetic()
205 && chars[i + 1].is_ascii_alphabetic()
206 })
207 .collect();
208
209 if valid_positions.is_empty() {
210 return text.to_string();
211 }
212
213 let idx = *valid_positions.choose(rng).expect("non-empty collection");
214 let mut result = chars.clone();
215 result.remove(idx);
216 result.into_iter().collect()
217 }
218
219 fn double_letter_typo(&self, text: &str, rng: &mut impl Rng) -> String {
220 let chars: Vec<char> = text.chars().collect();
221 if chars.is_empty() {
222 return text.to_string();
223 }
224
225 let valid_positions: Vec<usize> = chars
227 .iter()
228 .enumerate()
229 .filter(|(_, c)| c.is_ascii_alphabetic())
230 .map(|(i, _)| i)
231 .collect();
232
233 if valid_positions.is_empty() {
234 return text.to_string();
235 }
236
237 let idx = *valid_positions.choose(rng).expect("non-empty collection");
238 let mut result = chars.clone();
239 result.insert(idx, chars[idx]);
240 result.into_iter().collect()
241 }
242
243 fn common_word_typo(&self, text: &str, rng: &mut impl Rng) -> String {
244 let text_lower = text.to_lowercase();
246
247 for (correct, typo) in &self.common_transpositions {
249 if text_lower.contains(*correct) && rng.random_bool(0.5) {
250 return text.replacen(correct, typo, 1);
251 }
252 }
253
254 for (correct, typo) in &self.common_omissions {
256 if text_lower.contains(*correct) {
257 return text.replacen(correct, typo, 1);
258 }
259 }
260
261 self.keyboard_typo(text, rng)
263 }
264}
265
266#[derive(Debug, Clone)]
268pub struct DescriptionVariator {
269 config: VariationConfig,
270 abbreviations: HashMap<&'static str, Vec<&'static str>>,
271 typo_gen: TypoGenerator,
272}
273
274impl Default for DescriptionVariator {
275 fn default() -> Self {
276 Self::new()
277 }
278}
279
280impl DescriptionVariator {
281 pub fn new() -> Self {
283 Self::with_config(VariationConfig::default())
284 }
285
286 pub fn with_config(config: VariationConfig) -> Self {
288 let mut abbreviations = HashMap::new();
289
290 abbreviations.insert("Invoice", vec!["Inv", "INV", "Inv."]);
292 abbreviations.insert("invoice", vec!["inv", "inv."]);
293 abbreviations.insert("Purchase Order", vec!["PO", "P.O.", "PurchOrd"]);
294 abbreviations.insert("purchase order", vec!["PO", "p.o.", "po"]);
295 abbreviations.insert("Accounts Payable", vec!["AP", "A/P", "Accts Pay"]);
296 abbreviations.insert("accounts payable", vec!["AP", "a/p", "accts pay"]);
297 abbreviations.insert("Accounts Receivable", vec!["AR", "A/R", "Accts Rec"]);
298 abbreviations.insert("accounts receivable", vec!["AR", "a/r", "accts rec"]);
299 abbreviations.insert("Payment", vec!["Pmt", "PMT", "Pymt"]);
300 abbreviations.insert("payment", vec!["pmt", "pymt"]);
301 abbreviations.insert("Receipt", vec!["Rcpt", "RCPT", "Rec"]);
302 abbreviations.insert("receipt", vec!["rcpt", "rec"]);
303 abbreviations.insert("Transaction", vec!["Trans", "TXN", "Trx"]);
304 abbreviations.insert("transaction", vec!["trans", "txn", "trx"]);
305 abbreviations.insert("Reference", vec!["Ref", "REF", "Ref."]);
306 abbreviations.insert("reference", vec!["ref", "ref."]);
307 abbreviations.insert("Number", vec!["No", "No.", "Num", "#"]);
308 abbreviations.insert("number", vec!["no", "no.", "num", "#"]);
309 abbreviations.insert("Department", vec!["Dept", "Dept.", "Dpt"]);
310 abbreviations.insert("department", vec!["dept", "dept.", "dpt"]);
311 abbreviations.insert("Company", vec!["Co", "Co.", "Corp"]);
312 abbreviations.insert("company", vec!["co", "co.", "corp"]);
313 abbreviations.insert("Corporation", vec!["Corp", "Corp."]);
314 abbreviations.insert("corporation", vec!["corp", "corp."]);
315 abbreviations.insert("Incorporated", vec!["Inc", "Inc."]);
316 abbreviations.insert("incorporated", vec!["inc", "inc."]);
317 abbreviations.insert("Limited", vec!["Ltd", "Ltd."]);
318 abbreviations.insert("limited", vec!["ltd", "ltd."]);
319 abbreviations.insert("Quarter", vec!["Q", "Qtr", "Qtr."]);
320 abbreviations.insert("quarter", vec!["q", "qtr", "qtr."]);
321 abbreviations.insert("Year", vec!["Yr", "YR"]);
322 abbreviations.insert("year", vec!["yr"]);
323 abbreviations.insert("Month", vec!["Mo", "Mo.", "Mth"]);
324 abbreviations.insert("month", vec!["mo", "mo.", "mth"]);
325 abbreviations.insert("January", vec!["Jan", "Jan."]);
326 abbreviations.insert("February", vec!["Feb", "Feb."]);
327 abbreviations.insert("March", vec!["Mar", "Mar."]);
328 abbreviations.insert("April", vec!["Apr", "Apr."]);
329 abbreviations.insert("May", vec!["May"]);
330 abbreviations.insert("June", vec!["Jun", "Jun."]);
331 abbreviations.insert("July", vec!["Jul", "Jul."]);
332 abbreviations.insert("August", vec!["Aug", "Aug."]);
333 abbreviations.insert("September", vec!["Sep", "Sept", "Sep."]);
334 abbreviations.insert("October", vec!["Oct", "Oct."]);
335 abbreviations.insert("November", vec!["Nov", "Nov."]);
336 abbreviations.insert("December", vec!["Dec", "Dec."]);
337 abbreviations.insert("Revenue", vec!["Rev", "REV"]);
338 abbreviations.insert("revenue", vec!["rev"]);
339 abbreviations.insert("Expense", vec!["Exp", "EXP"]);
340 abbreviations.insert("expense", vec!["exp"]);
341 abbreviations.insert("Accrual", vec!["Accr", "Accrl"]);
342 abbreviations.insert("accrual", vec!["accr", "accrl"]);
343 abbreviations.insert("Adjustment", vec!["Adj", "Adjmt"]);
344 abbreviations.insert("adjustment", vec!["adj", "adjmt"]);
345 abbreviations.insert("Depreciation", vec!["Depr", "Dep"]);
346 abbreviations.insert("depreciation", vec!["depr", "dep"]);
347 abbreviations.insert("Amortization", vec!["Amort", "Amor"]);
348 abbreviations.insert("amortization", vec!["amort", "amor"]);
349 abbreviations.insert("Recognition", vec!["Recog", "Rec"]);
350 abbreviations.insert("recognition", vec!["recog", "rec"]);
351 abbreviations.insert("Processing", vec!["Proc", "Process"]);
352 abbreviations.insert("processing", vec!["proc", "process"]);
353 abbreviations.insert("Services", vec!["Svcs", "Svc"]);
354 abbreviations.insert("services", vec!["svcs", "svc"]);
355 abbreviations.insert("Management", vec!["Mgmt", "Mgt"]);
356 abbreviations.insert("management", vec!["mgmt", "mgt"]);
357 abbreviations.insert("General", vec!["Gen", "Gen."]);
358 abbreviations.insert("general", vec!["gen", "gen."]);
359 abbreviations.insert("Administrative", vec!["Admin", "Adm"]);
360 abbreviations.insert("administrative", vec!["admin", "adm"]);
361 abbreviations.insert("Professional", vec!["Prof", "Profl"]);
362 abbreviations.insert("professional", vec!["prof", "profl"]);
363
364 Self {
365 config,
366 abbreviations,
367 typo_gen: TypoGenerator::new(),
368 }
369 }
370
371 pub fn apply(&self, description: &str, rng: &mut impl Rng) -> String {
373 let mut result = description.to_string();
374
375 if rng.random_bool(self.config.abbreviation_rate) {
377 result = self.apply_abbreviations(&result, rng);
378 }
379
380 if rng.random_bool(self.config.case_variation_rate) {
382 result = self.apply_case_variation(&result, rng);
383 }
384
385 if rng.random_bool(self.config.typo_rate) {
387 result = self.typo_gen.introduce_typo(&result, rng);
388 }
389
390 result
391 }
392
393 pub fn abbreviate(&self, description: &str, rng: &mut impl Rng) -> String {
395 self.apply_abbreviations(description, rng)
396 }
397
398 fn apply_abbreviations(&self, text: &str, rng: &mut impl Rng) -> String {
399 let mut result = text.to_string();
400
401 let max_replacements = rng.random_range(1..=2);
403 let mut replacements = 0;
404
405 for (full, abbrevs) in &self.abbreviations {
406 if result.contains(*full) && replacements < max_replacements {
407 if let Some(abbrev) = abbrevs.choose(rng) {
408 result = result.replacen(*full, abbrev, 1);
409 replacements += 1;
410 }
411 }
412 }
413
414 result
415 }
416
417 fn apply_case_variation(&self, text: &str, rng: &mut impl Rng) -> String {
418 let variation = rng.random_range(0..3);
419 match variation {
420 0 => text.to_uppercase(),
421 1 => text.to_lowercase(),
422 _ => {
423 text.split_whitespace()
425 .map(|word| {
426 let mut chars: Vec<char> = word.chars().collect();
427 if let Some(first) = chars.first_mut() {
428 *first = first.to_ascii_uppercase();
429 }
430 for c in chars.iter_mut().skip(1) {
431 *c = c.to_ascii_lowercase();
432 }
433 chars.into_iter().collect::<String>()
434 })
435 .collect::<Vec<String>>()
436 .join(" ")
437 }
438 }
439 }
440
441 pub fn config(&self) -> &VariationConfig {
443 &self.config
444 }
445}
446
447#[cfg(test)]
448#[allow(clippy::unwrap_used)]
449mod tests {
450 use super::*;
451 use rand::SeedableRng;
452 use rand_chacha::ChaCha8Rng;
453
454 #[test]
455 fn test_typo_generator_keyboard() {
456 let mut rng = ChaCha8Rng::seed_from_u64(42);
457 let gen = TypoGenerator::new();
458
459 let original = "payment";
460 let typo = gen.keyboard_typo(original, &mut rng);
461 assert!(typo.len() == original.len()); }
464
465 #[test]
466 fn test_typo_generator_transposition() {
467 let mut rng = ChaCha8Rng::seed_from_u64(42);
468 let gen = TypoGenerator::new();
469
470 let original = "payment";
471 let typo = gen.transposition_typo(original, &mut rng);
472 assert_eq!(typo.len(), original.len());
473 }
474
475 #[test]
476 fn test_description_variator_abbreviation() {
477 let mut rng = ChaCha8Rng::seed_from_u64(42);
478 let config = VariationConfig {
479 abbreviation_rate: 1.0, typo_rate: 0.0,
481 case_variation_rate: 0.0,
482 ..Default::default()
483 };
484 let variator = DescriptionVariator::with_config(config);
485
486 let original = "Invoice for Purchase Order";
487 let varied = variator.apply(original, &mut rng);
488
489 let has_abbreviation = varied.contains("Inv")
491 || varied.contains("INV")
492 || varied.contains("PO")
493 || varied.contains("P.O.");
494 assert!(has_abbreviation);
495 }
496
497 #[test]
498 fn test_description_variator_no_change() {
499 let mut rng = ChaCha8Rng::seed_from_u64(42);
500 let config = VariationConfig {
501 abbreviation_rate: 0.0,
502 typo_rate: 0.0,
503 case_variation_rate: 0.0,
504 ..Default::default()
505 };
506 let variator = DescriptionVariator::with_config(config);
507
508 let original = "Regular description";
509 let varied = variator.apply(original, &mut rng);
510 assert_eq!(original, varied);
511 }
512
513 #[test]
514 fn test_month_abbreviations() {
515 let mut rng = ChaCha8Rng::seed_from_u64(42);
516 let config = VariationConfig {
517 abbreviation_rate: 1.0,
518 typo_rate: 0.0,
519 case_variation_rate: 0.0,
520 ..Default::default()
521 };
522 let variator = DescriptionVariator::with_config(config);
523
524 let original = "Revenue for December 2024";
525 let varied = variator.abbreviate(original, &mut rng);
526
527 let has_change = varied != original;
529 assert!(has_change || varied.contains("Dec") || varied.contains("Rev"));
530 }
531
532 #[test]
533 fn test_case_variation() {
534 let mut rng = ChaCha8Rng::seed_from_u64(42);
535 let config = VariationConfig {
536 abbreviation_rate: 0.0,
537 typo_rate: 0.0,
538 case_variation_rate: 1.0,
539 ..Default::default()
540 };
541 let variator = DescriptionVariator::with_config(config);
542
543 let original = "Invoice Payment";
544 let varied = variator.apply(original, &mut rng);
545
546 let case_changed = varied == original.to_uppercase()
548 || varied == original.to_lowercase()
549 || varied != original;
550 assert!(case_changed);
551 }
552
553 #[test]
554 fn test_deterministic_variation() {
555 let config = VariationConfig {
556 abbreviation_rate: 0.5,
557 typo_rate: 0.1,
558 ..Default::default()
559 };
560 let variator = DescriptionVariator::with_config(config);
561
562 let original = "Invoice for Services";
563
564 let mut rng1 = ChaCha8Rng::seed_from_u64(12345);
565 let mut rng2 = ChaCha8Rng::seed_from_u64(12345);
566
567 let varied1 = variator.apply(original, &mut rng1);
568 let varied2 = variator.apply(original, &mut rng2);
569
570 assert_eq!(varied1, varied2);
571 }
572}