Skip to main content

datasynth_core/templates/realism/
descriptions.rs

1//! Description variation engine with abbreviations and typos.
2//!
3//! Provides natural language variations to make generated descriptions
4//! more realistic by applying abbreviations, case variations, and
5//! occasional typos.
6
7use rand::seq::IndexedRandom;
8use rand::Rng;
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11
12/// Configuration for description variations.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14#[serde(default)]
15pub struct VariationConfig {
16    /// Rate of abbreviation application (0.0 - 1.0)
17    pub abbreviation_rate: f64,
18    /// Rate of typo injection (0.0 - 1.0)
19    pub typo_rate: f64,
20    /// Rate of case variations (0.0 - 1.0)
21    pub case_variation_rate: f64,
22    /// Enable word order variations
23    pub word_order_variation: bool,
24    /// Enable number format variations (e.g., "1000" vs "1,000")
25    pub number_format_variation: bool,
26}
27
28impl Default for VariationConfig {
29    fn default() -> Self {
30        Self {
31            abbreviation_rate: 0.25,
32            typo_rate: 0.01,
33            case_variation_rate: 0.05,
34            word_order_variation: false,
35            number_format_variation: true,
36        }
37    }
38}
39
40/// Typo generator with keyboard-aware and common typo patterns.
41#[derive(Debug, Clone)]
42pub struct TypoGenerator {
43    keyboard_neighbors: HashMap<char, Vec<char>>,
44    common_transpositions: Vec<(&'static str, &'static str)>,
45    common_omissions: Vec<(&'static str, &'static str)>,
46}
47
48impl Default for TypoGenerator {
49    fn default() -> Self {
50        Self::new()
51    }
52}
53
54impl TypoGenerator {
55    /// Create a new typo generator.
56    pub fn new() -> Self {
57        let mut keyboard_neighbors = HashMap::new();
58
59        // QWERTY keyboard layout neighbors
60        keyboard_neighbors.insert('q', vec!['w', 'a', '1', '2']);
61        keyboard_neighbors.insert('w', vec!['q', 'e', 'a', 's', '2', '3']);
62        keyboard_neighbors.insert('e', vec!['w', 'r', 's', 'd', '3', '4']);
63        keyboard_neighbors.insert('r', vec!['e', 't', 'd', 'f', '4', '5']);
64        keyboard_neighbors.insert('t', vec!['r', 'y', 'f', 'g', '5', '6']);
65        keyboard_neighbors.insert('y', vec!['t', 'u', 'g', 'h', '6', '7']);
66        keyboard_neighbors.insert('u', vec!['y', 'i', 'h', 'j', '7', '8']);
67        keyboard_neighbors.insert('i', vec!['u', 'o', 'j', 'k', '8', '9']);
68        keyboard_neighbors.insert('o', vec!['i', 'p', 'k', 'l', '9', '0']);
69        keyboard_neighbors.insert('p', vec!['o', 'l', '0']);
70        keyboard_neighbors.insert('a', vec!['q', 'w', 's', 'z']);
71        keyboard_neighbors.insert('s', vec!['a', 'w', 'e', 'd', 'z', 'x']);
72        keyboard_neighbors.insert('d', vec!['s', 'e', 'r', 'f', 'x', 'c']);
73        keyboard_neighbors.insert('f', vec!['d', 'r', 't', 'g', 'c', 'v']);
74        keyboard_neighbors.insert('g', vec!['f', 't', 'y', 'h', 'v', 'b']);
75        keyboard_neighbors.insert('h', vec!['g', 'y', 'u', 'j', 'b', 'n']);
76        keyboard_neighbors.insert('j', vec!['h', 'u', 'i', 'k', 'n', 'm']);
77        keyboard_neighbors.insert('k', vec!['j', 'i', 'o', 'l', 'm']);
78        keyboard_neighbors.insert('l', vec!['k', 'o', 'p']);
79        keyboard_neighbors.insert('z', vec!['a', 's', 'x']);
80        keyboard_neighbors.insert('x', vec!['z', 's', 'd', 'c']);
81        keyboard_neighbors.insert('c', vec!['x', 'd', 'f', 'v']);
82        keyboard_neighbors.insert('v', vec!['c', 'f', 'g', 'b']);
83        keyboard_neighbors.insert('b', vec!['v', 'g', 'h', 'n']);
84        keyboard_neighbors.insert('n', vec!['b', 'h', 'j', 'm']);
85        keyboard_neighbors.insert('m', vec!['n', 'j', 'k']);
86
87        Self {
88            keyboard_neighbors,
89            common_transpositions: vec![
90                ("the", "teh"),
91                ("and", "adn"),
92                ("for", "fro"),
93                ("that", "taht"),
94                ("with", "wiht"),
95                ("from", "form"),
96                ("have", "ahve"),
97                ("this", "tihs"),
98                ("will", "wil"),
99                ("your", "yoru"),
100                ("payment", "paymnet"),
101                ("invoice", "invocie"),
102                ("account", "acocunt"),
103                ("amount", "amuont"),
104                ("receipt", "reciept"),
105            ],
106            common_omissions: vec![
107                ("the", "te"),
108                ("and", "ad"),
109                ("payment", "paymet"),
110                ("invoice", "invoce"),
111                ("account", "accont"),
112                ("received", "recived"),
113                ("processing", "procesing"),
114                ("transaction", "transacion"),
115                ("reference", "referece"),
116                ("description", "descripton"),
117            ],
118        }
119    }
120
121    /// Introduce a typo into the text.
122    pub fn introduce_typo(&self, text: &str, rng: &mut impl Rng) -> String {
123        if text.is_empty() {
124            return text.to_string();
125        }
126
127        let typo_type = rng.random_range(0..5);
128        match typo_type {
129            0 => self.keyboard_typo(text, rng),
130            1 => self.transposition_typo(text, rng),
131            2 => self.omission_typo(text, rng),
132            3 => self.double_letter_typo(text, rng),
133            _ => self.common_word_typo(text, rng),
134        }
135    }
136
137    fn keyboard_typo(&self, text: &str, rng: &mut impl Rng) -> String {
138        let chars: Vec<char> = text.chars().collect();
139        if chars.is_empty() {
140            return text.to_string();
141        }
142
143        // Find alphabetic characters to potentially typo
144        let alpha_indices: Vec<usize> = chars
145            .iter()
146            .enumerate()
147            .filter(|(_, c)| c.is_ascii_alphabetic())
148            .map(|(i, _)| i)
149            .collect();
150
151        if alpha_indices.is_empty() {
152            return text.to_string();
153        }
154
155        let idx = *alpha_indices.choose(rng).expect("non-empty collection");
156        let original_char = chars[idx].to_ascii_lowercase();
157
158        if let Some(neighbors) = self.keyboard_neighbors.get(&original_char) {
159            if let Some(&neighbor) = neighbors.choose(rng) {
160                let mut result: Vec<char> = chars.clone();
161                result[idx] = if chars[idx].is_uppercase() {
162                    neighbor.to_ascii_uppercase()
163                } else {
164                    neighbor
165                };
166                return result.into_iter().collect();
167            }
168        }
169
170        text.to_string()
171    }
172
173    fn transposition_typo(&self, text: &str, rng: &mut impl Rng) -> String {
174        let chars: Vec<char> = text.chars().collect();
175        if chars.len() < 2 {
176            return text.to_string();
177        }
178
179        // Find valid positions for transposition
180        let valid_positions: Vec<usize> = (0..chars.len() - 1)
181            .filter(|&i| chars[i].is_ascii_alphabetic() && chars[i + 1].is_ascii_alphabetic())
182            .collect();
183
184        if valid_positions.is_empty() {
185            return text.to_string();
186        }
187
188        let idx = *valid_positions.choose(rng).expect("non-empty collection");
189        let mut result = chars.clone();
190        result.swap(idx, idx + 1);
191        result.into_iter().collect()
192    }
193
194    fn omission_typo(&self, text: &str, rng: &mut impl Rng) -> String {
195        let chars: Vec<char> = text.chars().collect();
196        if chars.len() < 3 {
197            return text.to_string();
198        }
199
200        // Find alphabetic characters to omit (not at word boundaries)
201        let valid_positions: Vec<usize> = (1..chars.len() - 1)
202            .filter(|&i| {
203                chars[i].is_ascii_alphabetic()
204                    && chars[i - 1].is_ascii_alphabetic()
205                    && chars[i + 1].is_ascii_alphabetic()
206            })
207            .collect();
208
209        if valid_positions.is_empty() {
210            return text.to_string();
211        }
212
213        let idx = *valid_positions.choose(rng).expect("non-empty collection");
214        let mut result = chars.clone();
215        result.remove(idx);
216        result.into_iter().collect()
217    }
218
219    fn double_letter_typo(&self, text: &str, rng: &mut impl Rng) -> String {
220        let chars: Vec<char> = text.chars().collect();
221        if chars.is_empty() {
222            return text.to_string();
223        }
224
225        // Find alphabetic characters to double
226        let valid_positions: Vec<usize> = chars
227            .iter()
228            .enumerate()
229            .filter(|(_, c)| c.is_ascii_alphabetic())
230            .map(|(i, _)| i)
231            .collect();
232
233        if valid_positions.is_empty() {
234            return text.to_string();
235        }
236
237        let idx = *valid_positions.choose(rng).expect("non-empty collection");
238        let mut result = chars.clone();
239        result.insert(idx, chars[idx]);
240        result.into_iter().collect()
241    }
242
243    fn common_word_typo(&self, text: &str, rng: &mut impl Rng) -> String {
244        // Try to apply a common transposition or omission
245        let text_lower = text.to_lowercase();
246
247        // Try transposition first
248        for (correct, typo) in &self.common_transpositions {
249            if text_lower.contains(*correct) && rng.random_bool(0.5) {
250                return text.replacen(correct, typo, 1);
251            }
252        }
253
254        // Try omission
255        for (correct, typo) in &self.common_omissions {
256            if text_lower.contains(*correct) {
257                return text.replacen(correct, typo, 1);
258            }
259        }
260
261        // Fallback to keyboard typo
262        self.keyboard_typo(text, rng)
263    }
264}
265
266/// Description variator with abbreviation and typo support.
267#[derive(Debug, Clone)]
268pub struct DescriptionVariator {
269    config: VariationConfig,
270    abbreviations: HashMap<&'static str, Vec<&'static str>>,
271    typo_gen: TypoGenerator,
272}
273
274impl Default for DescriptionVariator {
275    fn default() -> Self {
276        Self::new()
277    }
278}
279
280impl DescriptionVariator {
281    /// Create a new description variator with default settings.
282    pub fn new() -> Self {
283        Self::with_config(VariationConfig::default())
284    }
285
286    /// Create a new description variator with custom configuration.
287    pub fn with_config(config: VariationConfig) -> Self {
288        let mut abbreviations = HashMap::new();
289
290        // Common accounting/business abbreviations
291        abbreviations.insert("Invoice", vec!["Inv", "INV", "Inv."]);
292        abbreviations.insert("invoice", vec!["inv", "inv."]);
293        abbreviations.insert("Purchase Order", vec!["PO", "P.O.", "PurchOrd"]);
294        abbreviations.insert("purchase order", vec!["PO", "p.o.", "po"]);
295        abbreviations.insert("Accounts Payable", vec!["AP", "A/P", "Accts Pay"]);
296        abbreviations.insert("accounts payable", vec!["AP", "a/p", "accts pay"]);
297        abbreviations.insert("Accounts Receivable", vec!["AR", "A/R", "Accts Rec"]);
298        abbreviations.insert("accounts receivable", vec!["AR", "a/r", "accts rec"]);
299        abbreviations.insert("Payment", vec!["Pmt", "PMT", "Pymt"]);
300        abbreviations.insert("payment", vec!["pmt", "pymt"]);
301        abbreviations.insert("Receipt", vec!["Rcpt", "RCPT", "Rec"]);
302        abbreviations.insert("receipt", vec!["rcpt", "rec"]);
303        abbreviations.insert("Transaction", vec!["Trans", "TXN", "Trx"]);
304        abbreviations.insert("transaction", vec!["trans", "txn", "trx"]);
305        abbreviations.insert("Reference", vec!["Ref", "REF", "Ref."]);
306        abbreviations.insert("reference", vec!["ref", "ref."]);
307        abbreviations.insert("Number", vec!["No", "No.", "Num", "#"]);
308        abbreviations.insert("number", vec!["no", "no.", "num", "#"]);
309        abbreviations.insert("Department", vec!["Dept", "Dept.", "Dpt"]);
310        abbreviations.insert("department", vec!["dept", "dept.", "dpt"]);
311        abbreviations.insert("Company", vec!["Co", "Co.", "Corp"]);
312        abbreviations.insert("company", vec!["co", "co.", "corp"]);
313        abbreviations.insert("Corporation", vec!["Corp", "Corp."]);
314        abbreviations.insert("corporation", vec!["corp", "corp."]);
315        abbreviations.insert("Incorporated", vec!["Inc", "Inc."]);
316        abbreviations.insert("incorporated", vec!["inc", "inc."]);
317        abbreviations.insert("Limited", vec!["Ltd", "Ltd."]);
318        abbreviations.insert("limited", vec!["ltd", "ltd."]);
319        abbreviations.insert("Quarter", vec!["Q", "Qtr", "Qtr."]);
320        abbreviations.insert("quarter", vec!["q", "qtr", "qtr."]);
321        abbreviations.insert("Year", vec!["Yr", "YR"]);
322        abbreviations.insert("year", vec!["yr"]);
323        abbreviations.insert("Month", vec!["Mo", "Mo.", "Mth"]);
324        abbreviations.insert("month", vec!["mo", "mo.", "mth"]);
325        abbreviations.insert("January", vec!["Jan", "Jan."]);
326        abbreviations.insert("February", vec!["Feb", "Feb."]);
327        abbreviations.insert("March", vec!["Mar", "Mar."]);
328        abbreviations.insert("April", vec!["Apr", "Apr."]);
329        abbreviations.insert("May", vec!["May"]);
330        abbreviations.insert("June", vec!["Jun", "Jun."]);
331        abbreviations.insert("July", vec!["Jul", "Jul."]);
332        abbreviations.insert("August", vec!["Aug", "Aug."]);
333        abbreviations.insert("September", vec!["Sep", "Sept", "Sep."]);
334        abbreviations.insert("October", vec!["Oct", "Oct."]);
335        abbreviations.insert("November", vec!["Nov", "Nov."]);
336        abbreviations.insert("December", vec!["Dec", "Dec."]);
337        abbreviations.insert("Revenue", vec!["Rev", "REV"]);
338        abbreviations.insert("revenue", vec!["rev"]);
339        abbreviations.insert("Expense", vec!["Exp", "EXP"]);
340        abbreviations.insert("expense", vec!["exp"]);
341        abbreviations.insert("Accrual", vec!["Accr", "Accrl"]);
342        abbreviations.insert("accrual", vec!["accr", "accrl"]);
343        abbreviations.insert("Adjustment", vec!["Adj", "Adjmt"]);
344        abbreviations.insert("adjustment", vec!["adj", "adjmt"]);
345        abbreviations.insert("Depreciation", vec!["Depr", "Dep"]);
346        abbreviations.insert("depreciation", vec!["depr", "dep"]);
347        abbreviations.insert("Amortization", vec!["Amort", "Amor"]);
348        abbreviations.insert("amortization", vec!["amort", "amor"]);
349        abbreviations.insert("Recognition", vec!["Recog", "Rec"]);
350        abbreviations.insert("recognition", vec!["recog", "rec"]);
351        abbreviations.insert("Processing", vec!["Proc", "Process"]);
352        abbreviations.insert("processing", vec!["proc", "process"]);
353        abbreviations.insert("Services", vec!["Svcs", "Svc"]);
354        abbreviations.insert("services", vec!["svcs", "svc"]);
355        abbreviations.insert("Management", vec!["Mgmt", "Mgt"]);
356        abbreviations.insert("management", vec!["mgmt", "mgt"]);
357        abbreviations.insert("General", vec!["Gen", "Gen."]);
358        abbreviations.insert("general", vec!["gen", "gen."]);
359        abbreviations.insert("Administrative", vec!["Admin", "Adm"]);
360        abbreviations.insert("administrative", vec!["admin", "adm"]);
361        abbreviations.insert("Professional", vec!["Prof", "Profl"]);
362        abbreviations.insert("professional", vec!["prof", "profl"]);
363
364        Self {
365            config,
366            abbreviations,
367            typo_gen: TypoGenerator::new(),
368        }
369    }
370
371    /// Apply variations to a description.
372    pub fn apply(&self, description: &str, rng: &mut impl Rng) -> String {
373        let mut result = description.to_string();
374
375        // Apply abbreviations
376        if rng.random_bool(self.config.abbreviation_rate) {
377            result = self.apply_abbreviations(&result, rng);
378        }
379
380        // Apply case variations
381        if rng.random_bool(self.config.case_variation_rate) {
382            result = self.apply_case_variation(&result, rng);
383        }
384
385        // Apply typos (rare)
386        if rng.random_bool(self.config.typo_rate) {
387            result = self.typo_gen.introduce_typo(&result, rng);
388        }
389
390        result
391    }
392
393    /// Apply only abbreviations without other variations.
394    pub fn abbreviate(&self, description: &str, rng: &mut impl Rng) -> String {
395        self.apply_abbreviations(description, rng)
396    }
397
398    fn apply_abbreviations(&self, text: &str, rng: &mut impl Rng) -> String {
399        let mut result = text.to_string();
400
401        // Find and replace one or two terms
402        let max_replacements = rng.random_range(1..=2);
403        let mut replacements = 0;
404
405        for (full, abbrevs) in &self.abbreviations {
406            if result.contains(*full) && replacements < max_replacements {
407                if let Some(abbrev) = abbrevs.choose(rng) {
408                    result = result.replacen(*full, abbrev, 1);
409                    replacements += 1;
410                }
411            }
412        }
413
414        result
415    }
416
417    fn apply_case_variation(&self, text: &str, rng: &mut impl Rng) -> String {
418        let variation = rng.random_range(0..3);
419        match variation {
420            0 => text.to_uppercase(),
421            1 => text.to_lowercase(),
422            _ => {
423                // Title case variation - first letter of each word uppercase
424                text.split_whitespace()
425                    .map(|word| {
426                        let mut chars: Vec<char> = word.chars().collect();
427                        if let Some(first) = chars.first_mut() {
428                            *first = first.to_ascii_uppercase();
429                        }
430                        for c in chars.iter_mut().skip(1) {
431                            *c = c.to_ascii_lowercase();
432                        }
433                        chars.into_iter().collect::<String>()
434                    })
435                    .collect::<Vec<String>>()
436                    .join(" ")
437            }
438        }
439    }
440
441    /// Get the configuration.
442    pub fn config(&self) -> &VariationConfig {
443        &self.config
444    }
445}
446
447#[cfg(test)]
448#[allow(clippy::unwrap_used)]
449mod tests {
450    use super::*;
451    use rand::SeedableRng;
452    use rand_chacha::ChaCha8Rng;
453
454    #[test]
455    fn test_typo_generator_keyboard() {
456        let mut rng = ChaCha8Rng::seed_from_u64(42);
457        let gen = TypoGenerator::new();
458
459        let original = "payment";
460        let typo = gen.keyboard_typo(original, &mut rng);
461        // Should be different (usually)
462        assert!(typo.len() == original.len()); // Same length for keyboard typos
463    }
464
465    #[test]
466    fn test_typo_generator_transposition() {
467        let mut rng = ChaCha8Rng::seed_from_u64(42);
468        let gen = TypoGenerator::new();
469
470        let original = "payment";
471        let typo = gen.transposition_typo(original, &mut rng);
472        assert_eq!(typo.len(), original.len());
473    }
474
475    #[test]
476    fn test_description_variator_abbreviation() {
477        let mut rng = ChaCha8Rng::seed_from_u64(42);
478        let config = VariationConfig {
479            abbreviation_rate: 1.0, // Always abbreviate
480            typo_rate: 0.0,
481            case_variation_rate: 0.0,
482            ..Default::default()
483        };
484        let variator = DescriptionVariator::with_config(config);
485
486        let original = "Invoice for Purchase Order";
487        let varied = variator.apply(original, &mut rng);
488
489        // Should contain at least one abbreviation
490        let has_abbreviation = varied.contains("Inv")
491            || varied.contains("INV")
492            || varied.contains("PO")
493            || varied.contains("P.O.");
494        assert!(has_abbreviation);
495    }
496
497    #[test]
498    fn test_description_variator_no_change() {
499        let mut rng = ChaCha8Rng::seed_from_u64(42);
500        let config = VariationConfig {
501            abbreviation_rate: 0.0,
502            typo_rate: 0.0,
503            case_variation_rate: 0.0,
504            ..Default::default()
505        };
506        let variator = DescriptionVariator::with_config(config);
507
508        let original = "Regular description";
509        let varied = variator.apply(original, &mut rng);
510        assert_eq!(original, varied);
511    }
512
513    #[test]
514    fn test_month_abbreviations() {
515        let mut rng = ChaCha8Rng::seed_from_u64(42);
516        let config = VariationConfig {
517            abbreviation_rate: 1.0,
518            typo_rate: 0.0,
519            case_variation_rate: 0.0,
520            ..Default::default()
521        };
522        let variator = DescriptionVariator::with_config(config);
523
524        let original = "Revenue for December 2024";
525        let varied = variator.abbreviate(original, &mut rng);
526
527        // Should have some abbreviation
528        let has_change = varied != original;
529        assert!(has_change || varied.contains("Dec") || varied.contains("Rev"));
530    }
531
532    #[test]
533    fn test_case_variation() {
534        let mut rng = ChaCha8Rng::seed_from_u64(42);
535        let config = VariationConfig {
536            abbreviation_rate: 0.0,
537            typo_rate: 0.0,
538            case_variation_rate: 1.0,
539            ..Default::default()
540        };
541        let variator = DescriptionVariator::with_config(config);
542
543        let original = "Invoice Payment";
544        let varied = variator.apply(original, &mut rng);
545
546        // Case should be different
547        let case_changed = varied == original.to_uppercase()
548            || varied == original.to_lowercase()
549            || varied != original;
550        assert!(case_changed);
551    }
552
553    #[test]
554    fn test_deterministic_variation() {
555        let config = VariationConfig {
556            abbreviation_rate: 0.5,
557            typo_rate: 0.1,
558            ..Default::default()
559        };
560        let variator = DescriptionVariator::with_config(config);
561
562        let original = "Invoice for Services";
563
564        let mut rng1 = ChaCha8Rng::seed_from_u64(12345);
565        let mut rng2 = ChaCha8Rng::seed_from_u64(12345);
566
567        let varied1 = variator.apply(original, &mut rng1);
568        let varied2 = variator.apply(original, &mut rng2);
569
570        assert_eq!(varied1, varied2);
571    }
572}