Skip to main content

datasynth_core/templates/realism/
mod.rs

1//! Enhanced realism module for synthetic data generation.
2//!
3//! This module provides sophisticated name, description, and metadata generation
4//! with cultural awareness, industry-specific patterns, and natural variations.
5//!
6//! # Features
7//!
8//! - **Company Names**: Industry-specific naming patterns with legal suffixes
9//! - **Vendor Names**: Spend category-based vendor naming
10//! - **Description Variations**: Abbreviations, typos, and natural language variation
11//! - **User IDs**: Realistic corporate user ID patterns
12//! - **Reference Formats**: ERP-style reference number generation
13//! - **Addresses**: Multi-regional address formatting
14
15pub mod addresses;
16pub mod company_names;
17pub mod descriptions;
18pub mod reference_formats;
19pub mod user_ids;
20pub mod vendor_names;
21
22pub use addresses::{Address, AddressGenerator, AddressRegion, AddressStyle};
23pub use company_names::{CompanyNameGenerator, CompanyNameStyle, Industry, LegalSuffix};
24pub use descriptions::{DescriptionVariator, TypoGenerator, VariationConfig};
25pub use reference_formats::{EnhancedReferenceFormat, EnhancedReferenceGenerator, ReferenceStyle};
26pub use user_ids::{UserIdGenerator, UserIdPattern};
27pub use vendor_names::{SpendCategory, VendorNameGenerator, VendorProfile};
28
29use rand::Rng;
30use serde::{Deserialize, Serialize};
31
32/// Configuration for realism features.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34#[serde(default)]
35pub struct RealismConfig {
36    /// Enable culturally-aware name generation
37    pub cultural_awareness: bool,
38    /// Enable industry-specific vendor naming
39    pub industry_vendor_names: bool,
40    /// Enable description variations (abbreviations, typos)
41    pub description_variations: bool,
42    /// Rate of abbreviation usage (0.0 - 1.0)
43    pub abbreviation_rate: f64,
44    /// Rate of typo injection (0.0 - 1.0)
45    pub typo_rate: f64,
46    /// Enable realistic reference number formats
47    pub realistic_references: bool,
48    /// Primary region for address/name generation
49    pub primary_region: AddressRegion,
50    /// Enable international diversity
51    pub international_diversity: bool,
52    /// Diversity index for international content (0.0 - 1.0)
53    pub diversity_index: f64,
54}
55
56impl Default for RealismConfig {
57    fn default() -> Self {
58        Self {
59            cultural_awareness: true,
60            industry_vendor_names: true,
61            description_variations: true,
62            abbreviation_rate: 0.25,
63            typo_rate: 0.01,
64            realistic_references: true,
65            primary_region: AddressRegion::NorthAmerica,
66            international_diversity: true,
67            diversity_index: 0.3,
68        }
69    }
70}
71
72/// Master realism generator that coordinates all sub-generators.
73#[derive(Debug, Clone)]
74pub struct RealismGenerator {
75    config: RealismConfig,
76    company_gen: CompanyNameGenerator,
77    vendor_gen: VendorNameGenerator,
78    description_var: DescriptionVariator,
79    user_id_gen: UserIdGenerator,
80    reference_gen: EnhancedReferenceGenerator,
81    address_gen: AddressGenerator,
82}
83
84impl RealismGenerator {
85    /// Create a new realism generator with default configuration.
86    pub fn new() -> Self {
87        Self::with_config(RealismConfig::default())
88    }
89
90    /// Create a new realism generator with custom configuration.
91    pub fn with_config(config: RealismConfig) -> Self {
92        let variation_config = VariationConfig {
93            abbreviation_rate: config.abbreviation_rate,
94            typo_rate: config.typo_rate,
95            case_variation_rate: 0.05,
96            ..Default::default()
97        };
98
99        Self {
100            company_gen: CompanyNameGenerator::new(),
101            vendor_gen: VendorNameGenerator::new(),
102            description_var: DescriptionVariator::with_config(variation_config),
103            user_id_gen: UserIdGenerator::new(),
104            reference_gen: EnhancedReferenceGenerator::new(),
105            address_gen: AddressGenerator::for_region(config.primary_region),
106            config,
107        }
108    }
109
110    /// Get the configuration.
111    pub fn config(&self) -> &RealismConfig {
112        &self.config
113    }
114
115    /// Generate a realistic company name.
116    pub fn generate_company_name(&self, industry: Industry, rng: &mut impl Rng) -> String {
117        self.company_gen.generate(industry, rng)
118    }
119
120    /// Generate a realistic vendor name for a spend category.
121    pub fn generate_vendor_name(&self, category: SpendCategory, rng: &mut impl Rng) -> String {
122        self.vendor_gen.generate(category, rng)
123    }
124
125    /// Apply variations to a description.
126    pub fn vary_description(&self, description: &str, rng: &mut impl Rng) -> String {
127        if self.config.description_variations {
128            self.description_var.apply(description, rng)
129        } else {
130            description.to_string()
131        }
132    }
133
134    /// Generate a realistic user ID.
135    pub fn generate_user_id(
136        &self,
137        first_name: &str,
138        last_name: &str,
139        index: usize,
140        rng: &mut impl Rng,
141    ) -> String {
142        self.user_id_gen.generate(first_name, last_name, index, rng)
143    }
144
145    /// Generate a reference number.
146    pub fn generate_reference(
147        &self,
148        format: EnhancedReferenceFormat,
149        year: i32,
150        rng: &mut impl Rng,
151    ) -> String {
152        self.reference_gen.generate(format, year, rng)
153    }
154
155    /// Generate an address.
156    pub fn generate_address(&self, rng: &mut impl Rng) -> Address {
157        self.address_gen.generate(rng)
158    }
159
160    /// Get the company name generator.
161    pub fn company_names(&self) -> &CompanyNameGenerator {
162        &self.company_gen
163    }
164
165    /// Get the vendor name generator.
166    pub fn vendor_names(&self) -> &VendorNameGenerator {
167        &self.vendor_gen
168    }
169
170    /// Get the description variator.
171    pub fn descriptions(&self) -> &DescriptionVariator {
172        &self.description_var
173    }
174
175    /// Get the user ID generator.
176    pub fn user_ids(&self) -> &UserIdGenerator {
177        &self.user_id_gen
178    }
179
180    /// Get the reference generator.
181    pub fn references(&self) -> &EnhancedReferenceGenerator {
182        &self.reference_gen
183    }
184
185    /// Get the address generator.
186    pub fn addresses(&self) -> &AddressGenerator {
187        &self.address_gen
188    }
189}
190
191impl Default for RealismGenerator {
192    fn default() -> Self {
193        Self::new()
194    }
195}
196
197#[cfg(test)]
198#[allow(clippy::unwrap_used)]
199mod tests {
200    use super::*;
201    use rand::SeedableRng;
202    use rand_chacha::ChaCha8Rng;
203
204    #[test]
205    fn test_realism_generator_creation() {
206        let gen = RealismGenerator::new();
207        assert!(gen.config().cultural_awareness);
208        assert!(gen.config().description_variations);
209    }
210
211    #[test]
212    fn test_realism_generator_with_config() {
213        let config = RealismConfig {
214            abbreviation_rate: 0.5,
215            typo_rate: 0.0,
216            ..Default::default()
217        };
218        let gen = RealismGenerator::with_config(config);
219        assert_eq!(gen.config().abbreviation_rate, 0.5);
220        assert_eq!(gen.config().typo_rate, 0.0);
221    }
222
223    #[test]
224    fn test_generate_company_name() {
225        let mut rng = ChaCha8Rng::seed_from_u64(42);
226        let gen = RealismGenerator::new();
227        let name = gen.generate_company_name(Industry::Manufacturing, &mut rng);
228        assert!(!name.is_empty());
229    }
230
231    #[test]
232    fn test_generate_vendor_name() {
233        let mut rng = ChaCha8Rng::seed_from_u64(42);
234        let gen = RealismGenerator::new();
235        let name = gen.generate_vendor_name(SpendCategory::OfficeSupplies, &mut rng);
236        assert!(!name.is_empty());
237    }
238
239    #[test]
240    fn test_vary_description() {
241        let mut rng = ChaCha8Rng::seed_from_u64(42);
242        let config = RealismConfig {
243            abbreviation_rate: 1.0, // Always abbreviate
244            typo_rate: 0.0,
245            ..Default::default()
246        };
247        let gen = RealismGenerator::with_config(config);
248        let varied = gen.vary_description("Invoice for Purchase Order", &mut rng);
249        // Should contain abbreviation
250        assert!(
251            varied.contains("Inv")
252                || varied.contains("PO")
253                || varied == "Invoice for Purchase Order"
254        );
255    }
256}