mockforge_data/
generator.rs

1//! Data generator implementation
2
3use crate::Result;
4use crate::{
5    faker::EnhancedFaker,
6    rag::{RagConfig, RagEngine},
7    schema::SchemaDefinition,
8    DataConfig, GenerationResult,
9};
10use std::time::Instant;
11
12/// Data generator for creating synthetic datasets
13#[derive(Debug)]
14pub struct DataGenerator {
15    /// Schema definition
16    schema: SchemaDefinition,
17    /// Configuration
18    config: DataConfig,
19    /// Faker instance
20    faker: EnhancedFaker,
21    /// Seeded RNG if seed was provided
22    seeded_rng: Option<rand::rngs::StdRng>,
23    /// RAG engine for enhanced generation
24    rag_engine: Option<RagEngine>,
25}
26
27impl DataGenerator {
28    /// Create a new data generator
29    pub fn new(schema: SchemaDefinition, config: DataConfig) -> Result<Self> {
30        let faker = EnhancedFaker::new();
31        let seeded_rng = if let Some(seed) = config.seed {
32            use rand::SeedableRng;
33            Some(rand::rngs::StdRng::seed_from_u64(seed))
34        } else {
35            None
36        };
37
38        // Initialize RAG engine if enabled
39        let rag_engine = if config.rag_enabled {
40            let rag_config = RagConfig::default();
41            let mut engine = RagEngine::new(rag_config);
42            // Add schema to knowledge base
43            engine.add_schema(&schema)?;
44            Some(engine)
45        } else {
46            None
47        };
48
49        Ok(Self {
50            schema,
51            config,
52            faker,
53            seeded_rng,
54            rag_engine,
55        })
56    }
57
58    /// Generate data according to the configuration
59    pub async fn generate(&mut self) -> Result<GenerationResult> {
60        let start_time = Instant::now();
61
62        // Use RAG-enhanced generation if enabled
63        if let Some(rag_engine) = &mut self.rag_engine {
64            let data = rag_engine.generate_with_rag(&self.schema, &self.config).await?;
65            let generation_time = start_time.elapsed().as_millis();
66            Ok(GenerationResult::new(data, generation_time))
67        } else {
68            // Use standard faker-based generation
69            let mut data = Vec::with_capacity(self.config.rows);
70
71            for _ in 0..self.config.rows {
72                let row = self.schema.generate_row(&mut self.faker)?;
73                data.push(row);
74            }
75
76            let generation_time = start_time.elapsed().as_millis();
77            Ok(GenerationResult::new(data, generation_time))
78        }
79    }
80
81    /// Generate data with relationships resolved
82    pub async fn generate_with_relationships(
83        &mut self,
84        related_schemas: &[SchemaDefinition],
85    ) -> Result<GenerationResult> {
86        let start_time = Instant::now();
87
88        // Create a map of related schemas for lookup
89        let schema_map: std::collections::HashMap<String, &SchemaDefinition> =
90            related_schemas.iter().map(|s| (s.name.clone(), s)).collect();
91
92        let mut data = Vec::with_capacity(self.config.rows);
93
94        for _ in 0..self.config.rows {
95            let mut row = self.schema.generate_row(&mut self.faker)?;
96
97            // Resolve relationships
98            for relationship in self.schema.relationships.values() {
99                if let Some(target_schema) = schema_map.get(&relationship.target_schema) {
100                    // Generate a related row
101                    let related_row = target_schema.generate_row(&mut self.faker)?;
102
103                    // Extract the foreign key value
104                    if let Some(related_obj) = related_row.as_object() {
105                        if let Some(fk_value) = related_obj.get("id") {
106                            // Insert the foreign key into the current row
107                            if let Some(row_obj) = row.as_object_mut() {
108                                row_obj.insert(relationship.foreign_key.clone(), fk_value.clone());
109                            }
110                        }
111                    }
112                }
113            }
114
115            data.push(row);
116        }
117
118        let generation_time = start_time.elapsed().as_millis();
119
120        Ok(GenerationResult::new(data, generation_time))
121    }
122
123    /// Generate a single row
124    pub fn generate_single(&mut self) -> Result<serde_json::Value> {
125        self.schema.generate_row(&mut self.faker)
126    }
127
128    /// Get the schema being used
129    pub fn schema(&self) -> &SchemaDefinition {
130        &self.schema
131    }
132
133    /// Get the current configuration
134    pub fn config(&self) -> &DataConfig {
135        &self.config
136    }
137
138    /// Update configuration
139    pub fn update_config(&mut self, config: DataConfig) -> Result<()> {
140        self.config = config.clone();
141
142        // Re-seed if needed
143        if let Some(seed) = self.config.seed {
144            use rand::SeedableRng;
145            self.seeded_rng = Some(rand::rngs::StdRng::seed_from_u64(seed));
146        } else {
147            self.seeded_rng = None;
148        }
149
150        // Update RAG engine if RAG is enabled/disabled
151        if config.rag_enabled {
152            if self.rag_engine.is_none() {
153                let rag_config = RagConfig::default();
154                let mut engine = RagEngine::new(rag_config);
155                engine.add_schema(&self.schema)?;
156                self.rag_engine = Some(engine);
157            }
158        } else {
159            self.rag_engine = None;
160        }
161
162        Ok(())
163    }
164
165    /// Configure RAG settings
166    pub fn configure_rag(&mut self, rag_config: RagConfig) -> Result<()> {
167        if let Some(engine) = &mut self.rag_engine {
168            engine.update_config(rag_config);
169        } else {
170            let mut engine = RagEngine::new(rag_config);
171            engine.add_schema(&self.schema)?;
172            self.rag_engine = Some(engine);
173        }
174        Ok(())
175    }
176
177    /// Get RAG engine reference
178    pub fn rag_engine(&self) -> Option<&RagEngine> {
179        self.rag_engine.as_ref()
180    }
181
182    /// Get mutable RAG engine reference
183    pub fn rag_engine_mut(&mut self) -> Option<&mut RagEngine> {
184        self.rag_engine.as_mut()
185    }
186}
187
188/// Batch data generator for generating multiple datasets
189#[derive(Debug)]
190pub struct BatchGenerator {
191    /// Generators for different schemas
192    generators: Vec<DataGenerator>,
193    /// Global configuration
194    #[allow(dead_code)]
195    config: DataConfig,
196}
197
198impl BatchGenerator {
199    /// Create a new batch generator
200    pub fn new(schemas: Vec<SchemaDefinition>, config: DataConfig) -> Result<Self> {
201        let mut generators = Vec::new();
202
203        for schema in schemas {
204            let generator = DataGenerator::new(schema, config.clone())?;
205            generators.push(generator);
206        }
207
208        Ok(Self { generators, config })
209    }
210
211    /// Generate data for all schemas
212    pub async fn generate_batch(&mut self) -> Result<Vec<GenerationResult>> {
213        let mut results = Vec::new();
214
215        for generator in &mut self.generators {
216            let result = generator.generate().await?;
217            results.push(result);
218        }
219
220        Ok(results)
221    }
222
223    /// Generate data with cross-schema relationships
224    pub async fn generate_with_relationships(&mut self) -> Result<Vec<GenerationResult>> {
225        let mut results = Vec::new();
226        let schemas: Vec<SchemaDefinition> =
227            self.generators.iter().map(|g| g.schema().clone()).collect();
228
229        for generator in &mut self.generators {
230            let result = generator.generate_with_relationships(&schemas).await?;
231            results.push(result);
232        }
233
234        Ok(results)
235    }
236
237    /// Get all schemas
238    pub fn schemas(&self) -> Vec<&SchemaDefinition> {
239        self.generators.iter().map(|g| g.schema()).collect()
240    }
241}
242
243/// Utility functions for data generation
244pub mod utils {
245    use super::*;
246    use crate::Result;
247
248    /// Generate sample data from a simple schema definition
249    pub async fn generate_sample_data(
250        schema_name: &str,
251        fields: Vec<(&str, &str)>,
252        rows: usize,
253    ) -> Result<GenerationResult> {
254        let mut schema = SchemaDefinition::new(schema_name.to_string());
255
256        for (field_name, field_type) in fields {
257            let field =
258                crate::schema::FieldDefinition::new(field_name.to_string(), field_type.to_string());
259            schema = schema.with_field(field);
260        }
261
262        let config = DataConfig {
263            rows,
264            ..Default::default()
265        };
266
267        let mut generator = DataGenerator::new(schema, config)?;
268        generator.generate().await
269    }
270
271    /// Generate user data
272    pub async fn generate_users(count: usize) -> Result<GenerationResult> {
273        let schema = crate::schema::templates::user_schema();
274        let config = DataConfig {
275            rows: count,
276            ..Default::default()
277        };
278
279        let mut generator = DataGenerator::new(schema, config)?;
280        generator.generate().await
281    }
282
283    /// Generate product data
284    pub async fn generate_products(count: usize) -> Result<GenerationResult> {
285        let schema = crate::schema::templates::product_schema();
286        let config = DataConfig {
287            rows: count,
288            ..Default::default()
289        };
290
291        let mut generator = DataGenerator::new(schema, config)?;
292        generator.generate().await
293    }
294
295    /// Generate orders with user relationships
296    pub async fn generate_orders_with_users(
297        order_count: usize,
298        user_count: usize,
299    ) -> Result<Vec<GenerationResult>> {
300        let user_schema = crate::schema::templates::user_schema();
301        let order_schema = crate::schema::templates::order_schema();
302
303        let config = DataConfig {
304            rows: order_count,
305            ..Default::default()
306        };
307
308        let mut batch_generator = BatchGenerator::new(vec![user_schema, order_schema], config)?;
309
310        // Update the order generator to generate the right number of rows
311        if let Some(order_generator) = batch_generator.generators.get_mut(1) {
312            let order_config = DataConfig {
313                rows: order_count,
314                ..Default::default()
315            };
316            order_generator.update_config(order_config)?;
317        }
318
319        // Update the user generator to generate users
320        if let Some(user_generator) = batch_generator.generators.get_mut(0) {
321            let user_config = DataConfig {
322                rows: user_count,
323                ..Default::default()
324            };
325            user_generator.update_config(user_config)?;
326        }
327
328        batch_generator.generate_with_relationships().await
329    }
330}
331
332#[cfg(test)]
333mod tests {
334    use super::*;
335    use crate::schema::templates;
336
337    #[test]
338    fn test_data_generator_new() {
339        let schema = templates::user_schema();
340        let config = DataConfig::default();
341
342        let result = DataGenerator::new(schema, config);
343        assert!(result.is_ok());
344    }
345
346    #[test]
347    fn test_data_generator_with_seed() {
348        let schema = templates::user_schema();
349        let config = DataConfig {
350            rows: 10,
351            seed: Some(42),
352            ..Default::default()
353        };
354
355        let result = DataGenerator::new(schema, config);
356        assert!(result.is_ok());
357    }
358
359    #[test]
360    fn test_batch_generator_new() {
361        let schemas = vec![templates::user_schema()];
362        let config = DataConfig::default();
363
364        let result = BatchGenerator::new(schemas, config);
365        assert!(result.is_ok());
366    }
367
368    #[test]
369    fn test_batch_generator_multiple_schemas() {
370        let schemas = vec![templates::user_schema(), templates::product_schema()];
371        let config = DataConfig::default();
372
373        let result = BatchGenerator::new(schemas, config);
374        assert!(result.is_ok());
375
376        if let Ok(batch) = result {
377            assert_eq!(batch.generators.len(), 2);
378        }
379    }
380
381    #[test]
382    fn test_data_generator_update_config() {
383        let schema = templates::user_schema();
384        let config = DataConfig::default();
385
386        let mut generator = DataGenerator::new(schema, config).unwrap();
387
388        let new_config = DataConfig {
389            rows: 50,
390            ..Default::default()
391        };
392
393        let result = generator.update_config(new_config);
394        assert!(result.is_ok());
395    }
396}