Skip to main content

mockforge_data/
generator.rs

1//! Data generator implementation
2
3use crate::Result;
4use crate::{
5    faker::EnhancedFaker,
6    rag::{RagConfig, RagEngine},
7    schema::SchemaDefinition,
8    DataConfig, GenerationResult,
9};
10use std::time::Instant;
11
12/// Data generator for creating synthetic datasets
13#[derive(Debug)]
14pub struct DataGenerator {
15    /// Schema definition
16    schema: SchemaDefinition,
17    /// Configuration
18    config: DataConfig,
19    /// Faker instance
20    faker: EnhancedFaker,
21    /// Seeded RNG if seed was provided
22    seeded_rng: Option<rand::rngs::StdRng>,
23    /// RAG engine for enhanced generation
24    rag_engine: Option<RagEngine>,
25}
26
27impl DataGenerator {
28    /// Create a new data generator
29    pub fn new(schema: SchemaDefinition, config: DataConfig) -> Result<Self> {
30        let faker = EnhancedFaker::new();
31        let seeded_rng = if let Some(seed) = config.seed {
32            use rand::SeedableRng;
33            Some(rand::rngs::StdRng::seed_from_u64(seed))
34        } else {
35            None
36        };
37
38        // Initialize RAG engine if enabled
39        let rag_engine = if config.rag_enabled {
40            let rag_config = RagConfig::default();
41            let mut engine = RagEngine::new(rag_config);
42            // Add schema to knowledge base
43            engine.add_schema(&schema)?;
44            Some(engine)
45        } else {
46            None
47        };
48
49        Ok(Self {
50            schema,
51            config,
52            faker,
53            seeded_rng,
54            rag_engine,
55        })
56    }
57
58    /// Generate data according to the configuration
59    pub async fn generate(&mut self) -> Result<GenerationResult> {
60        let start_time = Instant::now();
61
62        // Use RAG-enhanced generation if enabled
63        if let Some(rag_engine) = &mut self.rag_engine {
64            let data = rag_engine.generate_with_rag(&self.schema, &self.config).await?;
65            let generation_time = start_time.elapsed().as_millis();
66            Ok(GenerationResult::new(data, generation_time))
67        } else {
68            // Use standard faker-based generation
69            let mut data = Vec::with_capacity(self.config.rows);
70
71            for _ in 0..self.config.rows {
72                let row = self.schema.generate_row(&mut self.faker)?;
73                data.push(row);
74            }
75
76            let generation_time = start_time.elapsed().as_millis();
77            Ok(GenerationResult::new(data, generation_time))
78        }
79    }
80
81    /// Generate data with relationships resolved
82    pub async fn generate_with_relationships(
83        &mut self,
84        related_schemas: &[SchemaDefinition],
85    ) -> Result<GenerationResult> {
86        let start_time = Instant::now();
87
88        // Create a map of related schemas for lookup
89        let schema_map: std::collections::HashMap<String, &SchemaDefinition> =
90            related_schemas.iter().map(|s| (s.name.clone(), s)).collect();
91
92        let mut data = Vec::with_capacity(self.config.rows);
93
94        for _ in 0..self.config.rows {
95            let mut row = self.schema.generate_row(&mut self.faker)?;
96
97            // Resolve relationships
98            for relationship in self.schema.relationships.values() {
99                if let Some(target_schema) = schema_map.get(&relationship.target_schema) {
100                    // Generate a related row
101                    let related_row = target_schema.generate_row(&mut self.faker)?;
102
103                    // Extract the foreign key value
104                    if let Some(related_obj) = related_row.as_object() {
105                        if let Some(fk_value) = related_obj.get("id") {
106                            // Insert the foreign key into the current row
107                            if let Some(row_obj) = row.as_object_mut() {
108                                row_obj.insert(relationship.foreign_key.clone(), fk_value.clone());
109                            }
110                        }
111                    }
112                }
113            }
114
115            data.push(row);
116        }
117
118        let generation_time = start_time.elapsed().as_millis();
119
120        Ok(GenerationResult::new(data, generation_time))
121    }
122
123    /// Generate a single row
124    pub fn generate_single(&mut self) -> Result<serde_json::Value> {
125        self.schema.generate_row(&mut self.faker)
126    }
127
128    /// Get the schema being used
129    pub fn schema(&self) -> &SchemaDefinition {
130        &self.schema
131    }
132
133    /// Get the current configuration
134    pub fn config(&self) -> &DataConfig {
135        &self.config
136    }
137
138    /// Update configuration
139    pub fn update_config(&mut self, config: DataConfig) -> Result<()> {
140        self.config = config.clone();
141
142        // Re-seed if needed
143        if let Some(seed) = self.config.seed {
144            use rand::SeedableRng;
145            self.seeded_rng = Some(rand::rngs::StdRng::seed_from_u64(seed));
146        } else {
147            self.seeded_rng = None;
148        }
149
150        // Update RAG engine if RAG is enabled/disabled
151        if config.rag_enabled {
152            if self.rag_engine.is_none() {
153                let rag_config = RagConfig::default();
154                let mut engine = RagEngine::new(rag_config);
155                engine.add_schema(&self.schema)?;
156                self.rag_engine = Some(engine);
157            }
158        } else {
159            self.rag_engine = None;
160        }
161
162        Ok(())
163    }
164
165    /// Configure RAG settings
166    pub fn configure_rag(&mut self, rag_config: RagConfig) -> Result<()> {
167        if let Some(engine) = &mut self.rag_engine {
168            engine.update_config(rag_config);
169        } else {
170            let mut engine = RagEngine::new(rag_config);
171            engine.add_schema(&self.schema)?;
172            self.rag_engine = Some(engine);
173        }
174        Ok(())
175    }
176
177    /// Get RAG engine reference
178    pub fn rag_engine(&self) -> Option<&RagEngine> {
179        self.rag_engine.as_ref()
180    }
181
182    /// Get mutable RAG engine reference
183    pub fn rag_engine_mut(&mut self) -> Option<&mut RagEngine> {
184        self.rag_engine.as_mut()
185    }
186}
187
188/// Batch data generator for generating multiple datasets
189#[derive(Debug)]
190pub struct BatchGenerator {
191    /// Generators for different schemas
192    generators: Vec<DataGenerator>,
193}
194
195impl BatchGenerator {
196    /// Create a new batch generator
197    pub fn new(schemas: Vec<SchemaDefinition>, config: DataConfig) -> Result<Self> {
198        let mut generators = Vec::new();
199
200        for schema in schemas {
201            let generator = DataGenerator::new(schema, config.clone())?;
202            generators.push(generator);
203        }
204
205        Ok(Self { generators })
206    }
207
208    /// Generate data for all schemas
209    pub async fn generate_batch(&mut self) -> Result<Vec<GenerationResult>> {
210        let mut results = Vec::new();
211
212        for generator in &mut self.generators {
213            let result = generator.generate().await?;
214            results.push(result);
215        }
216
217        Ok(results)
218    }
219
220    /// Generate data with cross-schema relationships
221    pub async fn generate_with_relationships(&mut self) -> Result<Vec<GenerationResult>> {
222        let mut results = Vec::new();
223        let schemas: Vec<SchemaDefinition> =
224            self.generators.iter().map(|g| g.schema().clone()).collect();
225
226        for generator in &mut self.generators {
227            let result = generator.generate_with_relationships(&schemas).await?;
228            results.push(result);
229        }
230
231        Ok(results)
232    }
233
234    /// Get all schemas
235    pub fn schemas(&self) -> Vec<&SchemaDefinition> {
236        self.generators.iter().map(|g| g.schema()).collect()
237    }
238}
239
240/// Utility functions for data generation
241pub mod utils {
242    use super::*;
243    use crate::Result;
244
245    /// Generate sample data from a simple schema definition
246    pub async fn generate_sample_data(
247        schema_name: &str,
248        fields: Vec<(&str, &str)>,
249        rows: usize,
250    ) -> Result<GenerationResult> {
251        let mut schema = SchemaDefinition::new(schema_name.to_string());
252
253        for (field_name, field_type) in fields {
254            let field =
255                crate::schema::FieldDefinition::new(field_name.to_string(), field_type.to_string());
256            schema = schema.with_field(field);
257        }
258
259        let config = DataConfig {
260            rows,
261            ..Default::default()
262        };
263
264        let mut generator = DataGenerator::new(schema, config)?;
265        generator.generate().await
266    }
267
268    /// Generate user data
269    pub async fn generate_users(count: usize) -> Result<GenerationResult> {
270        let schema = crate::schema::templates::user_schema();
271        let config = DataConfig {
272            rows: count,
273            ..Default::default()
274        };
275
276        let mut generator = DataGenerator::new(schema, config)?;
277        generator.generate().await
278    }
279
280    /// Generate product data
281    pub async fn generate_products(count: usize) -> Result<GenerationResult> {
282        let schema = crate::schema::templates::product_schema();
283        let config = DataConfig {
284            rows: count,
285            ..Default::default()
286        };
287
288        let mut generator = DataGenerator::new(schema, config)?;
289        generator.generate().await
290    }
291
292    /// Generate orders with user relationships
293    pub async fn generate_orders_with_users(
294        order_count: usize,
295        user_count: usize,
296    ) -> Result<Vec<GenerationResult>> {
297        let user_schema = crate::schema::templates::user_schema();
298        let order_schema = crate::schema::templates::order_schema();
299
300        let config = DataConfig {
301            rows: order_count,
302            ..Default::default()
303        };
304
305        let mut batch_generator = BatchGenerator::new(vec![user_schema, order_schema], config)?;
306
307        // Update the order generator to generate the right number of rows
308        if let Some(order_generator) = batch_generator.generators.get_mut(1) {
309            let order_config = DataConfig {
310                rows: order_count,
311                ..Default::default()
312            };
313            order_generator.update_config(order_config)?;
314        }
315
316        // Update the user generator to generate users
317        if let Some(user_generator) = batch_generator.generators.get_mut(0) {
318            let user_config = DataConfig {
319                rows: user_count,
320                ..Default::default()
321            };
322            user_generator.update_config(user_config)?;
323        }
324
325        batch_generator.generate_with_relationships().await
326    }
327}
328
329#[cfg(test)]
330mod tests {
331    use super::*;
332    use crate::schema::templates;
333
334    #[test]
335    fn test_data_generator_new() {
336        let schema = templates::user_schema();
337        let config = DataConfig::default();
338
339        let result = DataGenerator::new(schema, config);
340        assert!(result.is_ok());
341    }
342
343    #[test]
344    fn test_data_generator_with_seed() {
345        let schema = templates::user_schema();
346        let config = DataConfig {
347            rows: 10,
348            seed: Some(42),
349            ..Default::default()
350        };
351
352        let result = DataGenerator::new(schema, config);
353        assert!(result.is_ok());
354    }
355
356    #[test]
357    fn test_batch_generator_new() {
358        let schemas = vec![templates::user_schema()];
359        let config = DataConfig::default();
360
361        let result = BatchGenerator::new(schemas, config);
362        assert!(result.is_ok());
363    }
364
365    #[test]
366    fn test_batch_generator_multiple_schemas() {
367        let schemas = vec![templates::user_schema(), templates::product_schema()];
368        let config = DataConfig::default();
369
370        let result = BatchGenerator::new(schemas, config);
371        assert!(result.is_ok());
372
373        if let Ok(batch) = result {
374            assert_eq!(batch.generators.len(), 2);
375        }
376    }
377
378    #[test]
379    fn test_data_generator_update_config() {
380        let schema = templates::user_schema();
381        let config = DataConfig::default();
382
383        let mut generator = DataGenerator::new(schema, config).unwrap();
384
385        let new_config = DataConfig {
386            rows: 50,
387            ..Default::default()
388        };
389
390        let result = generator.update_config(new_config);
391        assert!(result.is_ok());
392    }
393}