mockforge_data/
lib.rs

1//! # MockForge Data
2//!
3//! Synthetic data generation engine with faker primitives and RAG (Retrieval-Augmented Generation).
4
5// Re-export error types from mockforge-core
6pub use mockforge_core::{Error, Result};
7
8/// Consistency engine for entity ID → persona mappings
9pub mod consistency;
10pub mod dataset;
11pub mod domains;
12pub mod drift;
13pub mod faker;
14pub mod generator;
15pub mod intelligent_mock;
16pub mod mock_generator;
17pub mod mock_server;
18/// Persona profile system for consistent data generation
19pub mod persona;
20/// Backstory generation for personas
21pub mod persona_backstory;
22/// Domain-specific persona templates
23pub mod persona_templates;
24/// Provider utilities for faker and data generation
25pub mod provider;
26/// RAG (Retrieval-Augmented Generation) utilities for intelligent mock data generation
27pub mod rag;
28pub mod replay_augmentation;
29pub mod schema;
30pub mod token_resolver;
31
32#[cfg(test)]
33mod mock_data_tests;
34
35#[cfg(test)]
36mod integration_tests;
37
38pub use consistency::{ConsistencyStore, EntityIdExtractor, EntityType};
39pub use dataset::{Dataset, DatasetValidationResult};
40pub use domains::{Domain, DomainGenerator, ParseDomainError};
41pub use drift::{DataDriftConfig, DataDriftEngine, DriftStrategy};
42pub use fake::Faker;
43pub use generator::DataGenerator;
44pub use intelligent_mock::{IntelligentMockConfig, IntelligentMockGenerator, ResponseMode};
45pub use mock_generator::{MockDataGenerator, MockDataResult, MockGeneratorConfig, MockResponse};
46pub use mock_server::{
47    start_mock_server, start_mock_server_with_config, MockServer, MockServerBuilder,
48    MockServerConfig,
49};
50pub use persona::{PersonaGenerator, PersonaProfile, PersonaRegistry};
51pub use persona_backstory::{BackstoryGenerator, BackstoryTemplate};
52pub use persona_templates::{
53    EcommercePersonaTemplate, FinancePersonaTemplate, HealthcarePersonaTemplate, PersonaTemplate,
54    PersonaTemplateRegistry,
55};
56pub use rag::{EmbeddingProvider, LlmProvider, RagConfig, RagEngine, SearchResult};
57pub use replay_augmentation::{
58    EventStrategy, GeneratedEvent, ReplayAugmentationConfig, ReplayAugmentationEngine, ReplayMode,
59};
60pub use schema::{FieldDefinition, SchemaDefinition};
61pub use token_resolver::{resolve_tokens, resolve_tokens_with_rag, TokenResolver, TokenType};
62
63/// Data generation configuration
64#[derive(Debug, Clone, serde::Deserialize, serde::Serialize, Default)]
65pub struct DataConfig {
66    /// Number of rows to generate
67    #[serde(default = "default_rows")]
68    pub rows: usize,
69    /// Random seed for reproducible generation
70    pub seed: Option<u64>,
71    /// Enable RAG mode
72    pub rag_enabled: bool,
73    /// Maximum RAG context length
74    #[serde(default = "default_rag_context_length")]
75    pub rag_context_length: usize,
76    /// Output format
77    pub format: OutputFormat,
78}
79
80fn default_rows() -> usize {
81    100
82}
83fn default_rag_context_length() -> usize {
84    1000
85}
86
87/// Output format for generated data
88#[derive(Debug, Clone, serde::Deserialize, serde::Serialize, Default)]
89#[serde(rename_all = "lowercase")]
90pub enum OutputFormat {
91    /// JSON format
92    #[default]
93    Json,
94    /// JSON Lines format
95    JsonLines,
96    /// YAML format
97    Yaml,
98    /// CSV format
99    Csv,
100}
101
102/// Generation result
103#[derive(Debug)]
104pub struct GenerationResult {
105    /// Generated data
106    pub data: Vec<serde_json::Value>,
107    /// Number of rows generated
108    pub count: usize,
109    /// Generation time in milliseconds
110    pub generation_time_ms: u128,
111    /// Any warnings during generation
112    pub warnings: Vec<String>,
113}
114
115impl GenerationResult {
116    /// Create a new generation result
117    pub fn new(data: Vec<serde_json::Value>, generation_time_ms: u128) -> Self {
118        Self {
119            count: data.len(),
120            data,
121            generation_time_ms,
122            warnings: Vec::new(),
123        }
124    }
125
126    /// Add a warning
127    pub fn with_warning(mut self, warning: String) -> Self {
128        self.warnings.push(warning);
129        self
130    }
131
132    /// Get data as JSON string
133    pub fn to_json_string(&self) -> mockforge_core::Result<String> {
134        Ok(serde_json::to_string_pretty(&self.data)?)
135    }
136
137    /// Get data as JSON Lines string
138    pub fn to_jsonl_string(&self) -> mockforge_core::Result<String> {
139        let lines: Vec<String> = self
140            .data
141            .iter()
142            .map(serde_json::to_string)
143            .collect::<std::result::Result<_, _>>()?;
144        Ok(lines.join("\n"))
145    }
146}
147
148/// Quick data generation function
149pub async fn generate_data(
150    schema: SchemaDefinition,
151    config: DataConfig,
152) -> mockforge_core::Result<GenerationResult> {
153    let mut generator = DataGenerator::new(schema, config)?;
154    generator.generate().await
155}
156
157/// Generate sample data from a JSON schema
158pub async fn generate_from_json_schema(
159    json_schema: &serde_json::Value,
160    rows: usize,
161) -> mockforge_core::Result<GenerationResult> {
162    let schema = SchemaDefinition::from_json_schema(json_schema)?;
163    let config = DataConfig {
164        rows,
165        ..Default::default()
166    };
167    generate_data(schema, config).await
168}
169
170/// Generate sample data from an OpenAPI schema
171pub async fn generate_from_openapi(
172    openapi_spec: &serde_json::Value,
173    rows: usize,
174) -> mockforge_core::Result<GenerationResult> {
175    let schema = SchemaDefinition::from_openapi_spec(openapi_spec)?;
176    let config = DataConfig {
177        rows,
178        ..Default::default()
179    };
180    generate_data(schema, config).await
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186    use serde_json::json;
187
188    #[test]
189    fn test_data_config_default() {
190        let config = DataConfig::default();
191        assert_eq!(config.rows, 0); // Default for usize is 0
192        assert_eq!(config.seed, None);
193        assert!(!config.rag_enabled);
194        assert_eq!(config.rag_context_length, 0); // Default for usize is 0
195        assert!(matches!(config.format, OutputFormat::Json));
196    }
197
198    #[test]
199    fn test_data_config_custom() {
200        let config = DataConfig {
201            rows: 50,
202            seed: Some(42),
203            rag_enabled: true,
204            rag_context_length: 2000,
205            format: OutputFormat::Csv,
206        };
207
208        assert_eq!(config.rows, 50);
209        assert_eq!(config.seed, Some(42));
210        assert!(config.rag_enabled);
211        assert_eq!(config.rag_context_length, 2000);
212        assert!(matches!(config.format, OutputFormat::Csv));
213    }
214
215    #[test]
216    fn test_output_format_variants() {
217        let json = OutputFormat::Json;
218        let jsonlines = OutputFormat::JsonLines;
219        let yaml = OutputFormat::Yaml;
220        let csv = OutputFormat::Csv;
221
222        assert!(matches!(json, OutputFormat::Json));
223        assert!(matches!(jsonlines, OutputFormat::JsonLines));
224        assert!(matches!(yaml, OutputFormat::Yaml));
225        assert!(matches!(csv, OutputFormat::Csv));
226    }
227
228    #[test]
229    fn test_generation_result_new() {
230        let data = vec![json!({"id": 1, "name": "test"})];
231        let result = GenerationResult::new(data.clone(), 100);
232
233        assert_eq!(result.count, 1);
234        assert_eq!(result.data.len(), 1);
235        assert_eq!(result.generation_time_ms, 100);
236        assert_eq!(result.warnings.len(), 0);
237    }
238
239    #[test]
240    fn test_generation_result_with_warning() {
241        let data = vec![json!({"id": 1})];
242        let result = GenerationResult::new(data, 50).with_warning("Test warning".to_string());
243
244        assert_eq!(result.warnings.len(), 1);
245        assert_eq!(result.warnings[0], "Test warning");
246    }
247
248    #[test]
249    fn test_generation_result_to_json_string() {
250        let data = vec![json!({"id": 1, "name": "test"})];
251        let result = GenerationResult::new(data, 10);
252
253        let json_string = result.to_json_string();
254        assert!(json_string.is_ok());
255        let json_str = json_string.unwrap();
256        assert!(json_str.contains("\"id\""));
257        assert!(json_str.contains("\"name\""));
258    }
259
260    #[test]
261    fn test_generation_result_to_jsonl_string() {
262        let data = vec![json!({"id": 1}), json!({"id": 2})];
263        let result = GenerationResult::new(data, 10);
264
265        let jsonl_string = result.to_jsonl_string();
266        assert!(jsonl_string.is_ok());
267        let jsonl_str = jsonl_string.unwrap();
268        assert!(jsonl_str.contains("{\"id\":1}"));
269        assert!(jsonl_str.contains("{\"id\":2}"));
270        assert!(jsonl_str.contains("\n"));
271    }
272
273    #[test]
274    fn test_generation_result_multiple_warnings() {
275        let data = vec![json!({"id": 1})];
276        let result = GenerationResult::new(data, 10)
277            .with_warning("Warning 1".to_string())
278            .with_warning("Warning 2".to_string());
279
280        assert_eq!(result.warnings.len(), 2);
281        assert_eq!(result.warnings[0], "Warning 1");
282        assert_eq!(result.warnings[1], "Warning 2");
283    }
284
285    #[test]
286    fn test_default_rows() {
287        assert_eq!(default_rows(), 100);
288    }
289
290    #[test]
291    fn test_default_rag_context_length() {
292        assert_eq!(default_rag_context_length(), 1000);
293    }
294}