1pub mod core;
12
13pub use core::*;
15
16use crate::{DataConfig, GenerationResult, OutputFormat, SchemaDefinition};
18use mockforge_core::{Error, Result};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21use std::path::Path;
22use tokio::fs;
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct DatasetValidationResult {
27 pub valid: bool,
29 pub errors: Vec<String>,
31 pub warnings: Vec<String>,
33 pub total_rows_validated: usize,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct DatasetMetadata {
40 pub name: String,
42 pub description: Option<String>,
44 pub schema_name: String,
46 pub row_count: usize,
48 pub config: DataConfig,
50 pub created_at: chrono::DateTime<chrono::Utc>,
52 pub generation_time_ms: u128,
54 pub format: OutputFormat,
56 pub file_size_bytes: Option<u64>,
58 pub tags: HashMap<String, String>,
60}
61
62impl Default for DatasetMetadata {
63 fn default() -> Self {
64 Self {
65 name: String::new(),
66 description: None,
67 schema_name: String::new(),
68 row_count: 0,
69 config: DataConfig::default(),
70 created_at: chrono::Utc::now(),
71 generation_time_ms: 0,
72 format: OutputFormat::Json,
73 file_size_bytes: None,
74 tags: HashMap::new(),
75 }
76 }
77}
78
79impl DatasetMetadata {
80 pub fn new(
82 name: String,
83 schema_name: String,
84 result: &GenerationResult,
85 config: DataConfig,
86 ) -> Self {
87 Self {
88 name,
89 description: None,
90 schema_name,
91 row_count: result.count,
92 config,
93 created_at: chrono::Utc::now(),
94 generation_time_ms: result.generation_time_ms,
95 format: OutputFormat::Json,
96 file_size_bytes: None,
97 tags: HashMap::new(),
98 }
99 }
100
101 pub fn with_description(mut self, description: String) -> Self {
103 self.description = Some(description);
104 self
105 }
106
107 pub fn with_tag(mut self, key: String, value: String) -> Self {
109 self.tags.insert(key, value);
110 self
111 }
112
113 pub fn with_file_size(mut self, size: u64) -> Self {
115 self.file_size_bytes = Some(size);
116 self
117 }
118}
119
120#[derive(Debug)]
122pub struct Dataset {
123 pub metadata: DatasetMetadata,
125 pub data: Vec<serde_json::Value>,
127}
128
129impl Dataset {
130 pub fn new(metadata: DatasetMetadata, data: Vec<serde_json::Value>) -> Self {
132 Self { metadata, data }
133 }
134
135 pub fn from_generation_result(
137 name: String,
138 schema_name: String,
139 result: GenerationResult,
140 config: DataConfig,
141 ) -> Self {
142 let metadata = DatasetMetadata::new(name, schema_name, &result, config);
143 Self::new(metadata, result.data)
144 }
145
146 pub fn to_json_string(&self) -> Result<String> {
148 serde_json::to_string_pretty(&self.data).map_err(|e| {
149 mockforge_core::Error::generic(format!("Failed to serialize dataset: {}", e))
150 })
151 }
152
153 pub fn to_jsonl_string(&self) -> Result<String> {
155 let lines: Result<Vec<String>> = self
156 .data
157 .iter()
158 .map(|value| {
159 serde_json::to_string(value).map_err(|e| {
160 mockforge_core::Error::generic(format!("JSON serialization error: {}", e))
161 })
162 })
163 .collect();
164
165 lines.map(|lines| lines.join("\n"))
166 }
167
168 pub fn to_csv_string(&self) -> Result<String> {
170 if self.data.is_empty() {
171 return Ok(String::new());
172 }
173
174 let mut csv_output = String::new();
175
176 if let Some(first_row) = self.data.first() {
178 if let Some(obj) = first_row.as_object() {
179 let headers: Vec<String> = obj.keys().cloned().collect();
180 csv_output.push_str(&headers.join(","));
181 csv_output.push('\n');
182
183 for row in &self.data {
185 if let Some(obj) = row.as_object() {
186 let values: Vec<String> = headers
187 .iter()
188 .map(|header| {
189 obj.get(header)
190 .map(|v| v.to_string().trim_matches('"').to_string())
191 .unwrap_or_default()
192 })
193 .collect();
194 csv_output.push_str(&values.join(","));
195 csv_output.push('\n');
196 }
197 }
198 }
199 }
200
201 Ok(csv_output)
202 }
203
204 pub fn to_yaml_string(&self) -> Result<String> {
206 serde_yaml::to_string(&self.data).map_err(|e| {
207 mockforge_core::Error::generic(format!("Failed to serialize dataset: {}", e))
208 })
209 }
210
211 pub async fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
213 let content = match self.metadata.format {
214 OutputFormat::Json => self.to_json_string()?,
215 OutputFormat::JsonLines => self.to_jsonl_string()?,
216 OutputFormat::Csv => self.to_csv_string()?,
217 OutputFormat::Yaml => self.to_yaml_string()?,
218 };
219
220 fs::write(path, content).await.map_err(|e| {
221 mockforge_core::Error::generic(format!("Failed to write dataset file: {}", e))
222 })
223 }
224
225 pub async fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
227 let content = fs::read_to_string(path).await.map_err(|e| {
228 mockforge_core::Error::generic(format!("Failed to read dataset file: {}", e))
229 })?;
230
231 if let Ok(data) = serde_json::from_str::<Vec<serde_json::Value>>(&content) {
233 let metadata = DatasetMetadata {
234 name: "loaded_dataset".to_string(),
235 description: None,
236 schema_name: "unknown".to_string(),
237 row_count: data.len(),
238 config: DataConfig::default(),
239 created_at: chrono::Utc::now(),
240 generation_time_ms: 0,
241 format: OutputFormat::Json,
242 file_size_bytes: Some(content.len() as u64),
243 tags: HashMap::new(),
244 };
245
246 return Ok(Self::new(metadata, data));
247 }
248
249 Err(mockforge_core::Error::generic("Unsupported file format or invalid content"))
250 }
251
252 pub fn row_count(&self) -> usize {
254 self.data.len()
255 }
256
257 pub fn sample(&self, count: usize) -> &[serde_json::Value] {
259 let sample_count = count.min(self.data.len());
260 &self.data[..sample_count]
261 }
262
263 pub fn filter<F>(&self, predicate: F) -> Dataset
265 where
266 F: Fn(&serde_json::Value) -> bool,
267 {
268 let filtered_data: Vec<serde_json::Value> =
269 self.data.iter().filter(|row| predicate(row)).cloned().collect();
270
271 let mut metadata = self.metadata.clone();
272 metadata.row_count = filtered_data.len();
273
274 Self::new(metadata, filtered_data)
275 }
276
277 pub fn map<F>(&self, mapper: F) -> Dataset
279 where
280 F: Fn(&serde_json::Value) -> serde_json::Value,
281 {
282 let mapped_data: Vec<serde_json::Value> = self.data.iter().map(mapper).collect();
283
284 let metadata = self.metadata.clone();
285 Self::new(metadata, mapped_data)
286 }
287
288 pub fn validate_against_schema(&self, schema: &SchemaDefinition) -> Result<Vec<String>> {
290 utils::validate_dataset_against_schema(self, schema)
291 }
292
293 pub fn validate_with_details(&self, schema: &SchemaDefinition) -> DatasetValidationResult {
295 utils::validate_dataset_with_details(self, schema)
296 }
297}
298
299#[derive(Debug)]
301pub struct DatasetCollection {
302 datasets: HashMap<String, Dataset>,
304 #[allow(dead_code)]
306 metadata: HashMap<String, String>,
307}
308
309impl DatasetCollection {
310 pub fn new() -> Self {
312 Self {
313 datasets: HashMap::new(),
314 metadata: HashMap::new(),
315 }
316 }
317
318 pub fn add_dataset(&mut self, dataset: Dataset) -> Result<()> {
320 let name = dataset.metadata.name.clone();
321 self.datasets.insert(name, dataset);
322 Ok(())
323 }
324
325 pub fn get_dataset(&self, name: &str) -> Option<&Dataset> {
327 self.datasets.get(name)
328 }
329
330 pub fn remove_dataset(&mut self, name: &str) -> Option<Dataset> {
332 self.datasets.remove(name)
333 }
334
335 pub fn list_datasets(&self) -> Vec<String> {
337 self.datasets.keys().cloned().collect()
338 }
339
340 pub fn size(&self) -> usize {
342 self.datasets.len()
343 }
344
345 pub async fn save_to_directory<P: AsRef<Path>>(&self, dir_path: P) -> Result<()> {
347 fs::create_dir_all(&dir_path).await.map_err(|e| {
348 mockforge_core::Error::generic(format!("Failed to create directory: {}", e))
349 })?;
350
351 for (name, dataset) in &self.datasets {
352 let file_path = dir_path.as_ref().join(format!("{}.json", name));
353 dataset.save_to_file(file_path).await?;
354 }
355
356 Ok(())
357 }
358
359 pub async fn load_from_directory<P: AsRef<Path>>(dir_path: P) -> Result<Self> {
361 let mut collection = Self::new();
362 let mut entries = fs::read_dir(dir_path).await.map_err(|e| {
363 mockforge_core::Error::generic(format!("Failed to read directory: {}", e))
364 })?;
365
366 while let Some(entry) = entries.next_entry().await.map_err(|e| {
367 mockforge_core::Error::generic(format!("Failed to read directory entry: {}", e))
368 })? {
369 let path = entry.path();
370 if path.extension().and_then(|s| s.to_str()) == Some("json") {
371 if let Some(_file_name) = path.file_stem().and_then(|s| s.to_str()) {
372 let dataset = Dataset::load_from_file(&path).await?;
373 collection.add_dataset(dataset)?;
374 }
375 }
376 }
377
378 Ok(collection)
379 }
380
381 pub fn statistics(&self) -> HashMap<String, serde_json::Value> {
383 let mut stats = HashMap::new();
384
385 stats.insert("total_datasets".to_string(), self.size().into());
386 stats.insert(
387 "total_rows".to_string(),
388 self.datasets.values().map(|d| d.row_count()).sum::<usize>().into(),
389 );
390
391 let dataset_info: Vec<serde_json::Value> = self
392 .datasets
393 .values()
394 .map(|d| {
395 serde_json::json!({
396 "name": d.metadata.name,
397 "schema": d.metadata.schema_name,
398 "rows": d.row_count(),
399 "format": format!("{:?}", d.metadata.format),
400 })
401 })
402 .collect();
403
404 stats.insert("datasets".to_string(), dataset_info.into());
405
406 stats
407 }
408}
409
410impl Default for DatasetCollection {
411 fn default() -> Self {
412 Self::new()
413 }
414}
415
416pub mod utils {
418 use super::*;
419
420 pub async fn create_sample_collection() -> Result<DatasetCollection> {
422 let mut collection = DatasetCollection::new();
423
424 let users_result = crate::generator::utils::generate_users(50).await?;
426 let users_dataset = Dataset::from_generation_result(
427 "users".to_string(),
428 "User".to_string(),
429 users_result,
430 DataConfig {
431 rows: 50,
432 ..Default::default()
433 },
434 );
435 collection.add_dataset(users_dataset)?;
436
437 let products_result = crate::generator::utils::generate_products(25).await?;
439 let products_dataset = Dataset::from_generation_result(
440 "products".to_string(),
441 "Product".to_string(),
442 products_result,
443 DataConfig {
444 rows: 25,
445 ..Default::default()
446 },
447 );
448 collection.add_dataset(products_dataset)?;
449
450 Ok(collection)
451 }
452
453 pub async fn export_dataset(
455 dataset: &Dataset,
456 format: OutputFormat,
457 output_path: &Path,
458 ) -> Result<()> {
459 let content = match format {
460 OutputFormat::Json => dataset.to_json_string()?,
461 OutputFormat::JsonLines => dataset.to_jsonl_string()?,
462 OutputFormat::Csv => dataset.to_csv_string()?,
463 OutputFormat::Yaml => dataset.to_yaml_string()?,
464 };
465
466 fs::write(output_path, content)
467 .await
468 .map_err(|e| mockforge_core::Error::generic(format!("Failed to export dataset: {}", e)))
469 }
470
471 pub fn validate_dataset_against_schema(
473 dataset: &Dataset,
474 schema: &SchemaDefinition,
475 ) -> Result<Vec<String>> {
476 let mut errors = Vec::new();
477
478 for (row_index, row) in dataset.data.iter().enumerate() {
480 match row {
481 serde_json::Value::Object(row_obj) => {
482 for field in &schema.fields {
484 let field_name = &field.name;
485
486 if let Some(field_value) = row_obj.get(field_name) {
487 if let Err(validation_error) = field.validate_value(field_value) {
489 errors.push(format!(
490 "Row {}: Field '{}': {}",
491 row_index + 1,
492 field_name,
493 validation_error
494 ));
495 }
496 } else if field.required {
497 errors.push(format!(
498 "Row {}: Required field '{}' is missing",
499 row_index + 1,
500 field_name
501 ));
502 }
503 }
504
505 for (key, _) in row_obj {
507 let field_exists_in_schema = schema.fields.iter().any(|f| f.name == *key);
508 if !field_exists_in_schema {
509 errors.push(format!(
510 "Row {}: Unexpected field '{}' not defined in schema",
511 row_index + 1,
512 key
513 ));
514 }
515 }
516 }
517 _ => {
518 errors.push(format!("Row {}: Expected object, got {}", row_index + 1, row));
519 }
520 }
521 }
522
523 if let Err(count_error) = validate_dataset_size(dataset, schema) {
525 errors.push(count_error.to_string());
526 }
527
528 Ok(errors)
529 }
530
531 fn validate_dataset_size(
533 dataset: &Dataset,
534 schema: &SchemaDefinition,
535 ) -> mockforge_core::Result<()> {
536 if let Some(min_rows) = schema.metadata.get("min_rows") {
538 if let Some(min_count) = min_rows.as_u64() {
539 if dataset.data.len() < min_count as usize {
540 return Err(Error::validation(format!(
541 "Dataset has {} rows, but schema requires at least {} rows",
542 dataset.data.len(),
543 min_count
544 )));
545 }
546 }
547 }
548
549 if let Some(max_rows) = schema.metadata.get("max_rows") {
550 if let Some(max_count) = max_rows.as_u64() {
551 if dataset.data.len() > max_count as usize {
552 return Err(Error::validation(format!(
553 "Dataset has {} rows, but schema allows at most {} rows",
554 dataset.data.len(),
555 max_count
556 )));
557 }
558 }
559 }
560
561 Ok(())
562 }
563
564 pub fn validate_dataset_with_details(
566 dataset: &Dataset,
567 schema: &SchemaDefinition,
568 ) -> DatasetValidationResult {
569 let errors = validate_dataset_against_schema(dataset, schema);
570
571 match errors {
572 Ok(validation_errors) => {
573 let warnings = Vec::new(); DatasetValidationResult {
575 valid: validation_errors.is_empty(),
576 errors: validation_errors,
577 warnings,
578 total_rows_validated: dataset.data.len(),
579 }
580 }
581 Err(e) => DatasetValidationResult {
582 valid: false,
583 errors: vec![format!("Validation failed: {}", e)],
584 warnings: Vec::new(),
585 total_rows_validated: dataset.data.len(),
586 },
587 }
588 }
589}
590
591#[cfg(test)]
592mod tests {
593 use super::*;
594
595 #[test]
596 fn test_dataset_validation_result_creation() {
597 let result = DatasetValidationResult {
598 valid: true,
599 errors: vec![],
600 warnings: vec![],
601 total_rows_validated: 100,
602 };
603
604 assert!(result.valid);
605 assert_eq!(result.total_rows_validated, 100);
606 }
607
608 #[test]
609 fn test_dataset_validation_result_with_errors() {
610 let result = DatasetValidationResult {
611 valid: false,
612 errors: vec!["Error 1".to_string(), "Error 2".to_string()],
613 warnings: vec![],
614 total_rows_validated: 50,
615 };
616
617 assert!(!result.valid);
618 assert_eq!(result.errors.len(), 2);
619 }
620
621 #[test]
622 fn test_dataset_validation_result_with_warnings() {
623 let result = DatasetValidationResult {
624 valid: true,
625 errors: vec![],
626 warnings: vec!["Warning 1".to_string()],
627 total_rows_validated: 75,
628 };
629
630 assert!(result.valid);
631 assert_eq!(result.warnings.len(), 1);
632 }
633
634 #[test]
635 fn test_dataset_metadata_creation() {
636 let config = DataConfig::default();
637 let metadata = DatasetMetadata {
638 name: "TestDataset".to_string(),
639 description: Some("Test description".to_string()),
640 schema_name: "TestSchema".to_string(),
641 row_count: 100,
642 config,
643 created_at: chrono::Utc::now(),
644 generation_time_ms: 1000,
645 format: OutputFormat::Json,
646 file_size_bytes: Some(1024),
647 tags: HashMap::new(),
648 };
649
650 assert_eq!(metadata.name, "TestDataset");
651 assert_eq!(metadata.row_count, 100);
652 assert!(metadata.description.is_some());
653 assert_eq!(metadata.generation_time_ms, 1000);
654 }
655}