mockforge_data/
dataset.rs

1//! Dataset management and persistence
2//!
3//! This module has been refactored into sub-modules for better organization:
4//! - core: Core dataset structures and basic operations
5//! - collection: Dataset collection management and organization
6//! - metadata: Dataset metadata tracking and management
7//! - validation: Dataset validation and integrity checking
8//! - persistence: Dataset storage, loading, and file operations
9
10// Re-export sub-modules for backward compatibility
11pub mod core;
12
13// Re-export commonly used types
14pub use core::*;
15
16// Legacy imports for compatibility
17use crate::{DataConfig, GenerationResult, OutputFormat, SchemaDefinition};
18use mockforge_core::{Error, Result};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21use std::path::Path;
22use tokio::fs;
23
24/// Dataset validation result
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct DatasetValidationResult {
27    /// Whether the dataset is valid
28    pub valid: bool,
29    /// Validation errors
30    pub errors: Vec<String>,
31    /// Validation warnings
32    pub warnings: Vec<String>,
33    /// Total number of rows validated
34    pub total_rows_validated: usize,
35}
36
37/// Dataset metadata
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct DatasetMetadata {
40    /// Dataset name
41    pub name: String,
42    /// Dataset description
43    pub description: Option<String>,
44    /// Schema name used to generate this dataset
45    pub schema_name: String,
46    /// Number of rows
47    pub row_count: usize,
48    /// Generation configuration
49    pub config: DataConfig,
50    /// Creation timestamp
51    pub created_at: chrono::DateTime<chrono::Utc>,
52    /// Generation time in milliseconds
53    pub generation_time_ms: u128,
54    /// File format
55    pub format: OutputFormat,
56    /// File size in bytes
57    pub file_size_bytes: Option<u64>,
58    /// Additional metadata
59    pub tags: HashMap<String, String>,
60}
61
62impl Default for DatasetMetadata {
63    fn default() -> Self {
64        Self {
65            name: String::new(),
66            description: None,
67            schema_name: String::new(),
68            row_count: 0,
69            config: DataConfig::default(),
70            created_at: chrono::Utc::now(),
71            generation_time_ms: 0,
72            format: OutputFormat::Json,
73            file_size_bytes: None,
74            tags: HashMap::new(),
75        }
76    }
77}
78
79impl DatasetMetadata {
80    /// Create new metadata
81    pub fn new(
82        name: String,
83        schema_name: String,
84        result: &GenerationResult,
85        config: DataConfig,
86    ) -> Self {
87        Self {
88            name,
89            description: None,
90            schema_name,
91            row_count: result.count,
92            config,
93            created_at: chrono::Utc::now(),
94            generation_time_ms: result.generation_time_ms,
95            format: OutputFormat::Json,
96            file_size_bytes: None,
97            tags: HashMap::new(),
98        }
99    }
100
101    /// Set description
102    pub fn with_description(mut self, description: String) -> Self {
103        self.description = Some(description);
104        self
105    }
106
107    /// Add a tag
108    pub fn with_tag(mut self, key: String, value: String) -> Self {
109        self.tags.insert(key, value);
110        self
111    }
112
113    /// Set file size
114    pub fn with_file_size(mut self, size: u64) -> Self {
115        self.file_size_bytes = Some(size);
116        self
117    }
118}
119
120/// Dataset representation
121#[derive(Debug)]
122pub struct Dataset {
123    /// Dataset metadata
124    pub metadata: DatasetMetadata,
125    /// Dataset data
126    pub data: Vec<serde_json::Value>,
127}
128
129impl Dataset {
130    /// Create a new dataset from generation result
131    pub fn new(metadata: DatasetMetadata, data: Vec<serde_json::Value>) -> Self {
132        Self { metadata, data }
133    }
134
135    /// Create dataset from generation result
136    pub fn from_generation_result(
137        name: String,
138        schema_name: String,
139        result: GenerationResult,
140        config: DataConfig,
141    ) -> Self {
142        let metadata = DatasetMetadata::new(name, schema_name, &result, config);
143        Self::new(metadata, result.data)
144    }
145
146    /// Get dataset as JSON string
147    pub fn to_json_string(&self) -> Result<String> {
148        serde_json::to_string_pretty(&self.data).map_err(|e| {
149            mockforge_core::Error::generic(format!("Failed to serialize dataset: {}", e))
150        })
151    }
152
153    /// Get dataset as JSON Lines string
154    pub fn to_jsonl_string(&self) -> Result<String> {
155        let lines: Result<Vec<String>> = self
156            .data
157            .iter()
158            .map(|value| {
159                serde_json::to_string(value).map_err(|e| {
160                    mockforge_core::Error::generic(format!("JSON serialization error: {}", e))
161                })
162            })
163            .collect();
164
165        lines.map(|lines| lines.join("\n"))
166    }
167
168    /// Get dataset as CSV string (basic implementation)
169    pub fn to_csv_string(&self) -> Result<String> {
170        if self.data.is_empty() {
171            return Ok(String::new());
172        }
173
174        let mut csv_output = String::new();
175
176        // Extract headers from first object
177        if let Some(first_row) = self.data.first() {
178            if let Some(obj) = first_row.as_object() {
179                let headers: Vec<String> = obj.keys().cloned().collect();
180                csv_output.push_str(&headers.join(","));
181                csv_output.push('\n');
182
183                // Add data rows
184                for row in &self.data {
185                    if let Some(obj) = row.as_object() {
186                        let values: Vec<String> = headers
187                            .iter()
188                            .map(|header| {
189                                obj.get(header)
190                                    .map(|v| v.to_string().trim_matches('"').to_string())
191                                    .unwrap_or_default()
192                            })
193                            .collect();
194                        csv_output.push_str(&values.join(","));
195                        csv_output.push('\n');
196                    }
197                }
198            }
199        }
200
201        Ok(csv_output)
202    }
203
204    /// Get dataset as YAML string
205    pub fn to_yaml_string(&self) -> Result<String> {
206        serde_yaml::to_string(&self.data).map_err(|e| {
207            mockforge_core::Error::generic(format!("Failed to serialize dataset: {}", e))
208        })
209    }
210
211    /// Save dataset to file
212    pub async fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
213        let content = match self.metadata.format {
214            OutputFormat::Json => self.to_json_string()?,
215            OutputFormat::JsonLines => self.to_jsonl_string()?,
216            OutputFormat::Csv => self.to_csv_string()?,
217            OutputFormat::Yaml => self.to_yaml_string()?,
218        };
219
220        fs::write(path, content).await.map_err(|e| {
221            mockforge_core::Error::generic(format!("Failed to write dataset file: {}", e))
222        })
223    }
224
225    /// Load dataset from file
226    pub async fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
227        let content = fs::read_to_string(path).await.map_err(|e| {
228            mockforge_core::Error::generic(format!("Failed to read dataset file: {}", e))
229        })?;
230
231        // Try to parse as JSON array first
232        if let Ok(data) = serde_json::from_str::<Vec<serde_json::Value>>(&content) {
233            let metadata = DatasetMetadata {
234                name: "loaded_dataset".to_string(),
235                description: None,
236                schema_name: "unknown".to_string(),
237                row_count: data.len(),
238                config: DataConfig::default(),
239                created_at: chrono::Utc::now(),
240                generation_time_ms: 0,
241                format: OutputFormat::Json,
242                file_size_bytes: Some(content.len() as u64),
243                tags: HashMap::new(),
244            };
245
246            return Ok(Self::new(metadata, data));
247        }
248
249        Err(mockforge_core::Error::generic("Unsupported file format or invalid content"))
250    }
251
252    /// Get row count
253    pub fn row_count(&self) -> usize {
254        self.data.len()
255    }
256
257    /// Get sample rows
258    pub fn sample(&self, count: usize) -> &[serde_json::Value] {
259        let sample_count = count.min(self.data.len());
260        &self.data[..sample_count]
261    }
262
263    /// Filter dataset by predicate
264    pub fn filter<F>(&self, predicate: F) -> Dataset
265    where
266        F: Fn(&serde_json::Value) -> bool,
267    {
268        let filtered_data: Vec<serde_json::Value> =
269            self.data.iter().filter(|row| predicate(row)).cloned().collect();
270
271        let mut metadata = self.metadata.clone();
272        metadata.row_count = filtered_data.len();
273
274        Self::new(metadata, filtered_data)
275    }
276
277    /// Transform dataset with a mapping function
278    pub fn map<F>(&self, mapper: F) -> Dataset
279    where
280        F: Fn(&serde_json::Value) -> serde_json::Value,
281    {
282        let mapped_data: Vec<serde_json::Value> = self.data.iter().map(mapper).collect();
283
284        let metadata = self.metadata.clone();
285        Self::new(metadata, mapped_data)
286    }
287
288    /// Validate this dataset against a schema
289    pub fn validate_against_schema(&self, schema: &SchemaDefinition) -> Result<Vec<String>> {
290        utils::validate_dataset_against_schema(self, schema)
291    }
292
293    /// Validate this dataset with detailed results
294    pub fn validate_with_details(&self, schema: &SchemaDefinition) -> DatasetValidationResult {
295        utils::validate_dataset_with_details(self, schema)
296    }
297}
298
299/// Dataset collection for managing multiple datasets
300#[derive(Debug)]
301pub struct DatasetCollection {
302    /// Datasets indexed by name
303    datasets: HashMap<String, Dataset>,
304    /// Collection metadata
305    #[allow(dead_code)]
306    metadata: HashMap<String, String>,
307}
308
309impl DatasetCollection {
310    /// Create a new dataset collection
311    pub fn new() -> Self {
312        Self {
313            datasets: HashMap::new(),
314            metadata: HashMap::new(),
315        }
316    }
317
318    /// Add a dataset to the collection
319    pub fn add_dataset(&mut self, dataset: Dataset) -> Result<()> {
320        let name = dataset.metadata.name.clone();
321        self.datasets.insert(name, dataset);
322        Ok(())
323    }
324
325    /// Get a dataset by name
326    pub fn get_dataset(&self, name: &str) -> Option<&Dataset> {
327        self.datasets.get(name)
328    }
329
330    /// Remove a dataset
331    pub fn remove_dataset(&mut self, name: &str) -> Option<Dataset> {
332        self.datasets.remove(name)
333    }
334
335    /// List all dataset names
336    pub fn list_datasets(&self) -> Vec<String> {
337        self.datasets.keys().cloned().collect()
338    }
339
340    /// Get collection size
341    pub fn size(&self) -> usize {
342        self.datasets.len()
343    }
344
345    /// Save entire collection to directory
346    pub async fn save_to_directory<P: AsRef<Path>>(&self, dir_path: P) -> Result<()> {
347        fs::create_dir_all(&dir_path).await.map_err(|e| {
348            mockforge_core::Error::generic(format!("Failed to create directory: {}", e))
349        })?;
350
351        for (name, dataset) in &self.datasets {
352            let file_path = dir_path.as_ref().join(format!("{}.json", name));
353            dataset.save_to_file(file_path).await?;
354        }
355
356        Ok(())
357    }
358
359    /// Load collection from directory
360    pub async fn load_from_directory<P: AsRef<Path>>(dir_path: P) -> Result<Self> {
361        let mut collection = Self::new();
362        let mut entries = fs::read_dir(dir_path).await.map_err(|e| {
363            mockforge_core::Error::generic(format!("Failed to read directory: {}", e))
364        })?;
365
366        while let Some(entry) = entries.next_entry().await.map_err(|e| {
367            mockforge_core::Error::generic(format!("Failed to read directory entry: {}", e))
368        })? {
369            let path = entry.path();
370            if path.extension().and_then(|s| s.to_str()) == Some("json") {
371                if let Some(_file_name) = path.file_stem().and_then(|s| s.to_str()) {
372                    let dataset = Dataset::load_from_file(&path).await?;
373                    collection.add_dataset(dataset)?;
374                }
375            }
376        }
377
378        Ok(collection)
379    }
380
381    /// Get collection statistics
382    pub fn statistics(&self) -> HashMap<String, serde_json::Value> {
383        let mut stats = HashMap::new();
384
385        stats.insert("total_datasets".to_string(), self.size().into());
386        stats.insert(
387            "total_rows".to_string(),
388            self.datasets.values().map(|d| d.row_count()).sum::<usize>().into(),
389        );
390
391        let dataset_info: Vec<serde_json::Value> = self
392            .datasets
393            .values()
394            .map(|d| {
395                serde_json::json!({
396                    "name": d.metadata.name,
397                    "schema": d.metadata.schema_name,
398                    "rows": d.row_count(),
399                    "format": format!("{:?}", d.metadata.format),
400                })
401            })
402            .collect();
403
404        stats.insert("datasets".to_string(), dataset_info.into());
405
406        stats
407    }
408}
409
410impl Default for DatasetCollection {
411    fn default() -> Self {
412        Self::new()
413    }
414}
415
416/// Dataset utilities
417pub mod utils {
418    use super::*;
419
420    /// Create a sample dataset collection with common schemas
421    pub async fn create_sample_collection() -> Result<DatasetCollection> {
422        let mut collection = DatasetCollection::new();
423
424        // Create user dataset
425        let users_result = crate::generator::utils::generate_users(50).await?;
426        let users_dataset = Dataset::from_generation_result(
427            "users".to_string(),
428            "User".to_string(),
429            users_result,
430            DataConfig {
431                rows: 50,
432                ..Default::default()
433            },
434        );
435        collection.add_dataset(users_dataset)?;
436
437        // Create product dataset
438        let products_result = crate::generator::utils::generate_products(25).await?;
439        let products_dataset = Dataset::from_generation_result(
440            "products".to_string(),
441            "Product".to_string(),
442            products_result,
443            DataConfig {
444                rows: 25,
445                ..Default::default()
446            },
447        );
448        collection.add_dataset(products_dataset)?;
449
450        Ok(collection)
451    }
452
453    /// Export dataset to different formats
454    pub async fn export_dataset(
455        dataset: &Dataset,
456        format: OutputFormat,
457        output_path: &Path,
458    ) -> Result<()> {
459        let content = match format {
460            OutputFormat::Json => dataset.to_json_string()?,
461            OutputFormat::JsonLines => dataset.to_jsonl_string()?,
462            OutputFormat::Csv => dataset.to_csv_string()?,
463            OutputFormat::Yaml => dataset.to_yaml_string()?,
464        };
465
466        fs::write(output_path, content)
467            .await
468            .map_err(|e| mockforge_core::Error::generic(format!("Failed to export dataset: {}", e)))
469    }
470
471    /// Validate dataset against schema
472    pub fn validate_dataset_against_schema(
473        dataset: &Dataset,
474        schema: &SchemaDefinition,
475    ) -> Result<Vec<String>> {
476        let mut errors = Vec::new();
477
478        // Validate each row in the dataset
479        for (row_index, row) in dataset.data.iter().enumerate() {
480            match row {
481                serde_json::Value::Object(row_obj) => {
482                    // Validate each field in the schema
483                    for field in &schema.fields {
484                        let field_name = &field.name;
485
486                        if let Some(field_value) = row_obj.get(field_name) {
487                            // Validate the field value
488                            if let Err(validation_error) = field.validate_value(field_value) {
489                                errors.push(format!(
490                                    "Row {}: Field '{}': {}",
491                                    row_index + 1,
492                                    field_name,
493                                    validation_error
494                                ));
495                            }
496                        } else if field.required {
497                            errors.push(format!(
498                                "Row {}: Required field '{}' is missing",
499                                row_index + 1,
500                                field_name
501                            ));
502                        }
503                    }
504
505                    // Check for unexpected fields
506                    for (key, _) in row_obj {
507                        let field_exists_in_schema = schema.fields.iter().any(|f| f.name == *key);
508                        if !field_exists_in_schema {
509                            errors.push(format!(
510                                "Row {}: Unexpected field '{}' not defined in schema",
511                                row_index + 1,
512                                key
513                            ));
514                        }
515                    }
516                }
517                _ => {
518                    errors.push(format!("Row {}: Expected object, got {}", row_index + 1, row));
519                }
520            }
521        }
522
523        // Validate dataset-level constraints
524        if let Err(count_error) = validate_dataset_size(dataset, schema) {
525            errors.push(count_error.to_string());
526        }
527
528        Ok(errors)
529    }
530
531    /// Validate dataset size constraints
532    fn validate_dataset_size(
533        dataset: &Dataset,
534        schema: &SchemaDefinition,
535    ) -> mockforge_core::Result<()> {
536        // Check if there are any size constraints in schema metadata
537        if let Some(min_rows) = schema.metadata.get("min_rows") {
538            if let Some(min_count) = min_rows.as_u64() {
539                if dataset.data.len() < min_count as usize {
540                    return Err(Error::validation(format!(
541                        "Dataset has {} rows, but schema requires at least {} rows",
542                        dataset.data.len(),
543                        min_count
544                    )));
545                }
546            }
547        }
548
549        if let Some(max_rows) = schema.metadata.get("max_rows") {
550            if let Some(max_count) = max_rows.as_u64() {
551                if dataset.data.len() > max_count as usize {
552                    return Err(Error::validation(format!(
553                        "Dataset has {} rows, but schema allows at most {} rows",
554                        dataset.data.len(),
555                        max_count
556                    )));
557                }
558            }
559        }
560
561        Ok(())
562    }
563
564    /// Validate dataset and return detailed result
565    pub fn validate_dataset_with_details(
566        dataset: &Dataset,
567        schema: &SchemaDefinition,
568    ) -> DatasetValidationResult {
569        let errors = validate_dataset_against_schema(dataset, schema);
570
571        match errors {
572            Ok(validation_errors) => {
573                let warnings = Vec::new(); // Could add warnings for deprecated fields, etc.
574                DatasetValidationResult {
575                    valid: validation_errors.is_empty(),
576                    errors: validation_errors,
577                    warnings,
578                    total_rows_validated: dataset.data.len(),
579                }
580            }
581            Err(e) => DatasetValidationResult {
582                valid: false,
583                errors: vec![format!("Validation failed: {}", e)],
584                warnings: Vec::new(),
585                total_rows_validated: dataset.data.len(),
586            },
587        }
588    }
589}
590
591#[cfg(test)]
592mod tests {
593    use super::*;
594
595    #[test]
596    fn test_dataset_validation_result_creation() {
597        let result = DatasetValidationResult {
598            valid: true,
599            errors: vec![],
600            warnings: vec![],
601            total_rows_validated: 100,
602        };
603
604        assert!(result.valid);
605        assert_eq!(result.total_rows_validated, 100);
606    }
607
608    #[test]
609    fn test_dataset_validation_result_with_errors() {
610        let result = DatasetValidationResult {
611            valid: false,
612            errors: vec!["Error 1".to_string(), "Error 2".to_string()],
613            warnings: vec![],
614            total_rows_validated: 50,
615        };
616
617        assert!(!result.valid);
618        assert_eq!(result.errors.len(), 2);
619    }
620
621    #[test]
622    fn test_dataset_validation_result_with_warnings() {
623        let result = DatasetValidationResult {
624            valid: true,
625            errors: vec![],
626            warnings: vec!["Warning 1".to_string()],
627            total_rows_validated: 75,
628        };
629
630        assert!(result.valid);
631        assert_eq!(result.warnings.len(), 1);
632    }
633
634    #[test]
635    fn test_dataset_metadata_creation() {
636        let config = DataConfig::default();
637        let metadata = DatasetMetadata {
638            name: "TestDataset".to_string(),
639            description: Some("Test description".to_string()),
640            schema_name: "TestSchema".to_string(),
641            row_count: 100,
642            config,
643            created_at: chrono::Utc::now(),
644            generation_time_ms: 1000,
645            format: OutputFormat::Json,
646            file_size_bytes: Some(1024),
647            tags: HashMap::new(),
648        };
649
650        assert_eq!(metadata.name, "TestDataset");
651        assert_eq!(metadata.row_count, 100);
652        assert!(metadata.description.is_some());
653        assert_eq!(metadata.generation_time_ms, 1000);
654    }
655}