mockforge_data/
dataset.rs

1//! Dataset management and persistence
2//!
3//! This module has been refactored into sub-modules for better organization:
4//! - core: Core dataset structures and basic operations
5//! - collection: Dataset collection management and organization
6//! - metadata: Dataset metadata tracking and management
7//! - validation: Dataset validation and integrity checking
8//! - persistence: Dataset storage, loading, and file operations
9
10// Re-export sub-modules for backward compatibility
11pub mod core;
12
13// Re-export commonly used types
14pub use core::*;
15
16// Legacy imports for compatibility
17use crate::{DataConfig, GenerationResult, OutputFormat, SchemaDefinition};
18use crate::{Error, Result};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21use std::path::Path;
22use tokio::fs;
23
24/// Dataset validation result
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct DatasetValidationResult {
27    /// Whether the dataset is valid
28    pub valid: bool,
29    /// Validation errors
30    pub errors: Vec<String>,
31    /// Validation warnings
32    pub warnings: Vec<String>,
33    /// Total number of rows validated
34    pub total_rows_validated: usize,
35}
36
37/// Dataset metadata
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct DatasetMetadata {
40    /// Dataset name
41    pub name: String,
42    /// Dataset description
43    pub description: Option<String>,
44    /// Schema name used to generate this dataset
45    pub schema_name: String,
46    /// Number of rows
47    pub row_count: usize,
48    /// Generation configuration
49    pub config: DataConfig,
50    /// Creation timestamp
51    pub created_at: chrono::DateTime<chrono::Utc>,
52    /// Generation time in milliseconds
53    pub generation_time_ms: u128,
54    /// File format
55    pub format: OutputFormat,
56    /// File size in bytes
57    pub file_size_bytes: Option<u64>,
58    /// Additional metadata
59    pub tags: HashMap<String, String>,
60}
61
62impl Default for DatasetMetadata {
63    fn default() -> Self {
64        Self {
65            name: String::new(),
66            description: None,
67            schema_name: String::new(),
68            row_count: 0,
69            config: DataConfig::default(),
70            created_at: chrono::Utc::now(),
71            generation_time_ms: 0,
72            format: OutputFormat::Json,
73            file_size_bytes: None,
74            tags: HashMap::new(),
75        }
76    }
77}
78
79impl DatasetMetadata {
80    /// Create new metadata
81    pub fn new(
82        name: String,
83        schema_name: String,
84        result: &GenerationResult,
85        config: DataConfig,
86    ) -> Self {
87        Self {
88            name,
89            description: None,
90            schema_name,
91            row_count: result.count,
92            config,
93            created_at: chrono::Utc::now(),
94            generation_time_ms: result.generation_time_ms,
95            format: OutputFormat::Json,
96            file_size_bytes: None,
97            tags: HashMap::new(),
98        }
99    }
100
101    /// Set description
102    pub fn with_description(mut self, description: String) -> Self {
103        self.description = Some(description);
104        self
105    }
106
107    /// Add a tag
108    pub fn with_tag(mut self, key: String, value: String) -> Self {
109        self.tags.insert(key, value);
110        self
111    }
112
113    /// Set file size
114    pub fn with_file_size(mut self, size: u64) -> Self {
115        self.file_size_bytes = Some(size);
116        self
117    }
118}
119
120/// Dataset representation
121#[derive(Debug)]
122pub struct Dataset {
123    /// Dataset metadata
124    pub metadata: DatasetMetadata,
125    /// Dataset data
126    pub data: Vec<serde_json::Value>,
127}
128
129impl Dataset {
130    /// Create a new dataset from generation result
131    pub fn new(metadata: DatasetMetadata, data: Vec<serde_json::Value>) -> Self {
132        Self { metadata, data }
133    }
134
135    /// Create dataset from generation result
136    pub fn from_generation_result(
137        name: String,
138        schema_name: String,
139        result: GenerationResult,
140        config: DataConfig,
141    ) -> Self {
142        let metadata = DatasetMetadata::new(name, schema_name, &result, config);
143        Self::new(metadata, result.data)
144    }
145
146    /// Get dataset as JSON string
147    pub fn to_json_string(&self) -> Result<String> {
148        serde_json::to_string_pretty(&self.data)
149            .map_err(|e| crate::Error::generic(format!("Failed to serialize dataset: {}", e)))
150    }
151
152    /// Get dataset as JSON Lines string
153    pub fn to_jsonl_string(&self) -> Result<String> {
154        let lines: Result<Vec<String>> = self
155            .data
156            .iter()
157            .map(|value| {
158                serde_json::to_string(value)
159                    .map_err(|e| crate::Error::generic(format!("JSON serialization error: {}", e)))
160            })
161            .collect();
162
163        lines.map(|lines| lines.join("\n"))
164    }
165
166    /// Get dataset as CSV string (basic implementation)
167    pub fn to_csv_string(&self) -> Result<String> {
168        if self.data.is_empty() {
169            return Ok(String::new());
170        }
171
172        let mut csv_output = String::new();
173
174        // Extract headers from first object
175        if let Some(first_row) = self.data.first() {
176            if let Some(obj) = first_row.as_object() {
177                let headers: Vec<String> = obj.keys().cloned().collect();
178                csv_output.push_str(&headers.join(","));
179                csv_output.push('\n');
180
181                // Add data rows
182                for row in &self.data {
183                    if let Some(obj) = row.as_object() {
184                        let values: Vec<String> = headers
185                            .iter()
186                            .map(|header| {
187                                obj.get(header)
188                                    .map(|v| v.to_string().trim_matches('"').to_string())
189                                    .unwrap_or_default()
190                            })
191                            .collect();
192                        csv_output.push_str(&values.join(","));
193                        csv_output.push('\n');
194                    }
195                }
196            }
197        }
198
199        Ok(csv_output)
200    }
201
202    /// Get dataset as YAML string
203    pub fn to_yaml_string(&self) -> Result<String> {
204        serde_yaml::to_string(&self.data)
205            .map_err(|e| crate::Error::generic(format!("Failed to serialize dataset: {}", e)))
206    }
207
208    /// Save dataset to file
209    pub async fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
210        let content = match self.metadata.format {
211            OutputFormat::Json => self.to_json_string()?,
212            OutputFormat::JsonLines => self.to_jsonl_string()?,
213            OutputFormat::Csv => self.to_csv_string()?,
214            OutputFormat::Yaml => self.to_yaml_string()?,
215        };
216
217        fs::write(path, content)
218            .await
219            .map_err(|e| crate::Error::generic(format!("Failed to write dataset file: {}", e)))
220    }
221
222    /// Load dataset from file
223    pub async fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
224        let content = fs::read_to_string(path)
225            .await
226            .map_err(|e| crate::Error::generic(format!("Failed to read dataset file: {}", e)))?;
227
228        // Try to parse as JSON array first
229        if let Ok(data) = serde_json::from_str::<Vec<serde_json::Value>>(&content) {
230            let metadata = DatasetMetadata {
231                name: "loaded_dataset".to_string(),
232                description: None,
233                schema_name: "unknown".to_string(),
234                row_count: data.len(),
235                config: DataConfig::default(),
236                created_at: chrono::Utc::now(),
237                generation_time_ms: 0,
238                format: OutputFormat::Json,
239                file_size_bytes: Some(content.len() as u64),
240                tags: HashMap::new(),
241            };
242
243            return Ok(Self::new(metadata, data));
244        }
245
246        Err(crate::Error::generic("Unsupported file format or invalid content"))
247    }
248
249    /// Get row count
250    pub fn row_count(&self) -> usize {
251        self.data.len()
252    }
253
254    /// Get sample rows
255    pub fn sample(&self, count: usize) -> &[serde_json::Value] {
256        let sample_count = count.min(self.data.len());
257        &self.data[..sample_count]
258    }
259
260    /// Filter dataset by predicate
261    pub fn filter<F>(&self, predicate: F) -> Dataset
262    where
263        F: Fn(&serde_json::Value) -> bool,
264    {
265        let filtered_data: Vec<serde_json::Value> =
266            self.data.iter().filter(|row| predicate(row)).cloned().collect();
267
268        let mut metadata = self.metadata.clone();
269        metadata.row_count = filtered_data.len();
270
271        Self::new(metadata, filtered_data)
272    }
273
274    /// Transform dataset with a mapping function
275    pub fn map<F>(&self, mapper: F) -> Dataset
276    where
277        F: Fn(&serde_json::Value) -> serde_json::Value,
278    {
279        let mapped_data: Vec<serde_json::Value> = self.data.iter().map(mapper).collect();
280
281        let metadata = self.metadata.clone();
282        Self::new(metadata, mapped_data)
283    }
284
285    /// Validate this dataset against a schema
286    pub fn validate_against_schema(&self, schema: &SchemaDefinition) -> Result<Vec<String>> {
287        utils::validate_dataset_against_schema(self, schema)
288    }
289
290    /// Validate this dataset with detailed results
291    pub fn validate_with_details(&self, schema: &SchemaDefinition) -> DatasetValidationResult {
292        utils::validate_dataset_with_details(self, schema)
293    }
294}
295
296/// Dataset collection for managing multiple datasets
297#[derive(Debug)]
298pub struct DatasetCollection {
299    /// Datasets indexed by name
300    datasets: HashMap<String, Dataset>,
301    /// Collection metadata
302    #[allow(dead_code)]
303    metadata: HashMap<String, String>,
304}
305
306impl DatasetCollection {
307    /// Create a new dataset collection
308    pub fn new() -> Self {
309        Self {
310            datasets: HashMap::new(),
311            metadata: HashMap::new(),
312        }
313    }
314
315    /// Add a dataset to the collection
316    pub fn add_dataset(&mut self, dataset: Dataset) -> Result<()> {
317        let name = dataset.metadata.name.clone();
318        self.datasets.insert(name, dataset);
319        Ok(())
320    }
321
322    /// Get a dataset by name
323    pub fn get_dataset(&self, name: &str) -> Option<&Dataset> {
324        self.datasets.get(name)
325    }
326
327    /// Remove a dataset
328    pub fn remove_dataset(&mut self, name: &str) -> Option<Dataset> {
329        self.datasets.remove(name)
330    }
331
332    /// List all dataset names
333    pub fn list_datasets(&self) -> Vec<String> {
334        self.datasets.keys().cloned().collect()
335    }
336
337    /// Get collection size
338    pub fn size(&self) -> usize {
339        self.datasets.len()
340    }
341
342    /// Save entire collection to directory
343    pub async fn save_to_directory<P: AsRef<Path>>(&self, dir_path: P) -> Result<()> {
344        fs::create_dir_all(&dir_path)
345            .await
346            .map_err(|e| crate::Error::generic(format!("Failed to create directory: {}", e)))?;
347
348        for (name, dataset) in &self.datasets {
349            let file_path = dir_path.as_ref().join(format!("{}.json", name));
350            dataset.save_to_file(file_path).await?;
351        }
352
353        Ok(())
354    }
355
356    /// Load collection from directory
357    pub async fn load_from_directory<P: AsRef<Path>>(dir_path: P) -> Result<Self> {
358        let mut collection = Self::new();
359        let mut entries = fs::read_dir(dir_path)
360            .await
361            .map_err(|e| crate::Error::generic(format!("Failed to read directory: {}", e)))?;
362
363        while let Some(entry) = entries
364            .next_entry()
365            .await
366            .map_err(|e| crate::Error::generic(format!("Failed to read directory entry: {}", e)))?
367        {
368            let path = entry.path();
369            if path.extension().and_then(|s| s.to_str()) == Some("json") {
370                if let Some(_file_name) = path.file_stem().and_then(|s| s.to_str()) {
371                    let dataset = Dataset::load_from_file(&path).await?;
372                    collection.add_dataset(dataset)?;
373                }
374            }
375        }
376
377        Ok(collection)
378    }
379
380    /// Get collection statistics
381    pub fn statistics(&self) -> HashMap<String, serde_json::Value> {
382        let mut stats = HashMap::new();
383
384        stats.insert("total_datasets".to_string(), self.size().into());
385        stats.insert(
386            "total_rows".to_string(),
387            self.datasets.values().map(|d| d.row_count()).sum::<usize>().into(),
388        );
389
390        let dataset_info: Vec<serde_json::Value> = self
391            .datasets
392            .values()
393            .map(|d| {
394                serde_json::json!({
395                    "name": d.metadata.name,
396                    "schema": d.metadata.schema_name,
397                    "rows": d.row_count(),
398                    "format": format!("{:?}", d.metadata.format),
399                })
400            })
401            .collect();
402
403        stats.insert("datasets".to_string(), dataset_info.into());
404
405        stats
406    }
407}
408
409impl Default for DatasetCollection {
410    fn default() -> Self {
411        Self::new()
412    }
413}
414
415/// Dataset utilities
416pub mod utils {
417    use super::*;
418
419    /// Create a sample dataset collection with common schemas
420    pub async fn create_sample_collection() -> Result<DatasetCollection> {
421        let mut collection = DatasetCollection::new();
422
423        // Create user dataset
424        let users_result = crate::generator::utils::generate_users(50).await?;
425        let users_dataset = Dataset::from_generation_result(
426            "users".to_string(),
427            "User".to_string(),
428            users_result,
429            DataConfig {
430                rows: 50,
431                ..Default::default()
432            },
433        );
434        collection.add_dataset(users_dataset)?;
435
436        // Create product dataset
437        let products_result = crate::generator::utils::generate_products(25).await?;
438        let products_dataset = Dataset::from_generation_result(
439            "products".to_string(),
440            "Product".to_string(),
441            products_result,
442            DataConfig {
443                rows: 25,
444                ..Default::default()
445            },
446        );
447        collection.add_dataset(products_dataset)?;
448
449        Ok(collection)
450    }
451
452    /// Export dataset to different formats
453    pub async fn export_dataset(
454        dataset: &Dataset,
455        format: OutputFormat,
456        output_path: &Path,
457    ) -> Result<()> {
458        let content = match format {
459            OutputFormat::Json => dataset.to_json_string()?,
460            OutputFormat::JsonLines => dataset.to_jsonl_string()?,
461            OutputFormat::Csv => dataset.to_csv_string()?,
462            OutputFormat::Yaml => dataset.to_yaml_string()?,
463        };
464
465        fs::write(output_path, content)
466            .await
467            .map_err(|e| crate::Error::generic(format!("Failed to export dataset: {}", e)))
468    }
469
470    /// Validate dataset against schema
471    pub fn validate_dataset_against_schema(
472        dataset: &Dataset,
473        schema: &SchemaDefinition,
474    ) -> Result<Vec<String>> {
475        let mut errors = Vec::new();
476
477        // Validate each row in the dataset
478        for (row_index, row) in dataset.data.iter().enumerate() {
479            match row {
480                serde_json::Value::Object(row_obj) => {
481                    // Validate each field in the schema
482                    for field in &schema.fields {
483                        let field_name = &field.name;
484
485                        if let Some(field_value) = row_obj.get(field_name) {
486                            // Validate the field value
487                            if let Err(validation_error) = field.validate_value(field_value) {
488                                errors.push(format!(
489                                    "Row {}: Field '{}': {}",
490                                    row_index + 1,
491                                    field_name,
492                                    validation_error
493                                ));
494                            }
495                        } else if field.required {
496                            errors.push(format!(
497                                "Row {}: Required field '{}' is missing",
498                                row_index + 1,
499                                field_name
500                            ));
501                        }
502                    }
503
504                    // Check for unexpected fields
505                    for (key, _) in row_obj {
506                        let field_exists_in_schema = schema.fields.iter().any(|f| f.name == *key);
507                        if !field_exists_in_schema {
508                            errors.push(format!(
509                                "Row {}: Unexpected field '{}' not defined in schema",
510                                row_index + 1,
511                                key
512                            ));
513                        }
514                    }
515                }
516                _ => {
517                    errors.push(format!("Row {}: Expected object, got {}", row_index + 1, row));
518                }
519            }
520        }
521
522        // Validate dataset-level constraints
523        if let Err(count_error) = validate_dataset_size(dataset, schema) {
524            errors.push(count_error.to_string());
525        }
526
527        Ok(errors)
528    }
529
530    /// Validate dataset size constraints
531    fn validate_dataset_size(dataset: &Dataset, schema: &SchemaDefinition) -> crate::Result<()> {
532        // Check if there are any size constraints in schema metadata
533        if let Some(min_rows) = schema.metadata.get("min_rows") {
534            if let Some(min_count) = min_rows.as_u64() {
535                if dataset.data.len() < min_count as usize {
536                    return Err(Error::validation(format!(
537                        "Dataset has {} rows, but schema requires at least {} rows",
538                        dataset.data.len(),
539                        min_count
540                    )));
541                }
542            }
543        }
544
545        if let Some(max_rows) = schema.metadata.get("max_rows") {
546            if let Some(max_count) = max_rows.as_u64() {
547                if dataset.data.len() > max_count as usize {
548                    return Err(Error::validation(format!(
549                        "Dataset has {} rows, but schema allows at most {} rows",
550                        dataset.data.len(),
551                        max_count
552                    )));
553                }
554            }
555        }
556
557        Ok(())
558    }
559
560    /// Validate dataset and return detailed result
561    pub fn validate_dataset_with_details(
562        dataset: &Dataset,
563        schema: &SchemaDefinition,
564    ) -> DatasetValidationResult {
565        let errors = validate_dataset_against_schema(dataset, schema);
566
567        match errors {
568            Ok(validation_errors) => {
569                let warnings = Vec::new(); // Could add warnings for deprecated fields, etc.
570                DatasetValidationResult {
571                    valid: validation_errors.is_empty(),
572                    errors: validation_errors,
573                    warnings,
574                    total_rows_validated: dataset.data.len(),
575                }
576            }
577            Err(e) => DatasetValidationResult {
578                valid: false,
579                errors: vec![format!("Validation failed: {}", e)],
580                warnings: Vec::new(),
581                total_rows_validated: dataset.data.len(),
582            },
583        }
584    }
585}
586
587#[cfg(test)]
588mod tests {
589    use super::*;
590
591    #[test]
592    fn test_dataset_validation_result_creation() {
593        let result = DatasetValidationResult {
594            valid: true,
595            errors: vec![],
596            warnings: vec![],
597            total_rows_validated: 100,
598        };
599
600        assert!(result.valid);
601        assert_eq!(result.total_rows_validated, 100);
602    }
603
604    #[test]
605    fn test_dataset_validation_result_with_errors() {
606        let result = DatasetValidationResult {
607            valid: false,
608            errors: vec!["Error 1".to_string(), "Error 2".to_string()],
609            warnings: vec![],
610            total_rows_validated: 50,
611        };
612
613        assert!(!result.valid);
614        assert_eq!(result.errors.len(), 2);
615    }
616
617    #[test]
618    fn test_dataset_validation_result_with_warnings() {
619        let result = DatasetValidationResult {
620            valid: true,
621            errors: vec![],
622            warnings: vec!["Warning 1".to_string()],
623            total_rows_validated: 75,
624        };
625
626        assert!(result.valid);
627        assert_eq!(result.warnings.len(), 1);
628    }
629
630    #[test]
631    fn test_dataset_metadata_creation() {
632        let config = DataConfig::default();
633        let metadata = DatasetMetadata {
634            name: "TestDataset".to_string(),
635            description: Some("Test description".to_string()),
636            schema_name: "TestSchema".to_string(),
637            row_count: 100,
638            config,
639            created_at: chrono::Utc::now(),
640            generation_time_ms: 1000,
641            format: OutputFormat::Json,
642            file_size_bytes: Some(1024),
643            tags: HashMap::new(),
644        };
645
646        assert_eq!(metadata.name, "TestDataset");
647        assert_eq!(metadata.row_count, 100);
648        assert!(metadata.description.is_some());
649        assert_eq!(metadata.generation_time_ms, 1000);
650    }
651}