mockforge_data/
dataset.rs

1//! Dataset management and persistence
2//!
3//! This module has been refactored into sub-modules for better organization:
4//! - core: Core dataset structures and basic operations
5//! - collection: Dataset collection management and organization
6//! - metadata: Dataset metadata tracking and management
7//! - validation: Dataset validation and integrity checking
8//! - persistence: Dataset storage, loading, and file operations
9
10// Re-export sub-modules for backward compatibility
11pub mod core;
12
13// Re-export commonly used types
14pub use core::*;
15
16// Legacy imports for compatibility
17use crate::{DataConfig, GenerationResult, OutputFormat, SchemaDefinition};
18use crate::{Error, Result};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21use std::path::Path;
22use tokio::fs;
23
24/// Dataset validation result
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct DatasetValidationResult {
27    /// Whether the dataset is valid
28    pub valid: bool,
29    /// Validation errors
30    pub errors: Vec<String>,
31    /// Validation warnings
32    pub warnings: Vec<String>,
33    /// Total number of rows validated
34    pub total_rows_validated: usize,
35}
36
37/// Dataset metadata
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct DatasetMetadata {
40    /// Dataset name
41    pub name: String,
42    /// Dataset description
43    pub description: Option<String>,
44    /// Schema name used to generate this dataset
45    pub schema_name: String,
46    /// Number of rows
47    pub row_count: usize,
48    /// Generation configuration
49    pub config: DataConfig,
50    /// Creation timestamp
51    pub created_at: chrono::DateTime<chrono::Utc>,
52    /// Generation time in milliseconds
53    pub generation_time_ms: u128,
54    /// File format
55    pub format: OutputFormat,
56    /// File size in bytes
57    pub file_size_bytes: Option<u64>,
58    /// Additional metadata
59    pub tags: HashMap<String, String>,
60}
61
62impl Default for DatasetMetadata {
63    fn default() -> Self {
64        Self {
65            name: String::new(),
66            description: None,
67            schema_name: String::new(),
68            row_count: 0,
69            config: DataConfig::default(),
70            created_at: chrono::Utc::now(),
71            generation_time_ms: 0,
72            format: OutputFormat::Json,
73            file_size_bytes: None,
74            tags: HashMap::new(),
75        }
76    }
77}
78
79impl DatasetMetadata {
80    /// Create new metadata
81    pub fn new(
82        name: String,
83        schema_name: String,
84        result: &GenerationResult,
85        config: DataConfig,
86    ) -> Self {
87        Self {
88            name,
89            description: None,
90            schema_name,
91            row_count: result.count,
92            config,
93            created_at: chrono::Utc::now(),
94            generation_time_ms: result.generation_time_ms,
95            format: OutputFormat::Json,
96            file_size_bytes: None,
97            tags: HashMap::new(),
98        }
99    }
100
101    /// Set description
102    pub fn with_description(mut self, description: String) -> Self {
103        self.description = Some(description);
104        self
105    }
106
107    /// Add a tag
108    pub fn with_tag(mut self, key: String, value: String) -> Self {
109        self.tags.insert(key, value);
110        self
111    }
112
113    /// Set file size
114    pub fn with_file_size(mut self, size: u64) -> Self {
115        self.file_size_bytes = Some(size);
116        self
117    }
118}
119
120/// Dataset representation
121#[derive(Debug)]
122pub struct Dataset {
123    /// Dataset metadata
124    pub metadata: DatasetMetadata,
125    /// Dataset data
126    pub data: Vec<serde_json::Value>,
127}
128
129impl Dataset {
130    /// Create a new dataset from generation result
131    pub fn new(metadata: DatasetMetadata, data: Vec<serde_json::Value>) -> Self {
132        Self { metadata, data }
133    }
134
135    /// Create dataset from generation result
136    pub fn from_generation_result(
137        name: String,
138        schema_name: String,
139        result: GenerationResult,
140        config: DataConfig,
141    ) -> Self {
142        let metadata = DatasetMetadata::new(name, schema_name, &result, config);
143        Self::new(metadata, result.data)
144    }
145
146    /// Get dataset as JSON string
147    pub fn to_json_string(&self) -> Result<String> {
148        serde_json::to_string_pretty(&self.data)
149            .map_err(|e| crate::Error::generic(format!("Failed to serialize dataset: {}", e)))
150    }
151
152    /// Get dataset as JSON Lines string
153    pub fn to_jsonl_string(&self) -> Result<String> {
154        let lines: Result<Vec<String>> = self
155            .data
156            .iter()
157            .map(|value| {
158                serde_json::to_string(value)
159                    .map_err(|e| crate::Error::generic(format!("JSON serialization error: {}", e)))
160            })
161            .collect();
162
163        lines.map(|lines| lines.join("\n"))
164    }
165
166    /// Get dataset as CSV string (basic implementation)
167    pub fn to_csv_string(&self) -> Result<String> {
168        if self.data.is_empty() {
169            return Ok(String::new());
170        }
171
172        let mut csv_output = String::new();
173
174        // Extract headers from first object
175        if let Some(first_row) = self.data.first() {
176            if let Some(obj) = first_row.as_object() {
177                let headers: Vec<String> = obj.keys().cloned().collect();
178                csv_output.push_str(&headers.join(","));
179                csv_output.push('\n');
180
181                // Add data rows
182                for row in &self.data {
183                    if let Some(obj) = row.as_object() {
184                        let values: Vec<String> = headers
185                            .iter()
186                            .map(|header| {
187                                obj.get(header)
188                                    .map(|v| v.to_string().trim_matches('"').to_string())
189                                    .unwrap_or_default()
190                            })
191                            .collect();
192                        csv_output.push_str(&values.join(","));
193                        csv_output.push('\n');
194                    }
195                }
196            }
197        }
198
199        Ok(csv_output)
200    }
201
202    /// Get dataset as YAML string
203    pub fn to_yaml_string(&self) -> Result<String> {
204        serde_yaml::to_string(&self.data)
205            .map_err(|e| crate::Error::generic(format!("Failed to serialize dataset: {}", e)))
206    }
207
208    /// Save dataset to file
209    pub async fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
210        let content = match self.metadata.format {
211            OutputFormat::Json => self.to_json_string()?,
212            OutputFormat::JsonLines => self.to_jsonl_string()?,
213            OutputFormat::Csv => self.to_csv_string()?,
214            OutputFormat::Yaml => self.to_yaml_string()?,
215        };
216
217        fs::write(path, content)
218            .await
219            .map_err(|e| crate::Error::generic(format!("Failed to write dataset file: {}", e)))
220    }
221
222    /// Load dataset from file
223    pub async fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
224        let content = fs::read_to_string(path)
225            .await
226            .map_err(|e| crate::Error::generic(format!("Failed to read dataset file: {}", e)))?;
227
228        // Try to parse as JSON array first
229        if let Ok(data) = serde_json::from_str::<Vec<serde_json::Value>>(&content) {
230            let metadata = DatasetMetadata {
231                name: "loaded_dataset".to_string(),
232                description: None,
233                schema_name: "unknown".to_string(),
234                row_count: data.len(),
235                config: DataConfig::default(),
236                created_at: chrono::Utc::now(),
237                generation_time_ms: 0,
238                format: OutputFormat::Json,
239                file_size_bytes: Some(content.len() as u64),
240                tags: HashMap::new(),
241            };
242
243            return Ok(Self::new(metadata, data));
244        }
245
246        Err(crate::Error::generic("Unsupported file format or invalid content"))
247    }
248
249    /// Get row count
250    pub fn row_count(&self) -> usize {
251        self.data.len()
252    }
253
254    /// Get sample rows
255    pub fn sample(&self, count: usize) -> &[serde_json::Value] {
256        let sample_count = count.min(self.data.len());
257        &self.data[..sample_count]
258    }
259
260    /// Filter dataset by predicate
261    pub fn filter<F>(&self, predicate: F) -> Dataset
262    where
263        F: Fn(&serde_json::Value) -> bool,
264    {
265        let filtered_data: Vec<serde_json::Value> =
266            self.data.iter().filter(|row| predicate(row)).cloned().collect();
267
268        let mut metadata = self.metadata.clone();
269        metadata.row_count = filtered_data.len();
270
271        Self::new(metadata, filtered_data)
272    }
273
274    /// Transform dataset with a mapping function
275    pub fn map<F>(&self, mapper: F) -> Dataset
276    where
277        F: Fn(&serde_json::Value) -> serde_json::Value,
278    {
279        let mapped_data: Vec<serde_json::Value> = self.data.iter().map(mapper).collect();
280
281        let metadata = self.metadata.clone();
282        Self::new(metadata, mapped_data)
283    }
284
285    /// Validate this dataset against a schema
286    pub fn validate_against_schema(&self, schema: &SchemaDefinition) -> Result<Vec<String>> {
287        utils::validate_dataset_against_schema(self, schema)
288    }
289
290    /// Validate this dataset with detailed results
291    pub fn validate_with_details(&self, schema: &SchemaDefinition) -> DatasetValidationResult {
292        utils::validate_dataset_with_details(self, schema)
293    }
294}
295
296/// Dataset collection for managing multiple datasets
297#[derive(Debug)]
298pub struct DatasetCollection {
299    /// Datasets indexed by name
300    datasets: HashMap<String, Dataset>,
301    /// Collection metadata
302    #[allow(dead_code)]
303    metadata: HashMap<String, String>,
304}
305
306impl DatasetCollection {
307    /// Create a new dataset collection
308    pub fn new() -> Self {
309        Self {
310            datasets: HashMap::new(),
311            metadata: HashMap::new(),
312        }
313    }
314
315    /// Add a dataset to the collection
316    pub fn add_dataset(&mut self, dataset: Dataset) -> Result<()> {
317        let name = dataset.metadata.name.clone();
318        self.datasets.insert(name, dataset);
319        Ok(())
320    }
321
322    /// Get a dataset by name
323    pub fn get_dataset(&self, name: &str) -> Option<&Dataset> {
324        self.datasets.get(name)
325    }
326
327    /// Remove a dataset
328    pub fn remove_dataset(&mut self, name: &str) -> Option<Dataset> {
329        self.datasets.remove(name)
330    }
331
332    /// List all dataset names
333    pub fn list_datasets(&self) -> Vec<String> {
334        self.datasets.keys().cloned().collect()
335    }
336
337    /// Get collection size
338    pub fn size(&self) -> usize {
339        self.datasets.len()
340    }
341
342    /// Save entire collection to directory
343    pub async fn save_to_directory<P: AsRef<Path>>(&self, dir_path: P) -> Result<()> {
344        fs::create_dir_all(&dir_path)
345            .await
346            .map_err(|e| crate::Error::generic(format!("Failed to create directory: {}", e)))?;
347
348        for (name, dataset) in &self.datasets {
349            let file_path = dir_path.as_ref().join(format!("{}.json", name));
350            dataset.save_to_file(file_path).await?;
351        }
352
353        Ok(())
354    }
355
356    /// Load collection from directory
357    pub async fn load_from_directory<P: AsRef<Path>>(dir_path: P) -> Result<Self> {
358        let mut collection = Self::new();
359        let mut entries = fs::read_dir(dir_path)
360            .await
361            .map_err(|e| crate::Error::generic(format!("Failed to read directory: {}", e)))?;
362
363        while let Some(entry) = entries
364            .next_entry()
365            .await
366            .map_err(|e| crate::Error::generic(format!("Failed to read directory entry: {}", e)))?
367        {
368            let path = entry.path();
369            if path.extension().and_then(|s| s.to_str()) == Some("json") {
370                if let Some(_file_name) = path.file_stem().and_then(|s| s.to_str()) {
371                    let dataset = Dataset::load_from_file(&path).await?;
372                    collection.add_dataset(dataset)?;
373                }
374            }
375        }
376
377        Ok(collection)
378    }
379
380    /// Get collection statistics
381    pub fn statistics(&self) -> HashMap<String, serde_json::Value> {
382        let mut stats = HashMap::new();
383
384        stats.insert("total_datasets".to_string(), self.size().into());
385        stats.insert(
386            "total_rows".to_string(),
387            self.datasets.values().map(|d| d.row_count()).sum::<usize>().into(),
388        );
389
390        let dataset_info: Vec<serde_json::Value> = self
391            .datasets
392            .values()
393            .map(|d| {
394                serde_json::json!({
395                    "name": d.metadata.name,
396                    "schema": d.metadata.schema_name,
397                    "rows": d.row_count(),
398                    "format": format!("{:?}", d.metadata.format),
399                })
400            })
401            .collect();
402
403        stats.insert("datasets".to_string(), dataset_info.into());
404
405        stats
406    }
407}
408
409impl Default for DatasetCollection {
410    fn default() -> Self {
411        Self::new()
412    }
413}
414
415/// Dataset utilities
416pub mod utils {
417    use super::*;
418
419    /// Create a sample dataset collection with common schemas
420    pub async fn create_sample_collection() -> Result<DatasetCollection> {
421        let mut collection = DatasetCollection::new();
422
423        // Create user dataset
424        let users_result = crate::generator::utils::generate_users(50).await?;
425        let users_dataset = Dataset::from_generation_result(
426            "users".to_string(),
427            "User".to_string(),
428            users_result,
429            DataConfig {
430                rows: 50,
431                ..Default::default()
432            },
433        );
434        collection.add_dataset(users_dataset)?;
435
436        // Create product dataset
437        let products_result = crate::generator::utils::generate_products(25).await?;
438        let products_dataset = Dataset::from_generation_result(
439            "products".to_string(),
440            "Product".to_string(),
441            products_result,
442            DataConfig {
443                rows: 25,
444                ..Default::default()
445            },
446        );
447        collection.add_dataset(products_dataset)?;
448
449        Ok(collection)
450    }
451
452    /// Export dataset to different formats
453    pub async fn export_dataset(
454        dataset: &Dataset,
455        format: OutputFormat,
456        output_path: &Path,
457    ) -> Result<()> {
458        let content = match format {
459            OutputFormat::Json => dataset.to_json_string()?,
460            OutputFormat::JsonLines => dataset.to_jsonl_string()?,
461            OutputFormat::Csv => dataset.to_csv_string()?,
462            OutputFormat::Yaml => dataset.to_yaml_string()?,
463        };
464
465        fs::write(output_path, content)
466            .await
467            .map_err(|e| crate::Error::generic(format!("Failed to export dataset: {}", e)))
468    }
469
470    /// Validate dataset against schema
471    pub fn validate_dataset_against_schema(
472        dataset: &Dataset,
473        schema: &SchemaDefinition,
474    ) -> Result<Vec<String>> {
475        let mut errors = Vec::new();
476
477        // Validate each row in the dataset
478        for (row_index, row) in dataset.data.iter().enumerate() {
479            match row {
480                serde_json::Value::Object(row_obj) => {
481                    // Validate each field in the schema
482                    for field in &schema.fields {
483                        let field_name = &field.name;
484
485                        if let Some(field_value) = row_obj.get(field_name) {
486                            // Validate the field value
487                            if let Err(validation_error) = field.validate_value(field_value) {
488                                errors.push(format!(
489                                    "Row {}: Field '{}': {}",
490                                    row_index + 1,
491                                    field_name,
492                                    validation_error
493                                ));
494                            }
495                        } else if field.required {
496                            errors.push(format!(
497                                "Row {}: Required field '{}' is missing",
498                                row_index + 1,
499                                field_name
500                            ));
501                        }
502                    }
503
504                    // Check for unexpected fields
505                    for (key, _) in row_obj {
506                        let field_exists_in_schema = schema.fields.iter().any(|f| f.name == *key);
507                        if !field_exists_in_schema {
508                            errors.push(format!(
509                                "Row {}: Unexpected field '{}' not defined in schema",
510                                row_index + 1,
511                                key
512                            ));
513                        }
514                    }
515                }
516                _ => {
517                    errors.push(format!("Row {}: Expected object, got {}", row_index + 1, row));
518                }
519            }
520        }
521
522        // Validate dataset-level constraints
523        if let Err(count_error) = validate_dataset_size(dataset, schema) {
524            errors.push(count_error.to_string());
525        }
526
527        Ok(errors)
528    }
529
530    /// Validate dataset size constraints
531    fn validate_dataset_size(dataset: &Dataset, schema: &SchemaDefinition) -> crate::Result<()> {
532        // Check if there are any size constraints in schema metadata
533        if let Some(min_rows) = schema.metadata.get("min_rows") {
534            if let Some(min_count) = min_rows.as_u64() {
535                if dataset.data.len() < min_count as usize {
536                    return Err(Error::validation(format!(
537                        "Dataset has {} rows, but schema requires at least {} rows",
538                        dataset.data.len(),
539                        min_count
540                    )));
541                }
542            }
543        }
544
545        if let Some(max_rows) = schema.metadata.get("max_rows") {
546            if let Some(max_count) = max_rows.as_u64() {
547                if dataset.data.len() > max_count as usize {
548                    return Err(Error::validation(format!(
549                        "Dataset has {} rows, but schema allows at most {} rows",
550                        dataset.data.len(),
551                        max_count
552                    )));
553                }
554            }
555        }
556
557        Ok(())
558    }
559
560    /// Validate dataset and return detailed result
561    pub fn validate_dataset_with_details(
562        dataset: &Dataset,
563        schema: &SchemaDefinition,
564    ) -> DatasetValidationResult {
565        let errors = validate_dataset_against_schema(dataset, schema);
566
567        match errors {
568            Ok(validation_errors) => {
569                let warnings = Vec::new(); // Could add warnings for deprecated fields, etc.
570                DatasetValidationResult {
571                    valid: validation_errors.is_empty(),
572                    errors: validation_errors,
573                    warnings,
574                    total_rows_validated: dataset.data.len(),
575                }
576            }
577            Err(e) => DatasetValidationResult {
578                valid: false,
579                errors: vec![format!("Validation failed: {}", e)],
580                warnings: Vec::new(),
581                total_rows_validated: dataset.data.len(),
582            },
583        }
584    }
585}
586
587#[cfg(test)]
588mod tests {
589    use super::*;
590
591    // =========================================================================
592    // DatasetValidationResult tests
593    // =========================================================================
594
595    #[test]
596    fn test_dataset_validation_result_creation() {
597        let result = DatasetValidationResult {
598            valid: true,
599            errors: vec![],
600            warnings: vec![],
601            total_rows_validated: 100,
602        };
603
604        assert!(result.valid);
605        assert_eq!(result.total_rows_validated, 100);
606    }
607
608    #[test]
609    fn test_dataset_validation_result_with_errors() {
610        let result = DatasetValidationResult {
611            valid: false,
612            errors: vec!["Error 1".to_string(), "Error 2".to_string()],
613            warnings: vec![],
614            total_rows_validated: 50,
615        };
616
617        assert!(!result.valid);
618        assert_eq!(result.errors.len(), 2);
619    }
620
621    #[test]
622    fn test_dataset_validation_result_with_warnings() {
623        let result = DatasetValidationResult {
624            valid: true,
625            errors: vec![],
626            warnings: vec!["Warning 1".to_string()],
627            total_rows_validated: 75,
628        };
629
630        assert!(result.valid);
631        assert_eq!(result.warnings.len(), 1);
632    }
633
634    #[test]
635    fn test_dataset_validation_result_clone() {
636        let result = DatasetValidationResult {
637            valid: true,
638            errors: vec!["err".to_string()],
639            warnings: vec!["warn".to_string()],
640            total_rows_validated: 50,
641        };
642        let cloned = result.clone();
643        assert_eq!(cloned.total_rows_validated, 50);
644        assert_eq!(cloned.errors.len(), 1);
645    }
646
647    #[test]
648    fn test_dataset_validation_result_serialize() {
649        let result = DatasetValidationResult {
650            valid: true,
651            errors: vec![],
652            warnings: vec![],
653            total_rows_validated: 25,
654        };
655        let json = serde_json::to_string(&result).unwrap();
656        assert!(json.contains("true"));
657        assert!(json.contains("25"));
658    }
659
660    #[test]
661    fn test_dataset_validation_result_deserialize() {
662        let json =
663            r#"{"valid": false, "errors": ["e1"], "warnings": [], "total_rows_validated": 10}"#;
664        let result: DatasetValidationResult = serde_json::from_str(json).unwrap();
665        assert!(!result.valid);
666        assert_eq!(result.errors.len(), 1);
667    }
668
669    #[test]
670    fn test_dataset_validation_result_debug() {
671        let result = DatasetValidationResult {
672            valid: true,
673            errors: vec![],
674            warnings: vec![],
675            total_rows_validated: 0,
676        };
677        let debug_str = format!("{:?}", result);
678        assert!(debug_str.contains("valid"));
679    }
680
681    // =========================================================================
682    // DatasetMetadata tests
683    // =========================================================================
684
685    #[test]
686    fn test_dataset_metadata_creation() {
687        let config = DataConfig::default();
688        let metadata = DatasetMetadata {
689            name: "TestDataset".to_string(),
690            description: Some("Test description".to_string()),
691            schema_name: "TestSchema".to_string(),
692            row_count: 100,
693            config,
694            created_at: chrono::Utc::now(),
695            generation_time_ms: 1000,
696            format: OutputFormat::Json,
697            file_size_bytes: Some(1024),
698            tags: HashMap::new(),
699        };
700
701        assert_eq!(metadata.name, "TestDataset");
702        assert_eq!(metadata.row_count, 100);
703        assert!(metadata.description.is_some());
704        assert_eq!(metadata.generation_time_ms, 1000);
705    }
706
707    #[test]
708    fn test_dataset_metadata_default() {
709        let metadata = DatasetMetadata::default();
710        assert!(metadata.name.is_empty());
711        assert!(metadata.description.is_none());
712        assert_eq!(metadata.row_count, 0);
713        assert!(metadata.tags.is_empty());
714    }
715
716    #[test]
717    fn test_dataset_metadata_new() {
718        let result = GenerationResult {
719            data: vec![serde_json::json!({"id": 1}), serde_json::json!({"id": 2})],
720            count: 2,
721            generation_time_ms: 100,
722            warnings: vec![],
723        };
724        let config = DataConfig::default();
725        let metadata = DatasetMetadata::new(
726            "my_dataset".to_string(),
727            "TestSchema".to_string(),
728            &result,
729            config,
730        );
731
732        assert_eq!(metadata.name, "my_dataset");
733        assert_eq!(metadata.schema_name, "TestSchema");
734        assert_eq!(metadata.row_count, 2);
735        assert_eq!(metadata.generation_time_ms, 100);
736    }
737
738    #[test]
739    fn test_dataset_metadata_with_description() {
740        let metadata = DatasetMetadata::default().with_description("A test dataset".to_string());
741        assert_eq!(metadata.description, Some("A test dataset".to_string()));
742    }
743
744    #[test]
745    fn test_dataset_metadata_with_tag() {
746        let metadata = DatasetMetadata::default()
747            .with_tag("env".to_string(), "test".to_string())
748            .with_tag("version".to_string(), "1.0".to_string());
749        assert_eq!(metadata.tags.get("env"), Some(&"test".to_string()));
750        assert_eq!(metadata.tags.get("version"), Some(&"1.0".to_string()));
751    }
752
753    #[test]
754    fn test_dataset_metadata_with_file_size() {
755        let metadata = DatasetMetadata::default().with_file_size(2048);
756        assert_eq!(metadata.file_size_bytes, Some(2048));
757    }
758
759    #[test]
760    fn test_dataset_metadata_clone() {
761        let metadata = DatasetMetadata {
762            name: "cloneable".to_string(),
763            ..Default::default()
764        };
765        let cloned = metadata.clone();
766        assert_eq!(cloned.name, "cloneable");
767    }
768
769    #[test]
770    fn test_dataset_metadata_serialize() {
771        let metadata = DatasetMetadata::default();
772        let json = serde_json::to_string(&metadata).unwrap();
773        assert!(json.contains("name"));
774        assert!(json.contains("row_count"));
775    }
776
777    // =========================================================================
778    // Dataset tests
779    // =========================================================================
780
781    #[test]
782    fn test_dataset_new() {
783        let metadata = DatasetMetadata::default();
784        let data = vec![
785            serde_json::json!({"id": 1, "name": "Alice"}),
786            serde_json::json!({"id": 2, "name": "Bob"}),
787        ];
788        let dataset = Dataset::new(metadata, data);
789        assert_eq!(dataset.row_count(), 2);
790    }
791
792    #[test]
793    fn test_dataset_from_generation_result() {
794        let result = GenerationResult {
795            data: vec![serde_json::json!({"id": 1})],
796            count: 1,
797            generation_time_ms: 50,
798            warnings: vec![],
799        };
800        let config = DataConfig::default();
801        let dataset = Dataset::from_generation_result(
802            "test_dataset".to_string(),
803            "TestSchema".to_string(),
804            result,
805            config,
806        );
807        assert_eq!(dataset.metadata.name, "test_dataset");
808        assert_eq!(dataset.row_count(), 1);
809    }
810
811    #[test]
812    fn test_dataset_to_json_string() {
813        let metadata = DatasetMetadata::default();
814        let data = vec![serde_json::json!({"id": 1}), serde_json::json!({"id": 2})];
815        let dataset = Dataset::new(metadata, data);
816        let json = dataset.to_json_string().unwrap();
817        assert!(json.contains("id"));
818        assert!(json.contains("1"));
819        assert!(json.contains("2"));
820    }
821
822    #[test]
823    fn test_dataset_to_jsonl_string() {
824        let metadata = DatasetMetadata::default();
825        let data = vec![serde_json::json!({"id": 1}), serde_json::json!({"id": 2})];
826        let dataset = Dataset::new(metadata, data);
827        let jsonl = dataset.to_jsonl_string().unwrap();
828        let lines: Vec<&str> = jsonl.split('\n').collect();
829        assert_eq!(lines.len(), 2);
830    }
831
832    #[test]
833    fn test_dataset_to_csv_string() {
834        let metadata = DatasetMetadata::default();
835        let data = vec![
836            serde_json::json!({"id": 1, "name": "Alice"}),
837            serde_json::json!({"id": 2, "name": "Bob"}),
838        ];
839        let dataset = Dataset::new(metadata, data);
840        let csv = dataset.to_csv_string().unwrap();
841        assert!(csv.contains("id") || csv.contains("name")); // Headers
842        assert!(csv.contains("Alice") || csv.contains("Bob")); // Data
843    }
844
845    #[test]
846    fn test_dataset_to_csv_string_empty() {
847        let metadata = DatasetMetadata::default();
848        let dataset = Dataset::new(metadata, vec![]);
849        let csv = dataset.to_csv_string().unwrap();
850        assert!(csv.is_empty());
851    }
852
853    #[test]
854    fn test_dataset_to_yaml_string() {
855        let metadata = DatasetMetadata::default();
856        let data = vec![serde_json::json!({"id": 1})];
857        let dataset = Dataset::new(metadata, data);
858        let yaml = dataset.to_yaml_string().unwrap();
859        assert!(yaml.contains("id"));
860    }
861
862    #[test]
863    fn test_dataset_row_count() {
864        let metadata = DatasetMetadata::default();
865        let data = vec![
866            serde_json::json!({}),
867            serde_json::json!({}),
868            serde_json::json!({}),
869        ];
870        let dataset = Dataset::new(metadata, data);
871        assert_eq!(dataset.row_count(), 3);
872    }
873
874    #[test]
875    fn test_dataset_sample() {
876        let metadata = DatasetMetadata::default();
877        let data: Vec<serde_json::Value> = (0..10).map(|i| serde_json::json!({"id": i})).collect();
878        let dataset = Dataset::new(metadata, data);
879
880        let sample = dataset.sample(3);
881        assert_eq!(sample.len(), 3);
882
883        let big_sample = dataset.sample(100);
884        assert_eq!(big_sample.len(), 10); // Capped at dataset size
885    }
886
887    #[test]
888    fn test_dataset_filter() {
889        let metadata = DatasetMetadata {
890            name: "filterable".to_string(),
891            ..Default::default()
892        };
893        let data = vec![
894            serde_json::json!({"id": 1, "active": true}),
895            serde_json::json!({"id": 2, "active": false}),
896            serde_json::json!({"id": 3, "active": true}),
897        ];
898        let dataset = Dataset::new(metadata, data);
899
900        let filtered =
901            dataset.filter(|row| row.get("active").and_then(|v| v.as_bool()).unwrap_or(false));
902
903        assert_eq!(filtered.row_count(), 2);
904        assert_eq!(filtered.metadata.row_count, 2);
905    }
906
907    #[test]
908    fn test_dataset_map() {
909        let metadata = DatasetMetadata::default();
910        let data = vec![
911            serde_json::json!({"value": 1}),
912            serde_json::json!({"value": 2}),
913        ];
914        let dataset = Dataset::new(metadata, data);
915
916        let mapped = dataset.map(|row| {
917            let mut new_row = row.clone();
918            if let Some(obj) = new_row.as_object_mut() {
919                obj.insert("doubled".to_string(), serde_json::json!(true));
920            }
921            new_row
922        });
923
924        assert_eq!(mapped.row_count(), 2);
925        assert!(mapped.data[0].get("doubled").is_some());
926    }
927
928    #[test]
929    fn test_dataset_debug() {
930        let metadata = DatasetMetadata {
931            name: "debug_test".to_string(),
932            ..Default::default()
933        };
934        let dataset = Dataset::new(metadata, vec![]);
935        let debug_str = format!("{:?}", dataset);
936        assert!(debug_str.contains("metadata"));
937    }
938
939    // =========================================================================
940    // DatasetCollection tests
941    // =========================================================================
942
943    #[test]
944    fn test_dataset_collection_new() {
945        let collection = DatasetCollection::new();
946        assert_eq!(collection.size(), 0);
947    }
948
949    #[test]
950    fn test_dataset_collection_default() {
951        let collection = DatasetCollection::default();
952        assert_eq!(collection.size(), 0);
953    }
954
955    #[test]
956    fn test_dataset_collection_add_dataset() {
957        let mut collection = DatasetCollection::new();
958        let dataset = Dataset::new(
959            DatasetMetadata {
960                name: "test1".to_string(),
961                ..Default::default()
962            },
963            vec![],
964        );
965        collection.add_dataset(dataset).unwrap();
966        assert_eq!(collection.size(), 1);
967    }
968
969    #[test]
970    fn test_dataset_collection_get_dataset() {
971        let mut collection = DatasetCollection::new();
972        let dataset = Dataset::new(
973            DatasetMetadata {
974                name: "findme".to_string(),
975                ..Default::default()
976            },
977            vec![serde_json::json!({"id": 1})],
978        );
979        collection.add_dataset(dataset).unwrap();
980
981        let found = collection.get_dataset("findme");
982        assert!(found.is_some());
983        assert_eq!(found.unwrap().row_count(), 1);
984    }
985
986    #[test]
987    fn test_dataset_collection_get_dataset_not_found() {
988        let collection = DatasetCollection::new();
989        assert!(collection.get_dataset("nonexistent").is_none());
990    }
991
992    #[test]
993    fn test_dataset_collection_remove_dataset() {
994        let mut collection = DatasetCollection::new();
995        let dataset = Dataset::new(
996            DatasetMetadata {
997                name: "removable".to_string(),
998                ..Default::default()
999            },
1000            vec![],
1001        );
1002        collection.add_dataset(dataset).unwrap();
1003
1004        let removed = collection.remove_dataset("removable");
1005        assert!(removed.is_some());
1006        assert_eq!(collection.size(), 0);
1007    }
1008
1009    #[test]
1010    fn test_dataset_collection_list_datasets() {
1011        let mut collection = DatasetCollection::new();
1012        collection
1013            .add_dataset(Dataset::new(
1014                DatasetMetadata {
1015                    name: "a".to_string(),
1016                    ..Default::default()
1017                },
1018                vec![],
1019            ))
1020            .unwrap();
1021        collection
1022            .add_dataset(Dataset::new(
1023                DatasetMetadata {
1024                    name: "b".to_string(),
1025                    ..Default::default()
1026                },
1027                vec![],
1028            ))
1029            .unwrap();
1030
1031        let names = collection.list_datasets();
1032        assert_eq!(names.len(), 2);
1033        assert!(names.contains(&"a".to_string()));
1034        assert!(names.contains(&"b".to_string()));
1035    }
1036
1037    #[test]
1038    fn test_dataset_collection_size() {
1039        let mut collection = DatasetCollection::new();
1040        assert_eq!(collection.size(), 0);
1041
1042        collection
1043            .add_dataset(Dataset::new(
1044                DatasetMetadata {
1045                    name: "x".to_string(),
1046                    ..Default::default()
1047                },
1048                vec![],
1049            ))
1050            .unwrap();
1051        assert_eq!(collection.size(), 1);
1052    }
1053
1054    #[test]
1055    fn test_dataset_collection_statistics() {
1056        let mut collection = DatasetCollection::new();
1057        collection
1058            .add_dataset(Dataset::new(
1059                DatasetMetadata {
1060                    name: "ds1".to_string(),
1061                    schema_name: "Schema1".to_string(),
1062                    ..Default::default()
1063                },
1064                vec![serde_json::json!({}), serde_json::json!({})],
1065            ))
1066            .unwrap();
1067        collection
1068            .add_dataset(Dataset::new(
1069                DatasetMetadata {
1070                    name: "ds2".to_string(),
1071                    schema_name: "Schema2".to_string(),
1072                    ..Default::default()
1073                },
1074                vec![serde_json::json!({})],
1075            ))
1076            .unwrap();
1077
1078        let stats = collection.statistics();
1079        assert_eq!(stats.get("total_datasets").and_then(|v| v.as_u64()), Some(2));
1080        assert_eq!(stats.get("total_rows").and_then(|v| v.as_u64()), Some(3));
1081    }
1082
1083    #[test]
1084    fn test_dataset_collection_debug() {
1085        let collection = DatasetCollection::new();
1086        let debug_str = format!("{:?}", collection);
1087        assert!(debug_str.contains("datasets"));
1088    }
1089}