Skip to main content

mockforge_data/
dataset.rs

1//! Dataset management and persistence
2//!
3//! This module has been refactored into sub-modules for better organization:
4//! - core: Core dataset structures and basic operations
5//! - collection: Dataset collection management and organization
6//! - metadata: Dataset metadata tracking and management
7//! - validation: Dataset validation and integrity checking
8//! - persistence: Dataset storage, loading, and file operations
9
10// Re-export sub-modules for backward compatibility
11pub mod core;
12
13// Re-export commonly used types
14pub use core::*;
15
16// Legacy imports for compatibility
17use crate::{DataConfig, GenerationResult, OutputFormat, SchemaDefinition};
18use crate::{Error, Result};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21use std::path::Path;
22use tokio::fs;
23
24/// Dataset validation result
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct DatasetValidationResult {
27    /// Whether the dataset is valid
28    pub valid: bool,
29    /// Validation errors
30    pub errors: Vec<String>,
31    /// Validation warnings
32    pub warnings: Vec<String>,
33    /// Total number of rows validated
34    pub total_rows_validated: usize,
35}
36
37/// Dataset metadata
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct DatasetMetadata {
40    /// Dataset name
41    pub name: String,
42    /// Dataset description
43    pub description: Option<String>,
44    /// Schema name used to generate this dataset
45    pub schema_name: String,
46    /// Number of rows
47    pub row_count: usize,
48    /// Generation configuration
49    pub config: DataConfig,
50    /// Creation timestamp
51    pub created_at: chrono::DateTime<chrono::Utc>,
52    /// Generation time in milliseconds
53    pub generation_time_ms: u128,
54    /// File format
55    pub format: OutputFormat,
56    /// File size in bytes
57    pub file_size_bytes: Option<u64>,
58    /// Additional metadata
59    pub tags: HashMap<String, String>,
60}
61
62impl Default for DatasetMetadata {
63    fn default() -> Self {
64        Self {
65            name: String::new(),
66            description: None,
67            schema_name: String::new(),
68            row_count: 0,
69            config: DataConfig::default(),
70            created_at: chrono::Utc::now(),
71            generation_time_ms: 0,
72            format: OutputFormat::Json,
73            file_size_bytes: None,
74            tags: HashMap::new(),
75        }
76    }
77}
78
79impl DatasetMetadata {
80    /// Create new metadata
81    pub fn new(
82        name: String,
83        schema_name: String,
84        result: &GenerationResult,
85        config: DataConfig,
86    ) -> Self {
87        Self {
88            name,
89            description: None,
90            schema_name,
91            row_count: result.count,
92            config,
93            created_at: chrono::Utc::now(),
94            generation_time_ms: result.generation_time_ms,
95            format: OutputFormat::Json,
96            file_size_bytes: None,
97            tags: HashMap::new(),
98        }
99    }
100
101    /// Set description
102    pub fn with_description(mut self, description: String) -> Self {
103        self.description = Some(description);
104        self
105    }
106
107    /// Add a tag
108    pub fn with_tag(mut self, key: String, value: String) -> Self {
109        self.tags.insert(key, value);
110        self
111    }
112
113    /// Set file size
114    pub fn with_file_size(mut self, size: u64) -> Self {
115        self.file_size_bytes = Some(size);
116        self
117    }
118}
119
120/// Dataset representation
121#[derive(Debug)]
122pub struct Dataset {
123    /// Dataset metadata
124    pub metadata: DatasetMetadata,
125    /// Dataset data
126    pub data: Vec<serde_json::Value>,
127}
128
129impl Dataset {
130    /// Create a new dataset from generation result
131    pub fn new(metadata: DatasetMetadata, data: Vec<serde_json::Value>) -> Self {
132        Self { metadata, data }
133    }
134
135    /// Create dataset from generation result
136    pub fn from_generation_result(
137        name: String,
138        schema_name: String,
139        result: GenerationResult,
140        config: DataConfig,
141    ) -> Self {
142        let metadata = DatasetMetadata::new(name, schema_name, &result, config);
143        Self::new(metadata, result.data)
144    }
145
146    /// Get dataset as JSON string
147    pub fn to_json_string(&self) -> Result<String> {
148        serde_json::to_string_pretty(&self.data)
149            .map_err(|e| Error::generic(format!("Failed to serialize dataset: {}", e)))
150    }
151
152    /// Get dataset as JSON Lines string
153    pub fn to_jsonl_string(&self) -> Result<String> {
154        let lines: Result<Vec<String>> = self
155            .data
156            .iter()
157            .map(|value| {
158                serde_json::to_string(value)
159                    .map_err(|e| Error::generic(format!("JSON serialization error: {}", e)))
160            })
161            .collect();
162
163        lines.map(|lines| lines.join("\n"))
164    }
165
166    /// Get dataset as CSV string (basic implementation)
167    pub fn to_csv_string(&self) -> Result<String> {
168        if self.data.is_empty() {
169            return Ok(String::new());
170        }
171
172        let mut csv_output = String::new();
173
174        // Extract headers from first object
175        if let Some(first_row) = self.data.first() {
176            if let Some(obj) = first_row.as_object() {
177                let headers: Vec<String> = obj.keys().cloned().collect();
178                csv_output.push_str(&headers.join(","));
179                csv_output.push('\n');
180
181                // Add data rows
182                for row in &self.data {
183                    if let Some(obj) = row.as_object() {
184                        let values: Vec<String> = headers
185                            .iter()
186                            .map(|header| {
187                                obj.get(header)
188                                    .map(|v| v.to_string().trim_matches('"').to_string())
189                                    .unwrap_or_default()
190                            })
191                            .collect();
192                        csv_output.push_str(&values.join(","));
193                        csv_output.push('\n');
194                    }
195                }
196            }
197        }
198
199        Ok(csv_output)
200    }
201
202    /// Get dataset as YAML string
203    pub fn to_yaml_string(&self) -> Result<String> {
204        serde_yaml::to_string(&self.data)
205            .map_err(|e| Error::generic(format!("Failed to serialize dataset: {}", e)))
206    }
207
208    /// Save dataset to file
209    pub async fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
210        let content = match self.metadata.format {
211            OutputFormat::Json => self.to_json_string()?,
212            OutputFormat::JsonLines => self.to_jsonl_string()?,
213            OutputFormat::Csv => self.to_csv_string()?,
214            OutputFormat::Yaml => self.to_yaml_string()?,
215        };
216
217        fs::write(path, content)
218            .await
219            .map_err(|e| Error::generic(format!("Failed to write dataset file: {}", e)))
220    }
221
222    /// Load dataset from file
223    pub async fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
224        let content = fs::read_to_string(path)
225            .await
226            .map_err(|e| Error::generic(format!("Failed to read dataset file: {}", e)))?;
227
228        // Try to parse as JSON array first
229        if let Ok(data) = serde_json::from_str::<Vec<serde_json::Value>>(&content) {
230            let metadata = DatasetMetadata {
231                name: "loaded_dataset".to_string(),
232                description: None,
233                schema_name: "unknown".to_string(),
234                row_count: data.len(),
235                config: DataConfig::default(),
236                created_at: chrono::Utc::now(),
237                generation_time_ms: 0,
238                format: OutputFormat::Json,
239                file_size_bytes: Some(content.len() as u64),
240                tags: HashMap::new(),
241            };
242
243            return Ok(Self::new(metadata, data));
244        }
245
246        Err(Error::generic("Unsupported file format or invalid content"))
247    }
248
249    /// Get row count
250    pub fn row_count(&self) -> usize {
251        self.data.len()
252    }
253
254    /// Get sample rows
255    pub fn sample(&self, count: usize) -> &[serde_json::Value] {
256        let sample_count = count.min(self.data.len());
257        &self.data[..sample_count]
258    }
259
260    /// Filter dataset by predicate
261    pub fn filter<F>(&self, predicate: F) -> Dataset
262    where
263        F: Fn(&serde_json::Value) -> bool,
264    {
265        let filtered_data: Vec<serde_json::Value> =
266            self.data.iter().filter(|row| predicate(row)).cloned().collect();
267
268        let mut metadata = self.metadata.clone();
269        metadata.row_count = filtered_data.len();
270
271        Self::new(metadata, filtered_data)
272    }
273
274    /// Transform dataset with a mapping function
275    pub fn map<F>(&self, mapper: F) -> Dataset
276    where
277        F: Fn(&serde_json::Value) -> serde_json::Value,
278    {
279        let mapped_data: Vec<serde_json::Value> = self.data.iter().map(mapper).collect();
280
281        let metadata = self.metadata.clone();
282        Self::new(metadata, mapped_data)
283    }
284
285    /// Validate this dataset against a schema
286    pub fn validate_against_schema(&self, schema: &SchemaDefinition) -> Result<Vec<String>> {
287        utils::validate_dataset_against_schema(self, schema)
288    }
289
290    /// Validate this dataset with detailed results
291    pub fn validate_with_details(&self, schema: &SchemaDefinition) -> DatasetValidationResult {
292        utils::validate_dataset_with_details(self, schema)
293    }
294}
295
296/// Dataset collection for managing multiple datasets
297#[derive(Debug)]
298pub struct DatasetCollection {
299    /// Datasets indexed by name
300    datasets: HashMap<String, Dataset>,
301}
302
303impl DatasetCollection {
304    /// Create a new dataset collection
305    pub fn new() -> Self {
306        Self {
307            datasets: HashMap::new(),
308        }
309    }
310
311    /// Add a dataset to the collection
312    pub fn add_dataset(&mut self, dataset: Dataset) -> Result<()> {
313        let name = dataset.metadata.name.clone();
314        self.datasets.insert(name, dataset);
315        Ok(())
316    }
317
318    /// Get a dataset by name
319    pub fn get_dataset(&self, name: &str) -> Option<&Dataset> {
320        self.datasets.get(name)
321    }
322
323    /// Remove a dataset
324    pub fn remove_dataset(&mut self, name: &str) -> Option<Dataset> {
325        self.datasets.remove(name)
326    }
327
328    /// List all dataset names
329    pub fn list_datasets(&self) -> Vec<String> {
330        self.datasets.keys().cloned().collect()
331    }
332
333    /// Get collection size
334    pub fn size(&self) -> usize {
335        self.datasets.len()
336    }
337
338    /// Save entire collection to directory
339    pub async fn save_to_directory<P: AsRef<Path>>(&self, dir_path: P) -> Result<()> {
340        fs::create_dir_all(&dir_path)
341            .await
342            .map_err(|e| Error::generic(format!("Failed to create directory: {}", e)))?;
343
344        for (name, dataset) in &self.datasets {
345            let file_path = dir_path.as_ref().join(format!("{}.json", name));
346            dataset.save_to_file(file_path).await?;
347        }
348
349        Ok(())
350    }
351
352    /// Load collection from directory
353    pub async fn load_from_directory<P: AsRef<Path>>(dir_path: P) -> Result<Self> {
354        let mut collection = Self::new();
355        let mut entries = fs::read_dir(dir_path)
356            .await
357            .map_err(|e| Error::generic(format!("Failed to read directory: {}", e)))?;
358
359        while let Some(entry) = entries
360            .next_entry()
361            .await
362            .map_err(|e| Error::generic(format!("Failed to read directory entry: {}", e)))?
363        {
364            let path = entry.path();
365            if path.extension().and_then(|s| s.to_str()) == Some("json") {
366                if let Some(_file_name) = path.file_stem().and_then(|s| s.to_str()) {
367                    let dataset = Dataset::load_from_file(&path).await?;
368                    collection.add_dataset(dataset)?;
369                }
370            }
371        }
372
373        Ok(collection)
374    }
375
376    /// Get collection statistics
377    pub fn statistics(&self) -> HashMap<String, serde_json::Value> {
378        let mut stats = HashMap::new();
379
380        stats.insert("total_datasets".to_string(), self.size().into());
381        stats.insert(
382            "total_rows".to_string(),
383            self.datasets.values().map(|d| d.row_count()).sum::<usize>().into(),
384        );
385
386        let dataset_info: Vec<serde_json::Value> = self
387            .datasets
388            .values()
389            .map(|d| {
390                serde_json::json!({
391                    "name": d.metadata.name,
392                    "schema": d.metadata.schema_name,
393                    "rows": d.row_count(),
394                    "format": format!("{:?}", d.metadata.format),
395                })
396            })
397            .collect();
398
399        stats.insert("datasets".to_string(), dataset_info.into());
400
401        stats
402    }
403}
404
405impl Default for DatasetCollection {
406    fn default() -> Self {
407        Self::new()
408    }
409}
410
411/// Dataset utilities
412pub mod utils {
413    use super::*;
414
415    /// Create a sample dataset collection with common schemas
416    pub async fn create_sample_collection() -> Result<DatasetCollection> {
417        let mut collection = DatasetCollection::new();
418
419        // Create user dataset
420        let users_result = crate::generator::utils::generate_users(50).await?;
421        let users_dataset = Dataset::from_generation_result(
422            "users".to_string(),
423            "User".to_string(),
424            users_result,
425            DataConfig {
426                rows: 50,
427                ..Default::default()
428            },
429        );
430        collection.add_dataset(users_dataset)?;
431
432        // Create product dataset
433        let products_result = crate::generator::utils::generate_products(25).await?;
434        let products_dataset = Dataset::from_generation_result(
435            "products".to_string(),
436            "Product".to_string(),
437            products_result,
438            DataConfig {
439                rows: 25,
440                ..Default::default()
441            },
442        );
443        collection.add_dataset(products_dataset)?;
444
445        Ok(collection)
446    }
447
448    /// Export dataset to different formats
449    pub async fn export_dataset(
450        dataset: &Dataset,
451        format: OutputFormat,
452        output_path: &Path,
453    ) -> Result<()> {
454        let content = match format {
455            OutputFormat::Json => dataset.to_json_string()?,
456            OutputFormat::JsonLines => dataset.to_jsonl_string()?,
457            OutputFormat::Csv => dataset.to_csv_string()?,
458            OutputFormat::Yaml => dataset.to_yaml_string()?,
459        };
460
461        fs::write(output_path, content)
462            .await
463            .map_err(|e| Error::generic(format!("Failed to export dataset: {}", e)))
464    }
465
466    /// Validate dataset against schema
467    pub fn validate_dataset_against_schema(
468        dataset: &Dataset,
469        schema: &SchemaDefinition,
470    ) -> Result<Vec<String>> {
471        let mut errors = Vec::new();
472
473        // Validate each row in the dataset
474        for (row_index, row) in dataset.data.iter().enumerate() {
475            match row {
476                serde_json::Value::Object(row_obj) => {
477                    // Validate each field in the schema
478                    for field in &schema.fields {
479                        let field_name = &field.name;
480
481                        if let Some(field_value) = row_obj.get(field_name) {
482                            // Validate the field value
483                            if let Err(validation_error) = field.validate_value(field_value) {
484                                errors.push(format!(
485                                    "Row {}: Field '{}': {}",
486                                    row_index + 1,
487                                    field_name,
488                                    validation_error
489                                ));
490                            }
491                        } else if field.required {
492                            errors.push(format!(
493                                "Row {}: Required field '{}' is missing",
494                                row_index + 1,
495                                field_name
496                            ));
497                        }
498                    }
499
500                    // Check for unexpected fields
501                    for (key, _) in row_obj {
502                        let field_exists_in_schema = schema.fields.iter().any(|f| f.name == *key);
503                        if !field_exists_in_schema {
504                            errors.push(format!(
505                                "Row {}: Unexpected field '{}' not defined in schema",
506                                row_index + 1,
507                                key
508                            ));
509                        }
510                    }
511                }
512                _ => {
513                    errors.push(format!("Row {}: Expected object, got {}", row_index + 1, row));
514                }
515            }
516        }
517
518        // Validate dataset-level constraints
519        if let Err(count_error) = validate_dataset_size(dataset, schema) {
520            errors.push(count_error.to_string());
521        }
522
523        Ok(errors)
524    }
525
526    /// Validate dataset size constraints
527    fn validate_dataset_size(dataset: &Dataset, schema: &SchemaDefinition) -> Result<()> {
528        // Check if there are any size constraints in schema metadata
529        if let Some(min_rows) = schema.metadata.get("min_rows") {
530            if let Some(min_count) = min_rows.as_u64() {
531                if dataset.data.len() < min_count as usize {
532                    return Err(Error::validation(format!(
533                        "Dataset has {} rows, but schema requires at least {} rows",
534                        dataset.data.len(),
535                        min_count
536                    )));
537                }
538            }
539        }
540
541        if let Some(max_rows) = schema.metadata.get("max_rows") {
542            if let Some(max_count) = max_rows.as_u64() {
543                if dataset.data.len() > max_count as usize {
544                    return Err(Error::validation(format!(
545                        "Dataset has {} rows, but schema allows at most {} rows",
546                        dataset.data.len(),
547                        max_count
548                    )));
549                }
550            }
551        }
552
553        Ok(())
554    }
555
556    /// Validate dataset and return detailed result
557    pub fn validate_dataset_with_details(
558        dataset: &Dataset,
559        schema: &SchemaDefinition,
560    ) -> DatasetValidationResult {
561        let errors = validate_dataset_against_schema(dataset, schema);
562
563        match errors {
564            Ok(validation_errors) => {
565                let warnings = Vec::new(); // Could add warnings for deprecated fields, etc.
566                DatasetValidationResult {
567                    valid: validation_errors.is_empty(),
568                    errors: validation_errors,
569                    warnings,
570                    total_rows_validated: dataset.data.len(),
571                }
572            }
573            Err(e) => DatasetValidationResult {
574                valid: false,
575                errors: vec![format!("Validation failed: {}", e)],
576                warnings: Vec::new(),
577                total_rows_validated: dataset.data.len(),
578            },
579        }
580    }
581}
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586
587    // =========================================================================
588    // DatasetValidationResult tests
589    // =========================================================================
590
591    #[test]
592    fn test_dataset_validation_result_creation() {
593        let result = DatasetValidationResult {
594            valid: true,
595            errors: vec![],
596            warnings: vec![],
597            total_rows_validated: 100,
598        };
599
600        assert!(result.valid);
601        assert_eq!(result.total_rows_validated, 100);
602    }
603
604    #[test]
605    fn test_dataset_validation_result_with_errors() {
606        let result = DatasetValidationResult {
607            valid: false,
608            errors: vec!["Error 1".to_string(), "Error 2".to_string()],
609            warnings: vec![],
610            total_rows_validated: 50,
611        };
612
613        assert!(!result.valid);
614        assert_eq!(result.errors.len(), 2);
615    }
616
617    #[test]
618    fn test_dataset_validation_result_with_warnings() {
619        let result = DatasetValidationResult {
620            valid: true,
621            errors: vec![],
622            warnings: vec!["Warning 1".to_string()],
623            total_rows_validated: 75,
624        };
625
626        assert!(result.valid);
627        assert_eq!(result.warnings.len(), 1);
628    }
629
630    #[test]
631    fn test_dataset_validation_result_clone() {
632        let result = DatasetValidationResult {
633            valid: true,
634            errors: vec!["err".to_string()],
635            warnings: vec!["warn".to_string()],
636            total_rows_validated: 50,
637        };
638        let cloned = result.clone();
639        assert_eq!(cloned.total_rows_validated, 50);
640        assert_eq!(cloned.errors.len(), 1);
641    }
642
643    #[test]
644    fn test_dataset_validation_result_serialize() {
645        let result = DatasetValidationResult {
646            valid: true,
647            errors: vec![],
648            warnings: vec![],
649            total_rows_validated: 25,
650        };
651        let json = serde_json::to_string(&result).unwrap();
652        assert!(json.contains("true"));
653        assert!(json.contains("25"));
654    }
655
656    #[test]
657    fn test_dataset_validation_result_deserialize() {
658        let json =
659            r#"{"valid": false, "errors": ["e1"], "warnings": [], "total_rows_validated": 10}"#;
660        let result: DatasetValidationResult = serde_json::from_str(json).unwrap();
661        assert!(!result.valid);
662        assert_eq!(result.errors.len(), 1);
663    }
664
665    #[test]
666    fn test_dataset_validation_result_debug() {
667        let result = DatasetValidationResult {
668            valid: true,
669            errors: vec![],
670            warnings: vec![],
671            total_rows_validated: 0,
672        };
673        let debug_str = format!("{:?}", result);
674        assert!(debug_str.contains("valid"));
675    }
676
677    // =========================================================================
678    // DatasetMetadata tests
679    // =========================================================================
680
681    #[test]
682    fn test_dataset_metadata_creation() {
683        let config = DataConfig::default();
684        let metadata = DatasetMetadata {
685            name: "TestDataset".to_string(),
686            description: Some("Test description".to_string()),
687            schema_name: "TestSchema".to_string(),
688            row_count: 100,
689            config,
690            created_at: chrono::Utc::now(),
691            generation_time_ms: 1000,
692            format: OutputFormat::Json,
693            file_size_bytes: Some(1024),
694            tags: HashMap::new(),
695        };
696
697        assert_eq!(metadata.name, "TestDataset");
698        assert_eq!(metadata.row_count, 100);
699        assert!(metadata.description.is_some());
700        assert_eq!(metadata.generation_time_ms, 1000);
701    }
702
703    #[test]
704    fn test_dataset_metadata_default() {
705        let metadata = DatasetMetadata::default();
706        assert!(metadata.name.is_empty());
707        assert!(metadata.description.is_none());
708        assert_eq!(metadata.row_count, 0);
709        assert!(metadata.tags.is_empty());
710    }
711
712    #[test]
713    fn test_dataset_metadata_new() {
714        let result = GenerationResult {
715            data: vec![serde_json::json!({"id": 1}), serde_json::json!({"id": 2})],
716            count: 2,
717            generation_time_ms: 100,
718            warnings: vec![],
719        };
720        let config = DataConfig::default();
721        let metadata = DatasetMetadata::new(
722            "my_dataset".to_string(),
723            "TestSchema".to_string(),
724            &result,
725            config,
726        );
727
728        assert_eq!(metadata.name, "my_dataset");
729        assert_eq!(metadata.schema_name, "TestSchema");
730        assert_eq!(metadata.row_count, 2);
731        assert_eq!(metadata.generation_time_ms, 100);
732    }
733
734    #[test]
735    fn test_dataset_metadata_with_description() {
736        let metadata = DatasetMetadata::default().with_description("A test dataset".to_string());
737        assert_eq!(metadata.description, Some("A test dataset".to_string()));
738    }
739
740    #[test]
741    fn test_dataset_metadata_with_tag() {
742        let metadata = DatasetMetadata::default()
743            .with_tag("env".to_string(), "test".to_string())
744            .with_tag("version".to_string(), "1.0".to_string());
745        assert_eq!(metadata.tags.get("env"), Some(&"test".to_string()));
746        assert_eq!(metadata.tags.get("version"), Some(&"1.0".to_string()));
747    }
748
749    #[test]
750    fn test_dataset_metadata_with_file_size() {
751        let metadata = DatasetMetadata::default().with_file_size(2048);
752        assert_eq!(metadata.file_size_bytes, Some(2048));
753    }
754
755    #[test]
756    fn test_dataset_metadata_clone() {
757        let metadata = DatasetMetadata {
758            name: "cloneable".to_string(),
759            ..Default::default()
760        };
761        let cloned = metadata.clone();
762        assert_eq!(cloned.name, "cloneable");
763    }
764
765    #[test]
766    fn test_dataset_metadata_serialize() {
767        let metadata = DatasetMetadata::default();
768        let json = serde_json::to_string(&metadata).unwrap();
769        assert!(json.contains("name"));
770        assert!(json.contains("row_count"));
771    }
772
773    // =========================================================================
774    // Dataset tests
775    // =========================================================================
776
777    #[test]
778    fn test_dataset_new() {
779        let metadata = DatasetMetadata::default();
780        let data = vec![
781            serde_json::json!({"id": 1, "name": "Alice"}),
782            serde_json::json!({"id": 2, "name": "Bob"}),
783        ];
784        let dataset = Dataset::new(metadata, data);
785        assert_eq!(dataset.row_count(), 2);
786    }
787
788    #[test]
789    fn test_dataset_from_generation_result() {
790        let result = GenerationResult {
791            data: vec![serde_json::json!({"id": 1})],
792            count: 1,
793            generation_time_ms: 50,
794            warnings: vec![],
795        };
796        let config = DataConfig::default();
797        let dataset = Dataset::from_generation_result(
798            "test_dataset".to_string(),
799            "TestSchema".to_string(),
800            result,
801            config,
802        );
803        assert_eq!(dataset.metadata.name, "test_dataset");
804        assert_eq!(dataset.row_count(), 1);
805    }
806
807    #[test]
808    fn test_dataset_to_json_string() {
809        let metadata = DatasetMetadata::default();
810        let data = vec![serde_json::json!({"id": 1}), serde_json::json!({"id": 2})];
811        let dataset = Dataset::new(metadata, data);
812        let json = dataset.to_json_string().unwrap();
813        assert!(json.contains("id"));
814        assert!(json.contains("1"));
815        assert!(json.contains("2"));
816    }
817
818    #[test]
819    fn test_dataset_to_jsonl_string() {
820        let metadata = DatasetMetadata::default();
821        let data = vec![serde_json::json!({"id": 1}), serde_json::json!({"id": 2})];
822        let dataset = Dataset::new(metadata, data);
823        let jsonl = dataset.to_jsonl_string().unwrap();
824        let lines: Vec<&str> = jsonl.split('\n').collect();
825        assert_eq!(lines.len(), 2);
826    }
827
828    #[test]
829    fn test_dataset_to_csv_string() {
830        let metadata = DatasetMetadata::default();
831        let data = vec![
832            serde_json::json!({"id": 1, "name": "Alice"}),
833            serde_json::json!({"id": 2, "name": "Bob"}),
834        ];
835        let dataset = Dataset::new(metadata, data);
836        let csv = dataset.to_csv_string().unwrap();
837        assert!(csv.contains("id") || csv.contains("name")); // Headers
838        assert!(csv.contains("Alice") || csv.contains("Bob")); // Data
839    }
840
841    #[test]
842    fn test_dataset_to_csv_string_empty() {
843        let metadata = DatasetMetadata::default();
844        let dataset = Dataset::new(metadata, vec![]);
845        let csv = dataset.to_csv_string().unwrap();
846        assert!(csv.is_empty());
847    }
848
849    #[test]
850    fn test_dataset_to_yaml_string() {
851        let metadata = DatasetMetadata::default();
852        let data = vec![serde_json::json!({"id": 1})];
853        let dataset = Dataset::new(metadata, data);
854        let yaml = dataset.to_yaml_string().unwrap();
855        assert!(yaml.contains("id"));
856    }
857
858    #[test]
859    fn test_dataset_row_count() {
860        let metadata = DatasetMetadata::default();
861        let data = vec![
862            serde_json::json!({}),
863            serde_json::json!({}),
864            serde_json::json!({}),
865        ];
866        let dataset = Dataset::new(metadata, data);
867        assert_eq!(dataset.row_count(), 3);
868    }
869
870    #[test]
871    fn test_dataset_sample() {
872        let metadata = DatasetMetadata::default();
873        let data: Vec<serde_json::Value> = (0..10).map(|i| serde_json::json!({"id": i})).collect();
874        let dataset = Dataset::new(metadata, data);
875
876        let sample = dataset.sample(3);
877        assert_eq!(sample.len(), 3);
878
879        let big_sample = dataset.sample(100);
880        assert_eq!(big_sample.len(), 10); // Capped at dataset size
881    }
882
883    #[test]
884    fn test_dataset_filter() {
885        let metadata = DatasetMetadata {
886            name: "filterable".to_string(),
887            ..Default::default()
888        };
889        let data = vec![
890            serde_json::json!({"id": 1, "active": true}),
891            serde_json::json!({"id": 2, "active": false}),
892            serde_json::json!({"id": 3, "active": true}),
893        ];
894        let dataset = Dataset::new(metadata, data);
895
896        let filtered =
897            dataset.filter(|row| row.get("active").and_then(|v| v.as_bool()).unwrap_or(false));
898
899        assert_eq!(filtered.row_count(), 2);
900        assert_eq!(filtered.metadata.row_count, 2);
901    }
902
903    #[test]
904    fn test_dataset_map() {
905        let metadata = DatasetMetadata::default();
906        let data = vec![
907            serde_json::json!({"value": 1}),
908            serde_json::json!({"value": 2}),
909        ];
910        let dataset = Dataset::new(metadata, data);
911
912        let mapped = dataset.map(|row| {
913            let mut new_row = row.clone();
914            if let Some(obj) = new_row.as_object_mut() {
915                obj.insert("doubled".to_string(), serde_json::json!(true));
916            }
917            new_row
918        });
919
920        assert_eq!(mapped.row_count(), 2);
921        assert!(mapped.data[0].get("doubled").is_some());
922    }
923
924    #[test]
925    fn test_dataset_debug() {
926        let metadata = DatasetMetadata {
927            name: "debug_test".to_string(),
928            ..Default::default()
929        };
930        let dataset = Dataset::new(metadata, vec![]);
931        let debug_str = format!("{:?}", dataset);
932        assert!(debug_str.contains("metadata"));
933    }
934
935    // =========================================================================
936    // DatasetCollection tests
937    // =========================================================================
938
939    #[test]
940    fn test_dataset_collection_new() {
941        let collection = DatasetCollection::new();
942        assert_eq!(collection.size(), 0);
943    }
944
945    #[test]
946    fn test_dataset_collection_default() {
947        let collection = DatasetCollection::default();
948        assert_eq!(collection.size(), 0);
949    }
950
951    #[test]
952    fn test_dataset_collection_add_dataset() {
953        let mut collection = DatasetCollection::new();
954        let dataset = Dataset::new(
955            DatasetMetadata {
956                name: "test1".to_string(),
957                ..Default::default()
958            },
959            vec![],
960        );
961        collection.add_dataset(dataset).unwrap();
962        assert_eq!(collection.size(), 1);
963    }
964
965    #[test]
966    fn test_dataset_collection_get_dataset() {
967        let mut collection = DatasetCollection::new();
968        let dataset = Dataset::new(
969            DatasetMetadata {
970                name: "findme".to_string(),
971                ..Default::default()
972            },
973            vec![serde_json::json!({"id": 1})],
974        );
975        collection.add_dataset(dataset).unwrap();
976
977        let found = collection.get_dataset("findme");
978        assert!(found.is_some());
979        assert_eq!(found.unwrap().row_count(), 1);
980    }
981
982    #[test]
983    fn test_dataset_collection_get_dataset_not_found() {
984        let collection = DatasetCollection::new();
985        assert!(collection.get_dataset("nonexistent").is_none());
986    }
987
988    #[test]
989    fn test_dataset_collection_remove_dataset() {
990        let mut collection = DatasetCollection::new();
991        let dataset = Dataset::new(
992            DatasetMetadata {
993                name: "removable".to_string(),
994                ..Default::default()
995            },
996            vec![],
997        );
998        collection.add_dataset(dataset).unwrap();
999
1000        let removed = collection.remove_dataset("removable");
1001        assert!(removed.is_some());
1002        assert_eq!(collection.size(), 0);
1003    }
1004
1005    #[test]
1006    fn test_dataset_collection_list_datasets() {
1007        let mut collection = DatasetCollection::new();
1008        collection
1009            .add_dataset(Dataset::new(
1010                DatasetMetadata {
1011                    name: "a".to_string(),
1012                    ..Default::default()
1013                },
1014                vec![],
1015            ))
1016            .unwrap();
1017        collection
1018            .add_dataset(Dataset::new(
1019                DatasetMetadata {
1020                    name: "b".to_string(),
1021                    ..Default::default()
1022                },
1023                vec![],
1024            ))
1025            .unwrap();
1026
1027        let names = collection.list_datasets();
1028        assert_eq!(names.len(), 2);
1029        assert!(names.contains(&"a".to_string()));
1030        assert!(names.contains(&"b".to_string()));
1031    }
1032
1033    #[test]
1034    fn test_dataset_collection_size() {
1035        let mut collection = DatasetCollection::new();
1036        assert_eq!(collection.size(), 0);
1037
1038        collection
1039            .add_dataset(Dataset::new(
1040                DatasetMetadata {
1041                    name: "x".to_string(),
1042                    ..Default::default()
1043                },
1044                vec![],
1045            ))
1046            .unwrap();
1047        assert_eq!(collection.size(), 1);
1048    }
1049
1050    #[test]
1051    fn test_dataset_collection_statistics() {
1052        let mut collection = DatasetCollection::new();
1053        collection
1054            .add_dataset(Dataset::new(
1055                DatasetMetadata {
1056                    name: "ds1".to_string(),
1057                    schema_name: "Schema1".to_string(),
1058                    ..Default::default()
1059                },
1060                vec![serde_json::json!({}), serde_json::json!({})],
1061            ))
1062            .unwrap();
1063        collection
1064            .add_dataset(Dataset::new(
1065                DatasetMetadata {
1066                    name: "ds2".to_string(),
1067                    schema_name: "Schema2".to_string(),
1068                    ..Default::default()
1069                },
1070                vec![serde_json::json!({})],
1071            ))
1072            .unwrap();
1073
1074        let stats = collection.statistics();
1075        assert_eq!(stats.get("total_datasets").and_then(|v| v.as_u64()), Some(2));
1076        assert_eq!(stats.get("total_rows").and_then(|v| v.as_u64()), Some(3));
1077    }
1078
1079    #[test]
1080    fn test_dataset_collection_debug() {
1081        let collection = DatasetCollection::new();
1082        let debug_str = format!("{:?}", collection);
1083        assert!(debug_str.contains("datasets"));
1084    }
1085}