mockforge_data/dataset/
core.rs

1//! Core dataset structures and basic operations
2//!
3//! This module provides the fundamental data structures for datasets,
4//! including dataset definitions, rows, and basic operations.
5
6use crate::{DataConfig, OutputFormat};
7use crate::{Error, Result};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10
11/// Dataset validation result
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct DatasetValidationResult {
14    /// Whether the dataset is valid
15    pub valid: bool,
16    /// Validation errors
17    pub errors: Vec<String>,
18    /// Validation warnings
19    pub warnings: Vec<String>,
20    /// Total number of rows validated
21    pub total_rows_validated: usize,
22}
23
24/// Dataset metadata
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct DatasetMetadata {
27    /// Dataset name
28    pub name: String,
29    /// Dataset description
30    pub description: Option<String>,
31    /// Schema name used to generate this dataset
32    pub schema_name: String,
33    /// Number of rows
34    pub row_count: usize,
35    /// Generation configuration
36    pub config: DataConfig,
37    /// Creation timestamp
38    pub created_at: chrono::DateTime<chrono::Utc>,
39    /// Generation time in milliseconds
40    pub generation_time_ms: u128,
41    /// File format
42    pub format: OutputFormat,
43    /// File size in bytes
44    pub file_size_bytes: Option<u64>,
45    /// Additional metadata
46    pub tags: HashMap<String, String>,
47}
48
49impl Default for DatasetMetadata {
50    fn default() -> Self {
51        Self {
52            name: String::new(),
53            description: None,
54            schema_name: String::new(),
55            row_count: 0,
56            config: DataConfig::default(),
57            created_at: chrono::Utc::now(),
58            generation_time_ms: 0,
59            format: OutputFormat::Json,
60            file_size_bytes: None,
61            tags: HashMap::new(),
62        }
63    }
64}
65
66impl DatasetMetadata {
67    /// Create new dataset metadata
68    pub fn new(
69        name: String,
70        schema_name: String,
71        config: DataConfig,
72        format: OutputFormat,
73    ) -> Self {
74        Self {
75            name,
76            schema_name,
77            config,
78            format,
79            created_at: chrono::Utc::now(),
80            ..Default::default()
81        }
82    }
83
84    /// Update generation time
85    pub fn set_generation_time(&mut self, time_ms: u128) {
86        self.generation_time_ms = time_ms;
87    }
88
89    /// Set file size
90    pub fn set_file_size(&mut self, size_bytes: u64) {
91        self.file_size_bytes = Some(size_bytes);
92    }
93
94    /// Add tag
95    pub fn add_tag(&mut self, key: String, value: String) {
96        self.tags.insert(key, value);
97    }
98
99    /// Get tag value
100    pub fn get_tag(&self, key: &str) -> Option<&String> {
101        self.tags.get(key)
102    }
103
104    /// Remove tag
105    pub fn remove_tag(&mut self, key: &str) -> Option<String> {
106        self.tags.remove(key)
107    }
108
109    /// Get total size in bytes (estimated)
110    pub fn estimated_size_bytes(&self) -> u64 {
111        self.file_size_bytes.unwrap_or_else(|| {
112            // Rough estimate: each row ~1KB
113            (self.row_count * 1024) as u64
114        })
115    }
116
117    /// Check if dataset is empty
118    pub fn is_empty(&self) -> bool {
119        self.row_count == 0
120    }
121
122    /// Get human-readable size
123    pub fn human_readable_size(&self) -> String {
124        let bytes = self.estimated_size_bytes();
125        if bytes < 1024 {
126            format!("{} B", bytes)
127        } else if bytes < 1024 * 1024 {
128            format!("{:.1} KB", bytes as f64 / 1024.0)
129        } else if bytes < 1024 * 1024 * 1024 {
130            format!("{:.1} MB", bytes as f64 / (1024.0 * 1024.0))
131        } else {
132            format!("{:.1} GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
133        }
134    }
135}
136
137/// Single row of data in a dataset
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct DatasetRow {
140    /// Row ID
141    pub id: String,
142    /// Row data as key-value pairs
143    pub data: HashMap<String, serde_json::Value>,
144    /// Row metadata
145    pub metadata: HashMap<String, String>,
146    /// Creation timestamp
147    pub created_at: chrono::DateTime<chrono::Utc>,
148}
149
150impl DatasetRow {
151    /// Create a new dataset row
152    pub fn new(id: String, data: HashMap<String, serde_json::Value>) -> Self {
153        Self {
154            id,
155            data,
156            metadata: HashMap::new(),
157            created_at: chrono::Utc::now(),
158        }
159    }
160
161    /// Add metadata to the row
162    pub fn add_metadata(&mut self, key: String, value: String) {
163        self.metadata.insert(key, value);
164    }
165
166    /// Get metadata value
167    pub fn get_metadata(&self, key: &str) -> Option<&String> {
168        self.metadata.get(key)
169    }
170
171    /// Remove metadata
172    pub fn remove_metadata(&mut self, key: &str) -> Option<String> {
173        self.metadata.remove(key)
174    }
175
176    /// Get field value
177    pub fn get_field(&self, field_name: &str) -> Option<&serde_json::Value> {
178        self.data.get(field_name)
179    }
180
181    /// Set field value
182    pub fn set_field(&mut self, field_name: String, value: serde_json::Value) {
183        self.data.insert(field_name, value);
184    }
185
186    /// Check if row contains a field
187    pub fn has_field(&self, field_name: &str) -> bool {
188        self.data.contains_key(field_name)
189    }
190
191    /// Get all field names
192    pub fn field_names(&self) -> Vec<&String> {
193        self.data.keys().collect()
194    }
195
196    /// Get row as JSON value
197    pub fn to_json(&self) -> serde_json::Value {
198        serde_json::json!({
199            "id": self.id,
200            "data": self.data,
201            "metadata": self.metadata,
202            "created_at": self.created_at,
203        })
204    }
205}
206
207/// Dataset statistics
208#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct DatasetStats {
210    /// Total number of rows
211    pub row_count: usize,
212    /// Number of columns/fields
213    pub column_count: usize,
214    /// Total size in bytes
215    pub total_size_bytes: u64,
216    /// Average row size in bytes
217    pub average_row_size_bytes: f64,
218    /// Smallest row size in bytes
219    pub min_row_size_bytes: u64,
220    /// Largest row size in bytes
221    pub max_row_size_bytes: u64,
222    /// Field name statistics
223    pub field_stats: HashMap<String, FieldStats>,
224    /// Generation timestamp
225    pub generated_at: chrono::DateTime<chrono::Utc>,
226}
227
228/// Field statistics
229#[derive(Debug, Clone, Serialize, Deserialize)]
230pub struct FieldStats {
231    /// Field name
232    pub field_name: String,
233    /// Field type
234    pub field_type: String,
235    /// Number of non-null values
236    pub non_null_count: usize,
237    /// Number of null values
238    pub null_count: usize,
239    /// Number of unique values
240    pub unique_count: usize,
241    /// Minimum value (if numeric)
242    pub min_value: Option<serde_json::Value>,
243    /// Maximum value (if numeric)
244    pub max_value: Option<serde_json::Value>,
245    /// Average value (if numeric)
246    pub average_value: Option<f64>,
247    /// Most common values
248    pub most_common_values: Vec<(serde_json::Value, usize)>,
249}
250
251/// Dataset represents a collection of generated data
252#[derive(Debug, Clone, Serialize, Deserialize)]
253pub struct Dataset {
254    /// Dataset metadata
255    pub metadata: DatasetMetadata,
256    /// Dataset rows
257    pub rows: Vec<DatasetRow>,
258    /// Dataset statistics
259    pub stats: Option<DatasetStats>,
260}
261
262impl Dataset {
263    /// Create a new empty dataset
264    pub fn new(
265        name: String,
266        schema_name: String,
267        config: DataConfig,
268        format: OutputFormat,
269    ) -> Self {
270        Self {
271            metadata: DatasetMetadata::new(name, schema_name, config, format),
272            rows: Vec::new(),
273            stats: None,
274        }
275    }
276
277    /// Create a dataset with pre-existing rows
278    pub fn with_rows(
279        name: String,
280        schema_name: String,
281        config: DataConfig,
282        format: OutputFormat,
283        rows: Vec<DatasetRow>,
284    ) -> Self {
285        let mut dataset = Self::new(name, schema_name, config, format);
286        dataset.rows = rows;
287        dataset.metadata.row_count = dataset.rows.len();
288        dataset
289    }
290
291    /// Add a row to the dataset
292    pub fn add_row(&mut self, row: DatasetRow) {
293        self.rows.push(row);
294        self.metadata.row_count = self.rows.len();
295    }
296
297    /// Add multiple rows to the dataset
298    pub fn add_rows(&mut self, rows: Vec<DatasetRow>) {
299        self.rows.extend(rows);
300        self.metadata.row_count = self.rows.len();
301    }
302
303    /// Get row by ID
304    pub fn get_row(&self, id: &str) -> Option<&DatasetRow> {
305        self.rows.iter().find(|row| row.id == id)
306    }
307
308    /// Get row by ID (mutable)
309    pub fn get_row_mut(&mut self, id: &str) -> Option<&mut DatasetRow> {
310        self.rows.iter_mut().find(|row| row.id == id)
311    }
312
313    /// Remove row by ID
314    pub fn remove_row(&mut self, id: &str) -> Option<DatasetRow> {
315        if let Some(pos) = self.rows.iter().position(|row| row.id == id) {
316            let row = self.rows.remove(pos);
317            self.metadata.row_count = self.rows.len();
318            Some(row)
319        } else {
320            None
321        }
322    }
323
324    /// Get rows by metadata key-value
325    pub fn get_rows_by_metadata(&self, key: &str, value: &str) -> Vec<&DatasetRow> {
326        self.rows
327            .iter()
328            .filter(|row| row.get_metadata(key).map(|v| v == value).unwrap_or(false))
329            .collect()
330    }
331
332    /// Get all row IDs
333    pub fn row_ids(&self) -> Vec<&String> {
334        self.rows.iter().map(|row| &row.id).collect()
335    }
336
337    /// Check if dataset is empty
338    pub fn is_empty(&self) -> bool {
339        self.rows.is_empty()
340    }
341
342    /// Get dataset size
343    pub fn size(&self) -> usize {
344        self.rows.len()
345    }
346
347    /// Get field names from the first row (if available)
348    pub fn field_names(&self) -> Vec<&String> {
349        if let Some(first_row) = self.rows.first() {
350            first_row.field_names()
351        } else {
352            Vec::new()
353        }
354    }
355
356    /// Calculate dataset statistics
357    pub fn calculate_stats(&mut self) -> Result<()> {
358        if self.rows.is_empty() {
359            self.stats = Some(DatasetStats {
360                row_count: 0,
361                column_count: 0,
362                total_size_bytes: 0,
363                average_row_size_bytes: 0.0,
364                min_row_size_bytes: 0,
365                max_row_size_bytes: 0,
366                field_stats: HashMap::new(),
367                generated_at: chrono::Utc::now(),
368            });
369            return Ok(());
370        }
371
372        let mut total_size = 0u64;
373        let mut row_sizes = Vec::new();
374
375        // Temporary structure for collecting field statistics
376        #[derive(Default)]
377        struct TempFieldStats {
378            field_type: Option<String>,
379            non_null_count: usize,
380            null_count: usize,
381            unique_values: std::collections::HashSet<serde_json::Value>,
382            numeric_values: Vec<f64>,
383            frequency: std::collections::HashMap<serde_json::Value, usize>,
384        }
385
386        let mut temp_field_stats: HashMap<String, TempFieldStats> = HashMap::new();
387
388        // Get field names from first row
389        let field_names = self.field_names();
390        for field_name in &field_names {
391            temp_field_stats.insert(field_name.to_string(), TempFieldStats::default());
392        }
393
394        // Process each row
395        for row in &self.rows {
396            let row_json = row.to_json();
397            let row_size = serde_json::to_string(&row_json)
398                .map_err(|e| Error::generic(format!("Failed to serialize row: {}", e)))?
399                .len() as u64;
400
401            total_size += row_size;
402            row_sizes.push(row_size);
403
404            // Update field statistics
405            for (field_name, field_value) in &row.data {
406                if let Some(temp_stats) = temp_field_stats.get_mut(field_name) {
407                    match field_value {
408                        serde_json::Value::Null => temp_stats.null_count += 1,
409                        _ => {
410                            temp_stats.non_null_count += 1;
411
412                            // Type detection
413                            let value_type = match field_value {
414                                serde_json::Value::Bool(_) => "boolean",
415                                serde_json::Value::Number(_) => "number",
416                                serde_json::Value::String(_) => "string",
417                                serde_json::Value::Array(_) => "array",
418                                serde_json::Value::Object(_) => "object",
419                                serde_json::Value::Null => unreachable!(),
420                            };
421
422                            if temp_stats.field_type.is_none() {
423                                temp_stats.field_type = Some(value_type.to_string());
424                            } else if temp_stats.field_type.as_ref()
425                                != Some(&value_type.to_string())
426                            {
427                                temp_stats.field_type = Some("mixed".to_string());
428                            }
429
430                            // Collect unique values
431                            temp_stats.unique_values.insert(field_value.clone());
432
433                            // Collect numeric values for min/max/avg
434                            if let serde_json::Value::Number(num) = field_value {
435                                if let Some(f) = num.as_f64() {
436                                    temp_stats.numeric_values.push(f);
437                                }
438                            }
439
440                            // Update frequency
441                            *temp_stats.frequency.entry(field_value.clone()).or_insert(0) += 1;
442                        }
443                    }
444                }
445            }
446        }
447
448        // Convert temporary stats to final FieldStats
449        let mut field_stats: HashMap<String, FieldStats> = HashMap::new();
450        for (field_name, temp_stats) in temp_field_stats {
451            let field_type = temp_stats.field_type.unwrap_or_else(|| "unknown".to_string());
452
453            let (min_value, max_value, average_value) = if field_type == "number"
454                && !temp_stats.numeric_values.is_empty()
455            {
456                let min = temp_stats.numeric_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
457                let max =
458                    temp_stats.numeric_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
459                let sum: f64 = temp_stats.numeric_values.iter().sum();
460                let avg = sum / temp_stats.numeric_values.len() as f64;
461                (
462                    Some(serde_json::Value::Number(
463                        serde_json::Number::from_f64(min).unwrap_or(serde_json::Number::from(0)),
464                    )),
465                    Some(serde_json::Value::Number(
466                        serde_json::Number::from_f64(max).unwrap_or(serde_json::Number::from(0)),
467                    )),
468                    Some(avg),
469                )
470            } else {
471                (None, None, None)
472            };
473
474            // Get most common values (top 5)
475            let mut most_common: Vec<(serde_json::Value, usize)> =
476                temp_stats.frequency.into_iter().collect();
477            most_common.sort_by(|a, b| b.1.cmp(&a.1));
478            most_common.truncate(5);
479
480            field_stats.insert(
481                field_name.clone(),
482                FieldStats {
483                    field_name,
484                    field_type,
485                    non_null_count: temp_stats.non_null_count,
486                    null_count: temp_stats.null_count,
487                    unique_count: temp_stats.unique_values.len(),
488                    min_value,
489                    max_value,
490                    average_value,
491                    most_common_values: most_common,
492                },
493            );
494        }
495
496        let row_count = self.rows.len();
497        let average_row_size = if row_count > 0 {
498            total_size as f64 / row_count as f64
499        } else {
500            0.0
501        };
502
503        let min_row_size = row_sizes.iter().min().unwrap_or(&0);
504        let max_row_size = row_sizes.iter().max().unwrap_or(&0);
505
506        self.stats = Some(DatasetStats {
507            row_count,
508            column_count: field_names.len(),
509            total_size_bytes: total_size,
510            average_row_size_bytes: average_row_size,
511            min_row_size_bytes: *min_row_size,
512            max_row_size_bytes: *max_row_size,
513            field_stats,
514            generated_at: chrono::Utc::now(),
515        });
516
517        Ok(())
518    }
519
520    /// Validate dataset integrity
521    pub fn validate(&self) -> DatasetValidationResult {
522        let mut errors = Vec::new();
523        let mut warnings = Vec::new();
524
525        // Check metadata
526        if self.metadata.name.is_empty() {
527            errors.push("Dataset name cannot be empty".to_string());
528        }
529
530        if self.metadata.schema_name.is_empty() {
531            errors.push("Schema name cannot be empty".to_string());
532        }
533
534        // Check rows
535        for (index, row) in self.rows.iter().enumerate() {
536            if row.id.is_empty() {
537                errors.push(format!("Row {} has empty ID", index));
538            }
539
540            if row.data.is_empty() {
541                warnings.push(format!("Row {} has no data", index));
542            }
543        }
544
545        DatasetValidationResult {
546            valid: errors.is_empty(),
547            errors,
548            warnings,
549            total_rows_validated: self.rows.len(),
550        }
551    }
552
553    /// Export dataset to JSON
554    pub fn to_json(&self) -> Result<String> {
555        serde_json::to_string_pretty(self)
556            .map_err(|e| Error::generic(format!("Failed to serialize dataset: {}", e)))
557    }
558
559    /// Export dataset rows to JSON array
560    pub fn rows_to_json(&self) -> Result<String> {
561        let rows_json: Vec<_> = self.rows.iter().map(|row| row.to_json()).collect();
562        serde_json::to_string_pretty(&rows_json)
563            .map_err(|e| Error::generic(format!("Failed to serialize dataset rows: {}", e)))
564    }
565
566    /// Get dataset summary
567    pub fn summary(&self) -> String {
568        format!(
569            "Dataset '{}' - {} rows, {} columns, {}",
570            self.metadata.name,
571            self.rows.len(),
572            self.field_names().len(),
573            self.metadata.human_readable_size()
574        )
575    }
576}
577
578impl Default for Dataset {
579    fn default() -> Self {
580        Self::new(
581            "Untitled Dataset".to_string(),
582            "Unknown Schema".to_string(),
583            DataConfig::default(),
584            OutputFormat::Json,
585        )
586    }
587}
588
589#[cfg(test)]
590mod tests {
591    use super::*;
592
593    #[test]
594    fn test_dataset_new() {
595        let dataset = Dataset::new(
596            "TestDataset".to_string(),
597            "TestSchema".to_string(),
598            DataConfig::default(),
599            OutputFormat::Json,
600        );
601
602        assert_eq!(dataset.metadata.name, "TestDataset");
603        assert_eq!(dataset.metadata.schema_name, "TestSchema");
604        assert_eq!(dataset.rows.len(), 0);
605    }
606
607    #[test]
608    fn test_dataset_default() {
609        let dataset = Dataset::default();
610
611        assert_eq!(dataset.metadata.name, "Untitled Dataset");
612        assert_eq!(dataset.metadata.schema_name, "Unknown Schema");
613    }
614
615    #[test]
616    fn test_dataset_row_new() {
617        let mut data = HashMap::new();
618        data.insert("name".to_string(), serde_json::json!("test"));
619
620        let row = DatasetRow::new("1".to_string(), data.clone());
621
622        assert_eq!(row.id, "1");
623        assert_eq!(row.data.len(), 1);
624        assert!(row.metadata.is_empty());
625    }
626
627    #[test]
628    fn test_dataset_row_metadata() {
629        let mut data = HashMap::new();
630        data.insert("name".to_string(), serde_json::json!("test"));
631
632        let mut row = DatasetRow::new("1".to_string(), data);
633        row.metadata.insert("source".to_string(), "test".to_string());
634
635        assert_eq!(row.metadata.len(), 1);
636    }
637}