1use crate::{DataConfig, OutputFormat};
7use crate::{Error, Result};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct DatasetValidationResult {
14 pub valid: bool,
16 pub errors: Vec<String>,
18 pub warnings: Vec<String>,
20 pub total_rows_validated: usize,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct DatasetMetadata {
27 pub name: String,
29 pub description: Option<String>,
31 pub schema_name: String,
33 pub row_count: usize,
35 pub config: DataConfig,
37 pub created_at: chrono::DateTime<chrono::Utc>,
39 pub generation_time_ms: u128,
41 pub format: OutputFormat,
43 pub file_size_bytes: Option<u64>,
45 pub tags: HashMap<String, String>,
47}
48
49impl Default for DatasetMetadata {
50 fn default() -> Self {
51 Self {
52 name: String::new(),
53 description: None,
54 schema_name: String::new(),
55 row_count: 0,
56 config: DataConfig::default(),
57 created_at: chrono::Utc::now(),
58 generation_time_ms: 0,
59 format: OutputFormat::Json,
60 file_size_bytes: None,
61 tags: HashMap::new(),
62 }
63 }
64}
65
66impl DatasetMetadata {
67 pub fn new(
69 name: String,
70 schema_name: String,
71 config: DataConfig,
72 format: OutputFormat,
73 ) -> Self {
74 Self {
75 name,
76 schema_name,
77 config,
78 format,
79 created_at: chrono::Utc::now(),
80 ..Default::default()
81 }
82 }
83
84 pub fn set_generation_time(&mut self, time_ms: u128) {
86 self.generation_time_ms = time_ms;
87 }
88
89 pub fn set_file_size(&mut self, size_bytes: u64) {
91 self.file_size_bytes = Some(size_bytes);
92 }
93
94 pub fn add_tag(&mut self, key: String, value: String) {
96 self.tags.insert(key, value);
97 }
98
99 pub fn get_tag(&self, key: &str) -> Option<&String> {
101 self.tags.get(key)
102 }
103
104 pub fn remove_tag(&mut self, key: &str) -> Option<String> {
106 self.tags.remove(key)
107 }
108
109 pub fn estimated_size_bytes(&self) -> u64 {
111 self.file_size_bytes.unwrap_or_else(|| {
112 (self.row_count * 1024) as u64
114 })
115 }
116
117 pub fn is_empty(&self) -> bool {
119 self.row_count == 0
120 }
121
122 pub fn human_readable_size(&self) -> String {
124 let bytes = self.estimated_size_bytes();
125 if bytes < 1024 {
126 format!("{} B", bytes)
127 } else if bytes < 1024 * 1024 {
128 format!("{:.1} KB", bytes as f64 / 1024.0)
129 } else if bytes < 1024 * 1024 * 1024 {
130 format!("{:.1} MB", bytes as f64 / (1024.0 * 1024.0))
131 } else {
132 format!("{:.1} GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
133 }
134 }
135}
136
137#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct DatasetRow {
140 pub id: String,
142 pub data: HashMap<String, serde_json::Value>,
144 pub metadata: HashMap<String, String>,
146 pub created_at: chrono::DateTime<chrono::Utc>,
148}
149
150impl DatasetRow {
151 pub fn new(id: String, data: HashMap<String, serde_json::Value>) -> Self {
153 Self {
154 id,
155 data,
156 metadata: HashMap::new(),
157 created_at: chrono::Utc::now(),
158 }
159 }
160
161 pub fn add_metadata(&mut self, key: String, value: String) {
163 self.metadata.insert(key, value);
164 }
165
166 pub fn get_metadata(&self, key: &str) -> Option<&String> {
168 self.metadata.get(key)
169 }
170
171 pub fn remove_metadata(&mut self, key: &str) -> Option<String> {
173 self.metadata.remove(key)
174 }
175
176 pub fn get_field(&self, field_name: &str) -> Option<&serde_json::Value> {
178 self.data.get(field_name)
179 }
180
181 pub fn set_field(&mut self, field_name: String, value: serde_json::Value) {
183 self.data.insert(field_name, value);
184 }
185
186 pub fn has_field(&self, field_name: &str) -> bool {
188 self.data.contains_key(field_name)
189 }
190
191 pub fn field_names(&self) -> Vec<&String> {
193 self.data.keys().collect()
194 }
195
196 pub fn to_json(&self) -> serde_json::Value {
198 serde_json::json!({
199 "id": self.id,
200 "data": self.data,
201 "metadata": self.metadata,
202 "created_at": self.created_at,
203 })
204 }
205}
206
207#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct DatasetStats {
210 pub row_count: usize,
212 pub column_count: usize,
214 pub total_size_bytes: u64,
216 pub average_row_size_bytes: f64,
218 pub min_row_size_bytes: u64,
220 pub max_row_size_bytes: u64,
222 pub field_stats: HashMap<String, FieldStats>,
224 pub generated_at: chrono::DateTime<chrono::Utc>,
226}
227
228#[derive(Debug, Clone, Serialize, Deserialize)]
230pub struct FieldStats {
231 pub field_name: String,
233 pub field_type: String,
235 pub non_null_count: usize,
237 pub null_count: usize,
239 pub unique_count: usize,
241 pub min_value: Option<serde_json::Value>,
243 pub max_value: Option<serde_json::Value>,
245 pub average_value: Option<f64>,
247 pub most_common_values: Vec<(serde_json::Value, usize)>,
249}
250
251#[derive(Debug, Clone, Serialize, Deserialize)]
253pub struct Dataset {
254 pub metadata: DatasetMetadata,
256 pub rows: Vec<DatasetRow>,
258 pub stats: Option<DatasetStats>,
260}
261
262impl Dataset {
263 pub fn new(
265 name: String,
266 schema_name: String,
267 config: DataConfig,
268 format: OutputFormat,
269 ) -> Self {
270 Self {
271 metadata: DatasetMetadata::new(name, schema_name, config, format),
272 rows: Vec::new(),
273 stats: None,
274 }
275 }
276
277 pub fn with_rows(
279 name: String,
280 schema_name: String,
281 config: DataConfig,
282 format: OutputFormat,
283 rows: Vec<DatasetRow>,
284 ) -> Self {
285 let mut dataset = Self::new(name, schema_name, config, format);
286 dataset.rows = rows;
287 dataset.metadata.row_count = dataset.rows.len();
288 dataset
289 }
290
291 pub fn add_row(&mut self, row: DatasetRow) {
293 self.rows.push(row);
294 self.metadata.row_count = self.rows.len();
295 }
296
297 pub fn add_rows(&mut self, rows: Vec<DatasetRow>) {
299 self.rows.extend(rows);
300 self.metadata.row_count = self.rows.len();
301 }
302
303 pub fn get_row(&self, id: &str) -> Option<&DatasetRow> {
305 self.rows.iter().find(|row| row.id == id)
306 }
307
308 pub fn get_row_mut(&mut self, id: &str) -> Option<&mut DatasetRow> {
310 self.rows.iter_mut().find(|row| row.id == id)
311 }
312
313 pub fn remove_row(&mut self, id: &str) -> Option<DatasetRow> {
315 if let Some(pos) = self.rows.iter().position(|row| row.id == id) {
316 let row = self.rows.remove(pos);
317 self.metadata.row_count = self.rows.len();
318 Some(row)
319 } else {
320 None
321 }
322 }
323
324 pub fn get_rows_by_metadata(&self, key: &str, value: &str) -> Vec<&DatasetRow> {
326 self.rows
327 .iter()
328 .filter(|row| row.get_metadata(key).map(|v| v == value).unwrap_or(false))
329 .collect()
330 }
331
332 pub fn row_ids(&self) -> Vec<&String> {
334 self.rows.iter().map(|row| &row.id).collect()
335 }
336
337 pub fn is_empty(&self) -> bool {
339 self.rows.is_empty()
340 }
341
342 pub fn size(&self) -> usize {
344 self.rows.len()
345 }
346
347 pub fn field_names(&self) -> Vec<&String> {
349 if let Some(first_row) = self.rows.first() {
350 first_row.field_names()
351 } else {
352 Vec::new()
353 }
354 }
355
356 pub fn calculate_stats(&mut self) -> Result<()> {
358 if self.rows.is_empty() {
359 self.stats = Some(DatasetStats {
360 row_count: 0,
361 column_count: 0,
362 total_size_bytes: 0,
363 average_row_size_bytes: 0.0,
364 min_row_size_bytes: 0,
365 max_row_size_bytes: 0,
366 field_stats: HashMap::new(),
367 generated_at: chrono::Utc::now(),
368 });
369 return Ok(());
370 }
371
372 let mut total_size = 0u64;
373 let mut row_sizes = Vec::new();
374
375 #[derive(Default)]
377 struct TempFieldStats {
378 field_type: Option<String>,
379 non_null_count: usize,
380 null_count: usize,
381 unique_values: std::collections::HashSet<serde_json::Value>,
382 numeric_values: Vec<f64>,
383 frequency: std::collections::HashMap<serde_json::Value, usize>,
384 }
385
386 let mut temp_field_stats: HashMap<String, TempFieldStats> = HashMap::new();
387
388 let field_names = self.field_names();
390 for field_name in &field_names {
391 temp_field_stats.insert(field_name.to_string(), TempFieldStats::default());
392 }
393
394 for row in &self.rows {
396 let row_json = row.to_json();
397 let row_size = serde_json::to_string(&row_json)
398 .map_err(|e| Error::generic(format!("Failed to serialize row: {}", e)))?
399 .len() as u64;
400
401 total_size += row_size;
402 row_sizes.push(row_size);
403
404 for (field_name, field_value) in &row.data {
406 if let Some(temp_stats) = temp_field_stats.get_mut(field_name) {
407 match field_value {
408 serde_json::Value::Null => temp_stats.null_count += 1,
409 _ => {
410 temp_stats.non_null_count += 1;
411
412 let value_type = match field_value {
414 serde_json::Value::Bool(_) => "boolean",
415 serde_json::Value::Number(_) => "number",
416 serde_json::Value::String(_) => "string",
417 serde_json::Value::Array(_) => "array",
418 serde_json::Value::Object(_) => "object",
419 serde_json::Value::Null => unreachable!(),
420 };
421
422 if temp_stats.field_type.is_none() {
423 temp_stats.field_type = Some(value_type.to_string());
424 } else if temp_stats.field_type.as_ref()
425 != Some(&value_type.to_string())
426 {
427 temp_stats.field_type = Some("mixed".to_string());
428 }
429
430 temp_stats.unique_values.insert(field_value.clone());
432
433 if let serde_json::Value::Number(num) = field_value {
435 if let Some(f) = num.as_f64() {
436 temp_stats.numeric_values.push(f);
437 }
438 }
439
440 *temp_stats.frequency.entry(field_value.clone()).or_insert(0) += 1;
442 }
443 }
444 }
445 }
446 }
447
448 let mut field_stats: HashMap<String, FieldStats> = HashMap::new();
450 for (field_name, temp_stats) in temp_field_stats {
451 let field_type = temp_stats.field_type.unwrap_or_else(|| "unknown".to_string());
452
453 let (min_value, max_value, average_value) = if field_type == "number"
454 && !temp_stats.numeric_values.is_empty()
455 {
456 let min = temp_stats.numeric_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
457 let max =
458 temp_stats.numeric_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
459 let sum: f64 = temp_stats.numeric_values.iter().sum();
460 let avg = sum / temp_stats.numeric_values.len() as f64;
461 (
462 Some(serde_json::Value::Number(
463 serde_json::Number::from_f64(min).unwrap_or(serde_json::Number::from(0)),
464 )),
465 Some(serde_json::Value::Number(
466 serde_json::Number::from_f64(max).unwrap_or(serde_json::Number::from(0)),
467 )),
468 Some(avg),
469 )
470 } else {
471 (None, None, None)
472 };
473
474 let mut most_common: Vec<(serde_json::Value, usize)> =
476 temp_stats.frequency.into_iter().collect();
477 most_common.sort_by(|a, b| b.1.cmp(&a.1));
478 most_common.truncate(5);
479
480 field_stats.insert(
481 field_name.clone(),
482 FieldStats {
483 field_name,
484 field_type,
485 non_null_count: temp_stats.non_null_count,
486 null_count: temp_stats.null_count,
487 unique_count: temp_stats.unique_values.len(),
488 min_value,
489 max_value,
490 average_value,
491 most_common_values: most_common,
492 },
493 );
494 }
495
496 let row_count = self.rows.len();
497 let average_row_size = if row_count > 0 {
498 total_size as f64 / row_count as f64
499 } else {
500 0.0
501 };
502
503 let min_row_size = row_sizes.iter().min().unwrap_or(&0);
504 let max_row_size = row_sizes.iter().max().unwrap_or(&0);
505
506 self.stats = Some(DatasetStats {
507 row_count,
508 column_count: field_names.len(),
509 total_size_bytes: total_size,
510 average_row_size_bytes: average_row_size,
511 min_row_size_bytes: *min_row_size,
512 max_row_size_bytes: *max_row_size,
513 field_stats,
514 generated_at: chrono::Utc::now(),
515 });
516
517 Ok(())
518 }
519
520 pub fn validate(&self) -> DatasetValidationResult {
522 let mut errors = Vec::new();
523 let mut warnings = Vec::new();
524
525 if self.metadata.name.is_empty() {
527 errors.push("Dataset name cannot be empty".to_string());
528 }
529
530 if self.metadata.schema_name.is_empty() {
531 errors.push("Schema name cannot be empty".to_string());
532 }
533
534 for (index, row) in self.rows.iter().enumerate() {
536 if row.id.is_empty() {
537 errors.push(format!("Row {} has empty ID", index));
538 }
539
540 if row.data.is_empty() {
541 warnings.push(format!("Row {} has no data", index));
542 }
543 }
544
545 DatasetValidationResult {
546 valid: errors.is_empty(),
547 errors,
548 warnings,
549 total_rows_validated: self.rows.len(),
550 }
551 }
552
553 pub fn to_json(&self) -> Result<String> {
555 serde_json::to_string_pretty(self)
556 .map_err(|e| Error::generic(format!("Failed to serialize dataset: {}", e)))
557 }
558
559 pub fn rows_to_json(&self) -> Result<String> {
561 let rows_json: Vec<_> = self.rows.iter().map(|row| row.to_json()).collect();
562 serde_json::to_string_pretty(&rows_json)
563 .map_err(|e| Error::generic(format!("Failed to serialize dataset rows: {}", e)))
564 }
565
566 pub fn summary(&self) -> String {
568 format!(
569 "Dataset '{}' - {} rows, {} columns, {}",
570 self.metadata.name,
571 self.rows.len(),
572 self.field_names().len(),
573 self.metadata.human_readable_size()
574 )
575 }
576}
577
578impl Default for Dataset {
579 fn default() -> Self {
580 Self::new(
581 "Untitled Dataset".to_string(),
582 "Unknown Schema".to_string(),
583 DataConfig::default(),
584 OutputFormat::Json,
585 )
586 }
587}
588
589#[cfg(test)]
590mod tests {
591 use super::*;
592
593 #[test]
594 fn test_dataset_new() {
595 let dataset = Dataset::new(
596 "TestDataset".to_string(),
597 "TestSchema".to_string(),
598 DataConfig::default(),
599 OutputFormat::Json,
600 );
601
602 assert_eq!(dataset.metadata.name, "TestDataset");
603 assert_eq!(dataset.metadata.schema_name, "TestSchema");
604 assert_eq!(dataset.rows.len(), 0);
605 }
606
607 #[test]
608 fn test_dataset_default() {
609 let dataset = Dataset::default();
610
611 assert_eq!(dataset.metadata.name, "Untitled Dataset");
612 assert_eq!(dataset.metadata.schema_name, "Unknown Schema");
613 }
614
615 #[test]
616 fn test_dataset_row_new() {
617 let mut data = HashMap::new();
618 data.insert("name".to_string(), serde_json::json!("test"));
619
620 let row = DatasetRow::new("1".to_string(), data.clone());
621
622 assert_eq!(row.id, "1");
623 assert_eq!(row.data.len(), 1);
624 assert!(row.metadata.is_empty());
625 }
626
627 #[test]
628 fn test_dataset_row_metadata() {
629 let mut data = HashMap::new();
630 data.insert("name".to_string(), serde_json::json!("test"));
631
632 let mut row = DatasetRow::new("1".to_string(), data);
633 row.metadata.insert("source".to_string(), "test".to_string());
634
635 assert_eq!(row.metadata.len(), 1);
636 }
637}