1pub mod core;
12
13pub use core::*;
15
16use crate::{DataConfig, GenerationResult, OutputFormat, SchemaDefinition};
18use crate::{Error, Result};
19use serde::{Deserialize, Serialize};
20use std::collections::HashMap;
21use std::path::Path;
22use tokio::fs;
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct DatasetValidationResult {
27 pub valid: bool,
29 pub errors: Vec<String>,
31 pub warnings: Vec<String>,
33 pub total_rows_validated: usize,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct DatasetMetadata {
40 pub name: String,
42 pub description: Option<String>,
44 pub schema_name: String,
46 pub row_count: usize,
48 pub config: DataConfig,
50 pub created_at: chrono::DateTime<chrono::Utc>,
52 pub generation_time_ms: u128,
54 pub format: OutputFormat,
56 pub file_size_bytes: Option<u64>,
58 pub tags: HashMap<String, String>,
60}
61
62impl Default for DatasetMetadata {
63 fn default() -> Self {
64 Self {
65 name: String::new(),
66 description: None,
67 schema_name: String::new(),
68 row_count: 0,
69 config: DataConfig::default(),
70 created_at: chrono::Utc::now(),
71 generation_time_ms: 0,
72 format: OutputFormat::Json,
73 file_size_bytes: None,
74 tags: HashMap::new(),
75 }
76 }
77}
78
79impl DatasetMetadata {
80 pub fn new(
82 name: String,
83 schema_name: String,
84 result: &GenerationResult,
85 config: DataConfig,
86 ) -> Self {
87 Self {
88 name,
89 description: None,
90 schema_name,
91 row_count: result.count,
92 config,
93 created_at: chrono::Utc::now(),
94 generation_time_ms: result.generation_time_ms,
95 format: OutputFormat::Json,
96 file_size_bytes: None,
97 tags: HashMap::new(),
98 }
99 }
100
101 pub fn with_description(mut self, description: String) -> Self {
103 self.description = Some(description);
104 self
105 }
106
107 pub fn with_tag(mut self, key: String, value: String) -> Self {
109 self.tags.insert(key, value);
110 self
111 }
112
113 pub fn with_file_size(mut self, size: u64) -> Self {
115 self.file_size_bytes = Some(size);
116 self
117 }
118}
119
120#[derive(Debug)]
122pub struct Dataset {
123 pub metadata: DatasetMetadata,
125 pub data: Vec<serde_json::Value>,
127}
128
129impl Dataset {
130 pub fn new(metadata: DatasetMetadata, data: Vec<serde_json::Value>) -> Self {
132 Self { metadata, data }
133 }
134
135 pub fn from_generation_result(
137 name: String,
138 schema_name: String,
139 result: GenerationResult,
140 config: DataConfig,
141 ) -> Self {
142 let metadata = DatasetMetadata::new(name, schema_name, &result, config);
143 Self::new(metadata, result.data)
144 }
145
146 pub fn to_json_string(&self) -> Result<String> {
148 serde_json::to_string_pretty(&self.data)
149 .map_err(|e| crate::Error::generic(format!("Failed to serialize dataset: {}", e)))
150 }
151
152 pub fn to_jsonl_string(&self) -> Result<String> {
154 let lines: Result<Vec<String>> = self
155 .data
156 .iter()
157 .map(|value| {
158 serde_json::to_string(value)
159 .map_err(|e| crate::Error::generic(format!("JSON serialization error: {}", e)))
160 })
161 .collect();
162
163 lines.map(|lines| lines.join("\n"))
164 }
165
166 pub fn to_csv_string(&self) -> Result<String> {
168 if self.data.is_empty() {
169 return Ok(String::new());
170 }
171
172 let mut csv_output = String::new();
173
174 if let Some(first_row) = self.data.first() {
176 if let Some(obj) = first_row.as_object() {
177 let headers: Vec<String> = obj.keys().cloned().collect();
178 csv_output.push_str(&headers.join(","));
179 csv_output.push('\n');
180
181 for row in &self.data {
183 if let Some(obj) = row.as_object() {
184 let values: Vec<String> = headers
185 .iter()
186 .map(|header| {
187 obj.get(header)
188 .map(|v| v.to_string().trim_matches('"').to_string())
189 .unwrap_or_default()
190 })
191 .collect();
192 csv_output.push_str(&values.join(","));
193 csv_output.push('\n');
194 }
195 }
196 }
197 }
198
199 Ok(csv_output)
200 }
201
202 pub fn to_yaml_string(&self) -> Result<String> {
204 serde_yaml::to_string(&self.data)
205 .map_err(|e| crate::Error::generic(format!("Failed to serialize dataset: {}", e)))
206 }
207
208 pub async fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
210 let content = match self.metadata.format {
211 OutputFormat::Json => self.to_json_string()?,
212 OutputFormat::JsonLines => self.to_jsonl_string()?,
213 OutputFormat::Csv => self.to_csv_string()?,
214 OutputFormat::Yaml => self.to_yaml_string()?,
215 };
216
217 fs::write(path, content)
218 .await
219 .map_err(|e| crate::Error::generic(format!("Failed to write dataset file: {}", e)))
220 }
221
222 pub async fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
224 let content = fs::read_to_string(path)
225 .await
226 .map_err(|e| crate::Error::generic(format!("Failed to read dataset file: {}", e)))?;
227
228 if let Ok(data) = serde_json::from_str::<Vec<serde_json::Value>>(&content) {
230 let metadata = DatasetMetadata {
231 name: "loaded_dataset".to_string(),
232 description: None,
233 schema_name: "unknown".to_string(),
234 row_count: data.len(),
235 config: DataConfig::default(),
236 created_at: chrono::Utc::now(),
237 generation_time_ms: 0,
238 format: OutputFormat::Json,
239 file_size_bytes: Some(content.len() as u64),
240 tags: HashMap::new(),
241 };
242
243 return Ok(Self::new(metadata, data));
244 }
245
246 Err(crate::Error::generic("Unsupported file format or invalid content"))
247 }
248
249 pub fn row_count(&self) -> usize {
251 self.data.len()
252 }
253
254 pub fn sample(&self, count: usize) -> &[serde_json::Value] {
256 let sample_count = count.min(self.data.len());
257 &self.data[..sample_count]
258 }
259
260 pub fn filter<F>(&self, predicate: F) -> Dataset
262 where
263 F: Fn(&serde_json::Value) -> bool,
264 {
265 let filtered_data: Vec<serde_json::Value> =
266 self.data.iter().filter(|row| predicate(row)).cloned().collect();
267
268 let mut metadata = self.metadata.clone();
269 metadata.row_count = filtered_data.len();
270
271 Self::new(metadata, filtered_data)
272 }
273
274 pub fn map<F>(&self, mapper: F) -> Dataset
276 where
277 F: Fn(&serde_json::Value) -> serde_json::Value,
278 {
279 let mapped_data: Vec<serde_json::Value> = self.data.iter().map(mapper).collect();
280
281 let metadata = self.metadata.clone();
282 Self::new(metadata, mapped_data)
283 }
284
285 pub fn validate_against_schema(&self, schema: &SchemaDefinition) -> Result<Vec<String>> {
287 utils::validate_dataset_against_schema(self, schema)
288 }
289
290 pub fn validate_with_details(&self, schema: &SchemaDefinition) -> DatasetValidationResult {
292 utils::validate_dataset_with_details(self, schema)
293 }
294}
295
296#[derive(Debug)]
298pub struct DatasetCollection {
299 datasets: HashMap<String, Dataset>,
301 #[allow(dead_code)]
303 metadata: HashMap<String, String>,
304}
305
306impl DatasetCollection {
307 pub fn new() -> Self {
309 Self {
310 datasets: HashMap::new(),
311 metadata: HashMap::new(),
312 }
313 }
314
315 pub fn add_dataset(&mut self, dataset: Dataset) -> Result<()> {
317 let name = dataset.metadata.name.clone();
318 self.datasets.insert(name, dataset);
319 Ok(())
320 }
321
322 pub fn get_dataset(&self, name: &str) -> Option<&Dataset> {
324 self.datasets.get(name)
325 }
326
327 pub fn remove_dataset(&mut self, name: &str) -> Option<Dataset> {
329 self.datasets.remove(name)
330 }
331
332 pub fn list_datasets(&self) -> Vec<String> {
334 self.datasets.keys().cloned().collect()
335 }
336
337 pub fn size(&self) -> usize {
339 self.datasets.len()
340 }
341
342 pub async fn save_to_directory<P: AsRef<Path>>(&self, dir_path: P) -> Result<()> {
344 fs::create_dir_all(&dir_path)
345 .await
346 .map_err(|e| crate::Error::generic(format!("Failed to create directory: {}", e)))?;
347
348 for (name, dataset) in &self.datasets {
349 let file_path = dir_path.as_ref().join(format!("{}.json", name));
350 dataset.save_to_file(file_path).await?;
351 }
352
353 Ok(())
354 }
355
356 pub async fn load_from_directory<P: AsRef<Path>>(dir_path: P) -> Result<Self> {
358 let mut collection = Self::new();
359 let mut entries = fs::read_dir(dir_path)
360 .await
361 .map_err(|e| crate::Error::generic(format!("Failed to read directory: {}", e)))?;
362
363 while let Some(entry) = entries
364 .next_entry()
365 .await
366 .map_err(|e| crate::Error::generic(format!("Failed to read directory entry: {}", e)))?
367 {
368 let path = entry.path();
369 if path.extension().and_then(|s| s.to_str()) == Some("json") {
370 if let Some(_file_name) = path.file_stem().and_then(|s| s.to_str()) {
371 let dataset = Dataset::load_from_file(&path).await?;
372 collection.add_dataset(dataset)?;
373 }
374 }
375 }
376
377 Ok(collection)
378 }
379
380 pub fn statistics(&self) -> HashMap<String, serde_json::Value> {
382 let mut stats = HashMap::new();
383
384 stats.insert("total_datasets".to_string(), self.size().into());
385 stats.insert(
386 "total_rows".to_string(),
387 self.datasets.values().map(|d| d.row_count()).sum::<usize>().into(),
388 );
389
390 let dataset_info: Vec<serde_json::Value> = self
391 .datasets
392 .values()
393 .map(|d| {
394 serde_json::json!({
395 "name": d.metadata.name,
396 "schema": d.metadata.schema_name,
397 "rows": d.row_count(),
398 "format": format!("{:?}", d.metadata.format),
399 })
400 })
401 .collect();
402
403 stats.insert("datasets".to_string(), dataset_info.into());
404
405 stats
406 }
407}
408
409impl Default for DatasetCollection {
410 fn default() -> Self {
411 Self::new()
412 }
413}
414
415pub mod utils {
417 use super::*;
418
419 pub async fn create_sample_collection() -> Result<DatasetCollection> {
421 let mut collection = DatasetCollection::new();
422
423 let users_result = crate::generator::utils::generate_users(50).await?;
425 let users_dataset = Dataset::from_generation_result(
426 "users".to_string(),
427 "User".to_string(),
428 users_result,
429 DataConfig {
430 rows: 50,
431 ..Default::default()
432 },
433 );
434 collection.add_dataset(users_dataset)?;
435
436 let products_result = crate::generator::utils::generate_products(25).await?;
438 let products_dataset = Dataset::from_generation_result(
439 "products".to_string(),
440 "Product".to_string(),
441 products_result,
442 DataConfig {
443 rows: 25,
444 ..Default::default()
445 },
446 );
447 collection.add_dataset(products_dataset)?;
448
449 Ok(collection)
450 }
451
452 pub async fn export_dataset(
454 dataset: &Dataset,
455 format: OutputFormat,
456 output_path: &Path,
457 ) -> Result<()> {
458 let content = match format {
459 OutputFormat::Json => dataset.to_json_string()?,
460 OutputFormat::JsonLines => dataset.to_jsonl_string()?,
461 OutputFormat::Csv => dataset.to_csv_string()?,
462 OutputFormat::Yaml => dataset.to_yaml_string()?,
463 };
464
465 fs::write(output_path, content)
466 .await
467 .map_err(|e| crate::Error::generic(format!("Failed to export dataset: {}", e)))
468 }
469
470 pub fn validate_dataset_against_schema(
472 dataset: &Dataset,
473 schema: &SchemaDefinition,
474 ) -> Result<Vec<String>> {
475 let mut errors = Vec::new();
476
477 for (row_index, row) in dataset.data.iter().enumerate() {
479 match row {
480 serde_json::Value::Object(row_obj) => {
481 for field in &schema.fields {
483 let field_name = &field.name;
484
485 if let Some(field_value) = row_obj.get(field_name) {
486 if let Err(validation_error) = field.validate_value(field_value) {
488 errors.push(format!(
489 "Row {}: Field '{}': {}",
490 row_index + 1,
491 field_name,
492 validation_error
493 ));
494 }
495 } else if field.required {
496 errors.push(format!(
497 "Row {}: Required field '{}' is missing",
498 row_index + 1,
499 field_name
500 ));
501 }
502 }
503
504 for (key, _) in row_obj {
506 let field_exists_in_schema = schema.fields.iter().any(|f| f.name == *key);
507 if !field_exists_in_schema {
508 errors.push(format!(
509 "Row {}: Unexpected field '{}' not defined in schema",
510 row_index + 1,
511 key
512 ));
513 }
514 }
515 }
516 _ => {
517 errors.push(format!("Row {}: Expected object, got {}", row_index + 1, row));
518 }
519 }
520 }
521
522 if let Err(count_error) = validate_dataset_size(dataset, schema) {
524 errors.push(count_error.to_string());
525 }
526
527 Ok(errors)
528 }
529
530 fn validate_dataset_size(dataset: &Dataset, schema: &SchemaDefinition) -> crate::Result<()> {
532 if let Some(min_rows) = schema.metadata.get("min_rows") {
534 if let Some(min_count) = min_rows.as_u64() {
535 if dataset.data.len() < min_count as usize {
536 return Err(Error::validation(format!(
537 "Dataset has {} rows, but schema requires at least {} rows",
538 dataset.data.len(),
539 min_count
540 )));
541 }
542 }
543 }
544
545 if let Some(max_rows) = schema.metadata.get("max_rows") {
546 if let Some(max_count) = max_rows.as_u64() {
547 if dataset.data.len() > max_count as usize {
548 return Err(Error::validation(format!(
549 "Dataset has {} rows, but schema allows at most {} rows",
550 dataset.data.len(),
551 max_count
552 )));
553 }
554 }
555 }
556
557 Ok(())
558 }
559
560 pub fn validate_dataset_with_details(
562 dataset: &Dataset,
563 schema: &SchemaDefinition,
564 ) -> DatasetValidationResult {
565 let errors = validate_dataset_against_schema(dataset, schema);
566
567 match errors {
568 Ok(validation_errors) => {
569 let warnings = Vec::new(); DatasetValidationResult {
571 valid: validation_errors.is_empty(),
572 errors: validation_errors,
573 warnings,
574 total_rows_validated: dataset.data.len(),
575 }
576 }
577 Err(e) => DatasetValidationResult {
578 valid: false,
579 errors: vec![format!("Validation failed: {}", e)],
580 warnings: Vec::new(),
581 total_rows_validated: dataset.data.len(),
582 },
583 }
584 }
585}
586
587#[cfg(test)]
588mod tests {
589 use super::*;
590
591 #[test]
592 fn test_dataset_validation_result_creation() {
593 let result = DatasetValidationResult {
594 valid: true,
595 errors: vec![],
596 warnings: vec![],
597 total_rows_validated: 100,
598 };
599
600 assert!(result.valid);
601 assert_eq!(result.total_rows_validated, 100);
602 }
603
604 #[test]
605 fn test_dataset_validation_result_with_errors() {
606 let result = DatasetValidationResult {
607 valid: false,
608 errors: vec!["Error 1".to_string(), "Error 2".to_string()],
609 warnings: vec![],
610 total_rows_validated: 50,
611 };
612
613 assert!(!result.valid);
614 assert_eq!(result.errors.len(), 2);
615 }
616
617 #[test]
618 fn test_dataset_validation_result_with_warnings() {
619 let result = DatasetValidationResult {
620 valid: true,
621 errors: vec![],
622 warnings: vec!["Warning 1".to_string()],
623 total_rows_validated: 75,
624 };
625
626 assert!(result.valid);
627 assert_eq!(result.warnings.len(), 1);
628 }
629
630 #[test]
631 fn test_dataset_metadata_creation() {
632 let config = DataConfig::default();
633 let metadata = DatasetMetadata {
634 name: "TestDataset".to_string(),
635 description: Some("Test description".to_string()),
636 schema_name: "TestSchema".to_string(),
637 row_count: 100,
638 config,
639 created_at: chrono::Utc::now(),
640 generation_time_ms: 1000,
641 format: OutputFormat::Json,
642 file_size_bytes: Some(1024),
643 tags: HashMap::new(),
644 };
645
646 assert_eq!(metadata.name, "TestDataset");
647 assert_eq!(metadata.row_count, 100);
648 assert!(metadata.description.is_some());
649 assert_eq!(metadata.generation_time_ms, 1000);
650 }
651}