use crate::{DataConfig, OutputFormat};
use crate::{Error, Result};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetValidationResult {
pub valid: bool,
pub errors: Vec<String>,
pub warnings: Vec<String>,
pub total_rows_validated: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetMetadata {
pub name: String,
pub description: Option<String>,
pub schema_name: String,
pub row_count: usize,
pub config: DataConfig,
pub created_at: chrono::DateTime<chrono::Utc>,
pub generation_time_ms: u128,
pub format: OutputFormat,
pub file_size_bytes: Option<u64>,
pub tags: HashMap<String, String>,
}
impl Default for DatasetMetadata {
fn default() -> Self {
Self {
name: String::new(),
description: None,
schema_name: String::new(),
row_count: 0,
config: DataConfig::default(),
created_at: chrono::Utc::now(),
generation_time_ms: 0,
format: OutputFormat::Json,
file_size_bytes: None,
tags: HashMap::new(),
}
}
}
impl DatasetMetadata {
pub fn new(
name: String,
schema_name: String,
config: DataConfig,
format: OutputFormat,
) -> Self {
Self {
name,
schema_name,
config,
format,
created_at: chrono::Utc::now(),
..Default::default()
}
}
pub fn set_generation_time(&mut self, time_ms: u128) {
self.generation_time_ms = time_ms;
}
pub fn set_file_size(&mut self, size_bytes: u64) {
self.file_size_bytes = Some(size_bytes);
}
pub fn add_tag(&mut self, key: String, value: String) {
self.tags.insert(key, value);
}
pub fn get_tag(&self, key: &str) -> Option<&String> {
self.tags.get(key)
}
pub fn remove_tag(&mut self, key: &str) -> Option<String> {
self.tags.remove(key)
}
pub fn estimated_size_bytes(&self) -> u64 {
self.file_size_bytes.unwrap_or_else(|| {
(self.row_count * 1024) as u64
})
}
pub fn is_empty(&self) -> bool {
self.row_count == 0
}
pub fn human_readable_size(&self) -> String {
let bytes = self.estimated_size_bytes();
if bytes < 1024 {
format!("{} B", bytes)
} else if bytes < 1024 * 1024 {
format!("{:.1} KB", bytes as f64 / 1024.0)
} else if bytes < 1024 * 1024 * 1024 {
format!("{:.1} MB", bytes as f64 / (1024.0 * 1024.0))
} else {
format!("{:.1} GB", bytes as f64 / (1024.0 * 1024.0 * 1024.0))
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetRow {
pub id: String,
pub data: HashMap<String, serde_json::Value>,
pub metadata: HashMap<String, String>,
pub created_at: chrono::DateTime<chrono::Utc>,
}
impl DatasetRow {
pub fn new(id: String, data: HashMap<String, serde_json::Value>) -> Self {
Self {
id,
data,
metadata: HashMap::new(),
created_at: chrono::Utc::now(),
}
}
pub fn add_metadata(&mut self, key: String, value: String) {
self.metadata.insert(key, value);
}
pub fn get_metadata(&self, key: &str) -> Option<&String> {
self.metadata.get(key)
}
pub fn remove_metadata(&mut self, key: &str) -> Option<String> {
self.metadata.remove(key)
}
pub fn get_field(&self, field_name: &str) -> Option<&serde_json::Value> {
self.data.get(field_name)
}
pub fn set_field(&mut self, field_name: String, value: serde_json::Value) {
self.data.insert(field_name, value);
}
pub fn has_field(&self, field_name: &str) -> bool {
self.data.contains_key(field_name)
}
pub fn field_names(&self) -> Vec<&String> {
self.data.keys().collect()
}
pub fn to_json(&self) -> serde_json::Value {
serde_json::json!({
"id": self.id,
"data": self.data,
"metadata": self.metadata,
"created_at": self.created_at,
})
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DatasetStats {
pub row_count: usize,
pub column_count: usize,
pub total_size_bytes: u64,
pub average_row_size_bytes: f64,
pub min_row_size_bytes: u64,
pub max_row_size_bytes: u64,
pub field_stats: HashMap<String, FieldStats>,
pub generated_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FieldStats {
pub field_name: String,
pub field_type: String,
pub non_null_count: usize,
pub null_count: usize,
pub unique_count: usize,
pub min_value: Option<serde_json::Value>,
pub max_value: Option<serde_json::Value>,
pub average_value: Option<f64>,
pub most_common_values: Vec<(serde_json::Value, usize)>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Dataset {
pub metadata: DatasetMetadata,
pub rows: Vec<DatasetRow>,
pub stats: Option<DatasetStats>,
}
impl Dataset {
pub fn new(
name: String,
schema_name: String,
config: DataConfig,
format: OutputFormat,
) -> Self {
Self {
metadata: DatasetMetadata::new(name, schema_name, config, format),
rows: Vec::new(),
stats: None,
}
}
pub fn with_rows(
name: String,
schema_name: String,
config: DataConfig,
format: OutputFormat,
rows: Vec<DatasetRow>,
) -> Self {
let mut dataset = Self::new(name, schema_name, config, format);
dataset.rows = rows;
dataset.metadata.row_count = dataset.rows.len();
dataset
}
pub fn add_row(&mut self, row: DatasetRow) {
self.rows.push(row);
self.metadata.row_count = self.rows.len();
}
pub fn add_rows(&mut self, rows: Vec<DatasetRow>) {
self.rows.extend(rows);
self.metadata.row_count = self.rows.len();
}
pub fn get_row(&self, id: &str) -> Option<&DatasetRow> {
self.rows.iter().find(|row| row.id == id)
}
pub fn get_row_mut(&mut self, id: &str) -> Option<&mut DatasetRow> {
self.rows.iter_mut().find(|row| row.id == id)
}
pub fn remove_row(&mut self, id: &str) -> Option<DatasetRow> {
if let Some(pos) = self.rows.iter().position(|row| row.id == id) {
let row = self.rows.remove(pos);
self.metadata.row_count = self.rows.len();
Some(row)
} else {
None
}
}
pub fn get_rows_by_metadata(&self, key: &str, value: &str) -> Vec<&DatasetRow> {
self.rows
.iter()
.filter(|row| row.get_metadata(key).map(|v| v == value).unwrap_or(false))
.collect()
}
pub fn row_ids(&self) -> Vec<&String> {
self.rows.iter().map(|row| &row.id).collect()
}
pub fn is_empty(&self) -> bool {
self.rows.is_empty()
}
pub fn size(&self) -> usize {
self.rows.len()
}
pub fn field_names(&self) -> Vec<&String> {
if let Some(first_row) = self.rows.first() {
first_row.field_names()
} else {
Vec::new()
}
}
pub fn calculate_stats(&mut self) -> Result<()> {
if self.rows.is_empty() {
self.stats = Some(DatasetStats {
row_count: 0,
column_count: 0,
total_size_bytes: 0,
average_row_size_bytes: 0.0,
min_row_size_bytes: 0,
max_row_size_bytes: 0,
field_stats: HashMap::new(),
generated_at: chrono::Utc::now(),
});
return Ok(());
}
let mut total_size = 0u64;
let mut row_sizes = Vec::new();
#[derive(Default)]
struct TempFieldStats {
field_type: Option<String>,
non_null_count: usize,
null_count: usize,
unique_values: HashSet<serde_json::Value>,
numeric_values: Vec<f64>,
frequency: HashMap<serde_json::Value, usize>,
}
let mut temp_field_stats: HashMap<String, TempFieldStats> = HashMap::new();
let field_names = self.field_names();
for field_name in &field_names {
temp_field_stats.insert(field_name.to_string(), TempFieldStats::default());
}
for row in &self.rows {
let row_json = row.to_json();
let row_size = serde_json::to_string(&row_json)
.map_err(|e| Error::generic(format!("Failed to serialize row: {}", e)))?
.len() as u64;
total_size += row_size;
row_sizes.push(row_size);
for (field_name, field_value) in &row.data {
if let Some(temp_stats) = temp_field_stats.get_mut(field_name) {
match field_value {
serde_json::Value::Null => temp_stats.null_count += 1,
_ => {
temp_stats.non_null_count += 1;
let value_type = match field_value {
serde_json::Value::Bool(_) => "boolean",
serde_json::Value::Number(_) => "number",
serde_json::Value::String(_) => "string",
serde_json::Value::Array(_) => "array",
serde_json::Value::Object(_) => "object",
serde_json::Value::Null => unreachable!(),
};
if temp_stats.field_type.is_none() {
temp_stats.field_type = Some(value_type.to_string());
} else if temp_stats.field_type.as_ref()
!= Some(&value_type.to_string())
{
temp_stats.field_type = Some("mixed".to_string());
}
temp_stats.unique_values.insert(field_value.clone());
if let serde_json::Value::Number(num) = field_value {
if let Some(f) = num.as_f64() {
temp_stats.numeric_values.push(f);
}
}
*temp_stats.frequency.entry(field_value.clone()).or_insert(0) += 1;
}
}
}
}
}
let mut field_stats: HashMap<String, FieldStats> = HashMap::new();
for (field_name, temp_stats) in temp_field_stats {
let field_type = temp_stats.field_type.unwrap_or_else(|| "unknown".to_string());
let (min_value, max_value, average_value) = if field_type == "number"
&& !temp_stats.numeric_values.is_empty()
{
let min = temp_stats.numeric_values.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let max =
temp_stats.numeric_values.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let sum: f64 = temp_stats.numeric_values.iter().sum();
let avg = sum / temp_stats.numeric_values.len() as f64;
(
Some(serde_json::Value::Number(
serde_json::Number::from_f64(min).unwrap_or(serde_json::Number::from(0)),
)),
Some(serde_json::Value::Number(
serde_json::Number::from_f64(max).unwrap_or(serde_json::Number::from(0)),
)),
Some(avg),
)
} else {
(None, None, None)
};
let mut most_common: Vec<(serde_json::Value, usize)> =
temp_stats.frequency.into_iter().collect();
most_common.sort_by(|a, b| b.1.cmp(&a.1));
most_common.truncate(5);
field_stats.insert(
field_name.clone(),
FieldStats {
field_name,
field_type,
non_null_count: temp_stats.non_null_count,
null_count: temp_stats.null_count,
unique_count: temp_stats.unique_values.len(),
min_value,
max_value,
average_value,
most_common_values: most_common,
},
);
}
let row_count = self.rows.len();
let average_row_size = if row_count > 0 {
total_size as f64 / row_count as f64
} else {
0.0
};
let min_row_size = row_sizes.iter().min().unwrap_or(&0);
let max_row_size = row_sizes.iter().max().unwrap_or(&0);
self.stats = Some(DatasetStats {
row_count,
column_count: field_names.len(),
total_size_bytes: total_size,
average_row_size_bytes: average_row_size,
min_row_size_bytes: *min_row_size,
max_row_size_bytes: *max_row_size,
field_stats,
generated_at: chrono::Utc::now(),
});
Ok(())
}
pub fn validate(&self) -> DatasetValidationResult {
let mut errors = Vec::new();
let mut warnings = Vec::new();
if self.metadata.name.is_empty() {
errors.push("Dataset name cannot be empty".to_string());
}
if self.metadata.schema_name.is_empty() {
errors.push("Schema name cannot be empty".to_string());
}
for (index, row) in self.rows.iter().enumerate() {
if row.id.is_empty() {
errors.push(format!("Row {} has empty ID", index));
}
if row.data.is_empty() {
warnings.push(format!("Row {} has no data", index));
}
}
DatasetValidationResult {
valid: errors.is_empty(),
errors,
warnings,
total_rows_validated: self.rows.len(),
}
}
pub fn to_json(&self) -> Result<String> {
serde_json::to_string_pretty(self)
.map_err(|e| Error::generic(format!("Failed to serialize dataset: {}", e)))
}
pub fn rows_to_json(&self) -> Result<String> {
let rows_json: Vec<_> = self.rows.iter().map(|row| row.to_json()).collect();
serde_json::to_string_pretty(&rows_json)
.map_err(|e| Error::generic(format!("Failed to serialize dataset rows: {}", e)))
}
pub fn summary(&self) -> String {
format!(
"Dataset '{}' - {} rows, {} columns, {}",
self.metadata.name,
self.rows.len(),
self.field_names().len(),
self.metadata.human_readable_size()
)
}
}
impl Default for Dataset {
fn default() -> Self {
Self::new(
"Untitled Dataset".to_string(),
"Unknown Schema".to_string(),
DataConfig::default(),
OutputFormat::Json,
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dataset_new() {
let dataset = Dataset::new(
"TestDataset".to_string(),
"TestSchema".to_string(),
DataConfig::default(),
OutputFormat::Json,
);
assert_eq!(dataset.metadata.name, "TestDataset");
assert_eq!(dataset.metadata.schema_name, "TestSchema");
assert_eq!(dataset.rows.len(), 0);
}
#[test]
fn test_dataset_default() {
let dataset = Dataset::default();
assert_eq!(dataset.metadata.name, "Untitled Dataset");
assert_eq!(dataset.metadata.schema_name, "Unknown Schema");
}
#[test]
fn test_dataset_row_new() {
let mut data = HashMap::new();
data.insert("name".to_string(), serde_json::json!("test"));
let row = DatasetRow::new("1".to_string(), data.clone());
assert_eq!(row.id, "1");
assert_eq!(row.data.len(), 1);
assert!(row.metadata.is_empty());
}
#[test]
fn test_dataset_row_metadata() {
let mut data = HashMap::new();
data.insert("name".to_string(), serde_json::json!("test"));
let mut row = DatasetRow::new("1".to_string(), data);
row.metadata.insert("source".to_string(), "test".to_string());
assert_eq!(row.metadata.len(), 1);
}
}