use crate::core::cardinality::{CardinalityConstraint, ValidationSeverity};
use crate::core::column_inference::{infer_column_types, InferenceConfig};
use crate::core::compatibility::MarkCompatibilityRules;
use crate::core::data::{DataType, FieldRole};
use crate::core::encoding::{Encoding, Field};
use crate::core::field_value::{DataTable, FieldValue};
use crate::core::mark::Mark;
#[derive(Debug, Clone, PartialEq)]
pub enum DataValidationError {
EmptyDataset,
MissingColumns(Vec<String>),
IncompatibleType {
column: String,
expected: String,
found: String,
},
InsufficientNumericData {
column: String,
numeric_count: usize,
total_rows: usize,
},
RoleMismatch {
column: String,
axis: String,
expected_role: FieldRole,
actual_role: FieldRole,
},
HighCardinality {
column: String,
cardinality: usize,
threshold: usize,
suggestion: String,
},
MarkIncompatible {
mark: Mark,
column: String,
axis: String,
required_types: Vec<DataType>,
actual_type: DataType,
},
}
impl std::fmt::Display for DataValidationError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::EmptyDataset => write!(f, "Dataset is empty"),
Self::MissingColumns(cols) => {
write!(f, "Missing columns: {}", cols.join(", "))
}
Self::IncompatibleType {
column,
expected,
found,
} => {
write!(
f,
"Column '{}' has incompatible type: expected {}, found {}",
column, expected, found
)
}
Self::InsufficientNumericData {
column,
numeric_count,
total_rows,
} => {
write!(
f,
"Column '{}' has insufficient numeric data: {}/{} rows ({:.1}%)",
column,
numeric_count,
total_rows,
(*numeric_count as f64 / *total_rows as f64) * 100.0
)
}
Self::RoleMismatch {
column,
axis,
expected_role,
actual_role,
} => {
write!(
f,
"{} axis expects {:?} but column '{}' is classified as {:?}. Try using a {} column instead.",
axis,
expected_role,
column,
actual_role,
match expected_role {
FieldRole::Dimension => "categorical (text/date)",
FieldRole::Measure => "numeric",
}
)
}
Self::HighCardinality {
column,
cardinality,
threshold,
suggestion,
} => {
write!(
f,
"Column '{}' has {} unique values (threshold: {}). {}",
column, cardinality, threshold, suggestion
)
}
Self::MarkIncompatible {
mark,
column,
axis,
required_types,
actual_type,
} => {
write!(
f,
"{:?} charts require {:?} for {} axis, but column '{}' has {:?}",
mark, required_types, axis, column, actual_type
)
}
}
}
}
#[derive(Debug, Clone)]
pub struct ValidationResult {
pub is_valid: bool,
pub errors: Vec<DataValidationError>,
pub warnings: Vec<String>,
pub severity: ValidationSeverity,
}
impl ValidationResult {
pub fn ok() -> Self {
Self {
is_valid: true,
errors: Vec::new(),
warnings: Vec::new(),
severity: ValidationSeverity::Ok,
}
}
pub fn with_errors(errors: Vec<DataValidationError>) -> Self {
Self {
is_valid: false,
errors,
warnings: Vec::new(),
severity: ValidationSeverity::Error,
}
}
pub fn add_warning(&mut self, warning: String) {
self.warnings.push(warning);
if self.severity == ValidationSeverity::Ok {
self.severity = ValidationSeverity::Warning;
}
}
pub fn add_error(&mut self, error: DataValidationError) {
self.errors.push(error);
self.is_valid = false;
self.severity = ValidationSeverity::Error;
}
}
impl DataTable {
pub fn validate_for_encoding(&self, encoding: &Encoding) -> ValidationResult {
let mut result = ValidationResult::ok();
if self.rows().is_empty() {
result.add_error(DataValidationError::EmptyDataset);
return result; }
let total_rows = self.rows().len();
let mut required_columns = Vec::new();
required_columns.push(encoding.x.clone());
required_columns.push(encoding.y.clone());
if let Some(ref field) = encoding.color {
required_columns.push(field.clone());
}
let mut missing_cols = Vec::new();
for field in &required_columns {
if !self.has_column(&field.name) {
missing_cols.push(field.name.clone());
}
}
if !missing_cols.is_empty() {
result.add_error(DataValidationError::MissingColumns(missing_cols));
return result; }
for field in &required_columns {
self.validate_field(field, total_rows, &mut result);
}
result
}
fn validate_field(&self, field: &Field, total_rows: usize, result: &mut ValidationResult) {
let col_name = &field.name;
let mut numeric_count = 0;
let mut text_count = 0;
let mut null_count = 0;
let mut bool_count = 0;
for row in self.rows() {
if let Some(val) = row.get(col_name) {
match val {
FieldValue::Numeric(_) | FieldValue::Timestamp(_) => numeric_count += 1,
FieldValue::Text(_) => text_count += 1,
FieldValue::Bool(_) => bool_count += 1,
FieldValue::Null => null_count += 1,
}
}
}
match &field.data_type {
DataType::Quantitative | DataType::Temporal => {
if numeric_count == 0 {
result.add_error(DataValidationError::IncompatibleType {
column: col_name.clone(),
expected: "numeric or timestamp".to_string(),
found: format!(
"{} text, {} bool, {} null",
text_count, bool_count, null_count
),
});
} else if numeric_count < total_rows {
let percent = (numeric_count as f64 / total_rows as f64) * 100.0;
if percent < 50.0 {
result.add_error(DataValidationError::InsufficientNumericData {
column: col_name.clone(),
numeric_count,
total_rows,
});
} else {
result.add_warning(format!(
"Column '{}': {}/{} rows ({:.1}%) will be filtered out (non-numeric data)",
col_name,
total_rows - numeric_count,
total_rows,
100.0 - percent
));
}
}
}
DataType::Nominal | DataType::Ordinal => {
if null_count == total_rows {
result.add_warning(format!("Column '{}': all values are null", col_name));
}
}
}
}
fn has_column(&self, col_name: &str) -> bool {
self.rows().iter().any(|row| row.contains_key(col_name))
}
pub fn validate_for_mark(&self, encoding: &Encoding, mark: Mark) -> ValidationResult {
let mut result = self.validate_for_encoding(encoding);
if !result.is_valid {
return result;
}
let rules = MarkCompatibilityRules::for_mark(mark);
let columns = vec![encoding.x.name.clone(), encoding.y.name.clone()];
let inferred = infer_column_types(self, &columns, InferenceConfig::default());
let x_inferred = &inferred[0];
self.validate_axis(
&encoding.x,
&rules.x_axis,
x_inferred.cardinality,
mark,
&mut result,
);
let y_inferred = &inferred[1];
self.validate_axis(
&encoding.y,
&rules.y_axis,
y_inferred.cardinality,
mark,
&mut result,
);
if let Some(ref color_field) = encoding.color {
if let Some(ref color_req) = rules.color_channel {
let color_columns = vec![color_field.name.clone()];
let color_inferred =
infer_column_types(self, &color_columns, InferenceConfig::default());
self.validate_axis(
color_field,
color_req,
color_inferred[0].cardinality,
mark,
&mut result,
);
}
}
result
}
fn validate_axis(
&self,
field: &Field,
axis_req: &crate::core::compatibility::AxisRequirement,
cardinality: usize,
mark: Mark,
result: &mut ValidationResult,
) {
let field_role = FieldRole::from_data_type(field.data_type);
if !axis_req.accepts_role(field_role) {
result.add_error(DataValidationError::RoleMismatch {
column: field.name.clone(),
axis: axis_req.axis_name.to_string(),
expected_role: axis_req.required_roles[0],
actual_role: field_role,
});
}
if !axis_req.accepts_data_type(field.data_type) {
result.add_error(DataValidationError::MarkIncompatible {
mark,
column: field.name.clone(),
axis: axis_req.axis_name.to_string(),
required_types: axis_req.allowed_data_types.clone(),
actual_type: field.data_type,
});
}
if axis_req.axis_type == crate::core::compatibility::AxisType::Categorical {
let constraint = CardinalityConstraint::for_chart(mark, axis_req.axis_name);
match constraint.validate(cardinality) {
ValidationSeverity::Error => {
result.add_error(DataValidationError::HighCardinality {
column: field.name.clone(),
cardinality,
threshold: constraint.max_allowed,
suggestion: constraint.warning_message.clone(),
});
}
ValidationSeverity::Warning => {
result.add_warning(format!(
"High cardinality: column '{}' has {} values (recommended: <{}). {}",
field.name,
cardinality,
constraint.max_recommended,
constraint.warning_message
));
}
ValidationSeverity::Ok => {}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::field_value::DataRow;
use std::collections::HashMap;
fn make_row(x: f64, y: f64) -> DataRow {
let mut row = HashMap::new();
row.insert("x".to_string(), FieldValue::Numeric(x));
row.insert("y".to_string(), FieldValue::Numeric(y));
row
}
#[test]
fn test_validate_empty_dataset() {
let table = DataTable::new(vec![]);
let encoding = Encoding::new(Field::quantitative("x"), Field::quantitative("y"));
let result = table.validate_for_encoding(&encoding);
assert!(!result.is_valid);
assert_eq!(result.errors.len(), 1);
assert!(matches!(
result.errors[0],
DataValidationError::EmptyDataset
));
}
#[test]
fn test_validate_missing_columns() {
let table = DataTable::new(vec![make_row(1.0, 2.0)]);
let encoding = Encoding::new(Field::quantitative("missing_x"), Field::quantitative("y"));
let result = table.validate_for_encoding(&encoding);
assert!(!result.is_valid);
assert!(matches!(
&result.errors[0],
DataValidationError::MissingColumns(cols) if cols.contains(&"missing_x".to_string())
));
}
#[test]
fn test_validate_valid_data() {
let table = DataTable::new(vec![make_row(1.0, 2.0), make_row(3.0, 4.0)]);
let encoding = Encoding::new(Field::quantitative("x"), Field::quantitative("y"));
let result = table.validate_for_encoding(&encoding);
assert!(result.is_valid);
assert!(result.errors.is_empty());
}
#[test]
fn test_validate_insufficient_numeric_data() {
let mut rows = vec![];
for i in 0..10 {
let mut row = HashMap::new();
row.insert("x".to_string(), FieldValue::Numeric(i as f64));
row.insert(
"y".to_string(),
if i < 3 {
FieldValue::Numeric(i as f64)
} else {
FieldValue::Text("not a number".to_string())
},
);
rows.push(row);
}
let table = DataTable::new(rows);
let encoding = Encoding::new(Field::quantitative("x"), Field::quantitative("y"));
let result = table.validate_for_encoding(&encoding);
assert!(!result.is_valid);
assert!(result.errors.iter().any(|e| matches!(
e,
DataValidationError::InsufficientNumericData { column, .. } if column == "y"
)));
}
#[test]
fn test_validate_for_mark_bar_chart_with_quantitative_x() {
let table = DataTable::new(vec![make_row(1.0, 10.0), make_row(2.0, 20.0)]);
let encoding = Encoding::new(Field::quantitative("x"), Field::quantitative("y"));
let result = table.validate_for_mark(&encoding, Mark::Bar);
assert!(!result.is_valid);
assert!(result.errors.iter().any(|e| matches!(
e,
DataValidationError::RoleMismatch { column, .. } if column == "x"
)));
}
#[test]
fn test_validate_for_mark_bar_chart_with_nominal_x() {
let mut rows = vec![];
for (cat, val) in [("A", 10.0), ("B", 20.0), ("C", 15.0)] {
let mut row = HashMap::new();
row.insert("category".to_string(), FieldValue::Text(cat.to_string()));
row.insert("value".to_string(), FieldValue::Numeric(val));
rows.push(row);
}
let table = DataTable::new(rows);
let encoding = Encoding::new(Field::nominal("category"), Field::quantitative("value"));
let result = table.validate_for_mark(&encoding, Mark::Bar);
assert!(result.is_valid);
assert!(result.errors.is_empty());
}
#[test]
fn test_validate_for_mark_bar_chart_high_cardinality_warning() {
let mut rows = vec![];
for i in 0..75 {
let mut row = HashMap::new();
row.insert(
"category".to_string(),
FieldValue::Text(format!("cat_{}", i)),
);
row.insert("value".to_string(), FieldValue::Numeric(i as f64));
rows.push(row);
}
let table = DataTable::new(rows);
let encoding = Encoding::new(Field::nominal("category"), Field::quantitative("value"));
let result = table.validate_for_mark(&encoding, Mark::Bar);
assert!(result.is_valid);
assert_eq!(result.severity, ValidationSeverity::Warning);
assert!(!result.warnings.is_empty());
assert!(result.warnings[0].contains("High cardinality"));
}
#[test]
fn test_validate_for_mark_bar_chart_high_cardinality_error() {
let mut rows = vec![];
for i in 0..250 {
let mut row = HashMap::new();
row.insert(
"category".to_string(),
FieldValue::Text(format!("cat_{}", i)),
);
row.insert("value".to_string(), FieldValue::Numeric(i as f64));
rows.push(row);
}
let table = DataTable::new(rows);
let encoding = Encoding::new(Field::nominal("category"), Field::quantitative("value"));
let result = table.validate_for_mark(&encoding, Mark::Bar);
assert!(!result.is_valid);
assert!(result.errors.iter().any(|e| matches!(
e,
DataValidationError::HighCardinality { cardinality, .. } if *cardinality == 250
)));
}
#[test]
fn test_validate_for_mark_scatter_with_nominal_data() {
let mut rows = vec![];
for (cat, val) in [("A", 10.0), ("B", 20.0)] {
let mut row = HashMap::new();
row.insert("category".to_string(), FieldValue::Text(cat.to_string()));
row.insert("value".to_string(), FieldValue::Numeric(val));
rows.push(row);
}
let table = DataTable::new(rows);
let encoding = Encoding::new(Field::nominal("category"), Field::quantitative("value"));
let result = table.validate_for_mark(&encoding, Mark::Point);
assert!(!result.is_valid);
assert!(result.errors.iter().any(|e| matches!(
e,
DataValidationError::MarkIncompatible { mark, .. } if *mark == Mark::Point
)));
}
#[test]
fn test_validate_for_mark_pie_chart_too_many_slices() {
let mut rows = vec![];
for i in 0..35 {
let mut row = HashMap::new();
row.insert("slice".to_string(), FieldValue::Text(format!("s_{}", i)));
row.insert("value".to_string(), FieldValue::Numeric(i as f64));
rows.push(row);
}
let table = DataTable::new(rows);
let encoding = Encoding::new(Field::nominal("slice"), Field::quantitative("value"));
let result = table.validate_for_mark(&encoding, Mark::Arc);
assert!(!result.is_valid);
assert!(result.errors.iter().any(|e| matches!(
e,
DataValidationError::HighCardinality { cardinality, .. } if *cardinality == 35
)));
}
#[test]
fn test_validate_for_mark_line_chart_with_temporal_x() {
let mut rows = vec![];
for i in 0..10 {
let mut row = HashMap::new();
row.insert(
"date".to_string(),
FieldValue::Timestamp(1_000_000.0 + (i as f64 * 86400.0 * 1000.0)),
);
row.insert("value".to_string(), FieldValue::Numeric(i as f64 * 10.0));
rows.push(row);
}
let table = DataTable::new(rows);
let encoding = Encoding::new(Field::temporal("date"), Field::quantitative("value"));
let result = table.validate_for_mark(&encoding, Mark::Line);
assert!(result.is_valid);
assert!(result.errors.is_empty());
}
}