use std::path::Path;
use peprs_core::project::Project;
use peprs_core::utils::any_value_to_json;
use polars_jsonschema_bridge::schema_to_polars_fields;
use serde_json::Value;
use tracing::warn;
use crate::error::{EidoError, MissingFile, Result, ValidationError};
use crate::schema::EidoSchema;
pub fn validate_samples(project: &Project, schema: &EidoSchema) -> Result<()> {
let mut errors = Vec::new();
for import in &schema.imports {
if let Err(EidoError::Validation(import_errors)) = validate_samples(project, import) {
errors.extend(import_errors);
}
}
let Some(sample_schema) = &schema.sample_schema else {
return if errors.is_empty() {
Ok(())
} else {
Err(EidoError::Validation(errors))
};
};
if let Err(structural_errors) = structural_precheck(project, sample_schema) {
errors.extend(structural_errors);
}
let validator = jsonschema::validator_for(sample_schema)
.map_err(|e| EidoError::SchemaCompile(format!("Failed to compile sample schema: {e}")))?;
let json_str = project
.to_json_string()
.map_err(|e| EidoError::Project(e))?;
let samples_json: Vec<Value> = serde_json::from_str(&json_str)?;
let sample_index = &project.sample_table_index;
for sample_value in &samples_json {
let sample_name = sample_value
.get(sample_index)
.and_then(|v| v.as_str())
.unwrap_or("<unknown>")
.to_string();
for error in validator.iter_errors(sample_value) {
errors.push(ValidationError {
path: error.instance_path.to_string(),
message: format_schema_error(&error, sample_schema),
sample_name: Some(sample_name.clone()),
});
}
}
if errors.is_empty() {
Ok(())
} else {
Err(EidoError::Validation(errors))
}
}
pub fn validate_project(project: &Project, schema: &EidoSchema) -> Result<()> {
let mut errors = Vec::new();
for import in &schema.imports {
if let Err(EidoError::Validation(import_errors)) = validate_project(project, import) {
errors.extend(import_errors);
}
}
let Some(project_schema) = &schema.project_schema else {
return if errors.is_empty() {
Ok(())
} else {
Err(EidoError::Validation(errors))
};
};
let config_value = match &project.config {
Some(cfg) => match &cfg.raw {
Some(raw) => raw.clone(),
None => Value::Object(serde_json::Map::new()),
},
None => Value::Object(serde_json::Map::new()),
};
let validator = jsonschema::validator_for(project_schema)
.map_err(|e| EidoError::SchemaCompile(format!("Failed to compile project schema: {e}")))?;
for error in validator.iter_errors(&config_value) {
errors.push(ValidationError {
path: error.instance_path.to_string(),
message: format_schema_error(&error, project_schema),
sample_name: None,
});
}
if errors.is_empty() {
Ok(())
} else {
Err(EidoError::Validation(errors))
}
}
pub fn validate_input_files(project: &Project, schema: &EidoSchema) -> Result<()> {
if schema.tangible.is_empty() {
return Ok(());
}
let mut missing = Vec::new();
let sample_index = &project.sample_table_index;
for sample in project.iter_samples() {
let sample_name = sample
.get(sample_index)
.map(|v| any_value_to_json(v.clone()))
.and_then(|v| v.as_str().map(String::from))
.unwrap_or_else(|| "<unknown>".to_string());
for attr in &schema.tangible {
let Some(value) = sample.get(attr) else {
missing.push(MissingFile {
sample_name: sample_name.clone(),
attribute: attr.clone(),
path: "<attribute not found>".to_string(),
});
continue;
};
let json_val = any_value_to_json(value.clone());
let paths: Vec<&str> = match &json_val {
Value::String(s) => vec![s.as_str()],
Value::Array(arr) => arr.iter().filter_map(|v| v.as_str()).collect(),
_ => continue,
};
for p in paths {
if p.is_empty() || p == "null" {
missing.push(MissingFile {
sample_name: sample_name.clone(),
attribute: attr.clone(),
path: "<empty>".to_string(),
});
} else if !Path::new(p).exists() {
missing.push(MissingFile {
sample_name: sample_name.clone(),
attribute: attr.clone(),
path: p.to_string(),
});
}
}
}
}
for sample in project.iter_samples() {
let sample_name = sample
.get(sample_index)
.map(|v| any_value_to_json(v.clone()))
.and_then(|v| v.as_str().map(String::from))
.unwrap_or_else(|| "<unknown>".to_string());
for attr in &schema.files {
if schema.tangible.contains(attr) {
continue;
}
if let Some(value) = sample.get(attr) {
let json_val = any_value_to_json(value.clone());
let paths: Vec<&str> = match &json_val {
Value::String(s) => vec![s.as_str()],
Value::Array(arr) => arr.iter().filter_map(|v| v.as_str()).collect(),
_ => continue,
};
for p in paths {
if !p.is_empty() && p != "null" && !Path::new(p).exists() {
warn!(
sample = sample_name,
attribute = attr,
path = p,
"Optional file attribute points to non-existent file"
);
}
}
}
}
}
if missing.is_empty() {
Ok(())
} else {
Err(EidoError::MissingFiles(missing))
}
}
pub fn validate_single_sample(
sample: &Value,
schema: &EidoSchema,
sample_name: &str,
) -> Result<()> {
let mut errors = Vec::new();
for import in &schema.imports {
if let Err(EidoError::Validation(import_errors)) =
validate_single_sample(sample, import, sample_name)
{
errors.extend(import_errors);
}
}
let Some(sample_schema) = &schema.sample_schema else {
return if errors.is_empty() {
Ok(())
} else {
Err(EidoError::Validation(errors))
};
};
let validator = jsonschema::validator_for(sample_schema)
.map_err(|e| EidoError::SchemaCompile(format!("Failed to compile sample schema: {e}")))?;
for error in validator.iter_errors(sample) {
errors.push(ValidationError {
path: error.instance_path.to_string(),
message: format_schema_error(&error, sample_schema),
sample_name: Some(sample_name.to_string()),
});
}
if errors.is_empty() {
Ok(())
} else {
Err(EidoError::Validation(errors))
}
}
fn format_schema_error(error: &jsonschema::ValidationError, schema: &Value) -> String {
use jsonschema::error::ValidationErrorKind;
if !matches!(error.kind, ValidationErrorKind::AnyOf) {
return error.to_string();
}
let schema_path = error.schema_path.to_string();
if let Some(any_of_node) = navigate_json_pointer(schema, &schema_path) {
if let Some(variants) = any_of_node.as_array() {
let expected: Vec<&str> = variants
.iter()
.filter_map(|v| v.get("type").and_then(|t| t.as_str()))
.collect();
if !expected.is_empty() {
let actual = json_type_name(&error.instance);
let field = error.instance_path.to_string();
let field_label = if field.is_empty() {
String::new()
} else {
format!(" at '{field}'")
};
return format!(
"type mismatch{field_label}: got {actual}, expected {}",
expected.join(" or ")
);
}
}
}
error.to_string()
}
fn json_type_name(value: &Value) -> &'static str {
match value {
Value::Null => "null",
Value::Bool(_) => "boolean",
Value::Number(n) => {
if n.is_f64() && n.as_i64().is_none() {
"number"
} else {
"integer"
}
}
Value::String(_) => "string",
Value::Array(_) => "array",
Value::Object(_) => "object",
}
}
fn navigate_json_pointer<'a>(root: &'a Value, pointer: &str) -> Option<&'a Value> {
let segments: Vec<&str> = pointer.split('/').filter(|s| !s.is_empty()).collect();
let mut current = root;
for segment in &segments {
current = current.get(*segment)?;
}
Some(current)
}
fn structural_precheck(
project: &Project,
sample_schema: &Value,
) -> std::result::Result<(), Vec<ValidationError>> {
let Some(properties) = sample_schema.get("properties") else {
return Ok(());
};
let schema_for_bridge = serde_json::json!({
"type": "object",
"properties": unwrap_any_of_properties(properties),
});
let expected_fields = match schema_to_polars_fields(
&schema_for_bridge,
polars_jsonschema_bridge::SchemaFormat::JsonSchema,
false,
) {
Ok(fields) => fields,
Err(e) => {
warn!(error = %e, "polars-jsonschema-bridge could not parse schema, skipping structural pre-check");
return Ok(());
}
};
let df_schema = project.samples.schema();
let mut errors = Vec::new();
if let Some(required) = sample_schema.get("required").and_then(|r| r.as_array()) {
for req in required {
if let Some(col_name) = req.as_str() {
if df_schema.get(col_name).is_none() {
errors.push(ValidationError {
path: format!("/properties/{col_name}"),
message: format!(
"Required column '{col_name}' is missing from sample table"
),
sample_name: None,
});
}
}
}
}
for (field_name, expected_dtype_str) in &expected_fields {
if let Some(df_dtype) = df_schema.get(field_name.as_str()) {
if !dtype_str_compatible(df_dtype, expected_dtype_str) {
errors.push(ValidationError {
path: format!("/properties/{field_name}"),
message: format!(
"Column '{field_name}' has type {:?} but schema expects {expected_dtype_str}",
df_dtype,
),
sample_name: None,
});
}
}
}
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
}
fn unwrap_any_of_properties(properties: &Value) -> Value {
let Some(obj) = properties.as_object() else {
return properties.clone();
};
let mut result = serde_json::Map::new();
for (key, value) in obj {
if let Some(any_of) = value.get("anyOf").and_then(|a| a.as_array()) {
if let Some(first) = any_of.first() {
result.insert(key.clone(), first.clone());
continue;
}
}
result.insert(key.clone(), value.clone());
}
Value::Object(result)
}
fn dtype_str_compatible(actual: &polars::prelude::DataType, expected_str: &str) -> bool {
use polars::prelude::DataType;
let actual_str = format!("{actual:?}");
if actual_str == expected_str {
return true;
}
if matches!(actual, DataType::String) || expected_str == "String" {
return true;
}
if let DataType::List(inner) = actual {
if let Some(inner_expected) = expected_str
.strip_prefix("List[")
.and_then(|s| s.strip_suffix(']'))
{
return dtype_str_compatible(inner, inner_expected);
}
}
let is_actual_int = matches!(
actual,
DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
);
let is_actual_float = matches!(actual, DataType::Float32 | DataType::Float64);
let is_expected_int = matches!(
expected_str,
"Int8" | "Int16" | "Int32" | "Int64" | "UInt8" | "UInt16" | "UInt32" | "UInt64"
);
let is_expected_float = matches!(expected_str, "Float32" | "Float64");
if (is_actual_int || is_actual_float) && (is_expected_int || is_expected_float) {
return true;
}
false
}