#![allow(missing_docs)]
pub mod validation;
pub use validation::{
CoercionMode, SchemaInference, SchemaValidator, TypeCoercion, ValidationReport,
ValidationViolation,
};
use crate::error::{IoError, Result};
use chrono::{DateTime, NaiveDate, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt;
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum FieldType {
Int8,
Int16,
Int32,
Int64,
UInt32,
UInt64,
Float32,
Float64,
Utf8,
Boolean,
Date,
Timestamp,
Binary,
Json,
List(Box<FieldType>),
Struct(Vec<SchemaField>),
Decimal {
precision: u8,
scale: u8,
},
}
impl FieldType {
pub fn type_name(&self) -> String {
match self {
FieldType::Int8 => "int8".to_string(),
FieldType::Int16 => "int16".to_string(),
FieldType::Int32 => "int32".to_string(),
FieldType::Int64 => "int64".to_string(),
FieldType::UInt32 => "uint32".to_string(),
FieldType::UInt64 => "uint64".to_string(),
FieldType::Float32 => "float32".to_string(),
FieldType::Float64 => "float64".to_string(),
FieldType::Utf8 => "string".to_string(),
FieldType::Boolean => "boolean".to_string(),
FieldType::Date => "date".to_string(),
FieldType::Timestamp => "timestamp".to_string(),
FieldType::Binary => "binary".to_string(),
FieldType::Json => "json".to_string(),
FieldType::List(elem) => format!("list<{}>", elem.type_name()),
FieldType::Struct(_) => "struct".to_string(),
FieldType::Decimal { precision, scale } => {
format!("decimal({precision},{scale})")
}
}
}
pub fn is_numeric(&self) -> bool {
matches!(
self,
FieldType::Int8
| FieldType::Int16
| FieldType::Int32
| FieldType::Int64
| FieldType::UInt32
| FieldType::UInt64
| FieldType::Float32
| FieldType::Float64
| FieldType::Decimal { .. }
)
}
pub fn is_integer(&self) -> bool {
matches!(
self,
FieldType::Int8
| FieldType::Int16
| FieldType::Int32
| FieldType::Int64
| FieldType::UInt32
| FieldType::UInt64
)
}
pub fn is_float(&self) -> bool {
matches!(self, FieldType::Float32 | FieldType::Float64)
}
}
impl fmt::Display for FieldType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.type_name())
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum Constraint {
Unique,
Range {
min: serde_json::Value,
max: serde_json::Value,
},
Regex(String),
MaxLength(usize),
MinLength(usize),
OneOf(Vec<serde_json::Value>),
Custom {
name: String,
description: String,
},
}
impl Constraint {
pub fn name(&self) -> &str {
match self {
Constraint::Unique => "unique",
Constraint::Range { .. } => "range",
Constraint::Regex(_) => "regex",
Constraint::MaxLength(_) => "max_length",
Constraint::MinLength(_) => "min_length",
Constraint::OneOf(_) => "one_of",
Constraint::Custom { name, .. } => name,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SchemaField {
pub name: String,
pub field_type: FieldType,
pub nullable: bool,
pub default: Option<serde_json::Value>,
pub constraints: Vec<Constraint>,
pub description: Option<String>,
pub metadata: HashMap<String, String>,
}
impl SchemaField {
pub fn new(name: impl Into<String>, field_type: FieldType) -> Self {
Self {
name: name.into(),
field_type,
nullable: true,
default: None,
constraints: Vec::new(),
description: None,
metadata: HashMap::new(),
}
}
pub fn not_nullable(mut self) -> Self {
self.nullable = false;
self
}
pub fn nullable(mut self) -> Self {
self.nullable = true;
self
}
pub fn with_default(mut self, default: serde_json::Value) -> Self {
self.default = Some(default);
self
}
pub fn with_constraint(mut self, c: Constraint) -> Self {
self.constraints.push(c);
self
}
pub fn with_description(mut self, desc: impl Into<String>) -> Self {
self.description = Some(desc.into());
self
}
pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.metadata.insert(key.into(), value.into());
self
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
pub struct Schema {
pub fields: Vec<SchemaField>,
pub name: Option<String>,
pub description: Option<String>,
pub version: Option<String>,
pub metadata: HashMap<String, String>,
}
impl Schema {
pub fn new() -> Self {
Self::default()
}
pub fn builder() -> SchemaBuilder {
SchemaBuilder::default()
}
pub fn field(&self, name: &str) -> Option<&SchemaField> {
self.fields.iter().find(|f| f.name == name)
}
pub fn len(&self) -> usize {
self.fields.len()
}
pub fn is_empty(&self) -> bool {
self.fields.is_empty()
}
pub fn column_names(&self) -> Vec<&str> {
self.fields.iter().map(|f| f.name.as_str()).collect()
}
pub fn merge(&mut self, other: &Schema) -> Result<()> {
for other_field in &other.fields {
match self.fields.iter().find(|f| f.name == other_field.name) {
Some(existing) if existing.field_type != other_field.field_type => {
return Err(IoError::ValidationError(format!(
"Incompatible types for field '{}': {:?} vs {:?}",
other_field.name, existing.field_type, other_field.field_type
)));
}
Some(_) => {
}
None => {
self.fields.push(other_field.clone());
}
}
}
Ok(())
}
pub fn to_json(&self) -> Result<String> {
serde_json::to_string_pretty(self)
.map_err(|e| IoError::SerializationError(e.to_string()))
}
pub fn from_json(json: &str) -> Result<Self> {
serde_json::from_str(json).map_err(|e| IoError::DeserializationError(e.to_string()))
}
}
#[derive(Debug, Default)]
pub struct SchemaBuilder {
fields: Vec<SchemaField>,
name: Option<String>,
description: Option<String>,
version: Option<String>,
metadata: HashMap<String, String>,
}
impl SchemaBuilder {
pub fn field(mut self, f: SchemaField) -> Self {
self.fields.push(f);
self
}
pub fn name(mut self, name: impl Into<String>) -> Self {
self.name = Some(name.into());
self
}
pub fn description(mut self, desc: impl Into<String>) -> Self {
self.description = Some(desc.into());
self
}
pub fn version(mut self, version: impl Into<String>) -> Self {
self.version = Some(version.into());
self
}
pub fn metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
self.metadata.insert(key.into(), value.into());
self
}
pub fn build(self) -> Schema {
Schema {
fields: self.fields,
name: self.name,
description: self.description,
version: self.version,
metadata: self.metadata,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum TypedValue {
Null,
Boolean(bool),
Int(i64),
UInt(u64),
Float(f64),
Utf8(String),
Date(NaiveDate),
Timestamp(DateTime<Utc>),
Binary(Vec<u8>),
Json(serde_json::Value),
}
impl TypedValue {
pub fn from_json_strict(value: &serde_json::Value, field_type: &FieldType) -> Result<Self> {
match (field_type, value) {
(_, serde_json::Value::Null) => Ok(TypedValue::Null),
(FieldType::Boolean, serde_json::Value::Bool(b)) => Ok(TypedValue::Boolean(*b)),
(ft, serde_json::Value::Number(n)) if ft.is_integer() => {
let i = n.as_i64().ok_or_else(|| {
IoError::ConversionError(format!("Cannot convert {n} to integer"))
})?;
Ok(TypedValue::Int(i))
}
(ft, serde_json::Value::Number(n)) if ft.is_float() => {
let f = n.as_f64().ok_or_else(|| {
IoError::ConversionError(format!("Cannot convert {n} to float"))
})?;
Ok(TypedValue::Float(f))
}
(FieldType::Utf8, serde_json::Value::String(s)) => Ok(TypedValue::Utf8(s.clone())),
(FieldType::Date, serde_json::Value::String(s)) => {
let date = s.parse::<NaiveDate>().map_err(|e| {
IoError::ParseError(format!("Cannot parse date '{}': {}", s, e))
})?;
Ok(TypedValue::Date(date))
}
(FieldType::Timestamp, serde_json::Value::String(s)) => {
let ts = s.parse::<DateTime<Utc>>().map_err(|e| {
IoError::ParseError(format!("Cannot parse timestamp '{}': {}", s, e))
})?;
Ok(TypedValue::Timestamp(ts))
}
(FieldType::Json, v) => Ok(TypedValue::Json(v.clone())),
(ft, v) => Err(IoError::ConversionError(format!(
"Cannot convert JSON {} to type {}",
v,
ft.type_name()
))),
}
}
pub fn inferred_type(&self) -> FieldType {
match self {
TypedValue::Null => FieldType::Json,
TypedValue::Boolean(_) => FieldType::Boolean,
TypedValue::Int(_) => FieldType::Int64,
TypedValue::UInt(_) => FieldType::UInt64,
TypedValue::Float(_) => FieldType::Float64,
TypedValue::Utf8(_) => FieldType::Utf8,
TypedValue::Date(_) => FieldType::Date,
TypedValue::Timestamp(_) => FieldType::Timestamp,
TypedValue::Binary(_) => FieldType::Binary,
TypedValue::Json(_) => FieldType::Json,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_schema_builder_roundtrip() {
let schema = Schema::builder()
.name("test_table")
.field(SchemaField::new("id", FieldType::Int64).not_nullable())
.field(SchemaField::new("label", FieldType::Utf8).nullable())
.build();
assert_eq!(schema.len(), 2);
assert_eq!(schema.name, Some("test_table".to_string()));
let json = schema.to_json().expect("to_json failed");
let restored = Schema::from_json(&json).expect("from_json failed");
assert_eq!(schema, restored);
}
#[test]
fn test_field_type_display() {
assert_eq!(FieldType::Float64.type_name(), "float64");
assert_eq!(
FieldType::List(Box::new(FieldType::Int32)).type_name(),
"list<int32>"
);
assert_eq!(
FieldType::Decimal {
precision: 10,
scale: 4
}
.type_name(),
"decimal(10,4)"
);
}
#[test]
fn test_schema_merge() {
let mut a = Schema::builder()
.field(SchemaField::new("id", FieldType::Int64))
.build();
let b = Schema::builder()
.field(SchemaField::new("value", FieldType::Float64))
.build();
a.merge(&b).expect("merge failed");
assert_eq!(a.len(), 2);
}
#[test]
fn test_schema_merge_type_conflict() {
let mut a = Schema::builder()
.field(SchemaField::new("id", FieldType::Int64))
.build();
let b = Schema::builder()
.field(SchemaField::new("id", FieldType::Utf8))
.build();
assert!(a.merge(&b).is_err());
}
#[test]
fn test_typed_value_from_json_strict() {
let v = serde_json::json!(42i64);
let tv = TypedValue::from_json_strict(&v, &FieldType::Int64).unwrap();
assert_eq!(tv, TypedValue::Int(42));
let v2 = serde_json::json!("hello");
let tv2 = TypedValue::from_json_strict(&v2, &FieldType::Utf8).unwrap();
assert_eq!(tv2, TypedValue::Utf8("hello".to_string()));
}
}