use crate::ingest::{
IngestDType, IngestJsonNameKeyedStructure, IngestJsonStructure, IngestJsonStructureList,
IngestSchema, IngestStructureProperty,
};
use serde_json_schema::Schema;
use serde_json_schema::property::{Property, PropertyInstance};
use std::collections::{BTreeMap, BTreeSet};
#[derive(Debug, thiserror::Error)]
pub enum JSONSchemaLoadError {
#[error("json schema parse error: {0}")]
Parse(#[from] serde_json_schema::error::Error),
#[error("json schema conversion error: {0}")]
Convert(String),
}
impl From<serde_json::Error> for JSONSchemaLoadError {
fn from(e: serde_json::Error) -> Self {
Self::Parse(e.into())
}
}
#[derive(Debug, Clone)]
pub struct JSONSchemaIngestSchema(pub Schema);
impl JSONSchemaIngestSchema {
pub fn from_json_utf8(bytes: &[u8]) -> Result<Self, JSONSchemaLoadError> {
Ok(Self(serde_json::from_slice(bytes)?))
}
pub fn from_json_str(s: &str) -> Result<Self, JSONSchemaLoadError> {
Self::from_json_utf8(s.as_bytes())
}
}
fn infer_dtype_from_property_instance(
pi: &PropertyInstance,
) -> Result<Option<IngestDType>, String> {
match pi {
PropertyInstance::String => Ok(Some(IngestDType::String("str".into()))),
PropertyInstance::Integer { .. } => Ok(Some(IngestDType::String("int".into()))),
PropertyInstance::Number { .. } => Ok(Some(IngestDType::String("float".into()))),
PropertyInstance::Boolean => Ok(Some(IngestDType::String("bool".into()))),
PropertyInstance::Null => Ok(None),
PropertyInstance::Object { .. } => {
Err("object properties are not supported in this conversion".to_string())
}
PropertyInstance::Array { .. } => {
Err("array properties are not supported in this conversion".to_string())
}
}
}
fn infer_dtype_from_property(
prop: &Property,
schema: &Schema,
) -> Result<Option<IngestDType>, String> {
match prop {
Property::Value(pi) => infer_dtype_from_property_instance(pi),
Property::Ref(r) => r.deref(schema).map_or_else(
|| Err("unresolvable property reference".to_string()),
infer_dtype_from_property_instance,
),
}
}
impl TryFrom<JSONSchemaIngestSchema> for IngestSchema {
type Error = JSONSchemaLoadError;
fn try_from(v: JSONSchemaIngestSchema) -> Result<Self, Self::Error> {
let schema = v.0;
let props = schema.properties().ok_or_else(|| {
JSONSchemaLoadError::Convert(
"root JSON Schema must be an object with 'properties'".to_string(),
)
})?;
let required: BTreeSet<String> = schema
.required_properties()
.cloned()
.unwrap_or_default()
.into_iter()
.collect();
let mut structure_props = BTreeMap::new();
for (name, prop) in props {
let dtype = infer_dtype_from_property(prop, &schema).map_err(|message| {
JSONSchemaLoadError::Convert(format!(
"unsupported JSON Schema for property {name:?}: {message}"
))
})?;
let description = if required.contains(name) {
Some("required field".to_string())
} else {
None
};
structure_props.insert(
name.clone(),
IngestStructureProperty {
choices: None,
description,
value: None,
dtype,
validator: None,
threshold: None,
},
);
}
let structure_name = schema
.id()
.map(|id| id.to_string())
.unwrap_or_else(|| "Model".to_string());
let props_value = serde_json::to_value(structure_props).map_err(|e| {
JSONSchemaLoadError::Convert(format!(
"failed to materialize converted structure properties: {e}"
))
})?;
let mut keyed = BTreeMap::new();
keyed.insert(structure_name, props_value);
Ok(IngestSchema {
entities: None,
json_structures: Some(IngestJsonStructureList::Single(
IngestJsonStructure::JsonNameKeyedStructure(IngestJsonNameKeyedStructure(keyed)),
)),
classifications: None,
relations: None,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn json_schema_uses_id_and_required() {
let raw = r#"
{
"$id": "BusinessRecord",
"type": "object",
"required": ["business_name"],
"properties": {
"business_name": { "type": "string" },
"status": { "type": "string" }
}
}
"#;
let ingest =
IngestSchema::try_from(JSONSchemaIngestSchema::from_json_str(raw).unwrap()).unwrap();
let IngestJsonStructureList::Single(IngestJsonStructure::JsonNameKeyedStructure(ns)) =
ingest.json_structures.unwrap()
else {
panic!("expected one name-keyed structure");
};
let props = ns.0.get("BusinessRecord").expect("businessrecord entry");
let props: BTreeMap<String, IngestStructureProperty> =
serde_json::from_value(props.clone()).expect("properties");
assert_eq!(
props["business_name"].description.as_deref(),
Some("required field")
);
assert_eq!(props["status"].description, None);
}
#[test]
fn json_schema_rejects_non_scalar_property_types() {
let raw = r#"
{
"type": "object",
"properties": {
"nested": { "type": "object", "properties": { "a": { "type": "string" } } }
}
}
"#;
let err = IngestSchema::try_from(JSONSchemaIngestSchema::from_json_str(raw).unwrap())
.expect_err("nested object should be rejected");
match err {
JSONSchemaLoadError::Convert(msg) => assert!(msg.contains("unsupported JSON Schema")),
other => panic!("unexpected error: {other:?}"),
}
}
}