ie-schema 0.1.5

A flexible schema specification and parser for information extraction tasks.
Documentation
use crate::ingest::{
    IngestDType, IngestJsonNameKeyedStructure, IngestJsonStructure, IngestJsonStructureList,
    IngestSchema, IngestStructureProperty,
};
use serde_json_schema::Schema;
use serde_json_schema::property::{Property, PropertyInstance};
use std::collections::{BTreeMap, BTreeSet};

#[derive(Debug, thiserror::Error)]
pub enum JSONSchemaLoadError {
    #[error("json schema parse error: {0}")]
    Parse(#[from] serde_json_schema::error::Error),
    #[error("json schema conversion error: {0}")]
    Convert(String),
}

impl From<serde_json::Error> for JSONSchemaLoadError {
    fn from(e: serde_json::Error) -> Self {
        Self::Parse(e.into())
    }
}

#[derive(Debug, Clone)]
pub struct JSONSchemaIngestSchema(pub Schema);

impl JSONSchemaIngestSchema {
    /// Parse JSON Schema from UTF-8 JSON bytes without an intermediate [`serde_json::Value`].
    pub fn from_json_utf8(bytes: &[u8]) -> Result<Self, JSONSchemaLoadError> {
        Ok(Self(serde_json::from_slice(bytes)?))
    }

    pub fn from_json_str(s: &str) -> Result<Self, JSONSchemaLoadError> {
        Self::from_json_utf8(s.as_bytes())
    }
}

fn infer_dtype_from_property_instance(
    pi: &PropertyInstance,
) -> Result<Option<IngestDType>, String> {
    match pi {
        PropertyInstance::String => Ok(Some(IngestDType::String("str".into()))),
        PropertyInstance::Integer { .. } => Ok(Some(IngestDType::String("int".into()))),
        PropertyInstance::Number { .. } => Ok(Some(IngestDType::String("float".into()))),
        PropertyInstance::Boolean => Ok(Some(IngestDType::String("bool".into()))),
        PropertyInstance::Null => Ok(None),
        PropertyInstance::Object { .. } => {
            Err("object properties are not supported in this conversion".to_string())
        }
        PropertyInstance::Array { .. } => {
            Err("array properties are not supported in this conversion".to_string())
        }
    }
}

fn infer_dtype_from_property(
    prop: &Property,
    schema: &Schema,
) -> Result<Option<IngestDType>, String> {
    match prop {
        Property::Value(pi) => infer_dtype_from_property_instance(pi),
        Property::Ref(r) => r.deref(schema).map_or_else(
            || Err("unresolvable property reference".to_string()),
            infer_dtype_from_property_instance,
        ),
    }
}

impl TryFrom<JSONSchemaIngestSchema> for IngestSchema {
    type Error = JSONSchemaLoadError;

    fn try_from(v: JSONSchemaIngestSchema) -> Result<Self, Self::Error> {
        let schema = v.0;
        let props = schema.properties().ok_or_else(|| {
            JSONSchemaLoadError::Convert(
                "root JSON Schema must be an object with 'properties'".to_string(),
            )
        })?;
        let required: BTreeSet<String> = schema
            .required_properties()
            .cloned()
            .unwrap_or_default()
            .into_iter()
            .collect();

        let mut structure_props = BTreeMap::new();
        for (name, prop) in props {
            let dtype = infer_dtype_from_property(prop, &schema).map_err(|message| {
                JSONSchemaLoadError::Convert(format!(
                    "unsupported JSON Schema for property {name:?}: {message}"
                ))
            })?;
            let description = if required.contains(name) {
                Some("required field".to_string())
            } else {
                None
            };

            structure_props.insert(
                name.clone(),
                IngestStructureProperty {
                    choices: None,
                    description,
                    value: None,
                    dtype,
                    validator: None,
                    threshold: None,
                },
            );
        }

        let structure_name = schema
            .id()
            .map(|id| id.to_string())
            .unwrap_or_else(|| "Model".to_string());
        let props_value = serde_json::to_value(structure_props).map_err(|e| {
            JSONSchemaLoadError::Convert(format!(
                "failed to materialize converted structure properties: {e}"
            ))
        })?;
        let mut keyed = BTreeMap::new();
        keyed.insert(structure_name, props_value);
        Ok(IngestSchema {
            entities: None,
            json_structures: Some(IngestJsonStructureList::Single(
                IngestJsonStructure::JsonNameKeyedStructure(IngestJsonNameKeyedStructure(keyed)),
            )),
            classifications: None,
            relations: None,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn json_schema_uses_id_and_required() {
        let raw = r#"
        {
            "$id": "BusinessRecord",
            "type": "object",
            "required": ["business_name"],
            "properties": {
                "business_name": { "type": "string" },
                "status": { "type": "string" }
            }
        }
        "#;

        let ingest =
            IngestSchema::try_from(JSONSchemaIngestSchema::from_json_str(raw).unwrap()).unwrap();
        let IngestJsonStructureList::Single(IngestJsonStructure::JsonNameKeyedStructure(ns)) =
            ingest.json_structures.unwrap()
        else {
            panic!("expected one name-keyed structure");
        };
        let props = ns.0.get("BusinessRecord").expect("businessrecord entry");
        let props: BTreeMap<String, IngestStructureProperty> =
            serde_json::from_value(props.clone()).expect("properties");
        assert_eq!(
            props["business_name"].description.as_deref(),
            Some("required field")
        );
        assert_eq!(props["status"].description, None);
    }

    #[test]
    fn json_schema_rejects_non_scalar_property_types() {
        let raw = r#"
        {
            "type": "object",
            "properties": {
                "nested": { "type": "object", "properties": { "a": { "type": "string" } } }
            }
        }
        "#;

        let err = IngestSchema::try_from(JSONSchemaIngestSchema::from_json_str(raw).unwrap())
            .expect_err("nested object should be rejected");
        match err {
            JSONSchemaLoadError::Convert(msg) => assert!(msg.contains("unsupported JSON Schema")),
            other => panic!("unexpected error: {other:?}"),
        }
    }
}