ie-schema 0.1.5

A flexible schema specification and parser for information extraction tasks.
Documentation
use crate::normalized::{
    Classification, DType, EntitySpec, ExpandedName, JsonStructure, NamedStructure,
    NormalizedSchema, Relation, RelationAcquired, StructureProperty, Validator,
};
use serde::Serialize;
use std::collections::BTreeMap;
use std::convert::TryFrom;

/// Expanded schema.
///
/// Goals:
/// - slug-only names
/// - defaults filled
/// - no top-level unions
/// - shared concepts use one canonical representation
pub type Description = String;
pub type Regex = String;
pub type Threshold = f64;

#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct ExpandedEntity {
    pub name: ExpandedName,
    pub dtype: Option<DType>,
    pub validator: Option<Validator>,
    pub threshold: Option<Threshold>,
    pub description: Option<Description>,
}

#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct ExpandedStructureProperty {
    pub choices: Vec<ExpandedEntity>,
    pub description: Option<Description>,
    pub value: Option<String>,
    pub dtype: Option<DType>,
    pub validator: Option<Validator>,
    pub threshold: Option<Threshold>,
}

#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct ExpandedJsonStructure {
    pub name: ExpandedName,
    pub props: BTreeMap<ExpandedName, ExpandedStructureProperty>,
}

#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct ExpandedClassification {
    pub task: ExpandedEntity,
    pub labels: Vec<ExpandedEntity>,
    pub threshold: Option<Threshold>,
    pub multi_label: bool,
    pub label_descriptions: BTreeMap<ExpandedName, ExpandedEntity>,
}

#[derive(Debug, Clone, PartialEq, Serialize)]
pub enum ExpandedRelation {
    EmptyAcquired {
        name: ExpandedName,
        description: Option<Description>,
    },
    EntityAcquired {
        name: ExpandedName,
        description: Option<Description>,
        head: Box<ExpandedEntity>,
        tail: Box<ExpandedEntity>,
    },
}

#[derive(Debug, Clone, PartialEq, Serialize, Default)]
pub struct ExpandedSchema {
    pub entities: Vec<ExpandedEntity>,
    pub json_structures: Vec<ExpandedJsonStructure>,
    pub relations: Vec<ExpandedRelation>,
    pub classifications: Vec<ExpandedClassification>,
}

#[derive(Debug, thiserror::Error)]
pub enum SchemaExpandError {
    #[error("relation without acquired form cannot be expanded at index {index}: {name}")]
    RelationWithoutAcquired { index: usize, name: String },
}

fn entity2_to_3(v: EntitySpec) -> ExpandedEntity {
    ExpandedEntity {
        name: v.name,
        dtype: v.dtype,
        validator: v.validator,
        threshold: v.threshold,
        description: v.description,
    }
}

fn entity_spec_to_structure_property(spec: EntitySpec) -> ExpandedStructureProperty {
    ExpandedStructureProperty {
        choices: Vec::new(),
        description: spec.description,
        value: None,
        dtype: spec.dtype,
        validator: spec.validator,
        threshold: spec.threshold,
    }
}

fn structure_property2_to_3(v: StructureProperty) -> ExpandedStructureProperty {
    ExpandedStructureProperty {
        choices: v.choices.into_iter().map(entity2_to_3).collect(),
        description: v.description,
        value: v.value,
        dtype: v.dtype,
        validator: v.validator,
        threshold: v.threshold,
    }
}

fn json_structure2_to_3(
    v: JsonStructure,
    index: usize,
) -> Result<ExpandedJsonStructure, SchemaExpandError> {
    match v {
        JsonStructure::NamedStructure(NamedStructure { name, props }) => {
            Ok(ExpandedJsonStructure {
                name,
                props: props
                    .into_iter()
                    .map(|(k, v)| (k, structure_property2_to_3(v)))
                    .collect(),
            })
        }
        JsonStructure::NameKeyedStructure { name, props } => Ok(ExpandedJsonStructure {
            name,
            props: props
                .into_iter()
                .map(|(k, v)| (k, structure_property2_to_3(v)))
                .collect(),
        }),
        JsonStructure::EntityList(list) => {
            let props = list
                .into_iter()
                .map(|spec| {
                    let key = spec.name.clone();
                    (key, entity_spec_to_structure_property(spec))
                })
                .collect();
            Ok(ExpandedJsonStructure {
                name: ExpandedName::new(format!("unnamed_{index}")),
                props,
            })
        }
    }
}

fn classification2_to_3(v: Classification) -> ExpandedClassification {
    ExpandedClassification {
        task: entity2_to_3(v.task),
        labels: v.labels.into_iter().map(entity2_to_3).collect(),
        threshold: v.threshold,
        multi_label: v.multi_label,
        label_descriptions: v
            .label_descriptions
            .into_iter()
            .map(|(k, v)| (k, entity2_to_3(v)))
            .collect(),
    }
}

fn relation2_to_3(v: Relation, index: usize) -> Result<ExpandedRelation, SchemaExpandError> {
    match v.acquired {
        Some(RelationAcquired::Empty) => Ok(ExpandedRelation::EmptyAcquired {
            name: v.name,
            description: v.description,
        }),
        Some(RelationAcquired::Entity { head, tail }) => Ok(ExpandedRelation::EntityAcquired {
            name: v.name,
            description: v.description,
            head: Box::new(entity2_to_3(*head)),
            tail: Box::new(entity2_to_3(*tail)),
        }),
        None => Err(SchemaExpandError::RelationWithoutAcquired {
            index,
            name: v.name.to_string(),
        }),
    }
}

impl TryFrom<NormalizedSchema> for ExpandedSchema {
    type Error = SchemaExpandError;

    fn try_from(v: NormalizedSchema) -> Result<Self, Self::Error> {
        let entities = v.entities.into_iter().map(entity2_to_3).collect();

        let json_structures = v
            .json_structures
            .into_iter()
            .enumerate()
            .map(|(i, js)| json_structure2_to_3(js, i))
            .collect::<Result<Vec<_>, _>>()?;

        let relations = v
            .relations
            .into_iter()
            .enumerate()
            .map(|(i, rel)| relation2_to_3(rel, i))
            .collect::<Result<Vec<_>, _>>()?;

        let classifications = v
            .classifications
            .into_iter()
            .map(classification2_to_3)
            .collect();

        Ok(Self {
            entities,
            json_structures,
            relations,
            classifications,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::normalized::NormalizedSchema;

    #[test]
    fn expanded_expands_named_structure() {
        let s = r#"
        {
            "json_structures": [
                {
                    "name": "Patient Record",
                    "id": { "description": "identifier", "dtype": "str" }
                }
            ],
            "relations": [
                { "contains": { "head": "patient", "tail": "record" } }
            ]
        }
        "#;

        let s2 = NormalizedSchema::from_json_str(s).unwrap();
        let s3 = ExpandedSchema::try_from(s2).unwrap();

        assert_eq!(s3.json_structures.len(), 1);
        assert_eq!(s3.relations.len(), 1);
        assert_eq!(s3.json_structures[0].name.as_str(), "patient_record");
    }

    #[test]
    fn expanded_rejects_relation_without_acquired_form() {
        let s = r#"
        {
            "relations": ["interacts_with"]
        }
        "#;

        let s2 = NormalizedSchema::from_json_str(s).unwrap();
        let err = ExpandedSchema::try_from(s2).unwrap_err();

        match err {
            SchemaExpandError::RelationWithoutAcquired { .. } => {}
        }
    }

    #[test]
    fn expanded_entity_list_becomes_unnamed_structure() {
        let s = r#"
        {
            "json_structures": [
                ["gene::str", "score::float::0.9"]
            ]
        }
        "#;

        let s2 = NormalizedSchema::from_json_str(s).unwrap();
        let s3 = ExpandedSchema::try_from(s2).unwrap();

        assert_eq!(s3.json_structures.len(), 1);
        assert_eq!(s3.json_structures[0].name.as_str(), "unnamed_0");
        assert_eq!(s3.entities.len(), 0);

        let props = &s3.json_structures[0].props;
        assert_eq!(props.len(), 2);

        let gene = props.get(&ExpandedName::new("gene".to_string())).unwrap();
        assert_eq!(gene.dtype, Some(DType::String));
        assert_eq!(gene.threshold, None);

        let score = props.get(&ExpandedName::new("score".to_string())).unwrap();
        assert_eq!(score.dtype, Some(DType::Float));
        assert_eq!(score.threshold, Some(0.9));
    }

    #[test]
    fn expanded_multiple_entity_lists_get_sequential_names() {
        let s = r#"
        {
            "json_structures": [
                ["a::str"],
                ["b::float"],
                ["c::bool"]
            ]
        }
        "#;

        let s2 = NormalizedSchema::from_json_str(s).unwrap();
        let s3 = ExpandedSchema::try_from(s2).unwrap();

        assert_eq!(s3.json_structures.len(), 3);
        assert_eq!(s3.json_structures[0].name.as_str(), "unnamed_0");
        assert_eq!(s3.json_structures[1].name.as_str(), "unnamed_1");
        assert_eq!(s3.json_structures[2].name.as_str(), "unnamed_2");
    }

    #[test]
    fn expanded_mixed_entity_list_and_named_structure() {
        let s = r#"
        {
            "entities": ["gene::str"],
            "json_structures": [
                ["patient::str"],
                {
                    "name": "Patient Record",
                    "id": { "dtype": "str" }
                }
            ]
        }
        "#;

        let s2 = NormalizedSchema::from_json_str(s).unwrap();
        let s3 = ExpandedSchema::try_from(s2).unwrap();

        assert_eq!(s3.entities.len(), 1);
        assert_eq!(s3.entities[0].name.as_str(), "gene");
        assert_eq!(s3.json_structures.len(), 2);
        assert_eq!(s3.json_structures[0].name.as_str(), "unnamed_0");
        assert_eq!(s3.json_structures[1].name.as_str(), "patient_record");
    }

    #[test]
    fn expanded_entity_list_entities_not_promoted_to_top_level() {
        let s = r#"
        {
            "entities": ["gene::str"],
            "json_structures": [
                ["gene::str", "score::float::0.9"]
            ]
        }
        "#;

        let s2 = NormalizedSchema::from_json_str(s).unwrap();
        let s3 = ExpandedSchema::try_from(s2).unwrap();

        assert_eq!(s3.entities.len(), 1);
        assert_eq!(s3.entities[0].name.as_str(), "gene");
        assert_eq!(s3.json_structures.len(), 1);

        let props = &s3.json_structures[0].props;
        assert!(props.contains_key(&ExpandedName::new("gene".to_string())));
        assert!(props.contains_key(&ExpandedName::new("score".to_string())));
    }
}