Skip to main content

ie_schema/
normalized.rs

1use crate::ingest::{
2    IngestClassification, IngestDType, IngestEntity, IngestEntityAcquired, IngestEntityList,
3    IngestEntityProperty, IngestJsonNameKeyedStructure, IngestJsonStructure,
4    IngestJsonStructureList, IngestNamedStructure, IngestRelation, IngestSchema,
5    IngestStructureProperties, IngestStructureProperty, IngestValidator, IngestValidatorMode,
6};
7use crate::json_schema::JSONSchemaIngestSchema;
8use serde::Serialize;
9use std::collections::{BTreeMap, BTreeSet};
10use std::convert::TryFrom;
11use std::fmt;
12
13pub type Description = String;
14pub type Regex = String;
15pub type Threshold = f64;
16
17#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Default)]
18pub struct ExpandedName(String);
19
20impl ExpandedName {
21    pub fn new(s: String) -> Self {
22        Self(s)
23    }
24
25    pub fn as_str(&self) -> &str {
26        &self.0
27    }
28
29    pub fn into_inner(self) -> String {
30        self.0
31    }
32}
33
34impl fmt::Display for ExpandedName {
35    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
36        self.0.fmt(f)
37    }
38}
39
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
41pub enum DType {
42    String,
43    Int,
44    Float,
45    Bool,
46}
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
49pub enum ValidatorMode {
50    Partial,
51    Full,
52}
53
54#[derive(Debug, Clone, PartialEq, Serialize)]
55pub struct Validator {
56    pub pattern: Regex,
57    pub mode: Option<ValidatorMode>,
58    pub exclude: bool,
59}
60
61#[derive(Debug, Clone, PartialEq, Serialize, Default)]
62pub struct EntitySpec {
63    // TODO I realize here that we should keep the original name, and our ExpandedName (really our slug)
64    // could be named slug or something
65    pub name: ExpandedName,
66    pub dtype: Option<DType>,
67    pub validator: Option<Validator>,
68    pub threshold: Option<Threshold>,
69    pub description: Option<Description>,
70}
71
72#[derive(Debug, Clone, PartialEq, Serialize)]
73pub struct StructureProperty {
74    pub choices: Vec<EntitySpec>,
75    pub description: Option<Description>,
76    pub value: Option<String>,
77    pub dtype: Option<DType>,
78    pub validator: Option<Validator>,
79    pub threshold: Option<Threshold>,
80}
81
82#[derive(Debug, Clone, PartialEq, Serialize)]
83pub struct NamedStructure {
84    pub name: ExpandedName,
85    pub props: BTreeMap<ExpandedName, StructureProperty>,
86}
87
88#[derive(Debug, Clone, PartialEq, Serialize)]
89pub enum JsonStructure {
90    NamedStructure(NamedStructure),
91    NameKeyedStructure {
92        name: ExpandedName,
93        props: BTreeMap<ExpandedName, StructureProperty>,
94    },
95    EntityList(Vec<EntitySpec>),
96}
97
98#[derive(Debug, Clone, PartialEq, Serialize)]
99pub struct Classification {
100    pub task: EntitySpec,
101    pub labels: Vec<EntitySpec>,
102    pub threshold: Option<Threshold>,
103    pub multi_label: bool,
104    pub label_descriptions: BTreeMap<ExpandedName, EntitySpec>,
105}
106
107#[derive(Debug, Clone, PartialEq, Serialize)]
108pub enum RelationAcquired {
109    Empty,
110    Entity {
111        head: Box<EntitySpec>,
112        tail: Box<EntitySpec>,
113    },
114}
115
116#[derive(Debug, Clone, PartialEq, Serialize)]
117pub struct Relation {
118    pub name: ExpandedName,
119    pub description: Option<Description>,
120    pub acquired: Option<RelationAcquired>,
121}
122
123#[derive(Debug, Clone, PartialEq, Serialize, Default)]
124pub struct NormalizedSchema {
125    pub entities: Vec<EntitySpec>,
126    pub json_structures: Vec<JsonStructure>,
127    pub classifications: Vec<Classification>,
128    pub relations: Vec<Relation>,
129}
130
131#[derive(Debug, Clone, PartialEq, Eq)]
132pub struct Path(String);
133
134impl Path {
135    pub fn root() -> Self {
136        Self("$".to_string())
137    }
138
139    pub fn field(&self, name: &str) -> Self {
140        Self(format!("{}.{}", self.0, name))
141    }
142
143    pub fn index(&self, idx: usize) -> Self {
144        Self(format!("{}[{idx}]", self.0))
145    }
146
147    pub fn key(&self, key: &str) -> Self {
148        Self(format!("{}[{key:?}]", self.0))
149    }
150
151    pub fn as_str(&self) -> &str {
152        &self.0
153    }
154}
155
156#[derive(Debug, thiserror::Error)]
157pub enum SchemaNormalizeError {
158    #[error("invalid name at {path}: {value:?}")]
159    InvalidName { path: String, value: String },
160
161    #[error("expected exactly one key at {path}, found {found}")]
162    ExpectedSingleKey { path: String, found: usize },
163
164    #[error("invalid colon-delimited entity at {path}: {raw:?}")]
165    InvalidColonDelimitedEntity { path: String, raw: String },
166
167    #[error("ambiguous colon-delimited entity at {path}: {raw:?}")]
168    AmbiguousColonDelimitedEntity { path: String, raw: String },
169
170    #[error(
171        "conflicting threshold aliases at {path}: threshold={threshold:?}, cls_threshold={cls_threshold:?}"
172    )]
173    ConflictingThresholdAliases {
174        path: String,
175        threshold: Option<f64>,
176        cls_threshold: Option<f64>,
177    },
178
179    #[error("duplicate normalized name at {path}: {name}")]
180    DuplicateNormalizedName { path: String, name: String },
181
182    #[error("invalid empty-acquired relation at {path}")]
183    InvalidEmptyAcquiredRelation { path: String },
184
185    #[error("invalid structure properties at {path}")]
186    InvalidStructureProperties { path: String },
187
188    #[error("invalid dtype at {path}: {value:?}")]
189    InvalidDType { path: String, value: String },
190
191    #[error("nested decode failed at {path}: {message}")]
192    NestedDecode { path: String, message: String },
193}
194
195fn normalize_name(raw: &str, path: &Path) -> Result<ExpandedName, SchemaNormalizeError> {
196    let trimmed = raw.trim();
197    if trimmed.is_empty() {
198        return Err(SchemaNormalizeError::InvalidName {
199            path: path.as_str().to_string(),
200            value: raw.to_string(),
201        });
202    }
203
204    let mut out = String::new();
205    let mut prev_us = false;
206
207    for ch in trimmed.chars() {
208        match ch {
209            'a'..='z' | '0'..='9' => {
210                out.push(ch);
211                prev_us = false;
212            }
213            'A'..='Z' => {
214                out.push(ch.to_ascii_lowercase());
215                prev_us = false;
216            }
217            '_' | '-' | ' ' => {
218                if !prev_us && !out.is_empty() {
219                    out.push('_');
220                    prev_us = true;
221                }
222            }
223            _ => {
224                return Err(SchemaNormalizeError::InvalidName {
225                    path: path.as_str().to_string(),
226                    value: raw.to_string(),
227                });
228            }
229        }
230    }
231
232    while out.ends_with('_') {
233        out.pop();
234    }
235
236    if out.is_empty() {
237        return Err(SchemaNormalizeError::InvalidName {
238            path: path.as_str().to_string(),
239            value: raw.to_string(),
240        });
241    }
242
243    Ok(ExpandedName::new(out))
244}
245
246fn dtype_1_to_2(v: IngestDType, path: &Path) -> Result<DType, SchemaNormalizeError> {
247    match v {
248        IngestDType::String(s) => {
249            let Some(dt) = looks_like_dtype(&s) else {
250                return Err(SchemaNormalizeError::InvalidDType {
251                    path: path.as_str().to_string(),
252                    value: s,
253                });
254            };
255            Ok(dt)
256        }
257        IngestDType::Int(_) => Ok(DType::Int),
258        IngestDType::Float(_) => Ok(DType::Float),
259        IngestDType::Bool(_) => Ok(DType::Bool),
260    }
261}
262
263fn validator_1_to_2(v: IngestValidator) -> Validator {
264    match v {
265        IngestValidator::Regex(pattern) => Validator {
266            pattern,
267            mode: None,
268            exclude: false,
269        },
270        IngestValidator::Dict(d) => Validator {
271            pattern: d.pattern,
272            mode: d.mode.map(|m| match m {
273                IngestValidatorMode::Partial => ValidatorMode::Partial,
274                IngestValidatorMode::Full => ValidatorMode::Full,
275            }),
276            exclude: d.exclude.unwrap_or(false),
277        },
278    }
279}
280
281fn looks_like_dtype(s: &str) -> Option<DType> {
282    let x = s.trim().to_ascii_lowercase();
283    match x.as_str() {
284        "str" | "string" => Some(DType::String),
285        "int" | "integer" => Some(DType::Int),
286        "float" => Some(DType::Float),
287        "bool" | "boolean" => Some(DType::Bool),
288        _ => None,
289    }
290}
291
292type ParsedColonDelimitedEntity = (ExpandedName, Option<DType>, Option<f64>, Option<String>);
293
294fn parse_colon_delimited_entity(
295    raw: &str,
296    path: &Path,
297) -> Result<ParsedColonDelimitedEntity, SchemaNormalizeError> {
298    let parts: Vec<&str> = raw.split("::").map(str::trim).collect();
299    if parts.is_empty() {
300        return Err(SchemaNormalizeError::InvalidColonDelimitedEntity {
301            path: path.as_str().to_string(),
302            raw: raw.to_string(),
303        });
304    }
305
306    let name = normalize_name(parts[0], &path.field("name"))?;
307    let mut dtype = None;
308    let mut threshold = None;
309    let mut description = None;
310
311    for seg in parts.into_iter().skip(1) {
312        if seg.is_empty() {
313            continue;
314        }
315
316        if threshold.is_none()
317            && let Ok(f) = seg.parse::<f64>()
318        {
319            threshold = Some(f);
320            continue;
321        }
322
323        if dtype.is_none()
324            && let Some(dt) = looks_like_dtype(seg)
325        {
326            dtype = Some(dt);
327            continue;
328        }
329
330        if description.is_none() {
331            description = Some(seg.to_string());
332            continue;
333        }
334
335        return Err(SchemaNormalizeError::AmbiguousColonDelimitedEntity {
336            path: path.as_str().to_string(),
337            raw: raw.to_string(),
338        });
339    }
340
341    Ok((name, dtype, threshold, description))
342}
343
344fn entity_property_to_spec(
345    name_hint: Option<&str>,
346    ep: IngestEntityProperty,
347    path: &Path,
348) -> Result<EntitySpec, SchemaNormalizeError> {
349    match ep {
350        IngestEntityProperty::Description(desc) => {
351            let Some(name_hint) = name_hint else {
352                return Err(SchemaNormalizeError::InvalidStructureProperties {
353                    path: path.as_str().to_string(),
354                });
355            };
356
357            Ok(EntitySpec {
358                name: normalize_name(name_hint, &path.field("name"))?,
359                dtype: None,
360                validator: None,
361                threshold: None,
362                description: Some(desc),
363            })
364        }
365        IngestEntityProperty::Dict(d) => {
366            let raw_name = if d.name.trim().is_empty() {
367                name_hint.unwrap_or_default()
368            } else {
369                d.name.as_str()
370            };
371
372            let dtype = match d.dtype {
373                Some(dt) => Some(dtype_1_to_2(dt, &path.field("dtype"))?),
374                None => None,
375            };
376
377            Ok(EntitySpec {
378                name: normalize_name(raw_name, &path.field("name"))?,
379                dtype,
380                validator: d.validator.map(validator_1_to_2),
381                threshold: d.threshold,
382                description: d.description,
383            })
384        }
385    }
386}
387
388fn single_entry_map<K, V>(
389    map: BTreeMap<K, V>,
390    path: &Path,
391) -> Result<(K, V), SchemaNormalizeError> {
392    if map.len() != 1 {
393        return Err(SchemaNormalizeError::ExpectedSingleKey {
394            path: path.as_str().to_string(),
395            found: map.len(),
396        });
397    }
398    Ok(map.into_iter().next().unwrap())
399}
400
401fn entity_1_to_spec(entity: IngestEntity, path: &Path) -> Result<EntitySpec, SchemaNormalizeError> {
402    match entity {
403        IngestEntity::Stringish(s) => {
404            if s.contains("::") {
405                let (name, dtype, threshold, description) = parse_colon_delimited_entity(&s, path)?;
406                Ok(EntitySpec {
407                    name,
408                    dtype,
409                    validator: None,
410                    threshold,
411                    description,
412                })
413            } else {
414                Ok(EntitySpec {
415                    name: normalize_name(&s, &path.field("name"))?,
416                    dtype: None,
417                    validator: None,
418                    threshold: None,
419                    description: None,
420                })
421            }
422        }
423        IngestEntity::SingleEntityDict(map) => {
424            let (k, v) = single_entry_map(map, path)?;
425            entity_property_to_spec(Some(&k), v, path)
426        }
427    }
428}
429
430fn entity_list_1_to_vec(
431    list: IngestEntityList,
432    path: &Path,
433) -> Result<Vec<EntitySpec>, SchemaNormalizeError> {
434    match list {
435        IngestEntityList::List(items) => items
436            .into_iter()
437            .enumerate()
438            .map(|(i, item)| entity_1_to_spec(item, &path.index(i)))
439            .collect(),
440        IngestEntityList::Dict(map) => map
441            .into_iter()
442            .map(|(k, v)| entity_property_to_spec(Some(&k), v, &path.key(&k)))
443            .collect(),
444    }
445}
446
447fn structure_property_1_to_2(
448    v: IngestStructureProperty,
449    path: &Path,
450) -> Result<StructureProperty, SchemaNormalizeError> {
451    let dtype = match v.dtype {
452        Some(dt) => Some(dtype_1_to_2(dt, &path.field("dtype"))?),
453        None => None,
454    };
455
456    Ok(StructureProperty {
457        choices: match v.choices {
458            Some(choices) => entity_list_1_to_vec(choices, &path.field("choices"))?,
459            None => Vec::new(),
460        },
461        description: v.description,
462        value: v.value,
463        dtype,
464        validator: v.validator.map(validator_1_to_2),
465        threshold: v.threshold,
466    })
467}
468
469fn ensure_unique_keys<T>(
470    map: &BTreeMap<ExpandedName, T>,
471    path: &Path,
472) -> Result<(), SchemaNormalizeError> {
473    let mut seen = BTreeSet::new();
474    for k in map.keys() {
475        if !seen.insert(k.clone()) {
476            return Err(SchemaNormalizeError::DuplicateNormalizedName {
477                path: path.as_str().to_string(),
478                name: k.to_string(),
479            });
480        }
481    }
482    Ok(())
483}
484
485fn insert_unique<T>(
486    map: &mut BTreeMap<ExpandedName, T>,
487    key: ExpandedName,
488    value: T,
489    path: &Path,
490) -> Result<(), SchemaNormalizeError> {
491    if map.insert(key.clone(), value).is_some() {
492        return Err(SchemaNormalizeError::DuplicateNormalizedName {
493            path: path.as_str().to_string(),
494            name: key.to_string(),
495        });
496    }
497    Ok(())
498}
499
500fn structure_properties_1_to_map(
501    v: IngestStructureProperties,
502    path: &Path,
503) -> Result<BTreeMap<ExpandedName, StructureProperty>, SchemaNormalizeError> {
504    let out = match v {
505        IngestStructureProperties::EntityDict(map) => {
506            let mut out = BTreeMap::new();
507            for (k, v) in map {
508                let key = normalize_name(&k, &path.key(&k))?;
509                let spec = entity_property_to_spec(Some(&k), v, &path.key(&k))?;
510                insert_unique(
511                    &mut out,
512                    key,
513                    StructureProperty {
514                        choices: Vec::new(),
515                        description: spec.description,
516                        value: None,
517                        dtype: spec.dtype,
518                        validator: spec.validator,
519                        threshold: spec.threshold,
520                    },
521                    path,
522                )?;
523            }
524            out
525        }
526        IngestStructureProperties::EntityList(list) => {
527            let specs = entity_list_1_to_vec(list, path)?;
528            let mut out = BTreeMap::new();
529            for spec in specs {
530                insert_unique(
531                    &mut out,
532                    spec.name.clone(),
533                    StructureProperty {
534                        choices: Vec::new(),
535                        description: spec.description,
536                        value: None,
537                        dtype: spec.dtype,
538                        validator: spec.validator,
539                        threshold: spec.threshold,
540                    },
541                    path,
542                )?;
543            }
544            out
545        }
546        IngestStructureProperties::StructurePropertiesDict(map) => {
547            let mut out = BTreeMap::new();
548            for (k, v) in map {
549                let key = normalize_name(&k, &path.key(&k))?;
550                let value = structure_property_1_to_2(v, &path.key(&k))?;
551                insert_unique(&mut out, key, value, path)?;
552            }
553            out
554        }
555    };
556
557    ensure_unique_keys(&out, path)?;
558    Ok(out)
559}
560
561fn try_parse_structure_properties_value(
562    v: serde_json::Value,
563    path: &Path,
564) -> Result<BTreeMap<ExpandedName, StructureProperty>, SchemaNormalizeError> {
565    let parsed: IngestStructureProperties =
566        serde_json::from_value(v).map_err(|e| SchemaNormalizeError::NestedDecode {
567            path: path.as_str().to_string(),
568            message: e.to_string(),
569        })?;
570    structure_properties_1_to_map(parsed, path)
571}
572
573fn named_structure_1_to_2(
574    ns: IngestNamedStructure,
575    path: &Path,
576) -> Result<NamedStructure, SchemaNormalizeError> {
577    let name = normalize_name(&ns.name, &path.field("name"))?;
578    let mut props = BTreeMap::new();
579
580    for (k, v) in ns.props {
581        let key = normalize_name(&k, &path.key(&k))?;
582        let value = structure_property_1_to_2(v, &path.key(&k))?;
583        insert_unique(&mut props, key, value, &path.field("props"))?;
584    }
585
586    ensure_unique_keys(&props, &path.field("props"))?;
587    Ok(NamedStructure { name, props })
588}
589
590fn json_structure_1_to_2(
591    v: IngestJsonStructure,
592    path: &Path,
593) -> Result<Vec<JsonStructure>, SchemaNormalizeError> {
594    match v {
595        IngestJsonStructure::NamedStructure(ns) => Ok(vec![JsonStructure::NamedStructure(
596            named_structure_1_to_2(ns, path)?,
597        )]),
598        IngestJsonStructure::EntityList(list) => Ok(vec![JsonStructure::EntityList(
599            entity_list_1_to_vec(list, path)?,
600        )]),
601        IngestJsonStructure::JsonNameKeyedStructure(IngestJsonNameKeyedStructure(map)) => {
602            let mut out = Vec::new();
603            for (name_raw, value) in map {
604                let name = normalize_name(&name_raw, &path.key(&name_raw))?;
605                let props = try_parse_structure_properties_value(value, &path.key(&name_raw))?;
606                out.push(JsonStructure::NameKeyedStructure { name, props });
607            }
608            Ok(out)
609        }
610    }
611}
612
613fn classification_1_to_2(
614    v: IngestClassification,
615    path: &Path,
616) -> Result<Classification, SchemaNormalizeError> {
617    let threshold = match (v.threshold, v.cls_threshold) {
618        (Some(a), Some(b)) if (a - b).abs() > f64::EPSILON => {
619            return Err(SchemaNormalizeError::ConflictingThresholdAliases {
620                path: path.as_str().to_string(),
621                threshold: Some(a),
622                cls_threshold: Some(b),
623            });
624        }
625        (Some(a), _) => Some(a),
626        (_, Some(b)) => Some(b),
627        _ => None,
628    };
629
630    let task = entity_1_to_spec(v.task, &path.field("task"))?;
631    let labels = entity_list_1_to_vec(v.labels, &path.field("labels"))?;
632
633    let mut label_descriptions = BTreeMap::new();
634    if let Some(map) = v.label_descriptions {
635        for (k, ep) in map {
636            let key = normalize_name(&k, &path.field("label_descriptions").key(&k))?;
637            let spec =
638                entity_property_to_spec(Some(&k), ep, &path.field("label_descriptions").key(&k))?;
639            label_descriptions.insert(key, spec);
640        }
641    }
642
643    Ok(Classification {
644        task,
645        labels,
646        threshold,
647        multi_label: v.multi_label.unwrap_or(false),
648        label_descriptions,
649    })
650}
651
652fn entity_acquired_to_2(
653    v: IngestEntityAcquired,
654    path: &Path,
655) -> Result<RelationAcquired, SchemaNormalizeError> {
656    match (v.head, v.tail) {
657        (IngestEntity::Stringish(s1), IngestEntity::Stringish(s2))
658            if s1.is_empty() && s2.is_empty() =>
659        {
660            Ok(RelationAcquired::Empty)
661        }
662        (h, t) => Ok(RelationAcquired::Entity {
663            head: Box::new(entity_1_to_spec(h, &path.field("head"))?),
664            tail: Box::new(entity_1_to_spec(t, &path.field("tail"))?),
665        }),
666    }
667}
668
669fn relation_1_to_2(v: IngestRelation, path: &Path) -> Result<Relation, SchemaNormalizeError> {
670    match v {
671        IngestRelation::Name(name) => Ok(Relation {
672            name: normalize_name(&name, &path.field("name"))?,
673            description: None,
674            acquired: None,
675        }),
676        IngestRelation::NameDescription(map) => {
677            let (k, desc) = single_entry_map(map, path)?;
678            Ok(Relation {
679                name: normalize_name(&k, &path.key(&k))?,
680                description: Some(desc),
681                acquired: None,
682            })
683        }
684        IngestRelation::RelationEntityAcquired(map) => {
685            let (k, acq) = single_entry_map(map, path)?;
686            Ok(Relation {
687                name: normalize_name(&k, &path.key(&k))?,
688                description: None,
689                acquired: Some(entity_acquired_to_2(acq, &path.key(&k))?),
690            })
691        }
692    }
693}
694
695impl TryFrom<IngestSchema> for NormalizedSchema {
696    type Error = SchemaNormalizeError;
697
698    fn try_from(v: IngestSchema) -> Result<Self, Self::Error> {
699        let root = Path::root();
700
701        let entities = match v.entities {
702            Some(x) => entity_list_1_to_vec(x, &root.field("entities"))?,
703            None => Vec::new(),
704        };
705
706        let mut json_structures = Vec::new();
707        match v.json_structures {
708            None => {}
709            Some(IngestJsonStructureList::Single(one)) => {
710                json_structures.extend(json_structure_1_to_2(one, &root.field("json_structures"))?);
711            }
712            Some(IngestJsonStructureList::List(list)) => {
713                for (i, item) in list.into_iter().enumerate() {
714                    json_structures.extend(json_structure_1_to_2(
715                        item,
716                        &root.field("json_structures").index(i),
717                    )?);
718                }
719            }
720        }
721
722        let classifications = match v.classifications {
723            Some(list) => list
724                .into_iter()
725                .enumerate()
726                .map(|(i, cls)| classification_1_to_2(cls, &root.field("classifications").index(i)))
727                .collect::<Result<Vec<_>, _>>()?,
728            None => Vec::new(),
729        };
730
731        let relations = match v.relations {
732            Some(list) => list
733                .into_iter()
734                .enumerate()
735                .map(|(i, rel)| relation_1_to_2(rel, &root.field("relations").index(i)))
736                .collect::<Result<Vec<_>, _>>()?,
737            None => Vec::new(),
738        };
739
740        Ok(Self {
741            entities,
742            json_structures,
743            classifications,
744            relations,
745        })
746    }
747}
748
749fn ingest_error_is_unknown_field(err: &serde_json::Error) -> bool {
750    err.to_string().contains("unknown field")
751}
752
753/// Serde reports `unknown field `foo`, expected ...` for unknown keys on `deny_unknown_fields`.
754fn unknown_field_name_in_ingest_error(err: &serde_json::Error) -> Option<String> {
755    let s = err.to_string();
756    let prefix = "unknown field `";
757    let start = s.find(prefix)? + prefix.len();
758    let rest = &s[start..];
759    let end = rest.find('`')?;
760    Some(rest[..end].to_string())
761}
762
763/// When strict IE ingest fails on an unknown field, still try JSON Schema if the field name
764/// looks like JSON Schema vocabulary (e.g. `$id`, `type`, `properties`), not a stray IE key.
765fn ingest_error_suggests_json_schema_root(err: &serde_json::Error) -> bool {
766    let Some(name) = unknown_field_name_in_ingest_error(err) else {
767        return false;
768    };
769    if name.starts_with('$') {
770        return true;
771    }
772    matches!(
773        name.as_str(),
774        "type"
775            | "properties"
776            | "required"
777            | "items"
778            | "title"
779            | "description"
780            | "definitions"
781            | "additionalProperties"
782            | "patternProperties"
783            | "allOf"
784            | "anyOf"
785            | "oneOf"
786            | "not"
787    )
788}
789
790impl NormalizedSchema {
791    /// Parse IE ingest JSON or a root JSON Schema object from UTF-8 bytes.
792    ///
793    /// Tries strict IE [`IngestSchema`] first (unknown top-level keys are rejected), then
794    /// JSON Schema via [`JSONSchemaIngestSchema`].
795    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, SchemaLoadError> {
796        match IngestSchema::from_json_slice(bytes) {
797            Ok(phase1) => Ok(Self::try_from(phase1)?),
798            Err(json_err) => {
799                if ingest_error_is_unknown_field(&json_err)
800                    && !ingest_error_suggests_json_schema_root(&json_err)
801                {
802                    return Err(SchemaLoadError::Json(json_err));
803                }
804                let schema = JSONSchemaIngestSchema::from_json_utf8(bytes)
805                    .map_err(|_| SchemaLoadError::Json(json_err))?;
806                let ingest = IngestSchema::try_from(schema).map_err(|e| {
807                    SchemaLoadError::Normalize(SchemaNormalizeError::NestedDecode {
808                        path: "$".to_string(),
809                        message: e.to_string(),
810                    })
811                })?;
812                Ok(Self::try_from(ingest)?)
813            }
814        }
815    }
816
817    pub fn from_json_str(s: &str) -> Result<Self, SchemaLoadError> {
818        Self::from_json_bytes(s.as_bytes())
819    }
820}
821
822#[derive(Debug, thiserror::Error)]
823pub enum SchemaLoadError {
824    #[error("json parse error: {0}")]
825    Json(#[from] serde_json::Error),
826
827    #[error("schema normalization error: {0}")]
828    Normalize(#[from] SchemaNormalizeError),
829}
830
831#[cfg(test)]
832mod tests {
833    use super::*;
834
835    #[test]
836    fn normalized_parses_colon_delimited_entity() {
837        let s = r#"{ "entities": ["Gene::str::0.95::gene symbol"] }"#;
838        let schema = NormalizedSchema::from_json_str(s).unwrap();
839
840        assert_eq!(schema.entities.len(), 1);
841        let e = &schema.entities[0];
842        assert_eq!(e.name.as_str(), "gene");
843        assert_eq!(e.threshold, Some(0.95));
844        assert_eq!(e.description.as_deref(), Some("gene symbol"));
845        assert_eq!(e.dtype, Some(DType::String));
846    }
847
848    #[test]
849    fn normalized_normalizes_hyphenated_and_camelish_names() {
850        let s = r#"{ "entities": ["Gene-Name", "Other_Name"] }"#;
851        let schema = NormalizedSchema::from_json_str(s).unwrap();
852
853        assert_eq!(schema.entities[0].name.as_str(), "gene_name");
854        assert_eq!(schema.entities[1].name.as_str(), "other_name");
855    }
856
857    #[test]
858    fn normalized_rejects_conflicting_threshold_aliases() {
859        let s = r#"
860        {
861            "classifications": [
862                {
863                    "task": "sentiment",
864                    "labels": ["positive", "negative"],
865                    "threshold": 0.4,
866                    "cls_threshold": 0.7
867                }
868            ]
869        }
870        "#;
871
872        let err = NormalizedSchema::from_json_str(s).unwrap_err();
873        match err {
874            SchemaLoadError::Normalize(SchemaNormalizeError::ConflictingThresholdAliases {
875                ..
876            }) => {}
877            other => panic!("unexpected error: {other:?}"),
878        }
879    }
880
881    #[test]
882    fn normalized_parses_json_schema_root_string() {
883        let raw = r#"
884        {
885            "$id": "BusinessRecord",
886            "type": "object",
887            "required": ["business_name"],
888            "properties": {
889                "business_name": { "type": "string" },
890                "status": { "type": "string" }
891            }
892        }
893        "#;
894
895        let schema = NormalizedSchema::from_json_str(raw).unwrap();
896        assert_eq!(schema.json_structures.len(), 1);
897        let JsonStructure::NameKeyedStructure { name, props } = &schema.json_structures[0] else {
898            panic!("expected name-keyed structure");
899        };
900        assert_eq!(name.as_str(), "businessrecord");
901        assert!(props.contains_key(&ExpandedName::new("business_name".into())));
902        assert!(props.contains_key(&ExpandedName::new("status".into())));
903    }
904
905    #[test]
906    fn normalized_rejects_unknown_top_level_ie_key() {
907        let s = r#"{ "entities": ["gene"], "extra_root": 1 }"#;
908        let err = NormalizedSchema::from_json_str(s).expect_err("unknown IE key");
909        match err {
910            SchemaLoadError::Json(_) => {}
911            other => panic!("unexpected error: {other:?}"),
912        }
913    }
914
915    #[test]
916    fn normalized_rejects_multi_key_single_entity_dict() {
917        let s = r#"
918        {
919            "entities": [
920                {
921                    "gene": "desc",
922                    "protein": "desc2"
923                }
924            ]
925        }
926        "#;
927
928        let err = NormalizedSchema::from_json_str(s).unwrap_err();
929        match err {
930            SchemaLoadError::Normalize(SchemaNormalizeError::ExpectedSingleKey { .. }) => {}
931            other => panic!("unexpected error: {other:?}"),
932        }
933    }
934
935    #[test]
936    fn normalized_named_structure_normalizes_property_keys() {
937        let s = r#"
938        {
939            "json_structures": [
940                {
941                    "name": "Patient Record",
942                    "Field-A": {
943                        "description": "field a",
944                        "dtype": "str"
945                    },
946                    "Field B": {
947                        "choices": ["x", "y::str::0.5::label"]
948                    }
949                }
950            ]
951        }
952        "#;
953
954        let schema = NormalizedSchema::from_json_str(s).unwrap();
955        let JsonStructure::NamedStructure(ns) = &schema.json_structures[0] else {
956            panic!("expected named structure");
957        };
958
959        assert_eq!(ns.name.as_str(), "patient_record");
960        assert!(ns.props.keys().any(|k| k.as_str() == "field_a"));
961        assert!(ns.props.keys().any(|k| k.as_str() == "field_b"));
962    }
963
964    #[test]
965    fn normalized_rejects_duplicate_normalized_structure_property_names() {
966        let s = r#"
967        {
968            "json_structures": [
969                {
970                    "name": "Patient Record",
971                    "Field-A": { "dtype": "str" },
972                    "Field A": { "dtype": "str" }
973                }
974            ]
975        }
976        "#;
977
978        let err = NormalizedSchema::from_json_str(s).unwrap_err();
979        match err {
980            SchemaLoadError::Normalize(SchemaNormalizeError::DuplicateNormalizedName {
981                name,
982                ..
983            }) => assert_eq!(name, "field_a"),
984            other => panic!("unexpected error: {other:?}"),
985        }
986    }
987}