1use crate::ingest::{
2 IngestClassification, IngestDType, IngestEntity, IngestEntityAcquired, IngestEntityList,
3 IngestEntityProperty, IngestJsonNameKeyedStructure, IngestJsonStructure,
4 IngestJsonStructureList, IngestNamedStructure, IngestRelation, IngestSchema,
5 IngestStructureProperties, IngestStructureProperty, IngestValidator, IngestValidatorMode,
6};
7use crate::json_schema::JSONSchemaIngestSchema;
8use serde::Serialize;
9use std::collections::{BTreeMap, BTreeSet};
10use std::convert::TryFrom;
11use std::fmt;
12
13pub type Description = String;
14pub type Regex = String;
15pub type Threshold = f64;
16
17#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Default)]
18pub struct ExpandedName(String);
19
20impl ExpandedName {
21 pub fn new(s: String) -> Self {
22 Self(s)
23 }
24
25 pub fn as_str(&self) -> &str {
26 &self.0
27 }
28
29 pub fn into_inner(self) -> String {
30 self.0
31 }
32}
33
34impl fmt::Display for ExpandedName {
35 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
36 self.0.fmt(f)
37 }
38}
39
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
41pub enum DType {
42 String,
43 Int,
44 Float,
45 Bool,
46}
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
49pub enum ValidatorMode {
50 Partial,
51 Full,
52}
53
54#[derive(Debug, Clone, PartialEq, Serialize)]
55pub struct Validator {
56 pub pattern: Regex,
57 pub mode: Option<ValidatorMode>,
58 pub exclude: bool,
59}
60
61#[derive(Debug, Clone, PartialEq, Serialize, Default)]
62pub struct EntitySpec {
63 pub name: ExpandedName,
66 pub dtype: Option<DType>,
67 pub validator: Option<Validator>,
68 pub threshold: Option<Threshold>,
69 pub description: Option<Description>,
70}
71
72#[derive(Debug, Clone, PartialEq, Serialize)]
73pub struct StructureProperty {
74 pub choices: Vec<EntitySpec>,
75 pub description: Option<Description>,
76 pub value: Option<String>,
77 pub dtype: Option<DType>,
78 pub validator: Option<Validator>,
79 pub threshold: Option<Threshold>,
80}
81
82#[derive(Debug, Clone, PartialEq, Serialize)]
83pub struct NamedStructure {
84 pub name: ExpandedName,
85 pub props: BTreeMap<ExpandedName, StructureProperty>,
86}
87
88#[derive(Debug, Clone, PartialEq, Serialize)]
89pub enum JsonStructure {
90 NamedStructure(NamedStructure),
91 NameKeyedStructure {
92 name: ExpandedName,
93 props: BTreeMap<ExpandedName, StructureProperty>,
94 },
95 EntityList(Vec<EntitySpec>),
96}
97
98#[derive(Debug, Clone, PartialEq, Serialize)]
99pub struct Classification {
100 pub task: EntitySpec,
101 pub labels: Vec<EntitySpec>,
102 pub threshold: Option<Threshold>,
103 pub multi_label: bool,
104 pub label_descriptions: BTreeMap<ExpandedName, EntitySpec>,
105}
106
107#[derive(Debug, Clone, PartialEq, Serialize)]
108pub enum RelationAcquired {
109 Empty,
110 Entity {
111 head: Box<EntitySpec>,
112 tail: Box<EntitySpec>,
113 },
114}
115
116#[derive(Debug, Clone, PartialEq, Serialize)]
117pub struct Relation {
118 pub name: ExpandedName,
119 pub description: Option<Description>,
120 pub acquired: Option<RelationAcquired>,
121}
122
123#[derive(Debug, Clone, PartialEq, Serialize, Default)]
124pub struct NormalizedSchema {
125 pub entities: Vec<EntitySpec>,
126 pub json_structures: Vec<JsonStructure>,
127 pub classifications: Vec<Classification>,
128 pub relations: Vec<Relation>,
129}
130
131#[derive(Debug, Clone, PartialEq, Eq)]
132pub struct Path(String);
133
134impl Path {
135 pub fn root() -> Self {
136 Self("$".to_string())
137 }
138
139 pub fn field(&self, name: &str) -> Self {
140 Self(format!("{}.{}", self.0, name))
141 }
142
143 pub fn index(&self, idx: usize) -> Self {
144 Self(format!("{}[{idx}]", self.0))
145 }
146
147 pub fn key(&self, key: &str) -> Self {
148 Self(format!("{}[{key:?}]", self.0))
149 }
150
151 pub fn as_str(&self) -> &str {
152 &self.0
153 }
154}
155
156#[derive(Debug, thiserror::Error)]
157pub enum SchemaNormalizeError {
158 #[error("invalid name at {path}: {value:?}")]
159 InvalidName { path: String, value: String },
160
161 #[error("expected exactly one key at {path}, found {found}")]
162 ExpectedSingleKey { path: String, found: usize },
163
164 #[error("invalid colon-delimited entity at {path}: {raw:?}")]
165 InvalidColonDelimitedEntity { path: String, raw: String },
166
167 #[error("ambiguous colon-delimited entity at {path}: {raw:?}")]
168 AmbiguousColonDelimitedEntity { path: String, raw: String },
169
170 #[error(
171 "conflicting threshold aliases at {path}: threshold={threshold:?}, cls_threshold={cls_threshold:?}"
172 )]
173 ConflictingThresholdAliases {
174 path: String,
175 threshold: Option<f64>,
176 cls_threshold: Option<f64>,
177 },
178
179 #[error("duplicate normalized name at {path}: {name}")]
180 DuplicateNormalizedName { path: String, name: String },
181
182 #[error("invalid empty-acquired relation at {path}")]
183 InvalidEmptyAcquiredRelation { path: String },
184
185 #[error("invalid structure properties at {path}")]
186 InvalidStructureProperties { path: String },
187
188 #[error("invalid dtype at {path}: {value:?}")]
189 InvalidDType { path: String, value: String },
190
191 #[error("nested decode failed at {path}: {message}")]
192 NestedDecode { path: String, message: String },
193}
194
195fn normalize_name(raw: &str, path: &Path) -> Result<ExpandedName, SchemaNormalizeError> {
196 let trimmed = raw.trim();
197 if trimmed.is_empty() {
198 return Err(SchemaNormalizeError::InvalidName {
199 path: path.as_str().to_string(),
200 value: raw.to_string(),
201 });
202 }
203
204 let mut out = String::new();
205 let mut prev_us = false;
206
207 for ch in trimmed.chars() {
208 match ch {
209 'a'..='z' | '0'..='9' => {
210 out.push(ch);
211 prev_us = false;
212 }
213 'A'..='Z' => {
214 out.push(ch.to_ascii_lowercase());
215 prev_us = false;
216 }
217 '_' | '-' | ' ' => {
218 if !prev_us && !out.is_empty() {
219 out.push('_');
220 prev_us = true;
221 }
222 }
223 _ => {
224 return Err(SchemaNormalizeError::InvalidName {
225 path: path.as_str().to_string(),
226 value: raw.to_string(),
227 });
228 }
229 }
230 }
231
232 while out.ends_with('_') {
233 out.pop();
234 }
235
236 if out.is_empty() {
237 return Err(SchemaNormalizeError::InvalidName {
238 path: path.as_str().to_string(),
239 value: raw.to_string(),
240 });
241 }
242
243 Ok(ExpandedName::new(out))
244}
245
246fn dtype_1_to_2(v: IngestDType, path: &Path) -> Result<DType, SchemaNormalizeError> {
247 match v {
248 IngestDType::String(s) => {
249 let Some(dt) = looks_like_dtype(&s) else {
250 return Err(SchemaNormalizeError::InvalidDType {
251 path: path.as_str().to_string(),
252 value: s,
253 });
254 };
255 Ok(dt)
256 }
257 IngestDType::Int(_) => Ok(DType::Int),
258 IngestDType::Float(_) => Ok(DType::Float),
259 IngestDType::Bool(_) => Ok(DType::Bool),
260 }
261}
262
263fn validator_1_to_2(v: IngestValidator) -> Validator {
264 match v {
265 IngestValidator::Regex(pattern) => Validator {
266 pattern,
267 mode: None,
268 exclude: false,
269 },
270 IngestValidator::Dict(d) => Validator {
271 pattern: d.pattern,
272 mode: d.mode.map(|m| match m {
273 IngestValidatorMode::Partial => ValidatorMode::Partial,
274 IngestValidatorMode::Full => ValidatorMode::Full,
275 }),
276 exclude: d.exclude.unwrap_or(false),
277 },
278 }
279}
280
281fn looks_like_dtype(s: &str) -> Option<DType> {
282 let x = s.trim().to_ascii_lowercase();
283 match x.as_str() {
284 "str" | "string" => Some(DType::String),
285 "int" | "integer" => Some(DType::Int),
286 "float" => Some(DType::Float),
287 "bool" | "boolean" => Some(DType::Bool),
288 _ => None,
289 }
290}
291
292type ParsedColonDelimitedEntity = (ExpandedName, Option<DType>, Option<f64>, Option<String>);
293
294fn parse_colon_delimited_entity(
295 raw: &str,
296 path: &Path,
297) -> Result<ParsedColonDelimitedEntity, SchemaNormalizeError> {
298 let parts: Vec<&str> = raw.split("::").map(str::trim).collect();
299 if parts.is_empty() {
300 return Err(SchemaNormalizeError::InvalidColonDelimitedEntity {
301 path: path.as_str().to_string(),
302 raw: raw.to_string(),
303 });
304 }
305
306 let name = normalize_name(parts[0], &path.field("name"))?;
307 let mut dtype = None;
308 let mut threshold = None;
309 let mut description = None;
310
311 for seg in parts.into_iter().skip(1) {
312 if seg.is_empty() {
313 continue;
314 }
315
316 if threshold.is_none()
317 && let Ok(f) = seg.parse::<f64>()
318 {
319 threshold = Some(f);
320 continue;
321 }
322
323 if dtype.is_none()
324 && let Some(dt) = looks_like_dtype(seg)
325 {
326 dtype = Some(dt);
327 continue;
328 }
329
330 if description.is_none() {
331 description = Some(seg.to_string());
332 continue;
333 }
334
335 return Err(SchemaNormalizeError::AmbiguousColonDelimitedEntity {
336 path: path.as_str().to_string(),
337 raw: raw.to_string(),
338 });
339 }
340
341 Ok((name, dtype, threshold, description))
342}
343
344fn entity_property_to_spec(
345 name_hint: Option<&str>,
346 ep: IngestEntityProperty,
347 path: &Path,
348) -> Result<EntitySpec, SchemaNormalizeError> {
349 match ep {
350 IngestEntityProperty::Description(desc) => {
351 let Some(name_hint) = name_hint else {
352 return Err(SchemaNormalizeError::InvalidStructureProperties {
353 path: path.as_str().to_string(),
354 });
355 };
356
357 Ok(EntitySpec {
358 name: normalize_name(name_hint, &path.field("name"))?,
359 dtype: None,
360 validator: None,
361 threshold: None,
362 description: Some(desc),
363 })
364 }
365 IngestEntityProperty::Dict(d) => {
366 let raw_name = if d.name.trim().is_empty() {
367 name_hint.unwrap_or_default()
368 } else {
369 d.name.as_str()
370 };
371
372 let dtype = match d.dtype {
373 Some(dt) => Some(dtype_1_to_2(dt, &path.field("dtype"))?),
374 None => None,
375 };
376
377 Ok(EntitySpec {
378 name: normalize_name(raw_name, &path.field("name"))?,
379 dtype,
380 validator: d.validator.map(validator_1_to_2),
381 threshold: d.threshold,
382 description: d.description,
383 })
384 }
385 }
386}
387
388fn single_entry_map<K, V>(
389 map: BTreeMap<K, V>,
390 path: &Path,
391) -> Result<(K, V), SchemaNormalizeError> {
392 if map.len() != 1 {
393 return Err(SchemaNormalizeError::ExpectedSingleKey {
394 path: path.as_str().to_string(),
395 found: map.len(),
396 });
397 }
398 Ok(map.into_iter().next().unwrap())
399}
400
401fn entity_1_to_spec(entity: IngestEntity, path: &Path) -> Result<EntitySpec, SchemaNormalizeError> {
402 match entity {
403 IngestEntity::Stringish(s) => {
404 if s.contains("::") {
405 let (name, dtype, threshold, description) = parse_colon_delimited_entity(&s, path)?;
406 Ok(EntitySpec {
407 name,
408 dtype,
409 validator: None,
410 threshold,
411 description,
412 })
413 } else {
414 Ok(EntitySpec {
415 name: normalize_name(&s, &path.field("name"))?,
416 dtype: None,
417 validator: None,
418 threshold: None,
419 description: None,
420 })
421 }
422 }
423 IngestEntity::SingleEntityDict(map) => {
424 let (k, v) = single_entry_map(map, path)?;
425 entity_property_to_spec(Some(&k), v, path)
426 }
427 }
428}
429
430fn entity_list_1_to_vec(
431 list: IngestEntityList,
432 path: &Path,
433) -> Result<Vec<EntitySpec>, SchemaNormalizeError> {
434 match list {
435 IngestEntityList::List(items) => items
436 .into_iter()
437 .enumerate()
438 .map(|(i, item)| entity_1_to_spec(item, &path.index(i)))
439 .collect(),
440 IngestEntityList::Dict(map) => map
441 .into_iter()
442 .map(|(k, v)| entity_property_to_spec(Some(&k), v, &path.key(&k)))
443 .collect(),
444 }
445}
446
447fn structure_property_1_to_2(
448 v: IngestStructureProperty,
449 path: &Path,
450) -> Result<StructureProperty, SchemaNormalizeError> {
451 let dtype = match v.dtype {
452 Some(dt) => Some(dtype_1_to_2(dt, &path.field("dtype"))?),
453 None => None,
454 };
455
456 Ok(StructureProperty {
457 choices: match v.choices {
458 Some(choices) => entity_list_1_to_vec(choices, &path.field("choices"))?,
459 None => Vec::new(),
460 },
461 description: v.description,
462 value: v.value,
463 dtype,
464 validator: v.validator.map(validator_1_to_2),
465 threshold: v.threshold,
466 })
467}
468
469fn ensure_unique_keys<T>(
470 map: &BTreeMap<ExpandedName, T>,
471 path: &Path,
472) -> Result<(), SchemaNormalizeError> {
473 let mut seen = BTreeSet::new();
474 for k in map.keys() {
475 if !seen.insert(k.clone()) {
476 return Err(SchemaNormalizeError::DuplicateNormalizedName {
477 path: path.as_str().to_string(),
478 name: k.to_string(),
479 });
480 }
481 }
482 Ok(())
483}
484
485fn insert_unique<T>(
486 map: &mut BTreeMap<ExpandedName, T>,
487 key: ExpandedName,
488 value: T,
489 path: &Path,
490) -> Result<(), SchemaNormalizeError> {
491 if map.insert(key.clone(), value).is_some() {
492 return Err(SchemaNormalizeError::DuplicateNormalizedName {
493 path: path.as_str().to_string(),
494 name: key.to_string(),
495 });
496 }
497 Ok(())
498}
499
500fn structure_properties_1_to_map(
501 v: IngestStructureProperties,
502 path: &Path,
503) -> Result<BTreeMap<ExpandedName, StructureProperty>, SchemaNormalizeError> {
504 let out = match v {
505 IngestStructureProperties::EntityDict(map) => {
506 let mut out = BTreeMap::new();
507 for (k, v) in map {
508 let key = normalize_name(&k, &path.key(&k))?;
509 let spec = entity_property_to_spec(Some(&k), v, &path.key(&k))?;
510 insert_unique(
511 &mut out,
512 key,
513 StructureProperty {
514 choices: Vec::new(),
515 description: spec.description,
516 value: None,
517 dtype: spec.dtype,
518 validator: spec.validator,
519 threshold: spec.threshold,
520 },
521 path,
522 )?;
523 }
524 out
525 }
526 IngestStructureProperties::EntityList(list) => {
527 let specs = entity_list_1_to_vec(list, path)?;
528 let mut out = BTreeMap::new();
529 for spec in specs {
530 insert_unique(
531 &mut out,
532 spec.name.clone(),
533 StructureProperty {
534 choices: Vec::new(),
535 description: spec.description,
536 value: None,
537 dtype: spec.dtype,
538 validator: spec.validator,
539 threshold: spec.threshold,
540 },
541 path,
542 )?;
543 }
544 out
545 }
546 IngestStructureProperties::StructurePropertiesDict(map) => {
547 let mut out = BTreeMap::new();
548 for (k, v) in map {
549 let key = normalize_name(&k, &path.key(&k))?;
550 let value = structure_property_1_to_2(v, &path.key(&k))?;
551 insert_unique(&mut out, key, value, path)?;
552 }
553 out
554 }
555 };
556
557 ensure_unique_keys(&out, path)?;
558 Ok(out)
559}
560
561fn try_parse_structure_properties_value(
562 v: serde_json::Value,
563 path: &Path,
564) -> Result<BTreeMap<ExpandedName, StructureProperty>, SchemaNormalizeError> {
565 let parsed: IngestStructureProperties =
566 serde_json::from_value(v).map_err(|e| SchemaNormalizeError::NestedDecode {
567 path: path.as_str().to_string(),
568 message: e.to_string(),
569 })?;
570 structure_properties_1_to_map(parsed, path)
571}
572
573fn named_structure_1_to_2(
574 ns: IngestNamedStructure,
575 path: &Path,
576) -> Result<NamedStructure, SchemaNormalizeError> {
577 let name = normalize_name(&ns.name, &path.field("name"))?;
578 let mut props = BTreeMap::new();
579
580 for (k, v) in ns.props {
581 let key = normalize_name(&k, &path.key(&k))?;
582 let value = structure_property_1_to_2(v, &path.key(&k))?;
583 insert_unique(&mut props, key, value, &path.field("props"))?;
584 }
585
586 ensure_unique_keys(&props, &path.field("props"))?;
587 Ok(NamedStructure { name, props })
588}
589
590fn json_structure_1_to_2(
591 v: IngestJsonStructure,
592 path: &Path,
593) -> Result<Vec<JsonStructure>, SchemaNormalizeError> {
594 match v {
595 IngestJsonStructure::NamedStructure(ns) => Ok(vec![JsonStructure::NamedStructure(
596 named_structure_1_to_2(ns, path)?,
597 )]),
598 IngestJsonStructure::EntityList(list) => Ok(vec![JsonStructure::EntityList(
599 entity_list_1_to_vec(list, path)?,
600 )]),
601 IngestJsonStructure::JsonNameKeyedStructure(IngestJsonNameKeyedStructure(map)) => {
602 let mut out = Vec::new();
603 for (name_raw, value) in map {
604 let name = normalize_name(&name_raw, &path.key(&name_raw))?;
605 let props = try_parse_structure_properties_value(value, &path.key(&name_raw))?;
606 out.push(JsonStructure::NameKeyedStructure { name, props });
607 }
608 Ok(out)
609 }
610 }
611}
612
613fn classification_1_to_2(
614 v: IngestClassification,
615 path: &Path,
616) -> Result<Classification, SchemaNormalizeError> {
617 let threshold = match (v.threshold, v.cls_threshold) {
618 (Some(a), Some(b)) if (a - b).abs() > f64::EPSILON => {
619 return Err(SchemaNormalizeError::ConflictingThresholdAliases {
620 path: path.as_str().to_string(),
621 threshold: Some(a),
622 cls_threshold: Some(b),
623 });
624 }
625 (Some(a), _) => Some(a),
626 (_, Some(b)) => Some(b),
627 _ => None,
628 };
629
630 let task = entity_1_to_spec(v.task, &path.field("task"))?;
631 let labels = entity_list_1_to_vec(v.labels, &path.field("labels"))?;
632
633 let mut label_descriptions = BTreeMap::new();
634 if let Some(map) = v.label_descriptions {
635 for (k, ep) in map {
636 let key = normalize_name(&k, &path.field("label_descriptions").key(&k))?;
637 let spec =
638 entity_property_to_spec(Some(&k), ep, &path.field("label_descriptions").key(&k))?;
639 label_descriptions.insert(key, spec);
640 }
641 }
642
643 Ok(Classification {
644 task,
645 labels,
646 threshold,
647 multi_label: v.multi_label.unwrap_or(false),
648 label_descriptions,
649 })
650}
651
652fn entity_acquired_to_2(
653 v: IngestEntityAcquired,
654 path: &Path,
655) -> Result<RelationAcquired, SchemaNormalizeError> {
656 match (v.head, v.tail) {
657 (IngestEntity::Stringish(s1), IngestEntity::Stringish(s2))
658 if s1.is_empty() && s2.is_empty() =>
659 {
660 Ok(RelationAcquired::Empty)
661 }
662 (h, t) => Ok(RelationAcquired::Entity {
663 head: Box::new(entity_1_to_spec(h, &path.field("head"))?),
664 tail: Box::new(entity_1_to_spec(t, &path.field("tail"))?),
665 }),
666 }
667}
668
669fn relation_1_to_2(v: IngestRelation, path: &Path) -> Result<Relation, SchemaNormalizeError> {
670 match v {
671 IngestRelation::Name(name) => Ok(Relation {
672 name: normalize_name(&name, &path.field("name"))?,
673 description: None,
674 acquired: None,
675 }),
676 IngestRelation::NameDescription(map) => {
677 let (k, desc) = single_entry_map(map, path)?;
678 Ok(Relation {
679 name: normalize_name(&k, &path.key(&k))?,
680 description: Some(desc),
681 acquired: None,
682 })
683 }
684 IngestRelation::RelationEntityAcquired(map) => {
685 let (k, acq) = single_entry_map(map, path)?;
686 Ok(Relation {
687 name: normalize_name(&k, &path.key(&k))?,
688 description: None,
689 acquired: Some(entity_acquired_to_2(acq, &path.key(&k))?),
690 })
691 }
692 }
693}
694
695impl TryFrom<IngestSchema> for NormalizedSchema {
696 type Error = SchemaNormalizeError;
697
698 fn try_from(v: IngestSchema) -> Result<Self, Self::Error> {
699 let root = Path::root();
700
701 let entities = match v.entities {
702 Some(x) => entity_list_1_to_vec(x, &root.field("entities"))?,
703 None => Vec::new(),
704 };
705
706 let mut json_structures = Vec::new();
707 match v.json_structures {
708 None => {}
709 Some(IngestJsonStructureList::Single(one)) => {
710 json_structures.extend(json_structure_1_to_2(one, &root.field("json_structures"))?);
711 }
712 Some(IngestJsonStructureList::List(list)) => {
713 for (i, item) in list.into_iter().enumerate() {
714 json_structures.extend(json_structure_1_to_2(
715 item,
716 &root.field("json_structures").index(i),
717 )?);
718 }
719 }
720 }
721
722 let classifications = match v.classifications {
723 Some(list) => list
724 .into_iter()
725 .enumerate()
726 .map(|(i, cls)| classification_1_to_2(cls, &root.field("classifications").index(i)))
727 .collect::<Result<Vec<_>, _>>()?,
728 None => Vec::new(),
729 };
730
731 let relations = match v.relations {
732 Some(list) => list
733 .into_iter()
734 .enumerate()
735 .map(|(i, rel)| relation_1_to_2(rel, &root.field("relations").index(i)))
736 .collect::<Result<Vec<_>, _>>()?,
737 None => Vec::new(),
738 };
739
740 Ok(Self {
741 entities,
742 json_structures,
743 classifications,
744 relations,
745 })
746 }
747}
748
749fn ingest_error_is_unknown_field(err: &serde_json::Error) -> bool {
750 err.to_string().contains("unknown field")
751}
752
753fn unknown_field_name_in_ingest_error(err: &serde_json::Error) -> Option<String> {
755 let s = err.to_string();
756 let prefix = "unknown field `";
757 let start = s.find(prefix)? + prefix.len();
758 let rest = &s[start..];
759 let end = rest.find('`')?;
760 Some(rest[..end].to_string())
761}
762
763fn ingest_error_suggests_json_schema_root(err: &serde_json::Error) -> bool {
766 let Some(name) = unknown_field_name_in_ingest_error(err) else {
767 return false;
768 };
769 if name.starts_with('$') {
770 return true;
771 }
772 matches!(
773 name.as_str(),
774 "type"
775 | "properties"
776 | "required"
777 | "items"
778 | "title"
779 | "description"
780 | "definitions"
781 | "additionalProperties"
782 | "patternProperties"
783 | "allOf"
784 | "anyOf"
785 | "oneOf"
786 | "not"
787 )
788}
789
790impl NormalizedSchema {
791 pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, SchemaLoadError> {
796 match IngestSchema::from_json_slice(bytes) {
797 Ok(phase1) => Ok(Self::try_from(phase1)?),
798 Err(json_err) => {
799 if ingest_error_is_unknown_field(&json_err)
800 && !ingest_error_suggests_json_schema_root(&json_err)
801 {
802 return Err(SchemaLoadError::Json(json_err));
803 }
804 let schema = JSONSchemaIngestSchema::from_json_utf8(bytes)
805 .map_err(|_| SchemaLoadError::Json(json_err))?;
806 let ingest = IngestSchema::try_from(schema).map_err(|e| {
807 SchemaLoadError::Normalize(SchemaNormalizeError::NestedDecode {
808 path: "$".to_string(),
809 message: e.to_string(),
810 })
811 })?;
812 Ok(Self::try_from(ingest)?)
813 }
814 }
815 }
816
817 pub fn from_json_str(s: &str) -> Result<Self, SchemaLoadError> {
818 Self::from_json_bytes(s.as_bytes())
819 }
820}
821
822#[derive(Debug, thiserror::Error)]
823pub enum SchemaLoadError {
824 #[error("json parse error: {0}")]
825 Json(#[from] serde_json::Error),
826
827 #[error("schema normalization error: {0}")]
828 Normalize(#[from] SchemaNormalizeError),
829}
830
831#[cfg(test)]
832mod tests {
833 use super::*;
834
835 #[test]
836 fn normalized_parses_colon_delimited_entity() {
837 let s = r#"{ "entities": ["Gene::str::0.95::gene symbol"] }"#;
838 let schema = NormalizedSchema::from_json_str(s).unwrap();
839
840 assert_eq!(schema.entities.len(), 1);
841 let e = &schema.entities[0];
842 assert_eq!(e.name.as_str(), "gene");
843 assert_eq!(e.threshold, Some(0.95));
844 assert_eq!(e.description.as_deref(), Some("gene symbol"));
845 assert_eq!(e.dtype, Some(DType::String));
846 }
847
848 #[test]
849 fn normalized_normalizes_hyphenated_and_camelish_names() {
850 let s = r#"{ "entities": ["Gene-Name", "Other_Name"] }"#;
851 let schema = NormalizedSchema::from_json_str(s).unwrap();
852
853 assert_eq!(schema.entities[0].name.as_str(), "gene_name");
854 assert_eq!(schema.entities[1].name.as_str(), "other_name");
855 }
856
857 #[test]
858 fn normalized_rejects_conflicting_threshold_aliases() {
859 let s = r#"
860 {
861 "classifications": [
862 {
863 "task": "sentiment",
864 "labels": ["positive", "negative"],
865 "threshold": 0.4,
866 "cls_threshold": 0.7
867 }
868 ]
869 }
870 "#;
871
872 let err = NormalizedSchema::from_json_str(s).unwrap_err();
873 match err {
874 SchemaLoadError::Normalize(SchemaNormalizeError::ConflictingThresholdAliases {
875 ..
876 }) => {}
877 other => panic!("unexpected error: {other:?}"),
878 }
879 }
880
881 #[test]
882 fn normalized_parses_json_schema_root_string() {
883 let raw = r#"
884 {
885 "$id": "BusinessRecord",
886 "type": "object",
887 "required": ["business_name"],
888 "properties": {
889 "business_name": { "type": "string" },
890 "status": { "type": "string" }
891 }
892 }
893 "#;
894
895 let schema = NormalizedSchema::from_json_str(raw).unwrap();
896 assert_eq!(schema.json_structures.len(), 1);
897 let JsonStructure::NameKeyedStructure { name, props } = &schema.json_structures[0] else {
898 panic!("expected name-keyed structure");
899 };
900 assert_eq!(name.as_str(), "businessrecord");
901 assert!(props.contains_key(&ExpandedName::new("business_name".into())));
902 assert!(props.contains_key(&ExpandedName::new("status".into())));
903 }
904
905 #[test]
906 fn normalized_rejects_unknown_top_level_ie_key() {
907 let s = r#"{ "entities": ["gene"], "extra_root": 1 }"#;
908 let err = NormalizedSchema::from_json_str(s).expect_err("unknown IE key");
909 match err {
910 SchemaLoadError::Json(_) => {}
911 other => panic!("unexpected error: {other:?}"),
912 }
913 }
914
915 #[test]
916 fn normalized_rejects_multi_key_single_entity_dict() {
917 let s = r#"
918 {
919 "entities": [
920 {
921 "gene": "desc",
922 "protein": "desc2"
923 }
924 ]
925 }
926 "#;
927
928 let err = NormalizedSchema::from_json_str(s).unwrap_err();
929 match err {
930 SchemaLoadError::Normalize(SchemaNormalizeError::ExpectedSingleKey { .. }) => {}
931 other => panic!("unexpected error: {other:?}"),
932 }
933 }
934
935 #[test]
936 fn normalized_named_structure_normalizes_property_keys() {
937 let s = r#"
938 {
939 "json_structures": [
940 {
941 "name": "Patient Record",
942 "Field-A": {
943 "description": "field a",
944 "dtype": "str"
945 },
946 "Field B": {
947 "choices": ["x", "y::str::0.5::label"]
948 }
949 }
950 ]
951 }
952 "#;
953
954 let schema = NormalizedSchema::from_json_str(s).unwrap();
955 let JsonStructure::NamedStructure(ns) = &schema.json_structures[0] else {
956 panic!("expected named structure");
957 };
958
959 assert_eq!(ns.name.as_str(), "patient_record");
960 assert!(ns.props.keys().any(|k| k.as_str() == "field_a"));
961 assert!(ns.props.keys().any(|k| k.as_str() == "field_b"));
962 }
963
964 #[test]
965 fn normalized_rejects_duplicate_normalized_structure_property_names() {
966 let s = r#"
967 {
968 "json_structures": [
969 {
970 "name": "Patient Record",
971 "Field-A": { "dtype": "str" },
972 "Field A": { "dtype": "str" }
973 }
974 ]
975 }
976 "#;
977
978 let err = NormalizedSchema::from_json_str(s).unwrap_err();
979 match err {
980 SchemaLoadError::Normalize(SchemaNormalizeError::DuplicateNormalizedName {
981 name,
982 ..
983 }) => assert_eq!(name, "field_a"),
984 other => panic!("unexpected error: {other:?}"),
985 }
986 }
987}