schema_analysis/helpers.rs
1//! A module for any useful helper functions.
2
3pub mod xml {
4 //! A module for xml cleaning helper functions.
5 //! Check individual functions for details.
6
7 use crate::{Field, Schema};
8
9 /// A wrapper function that applies all XML cleaning transformations.
10 ///
11 /// [clean_solitary_nested_values]
12 /// + [turn_duplicates_into_sequence_field]
13 /// + [clean_empty_structs_in_field]
14 pub fn cleanup_xml_schema(schema: &mut Schema) {
15 clean_solitary_nested_values(schema);
16 turn_duplicates_into_sequence_field(schema);
17 clean_empty_structs_in_field(schema);
18 }
19
20 /// XML documents often result in uselessly nested values because the 'content' of a tag
21 /// that isn't itself a tag is put into the `$value` field.
22 ///
23 /// This function simply finds [Schema::Struct]s with a single field named `$value` and
24 /// replaces them with the schema inside the `$value` field.
25 pub fn clean_solitary_nested_values(schema: &mut Schema) {
26 use Schema::*;
27 match schema {
28 Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) | Bytes(_) => {}
29 Sequence { field, .. } => match &mut field.schema {
30 Some(schema) => clean_solitary_nested_values(schema),
31 None => {}
32 },
33 Struct { fields, .. } => {
34 // If the only field is $value, then we 'bring it up'.
35 if fields.len() == 1 && fields.contains_key("$value") {
36 if let Some(Field {
37 schema: Some(inner_schema),
38 ..
39 }) = fields.remove("$value")
40 {
41 *schema = inner_schema;
42 }
43 } else {
44 for (_, field) in fields.iter_mut() {
45 match &mut field.schema {
46 Some(schema) => clean_solitary_nested_values(schema),
47 None => {}
48 }
49 }
50 }
51 }
52 Union { variants } => {
53 for value in variants {
54 clean_solitary_nested_values(value);
55 }
56 }
57 }
58 }
59
60 /// XML documents do not have proper sequences, and an 'array' or 'list' is simply
61 /// represented as a tag appearing multiple times.
62 ///
63 /// To help with this the inference software annotates duplicate fields, and this function
64 /// takes the schema in that field and places it into a [Schema::Sequence].
65 pub fn turn_duplicates_into_sequence_field(schema: &mut Schema) {
66 clean_field_recursively(schema, _inner_field_cleaning);
67
68 fn _inner_field_cleaning(field: &mut Field) {
69 match &mut field.schema {
70 Some(schema) => clean_field_recursively(schema, _inner_field_cleaning),
71 None => {}
72 }
73 // In xml, sequences are simply registered as a field appearing more than once,
74 // the parser records this but now we need to move the duplicate field into its own sequence.
75 if field.status.may_be_duplicate {
76 *field = Field {
77 status: field.status.clone(),
78 schema: Some(Schema::Sequence {
79 field: Box::new(field.clone()),
80 context: Default::default(),
81 }),
82 };
83 field.status.may_be_duplicate = false;
84 }
85 }
86 }
87
88 /// When a tag is empty, the parser interprets it as as an empty [Schema::Struct].
89 ///
90 /// This function replaces those fields with empty [Schema::Struct] with fields of
91 /// unknown schema.
92 pub fn clean_empty_structs_in_field(schema: &mut Schema) {
93 clean_field_recursively(schema, _inner_field_cleaning);
94
95 fn _inner_field_cleaning(field: &mut Field) {
96 match &mut field.schema {
97 Some(Schema::Struct { fields, .. }) if fields.is_empty() => {
98 field.schema = None;
99 }
100 Some(schema) => clean_field_recursively(schema, _inner_field_cleaning),
101 None => {}
102 }
103 }
104 }
105
106 fn clean_field_recursively(schema: &mut Schema, clean_field: fn(&mut Field)) {
107 use Schema::*;
108 match schema {
109 Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) | Bytes(_) => {}
110 Schema::Sequence { field, .. } => clean_field(field),
111 Schema::Struct { fields, .. } => {
112 for (_, field) in fields.iter_mut() {
113 clean_field(field);
114 }
115 }
116 Schema::Union { variants } => {
117 for value in variants {
118 clean_field_recursively(value, clean_field);
119 }
120 }
121 }
122 }
123}