1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
//! A module for any useful helper functions.
pub mod xml {
//! A module for xml cleaning helper functions.
//! Check individual functions for details.
use std::mem;
use crate::{Field, Schema, context::Context};
/// A wrapper function that applies all XML cleaning transformations.
///
/// [clean_solitary_nested_values]
/// + [turn_duplicates_into_sequence_field]
/// + [clean_empty_structs_in_field]
pub fn cleanup_xml_schema<C: Context + Default>(schema: &mut Schema<C>) {
clean_solitary_nested_values(schema);
turn_duplicates_into_sequence_field(schema);
clean_empty_structs_in_field(schema);
}
/// XML documents often result in uselessly nested values because the 'content' of a tag
/// that isn't itself a tag is put into the `$value` field.
///
/// This function simply finds [Schema::Struct]s with a single field named `$value` and
/// replaces them with the schema inside the `$value` field.
pub fn clean_solitary_nested_values<C: Context>(schema: &mut Schema<C>) {
use Schema::*;
match schema {
Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) | Bytes(_) => {}
Sequence { field, .. } => {
if let Some(schema) = &mut field.schema {
clean_solitary_nested_values(schema)
}
}
Struct { fields, .. } => {
// If the only field is $value, then we 'bring it up'.
if fields.len() == 1 && fields.contains_key("$value") {
if let Some(Field {
schema: Some(inner_schema),
..
}) = fields.remove("$value")
{
*schema = inner_schema;
}
} else {
for (_, field) in fields.iter_mut() {
if let Some(schema) = &mut field.schema {
clean_solitary_nested_values(schema)
}
}
}
}
Union { variants } => {
for value in variants {
clean_solitary_nested_values(value);
}
}
}
}
/// XML documents do not have proper sequences, and an 'array' or 'list' is simply
/// represented as a tag appearing multiple times.
///
/// To help with this the inference software annotates duplicate fields, and this function
/// takes the schema in that field and places it into a [Schema::Sequence].
pub fn turn_duplicates_into_sequence_field<C: Context + Default>(schema: &mut Schema<C>) {
clean_field_recursively(schema, _inner_field_cleaning);
fn _inner_field_cleaning<C: Context + Default>(field: &mut Field<C>) {
if let Some(schema) = &mut field.schema {
clean_field_recursively(schema, _inner_field_cleaning)
}
// In xml, sequences are simply registered as a field appearing more than once,
// the parser records this but now we need to move the duplicate field into its own sequence.
if field.status.may_be_duplicate {
*field = Field {
status: field.status.clone(),
schema: Some(Schema::Sequence {
field: Box::new(mem::take(field)),
context: C::Sequence::default(),
}),
};
field.status.may_be_duplicate = false;
}
}
}
/// When a tag is empty, the parser interprets it as as an empty [Schema::Struct].
///
/// This function replaces those fields with empty [Schema::Struct] with fields of
/// unknown schema.
pub fn clean_empty_structs_in_field<C: Context>(schema: &mut Schema<C>) {
clean_field_recursively(schema, _inner_field_cleaning);
fn _inner_field_cleaning<C: Context>(field: &mut Field<C>) {
match &mut field.schema {
Some(Schema::Struct { fields, .. }) if fields.is_empty() => {
field.schema = None;
}
Some(schema) => clean_field_recursively(schema, _inner_field_cleaning),
None => {}
}
}
}
fn clean_field_recursively<C: Context>(schema: &mut Schema<C>, clean_field: fn(&mut Field<C>)) {
use Schema::*;
match schema {
Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) | Bytes(_) => {}
Schema::Sequence { field, .. } => clean_field(field),
Schema::Struct { fields, .. } => {
for (_, field) in fields.iter_mut() {
clean_field(field);
}
}
Schema::Union { variants } => {
for value in variants {
clean_field_recursively(value, clean_field);
}
}
}
}
}