schema_analysis/
schema.rs

1use std::collections::BTreeMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::{
6    context::{
7        BooleanContext, BytesContext, MapStructContext, NullContext, NumberContext,
8        SequenceContext, StringContext,
9    },
10    Coalesce, StructuralEq,
11};
12
13/// This enum is the core output of the analysis, it describes the structure of a document.
14///
15/// Each variant also contains [context](crate::context) data that allows it to store information
16/// about the values it has encountered.
17#[derive(Debug, Clone, Serialize, Deserialize)]
18#[serde(tag = "type")]
19pub enum Schema {
20    /// The Null variant is a special one that is only ever found when a document has a single
21    /// null value at the root of the document.
22    /// Null values in [Struct](Schema::Struct)s or [Sequence](Schema::Sequence)s are instead
23    /// handled at the [Field] level, where it is more ergonomic.
24    Null(NullContext),
25    /// Represents a boolean value.
26    Boolean(BooleanContext),
27    /// Represents an integer value.
28    Integer(NumberContext<i128>),
29    /// Represents a floating point value.
30    Float(NumberContext<f64>),
31    /// Represents a textual value.
32    String(StringContext),
33    /// Represents a value of raw bytes.
34    Bytes(BytesContext),
35    /// Represents a sequence of values described by a [Field].
36    /// It assumes all values share the same schema.
37    Sequence {
38        /// The field is the structure shared by all the elements of the sequence.
39        field: Box<Field>,
40        /// The context aggregates information about the sequence.
41        /// It is passed the length of the sequence.
42        context: SequenceContext,
43    },
44    /// Represents a [String]->[Field] mapping.
45    ///
46    /// Note: currently there is not a true map and only strings may be used as keys.
47    Struct {
48        /// Each [String] key gets assigned a [Field].
49        /// Currently we are using a [BTreeMap], but that might change in the future.
50        fields: BTreeMap<String, Field>,
51        /// The context aggregates information about the struct.
52        /// It is passed a vector of the key names.
53        context: MapStructContext,
54    },
55    /// Simply a vector of [Schema]s, it should never contain an Union or multiple instances of the
56    /// same variant inside.
57    ///
58    /// Note: content needs to be a struct variant to work with `#[serde(tag = "type")]`.
59    Union {
60        /// A list of the possible schemas that were found.
61        variants: Vec<Schema>,
62    },
63    // Tuple(..),
64    // Map(..),
65}
66
67/// A [Field] is a useful abstraction to record metadata that does not belong or would be unyieldy
68/// to place into the [Schema] and to account for cases in which the existence of a [Field] might be
69/// known, but nothing is known about its shape.
70#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
71pub struct Field {
72    /// The status holds information on the the field, like whether it might be null or
73    /// missing altogether. Duplicate fields are also recorded.
74    #[serde(flatten)]
75    pub status: FieldStatus,
76    /// The inner Schema is optional because we might have no information on the shape of the field
77    /// (like for an empty array).
78    #[serde(flatten)]
79    pub schema: Option<Schema>,
80}
81
82/// The FieldStatus keeps track of what kind of values a [Field] has been found to have.
83#[derive(Debug, Clone, PartialEq, Eq, Hash, Default, Serialize, Deserialize)]
84pub struct FieldStatus {
85    /// The [Field] has been found to be [None] or of the unit type `()`.
86    pub may_be_null: bool,
87    /// The [Field] has been found to be a normal value, where normal means
88    /// any valid value for the [Schema] associated with the [Field].
89    pub may_be_normal: bool,
90    /// The [Field] was found only on some [Struct](Schema::Struct)s or the
91    /// [Sequence](Schema::Sequence) to which it belongs might be empty (if only empty sequences
92    /// are found, then the [Schema] in the [Field] will also be [None]).
93    pub may_be_missing: bool,
94    /// Sometimes a field might appear more than once in the same [Struct](Schema::Struct).
95    /// In that case all instances are considered, but this flag is also enabled.
96    /// This is useful to spot suspicious data, but also to detect sequences in xml files.
97    /// See [here](crate::helpers::xml) for more info.
98    pub may_be_duplicate: bool,
99}
100
101//
102// Schema implementations
103//
104impl StructuralEq for Schema {
105    fn structural_eq(&self, other: &Self) -> bool {
106        use Schema::*;
107        match (self, other) {
108            (Null(_), Null(_)) => true,
109            (Boolean(_), Boolean(_)) => true,
110            (Integer(_), Integer(_)) => true,
111            (Float(_), Float(_)) => true,
112            (String(_), String(_)) => true,
113            (Bytes(_), Bytes(_)) => true,
114
115            (Sequence { field: field_1, .. }, Sequence { field: field_2, .. }) => {
116                field_1.structural_eq(field_2)
117            }
118
119            (
120                Struct {
121                    fields: fields_1, ..
122                },
123                Struct {
124                    fields: fields_2, ..
125                },
126            ) => fields_1.structural_eq(fields_2),
127
128            (Union { variants: s }, Union { variants: o }) => {
129                let mut s = s.clone();
130                let mut o = o.clone();
131                s.sort_by(schema_cmp);
132                o.sort_by(schema_cmp);
133                s.structural_eq(&o)
134            }
135
136            // Listing these out makes sure it fails if new variants are added.
137            (Null(_), _)
138            | (Boolean(_), _)
139            | (Integer(_), _)
140            | (Float(_), _)
141            | (String(_), _)
142            | (Bytes(_), _)
143            | (Sequence { .. }, _)
144            | (Struct { .. }, _)
145            | (Union { .. }, _) => false,
146        }
147    }
148}
149impl Coalesce for Schema {
150    fn coalesce(&mut self, other: Self) {
151        use Schema::*;
152        match (self, other) {
153            (Boolean(s), Boolean(o)) => s.coalesce(o),
154            (Integer(s), Integer(o)) => s.coalesce(o),
155            (Float(s), Float(o)) => s.coalesce(o),
156            (String(s), String(o)) => s.coalesce(o),
157            (Bytes(s), Bytes(o)) => s.coalesce(o),
158
159            (
160                Sequence {
161                    field: self_boxed,
162                    context: self_agg,
163                },
164                Sequence {
165                    field: other_boxed,
166                    context: other_agg,
167                },
168            ) => {
169                self_agg.coalesce(other_agg);
170                self_boxed.coalesce(*other_boxed);
171            }
172
173            (
174                Struct {
175                    fields: self_fields,
176                    context: self_agg,
177                },
178                Struct {
179                    fields: other_fields,
180                    context: other_agg,
181                },
182            ) => {
183                self_agg.coalesce(other_agg);
184                for (name, other_schema) in other_fields {
185                    self_fields
186                        .entry(name)
187                        .and_modify(|schema| schema.coalesce(other_schema.clone()))
188                        .or_insert_with(|| other_schema);
189                }
190            }
191            (
192                Union {
193                    variants: self_alternatives,
194                },
195                Union {
196                    variants: other_alternatives,
197                },
198            ) => coalesce_unions(self_alternatives, other_alternatives),
199            (
200                Union {
201                    variants: self_alternatives,
202                },
203                any_other,
204            ) => coalesce_to_alternatives(self_alternatives, any_other),
205            (
206                any_self,
207                Union {
208                    variants: mut other_alternatives,
209                },
210            ) => {
211                let self_original = std::mem::replace(any_self, Schema::Null(Default::default()));
212                coalesce_to_alternatives(&mut other_alternatives, self_original);
213                *any_self = Schema::Union {
214                    variants: other_alternatives,
215                };
216            }
217
218            (any_self, any_other) => {
219                let self_original = std::mem::replace(any_self, Schema::Null(Default::default()));
220                *any_self = Union {
221                    variants: vec![self_original, any_other],
222                };
223            }
224        };
225        return;
226
227        fn coalesce_unions(selfs: &mut Vec<Schema>, others: Vec<Schema>) {
228            for o in others {
229                coalesce_to_alternatives(selfs, o);
230            }
231        }
232
233        /// This function attempts to match the incomming schema against all the
234        /// alternatives already present, and if it fails it pushes it to the vector as a
235        /// new alternative.
236        fn coalesce_to_alternatives(alternatives: &mut Vec<Schema>, mut other: Schema) {
237            use Schema::*;
238            for s in alternatives.iter_mut() {
239                match (s, other) {
240                    // Nested unions should never happen.
241                    // It is the job of the root impl of Coalesce for Schema to guarantee this.
242                    (Union { .. }, _) | (_, Union { .. }) => {
243                        unreachable!("nested union")
244                    }
245
246                    // If they are the same, go ahead and coalesce!
247                    (Boolean(s), Boolean(o)) => {
248                        s.coalesce(o);
249                        return;
250                    }
251                    (Integer(s), Integer(o)) => {
252                        s.coalesce(o);
253                        return;
254                    }
255                    (Float(s), Float(o)) => {
256                        s.coalesce(o);
257                        return;
258                    }
259                    (String(s), String(o)) => {
260                        s.coalesce(o);
261                        return;
262                    }
263                    (Bytes(s), Bytes(o)) => {
264                        s.coalesce(o);
265                        return;
266                    }
267
268                    (
269                        Sequence {
270                            field: self_boxed,
271                            context: self_agg,
272                        },
273                        Sequence {
274                            field: other_boxed,
275                            context: other_agg,
276                        },
277                    ) => {
278                        self_agg.coalesce(other_agg);
279                        self_boxed.coalesce(*other_boxed);
280                        return;
281                    }
282
283                    (
284                        Struct {
285                            fields: self_fields,
286                            context: self_agg,
287                        },
288                        Struct {
289                            fields: other_fields,
290                            context: other_agg,
291                        },
292                    ) => {
293                        self_agg.coalesce(other_agg);
294                        for (name, other_schema) in other_fields {
295                            self_fields
296                                .entry(name)
297                                .and_modify(|schema| schema.coalesce(other_schema.clone()))
298                                .or_insert_with(|| other_schema);
299                        }
300                        return;
301                    }
302
303                    // If they don't match just continue ahead to the next one.
304                    (_, caught_other) => {
305                        other = caught_other;
306                    }
307                }
308            }
309
310            // If we were unable to find a match, push the schema to the alternatives:
311            alternatives.push(other);
312        }
313    }
314}
315impl PartialEq for Schema {
316    fn eq(&self, other: &Self) -> bool {
317        use Schema::*;
318        match (self, other) {
319            (Null(s), Null(o)) => s == o,
320            (Boolean(s), Boolean(o)) => s == o,
321            (Integer(s), Integer(o)) => s == o,
322            (Float(s), Float(o)) => s == o,
323            (String(s), String(o)) => s == o,
324            (Bytes(s), Bytes(o)) => s == o,
325
326            (
327                Sequence {
328                    field: field_1,
329                    context: context_1,
330                },
331                Sequence {
332                    field: field_2,
333                    context: context_2,
334                },
335            ) => field_1 == field_2 && context_1 == context_2,
336
337            (
338                Struct {
339                    fields: fields_1,
340                    context: context_1,
341                },
342                Struct {
343                    fields: fields_2,
344                    context: context_2,
345                },
346            ) => fields_1 == fields_2 && context_1 == context_2,
347
348            (Union { variants: s }, Union { variants: o }) => {
349                let mut s = s.clone();
350                let mut o = o.clone();
351                s.sort_by(schema_cmp);
352                o.sort_by(schema_cmp);
353                s == o
354            }
355
356            // Listing these out makes sure it fails if new variants are added.
357            (Null(_), _)
358            | (Boolean(_), _)
359            | (Integer(_), _)
360            | (Float(_), _)
361            | (String(_), _)
362            | (Bytes(_), _)
363            | (Sequence { .. }, _)
364            | (Struct { .. }, _)
365            | (Union { .. }, _) => false,
366        }
367    }
368}
369
370//
371// Field implementations
372//
373impl Field {
374    /// Returns a [Field] with the given [Schema] and default [FieldStatus].
375    pub fn with_schema(schema: Schema) -> Self {
376        Self {
377            status: FieldStatus::default(),
378            schema: Some(schema),
379        }
380    }
381}
382impl Coalesce for Field {
383    fn coalesce(&mut self, other: Self)
384    where
385        Self: Sized,
386    {
387        self.status.coalesce(other.status);
388        self.schema = match (self.schema.take(), other.schema) {
389            (Some(mut s), Some(o)) => {
390                s.coalesce(o);
391                Some(s)
392            }
393            (Some(s), None) => Some(s),
394            (None, Some(o)) => Some(o),
395            (None, None) => None,
396        }
397    }
398}
399impl StructuralEq for Field {
400    fn structural_eq(&self, other: &Self) -> bool {
401        self.status == other.status && self.schema.structural_eq(&other.schema)
402    }
403}
404
405//
406// FieldStatus implementations
407//
408impl FieldStatus {
409    /// If the value passed is true, then the status will allow duplicates.
410    /// Otherwise no changes are made.
411    pub fn allow_duplicates(&mut self, is_duplicate: bool) {
412        self.may_be_duplicate |= is_duplicate;
413    }
414    /// `true` if the status allows for null or missing values.
415    pub fn is_option(&self) -> bool {
416        self.may_be_null || self.may_be_missing
417    }
418}
419impl Coalesce for FieldStatus {
420    fn coalesce(&mut self, other: Self)
421    where
422        Self: Sized,
423    {
424        self.may_be_null |= other.may_be_null;
425        self.may_be_normal |= other.may_be_normal;
426        self.may_be_missing |= other.may_be_missing;
427        self.may_be_duplicate |= other.may_be_duplicate;
428    }
429}
430
431//
432// Helper functions
433//
434
435/// A helper function that returns the [Ordering](std::cmp::Ordering) of two [Schema]s
436/// to help in comparing two [Schema::Union].
437/// Since a [Schema::Union] should never hold two schemas of the same type, it is enough to
438/// just compare the top level without recursion.
439fn schema_cmp(first: &Schema, second: &Schema) -> std::cmp::Ordering {
440    use std::cmp::Ordering::*;
441    use Schema::*;
442    match first {
443        Null(_) => match second {
444            Null(_) => Equal,
445            _ => Less,
446        },
447        Boolean(_) => match second {
448            Null(_) | Boolean(_) => Equal,
449            _ => Less,
450        },
451        Integer(_) => match second {
452            Null(_) | Boolean(_) => Greater,
453            Integer(_) => Equal,
454            _ => Less,
455        },
456        Float(_) => match second {
457            Null(_) | Boolean(_) | Integer(_) => Greater,
458            Float(_) => Equal,
459            _ => Less,
460        },
461        String(_) => match second {
462            Null(_) | Boolean(_) | Integer(_) | Float(_) => Greater,
463            String(_) => Equal,
464            _ => Less,
465        },
466        Bytes(_) => match second {
467            Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) => Greater,
468            Bytes(_) => Equal,
469            _ => Less,
470        },
471        Sequence { .. } => match second {
472            Null(_) | Boolean(_) | Integer(_) | Float(_) | String(_) | Bytes(_) => Greater,
473            Sequence { .. } => Equal,
474            _ => Less,
475        },
476        Struct { .. } => match second {
477            Null(_)
478            | Boolean(_)
479            | Integer(_)
480            | Float(_)
481            | String(_)
482            | Bytes(_)
483            | Sequence { .. } => Greater,
484            Struct { .. } => Equal,
485            _ => Less,
486        },
487        Union { .. } => match second {
488            Null(_)
489            | Boolean(_)
490            | Integer(_)
491            | Float(_)
492            | String(_)
493            | Bytes(_)
494            | Sequence { .. }
495            | Struct { .. } => Greater,
496            Union { .. } => Equal,
497        },
498    }
499}