substrait_validator/output/
tree.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Module for the output tree structure.
4//!
5//! This module provides the types for the tree structure that constitutes
6//! the output of the validator. The nodes in the tree are intended to
7//! correspond exactly to the protobuf messages, primitives, and YAML values
8//! (the latter actually using the JSON object model) that constitute the
9//! incoming plan. Likewise, the structure of the tree is the same as the
10//! input. However, unlike the input:
11//!
12//!  - All nodes and the relations between them are encapsulated in generic
13//!    types, independent from the corresponding messages/values in the
14//!    original tree. This allows the tree to be traversed by generic code
15//!    with no understanding of Substrait.
16//!  - Additional information can be attached to the nodes, edges, and
17//!    between the edges, such as diagnostic messages and data type
18//!    information.
19//!
20//! The node type for the output trees is [`Node`]. This structure contains
21//! a single [`NodeType`] enum variant and zero or more [`NodeData`] enum
22//! variants in an ordered sequence to form the tree structure; [`NodeType`]
23//! includes information about the node itself, while the [`NodeData`]
24//! elements represent edges to other nodes ([`Child`]) or contextual
25//! information. A subtree might look something like this:
26//!
27//! ```text
28//!                 Node ---> ProtoMessage                   } Parent node
29//!                  |
30//!   .--------------'--------------.
31//!   |         |         |         |
32//!   v         v         v         v
33//! Child  Diagnostic  Comment    Child                      } Edges
34//!   |                             |
35//!   v                             v
36//! Node ---> ProtoPrimitive      Node ---> ProtoMessage     } Child nodes
37//!            |                    |
38//!            '-> PrimitiveData    :
39//! ```
40//!
41//! Note that the [`Child`] struct includes information about how the child
42//! node relates to its parent (which field, array element, etc) via
43//! [`PathElement`](path::PathElement), such that the original tree structure
44//! could in theory be completely reconstructed.
45//!
46//! Nevertheless, the conversion from protobuf/YAML to this tree structure is
47//! only intended to be a one-way street; indeed, the output tree is not
48//! intended to ever be treated as some executable query plan by a computer at
49//! all. It serves only as an intermediate format for documentation, debug,
50//! and/or validation output. The [export](mod@crate::export) module deals with
51//! breaking this internal representation down further, into (file) formats
52//! that are not specific to the Substrait validator.
53
54use crate::output::comment;
55use crate::output::diagnostic;
56use crate::output::path;
57use crate::output::primitive_data;
58use crate::output::type_system::data;
59use std::collections::VecDeque;
60use std::sync::Arc;
61
62/// Node for a semi-structured documentation-like tree representation of a
63/// parsed Substrait plan. The intention is for this to be serialized into
64/// some human-readable format.
65///
66/// Note: although it should be possible to reconstruct the entire plan from
67/// the information contained in the tree, the tree is only intended to be
68/// converted to structured human-readable documentation for the plan. It is
69/// expressly NOT intended to be read as a form of AST by a downstream
70/// process, and therefore isn't nearly as strictly-typed as you would
71/// otherwise want it to be. Protobuf itself is already a reasonable format
72/// for this!
73#[derive(Clone, Debug, PartialEq)]
74pub struct Node {
75    /// The type of a node in terms of plan semantics.
76    pub class: Class,
77
78    /// An optional brief description of the node. This can be regarded as
79    /// a comment placed at the start of the data vector, but it is usually
80    /// only set at the end of the parse function.
81    pub brief: Option<comment::Brief>,
82
83    /// An optional comment summarizing what this node does. This can be
84    /// regarded as a comment placed at the start of the data vector (just
85    /// after brief, if brief is also defined), but it is usually only set
86    /// at the end of the parse function.
87    pub summary: Option<comment::Comment>,
88
89    /// The type of node in terms of what it represents in the original
90    /// data structure.
91    pub node_type: NodeType,
92
93    /// The type of data returned by this node, if any. Depending on the
94    /// message and context, this may represent a table schema or scalar
95    /// data.
96    pub data_type: Option<data::Type>,
97
98    /// The information gathered about the message.
99    ///
100    /// This normally includes all child nodes for this message, possibly
101    /// interspersed with diagnostics, type information, and unstructured
102    /// comment nodes to provide context, all ordered in a reasonable way.
103    /// Note however that this information is intended to be understood by
104    /// a human, not by the validator itself (aside from serialization to a
105    /// human-readable notation).
106    pub data: Vec<NodeData>,
107}
108
109impl From<NodeType> for Node {
110    fn from(node_type: NodeType) -> Self {
111        Node {
112            class: Class::Misc,
113            brief: None,
114            summary: None,
115            node_type,
116            data_type: None,
117            data: vec![],
118        }
119    }
120}
121
122impl Node {
123    /// Returns an iterator that iterates over all nodes depth-first.
124    pub fn iter_flattened_nodes(&self) -> FlattenedNodeIter {
125        FlattenedNodeIter {
126            remaining: VecDeque::from(vec![self]),
127        }
128    }
129
130    /// Returns an iterator that iterates over all NodeData objects in the
131    /// order in which they were defined.
132    pub fn iter_flattened_node_data(&self) -> FlattenedNodeDataIter {
133        FlattenedNodeDataIter {
134            remaining: self.data.iter().rev().collect(),
135        }
136    }
137
138    /// Iterates over all diagnostics in the tree.
139    pub fn iter_diagnostics(&self) -> impl Iterator<Item = &diagnostic::Diagnostic> + '_ {
140        self.iter_flattened_node_data().filter_map(|x| match x {
141            NodeData::Diagnostic(d) => Some(d),
142            _ => None,
143        })
144    }
145
146    /// Returns the first diagnostic of the highest severity level in the tree.
147    pub fn get_diagnostic(&self) -> Option<&diagnostic::Diagnostic> {
148        let mut result: Option<&diagnostic::Diagnostic> = None;
149        for diag in self.iter_diagnostics() {
150            // We can return immediately for error diagnostics, since this is the
151            // highest level.
152            if diag.adjusted_level == diagnostic::Level::Error {
153                return Some(diag);
154            }
155
156            // For other levels, update only if the incoming diagnostic is of a
157            // higher level/severity than the current one.
158            if let Some(cur) = result.as_mut() {
159                if diag.adjusted_level > cur.adjusted_level {
160                    *cur = diag;
161                }
162            } else {
163                result = Some(diag);
164            }
165        }
166        result
167    }
168
169    /// Returns a reference to the data type that this node returns at runtime
170    /// or (for type nodes) represents. If no type information is attached, a
171    /// reference to a default-generated unresolved type is returned.
172    pub fn data_type(&self) -> data::Type {
173        self.data_type.clone().unwrap_or_default()
174    }
175}
176
177/// The original data type that the node represents, to (in theory) allow the
178/// original structure of the plan to be recovered from the documentation tree.
179#[derive(Clone, Debug, PartialEq)]
180pub enum NodeType {
181    /// The associated node represents a protobuf message of the given type
182    /// (full protobuf path). The contents of the message are described using
183    /// Field, RepeatedField, and OneOfField.
184    ProtoMessage(&'static str),
185
186    /// The associated node represents a protobuf primitive value of the given
187    /// type and with the given data.
188    ProtoPrimitive(&'static str, primitive_data::PrimitiveData),
189
190    /// The associated node represents an unpopulated oneof field. This is used
191    /// for an error recovery node when a required oneof field is not
192    /// populated.
193    ProtoMissingOneOf,
194
195    /// Used for anchor/reference-based references to other nodes.
196    NodeReference(u64, NodeReference),
197
198    /// The associated node represents a YAML map. The contents of the map are
199    /// described using Field and UnknownField.
200    YamlMap,
201
202    /// The associated node represents a YAML array. The contents of the array
203    /// are described using ArrayElement datums.
204    YamlArray,
205
206    /// The associated node represents a YAML primitive.
207    YamlPrimitive(primitive_data::PrimitiveData),
208
209    /// Used for string primitives that were resolved as URIs. The node will
210    /// have a child named "data" with the validation tree of the resolved
211    /// data.
212    ResolvedUri(String),
213
214    /// The associated node represents a node in an abstract syntax tree parsed
215    /// from a string.
216    AstNode,
217}
218
219/// Semantical information about a node.
220#[derive(Clone, Copy, Debug, PartialEq, Eq)]
221pub enum Class {
222    /// Used for nodes for which no better classification exists.
223    Misc,
224
225    /// Used for nodes that define a type. The data_type field signifies this
226    /// data type.
227    Type,
228
229    /// Used for nodes that represent scalar expressions or literals. The
230    /// data_type field signifies the type of the value returned by the
231    /// expression.
232    Expression,
233
234    /// Used for nodes that represent relations. The data_type field signifies
235    /// the schema for the data returned by the relation.
236    Relation,
237}
238
239/// Information nodes for a parsed protobuf message.
240#[derive(Clone, Debug, PartialEq)]
241pub enum NodeData {
242    /// A reference to a child node in the tree.
243    Child(Child),
244
245    /// Indicates that parsing/validating this message resulted in some
246    /// diagnostic message being emitted. The secondary error level is the
247    /// modified level via
248    Diagnostic(diagnostic::Diagnostic),
249
250    /// Provides (intermediate) type information for this node. Depending on
251    /// the message, this may be a struct or named struct representing a
252    /// schema, or it may represent the type of some scalar expression.
253    /// Multiple TypeInfo nodes may be present, in particular for relations
254    /// that perform multiple operations in one go (for example read, project,
255    /// emit). The TypeInfo and operation description *Field nodes are then
256    /// ordered by data flow. In particular, the last TypeInfo node always
257    /// represents the type of the final result of a node.
258    DataType(data::Type),
259
260    /// Used for adding unstructured additional information to a message,
261    /// wherever this may aid human understanding of a message.
262    Comment(comment::Comment),
263}
264
265/// Reference to a child node in the tree.
266#[derive(Clone, Debug, PartialEq)]
267pub struct Child {
268    /// Path element identifying the relation of this child node to its parent.
269    pub path_element: path::PathElement,
270
271    /// The child node.
272    pub node: Arc<Node>,
273
274    /// Whether the validator recognized/expected the field or element that
275    /// this child represents. Fields/elements may be unrecognized simply
276    /// because validation is not implemented for them yet. In any case, this
277    /// flag indicates that the subtree represented by this node could not be
278    /// validated.
279    pub recognized: bool,
280}
281
282/// A reference to a node elsewhere in the tree.
283#[derive(Clone, Debug, PartialEq)]
284pub struct NodeReference {
285    /// Absolute path to the node.
286    pub path: path::PathBuf,
287
288    /// Link to the node.
289    pub node: Arc<Node>,
290}
291
292pub struct FlattenedNodeIter<'a> {
293    remaining: VecDeque<&'a Node>,
294}
295impl<'a> Iterator for FlattenedNodeIter<'a> {
296    type Item = &'a Node;
297
298    fn next(&mut self) -> Option<Self::Item> {
299        let maybe_node = self.remaining.pop_back();
300        if let Some(node) = maybe_node {
301            self.remaining
302                .extend(node.data.iter().rev().filter_map(|x| -> Option<&Node> {
303                    if let NodeData::Child(child) = x {
304                        Some(&child.node)
305                    } else {
306                        None
307                    }
308                }));
309        }
310        maybe_node
311    }
312}
313
314pub struct FlattenedNodeDataIter<'a> {
315    remaining: VecDeque<&'a NodeData>,
316}
317
318impl<'a> Iterator for FlattenedNodeDataIter<'a> {
319    type Item = &'a NodeData;
320
321    fn next(&mut self) -> Option<Self::Item> {
322        let maybe_node_data = self.remaining.pop_back();
323        if let Some(NodeData::Child(child)) = maybe_node_data {
324            self.remaining.extend(child.node.data.iter().rev())
325        }
326        maybe_node_data
327    }
328}