substrait_validator/output/tree.rs
1// SPDX-License-Identifier: Apache-2.0
2
3//! Module for the output tree structure.
4//!
5//! This module provides the types for the tree structure that constitutes
6//! the output of the validator. The nodes in the tree are intended to
7//! correspond exactly to the protobuf messages, primitives, and YAML values
8//! (the latter actually using the JSON object model) that constitute the
9//! incoming plan. Likewise, the structure of the tree is the same as the
10//! input. However, unlike the input:
11//!
12//! - All nodes and the relations between them are encapsulated in generic
13//! types, independent from the corresponding messages/values in the
14//! original tree. This allows the tree to be traversed by generic code
15//! with no understanding of Substrait.
16//! - Additional information can be attached to the nodes, edges, and
17//! between the edges, such as diagnostic messages and data type
18//! information.
19//!
20//! The node type for the output trees is [`Node`]. This structure contains
21//! a single [`NodeType`] enum variant and zero or more [`NodeData`] enum
22//! variants in an ordered sequence to form the tree structure; [`NodeType`]
23//! includes information about the node itself, while the [`NodeData`]
24//! elements represent edges to other nodes ([`Child`]) or contextual
25//! information. A subtree might look something like this:
26//!
27//! ```text
28//! Node ---> ProtoMessage } Parent node
29//! |
30//! .--------------'--------------.
31//! | | | |
32//! v v v v
33//! Child Diagnostic Comment Child } Edges
34//! | |
35//! v v
36//! Node ---> ProtoPrimitive Node ---> ProtoMessage } Child nodes
37//! | |
38//! '-> PrimitiveData :
39//! ```
40//!
41//! Note that the [`Child`] struct includes information about how the child
42//! node relates to its parent (which field, array element, etc) via
43//! [`PathElement`](path::PathElement), such that the original tree structure
44//! could in theory be completely reconstructed.
45//!
46//! Nevertheless, the conversion from protobuf/YAML to this tree structure is
47//! only intended to be a one-way street; indeed, the output tree is not
48//! intended to ever be treated as some executable query plan by a computer at
49//! all. It serves only as an intermediate format for documentation, debug,
50//! and/or validation output. The [export](mod@crate::export) module deals with
51//! breaking this internal representation down further, into (file) formats
52//! that are not specific to the Substrait validator.
53
54use crate::output::comment;
55use crate::output::diagnostic;
56use crate::output::path;
57use crate::output::primitive_data;
58use crate::output::type_system::data;
59use std::collections::VecDeque;
60use std::sync::Arc;
61
62/// Node for a semi-structured documentation-like tree representation of a
63/// parsed Substrait plan. The intention is for this to be serialized into
64/// some human-readable format.
65///
66/// Note: although it should be possible to reconstruct the entire plan from
67/// the information contained in the tree, the tree is only intended to be
68/// converted to structured human-readable documentation for the plan. It is
69/// expressly NOT intended to be read as a form of AST by a downstream
70/// process, and therefore isn't nearly as strictly-typed as you would
71/// otherwise want it to be. Protobuf itself is already a reasonable format
72/// for this!
73#[derive(Clone, Debug, PartialEq)]
74pub struct Node {
75 /// The type of a node in terms of plan semantics.
76 pub class: Class,
77
78 /// An optional brief description of the node. This can be regarded as
79 /// a comment placed at the start of the data vector, but it is usually
80 /// only set at the end of the parse function.
81 pub brief: Option<comment::Brief>,
82
83 /// An optional comment summarizing what this node does. This can be
84 /// regarded as a comment placed at the start of the data vector (just
85 /// after brief, if brief is also defined), but it is usually only set
86 /// at the end of the parse function.
87 pub summary: Option<comment::Comment>,
88
89 /// The type of node in terms of what it represents in the original
90 /// data structure.
91 pub node_type: NodeType,
92
93 /// The type of data returned by this node, if any. Depending on the
94 /// message and context, this may represent a table schema or scalar
95 /// data.
96 pub data_type: Option<data::Type>,
97
98 /// The information gathered about the message.
99 ///
100 /// This normally includes all child nodes for this message, possibly
101 /// interspersed with diagnostics, type information, and unstructured
102 /// comment nodes to provide context, all ordered in a reasonable way.
103 /// Note however that this information is intended to be understood by
104 /// a human, not by the validator itself (aside from serialization to a
105 /// human-readable notation).
106 pub data: Vec<NodeData>,
107}
108
109impl From<NodeType> for Node {
110 fn from(node_type: NodeType) -> Self {
111 Node {
112 class: Class::Misc,
113 brief: None,
114 summary: None,
115 node_type,
116 data_type: None,
117 data: vec![],
118 }
119 }
120}
121
122impl Node {
123 /// Returns an iterator that iterates over all nodes depth-first.
124 pub fn iter_flattened_nodes(&self) -> FlattenedNodeIter {
125 FlattenedNodeIter {
126 remaining: VecDeque::from(vec![self]),
127 }
128 }
129
130 /// Returns an iterator that iterates over all NodeData objects in the
131 /// order in which they were defined.
132 pub fn iter_flattened_node_data(&self) -> FlattenedNodeDataIter {
133 FlattenedNodeDataIter {
134 remaining: self.data.iter().rev().collect(),
135 }
136 }
137
138 /// Iterates over all diagnostics in the tree.
139 pub fn iter_diagnostics(&self) -> impl Iterator<Item = &diagnostic::Diagnostic> + '_ {
140 self.iter_flattened_node_data().filter_map(|x| match x {
141 NodeData::Diagnostic(d) => Some(d),
142 _ => None,
143 })
144 }
145
146 /// Returns the first diagnostic of the highest severity level in the tree.
147 pub fn get_diagnostic(&self) -> Option<&diagnostic::Diagnostic> {
148 let mut result: Option<&diagnostic::Diagnostic> = None;
149 for diag in self.iter_diagnostics() {
150 // We can return immediately for error diagnostics, since this is the
151 // highest level.
152 if diag.adjusted_level == diagnostic::Level::Error {
153 return Some(diag);
154 }
155
156 // For other levels, update only if the incoming diagnostic is of a
157 // higher level/severity than the current one.
158 if let Some(cur) = result.as_mut() {
159 if diag.adjusted_level > cur.adjusted_level {
160 *cur = diag;
161 }
162 } else {
163 result = Some(diag);
164 }
165 }
166 result
167 }
168
169 /// Returns a reference to the data type that this node returns at runtime
170 /// or (for type nodes) represents. If no type information is attached, a
171 /// reference to a default-generated unresolved type is returned.
172 pub fn data_type(&self) -> data::Type {
173 self.data_type.clone().unwrap_or_default()
174 }
175}
176
177/// The original data type that the node represents, to (in theory) allow the
178/// original structure of the plan to be recovered from the documentation tree.
179#[derive(Clone, Debug, PartialEq)]
180pub enum NodeType {
181 /// The associated node represents a protobuf message of the given type
182 /// (full protobuf path). The contents of the message are described using
183 /// Field, RepeatedField, and OneOfField.
184 ProtoMessage(&'static str),
185
186 /// The associated node represents a protobuf primitive value of the given
187 /// type and with the given data.
188 ProtoPrimitive(&'static str, primitive_data::PrimitiveData),
189
190 /// The associated node represents an unpopulated oneof field. This is used
191 /// for an error recovery node when a required oneof field is not
192 /// populated.
193 ProtoMissingOneOf,
194
195 /// Used for anchor/reference-based references to other nodes.
196 NodeReference(u64, NodeReference),
197
198 /// The associated node represents a YAML map. The contents of the map are
199 /// described using Field and UnknownField.
200 YamlMap,
201
202 /// The associated node represents a YAML array. The contents of the array
203 /// are described using ArrayElement datums.
204 YamlArray,
205
206 /// The associated node represents a YAML primitive.
207 YamlPrimitive(primitive_data::PrimitiveData),
208
209 /// Used for string primitives that were resolved as URIs. The node will
210 /// have a child named "data" with the validation tree of the resolved
211 /// data.
212 ResolvedUri(String),
213
214 /// The associated node represents a node in an abstract syntax tree parsed
215 /// from a string.
216 AstNode,
217}
218
219/// Semantical information about a node.
220#[derive(Clone, Copy, Debug, PartialEq, Eq)]
221pub enum Class {
222 /// Used for nodes for which no better classification exists.
223 Misc,
224
225 /// Used for nodes that define a type. The data_type field signifies this
226 /// data type.
227 Type,
228
229 /// Used for nodes that represent scalar expressions or literals. The
230 /// data_type field signifies the type of the value returned by the
231 /// expression.
232 Expression,
233
234 /// Used for nodes that represent relations. The data_type field signifies
235 /// the schema for the data returned by the relation.
236 Relation,
237}
238
239/// Information nodes for a parsed protobuf message.
240#[derive(Clone, Debug, PartialEq)]
241pub enum NodeData {
242 /// A reference to a child node in the tree.
243 Child(Child),
244
245 /// Indicates that parsing/validating this message resulted in some
246 /// diagnostic message being emitted. The secondary error level is the
247 /// modified level via
248 Diagnostic(diagnostic::Diagnostic),
249
250 /// Provides (intermediate) type information for this node. Depending on
251 /// the message, this may be a struct or named struct representing a
252 /// schema, or it may represent the type of some scalar expression.
253 /// Multiple TypeInfo nodes may be present, in particular for relations
254 /// that perform multiple operations in one go (for example read, project,
255 /// emit). The TypeInfo and operation description *Field nodes are then
256 /// ordered by data flow. In particular, the last TypeInfo node always
257 /// represents the type of the final result of a node.
258 DataType(data::Type),
259
260 /// Used for adding unstructured additional information to a message,
261 /// wherever this may aid human understanding of a message.
262 Comment(comment::Comment),
263}
264
265/// Reference to a child node in the tree.
266#[derive(Clone, Debug, PartialEq)]
267pub struct Child {
268 /// Path element identifying the relation of this child node to its parent.
269 pub path_element: path::PathElement,
270
271 /// The child node.
272 pub node: Arc<Node>,
273
274 /// Whether the validator recognized/expected the field or element that
275 /// this child represents. Fields/elements may be unrecognized simply
276 /// because validation is not implemented for them yet. In any case, this
277 /// flag indicates that the subtree represented by this node could not be
278 /// validated.
279 pub recognized: bool,
280}
281
282/// A reference to a node elsewhere in the tree.
283#[derive(Clone, Debug, PartialEq)]
284pub struct NodeReference {
285 /// Absolute path to the node.
286 pub path: path::PathBuf,
287
288 /// Link to the node.
289 pub node: Arc<Node>,
290}
291
292pub struct FlattenedNodeIter<'a> {
293 remaining: VecDeque<&'a Node>,
294}
295impl<'a> Iterator for FlattenedNodeIter<'a> {
296 type Item = &'a Node;
297
298 fn next(&mut self) -> Option<Self::Item> {
299 let maybe_node = self.remaining.pop_back();
300 if let Some(node) = maybe_node {
301 self.remaining
302 .extend(node.data.iter().rev().filter_map(|x| -> Option<&Node> {
303 if let NodeData::Child(child) = x {
304 Some(&child.node)
305 } else {
306 None
307 }
308 }));
309 }
310 maybe_node
311 }
312}
313
314pub struct FlattenedNodeDataIter<'a> {
315 remaining: VecDeque<&'a NodeData>,
316}
317
318impl<'a> Iterator for FlattenedNodeDataIter<'a> {
319 type Item = &'a NodeData;
320
321 fn next(&mut self) -> Option<Self::Item> {
322 let maybe_node_data = self.remaining.pop_back();
323 if let Some(NodeData::Child(child)) = maybe_node_data {
324 self.remaining.extend(child.node.data.iter().rev())
325 }
326 maybe_node_data
327 }
328}