1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// SPDX-License-Identifier: Apache-2.0

//! Module for the output tree structure.
//!
//! This module provides the types for the tree structure that constitutes
//! the output of the validator. The nodes in the tree are intended to
//! correspond exactly to the protobuf messages, primitives, and YAML values
//! (the latter actually using the JSON object model) that constitute the
//! incoming plan. Likewise, the structure of the tree is the same as the
//! input. However, unlike the input:
//!
//!  - All nodes and the relations between them are encapsulated in generic
//!    types, independent from the corresponding messages/values in the
//!    original tree. This allows the tree to be traversed by generic code
//!    with no understanding of Substrait.
//!  - Additional information can be attached to the nodes, edges, and
//!    between the edges, such as diagnostic messages and data type
//!    information.
//!
//! The node type for the output trees is [`Node`]. This structure contains
//! a single [`NodeType`] enum variant and zero or more [`NodeData`] enum
//! variants in an ordered sequence to form the tree structure; [`NodeType`]
//! includes information about the node itself, while the [`NodeData`]
//! elements represent edges to other nodes ([`Child`]) or contextual
//! information. A subtree might look something like this:
//!
//! ```text
//!                 Node ---> ProtoMessage                   } Parent node
//!                  |
//!   .--------------'--------------.
//!   |         |         |         |
//!   v         v         v         v
//! Child  Diagnostic  Comment    Child                      } Edges
//!   |                             |
//!   v                             v
//! Node ---> ProtoPrimitive      Node ---> ProtoMessage     } Child nodes
//!            |                    |
//!            '-> PrimitiveData    :
//! ```
//!
//! Note that the [`Child`] struct includes information about how the child
//! node relates to its parent (which field, array element, etc) via
//! [`PathElement`](path::PathElement), such that the original tree structure
//! could in theory be completely reconstructed.
//!
//! Nevertheless, the conversion from protobuf/YAML to this tree structure is
//! only intended to be a one-way street; indeed, the output tree is not
//! intended to ever be treated as some executable query plan by a computer at
//! all. It serves only as an intermediate format for documentation, debug,
//! and/or validation output. The [export](mod@crate::export) module deals with
//! breaking this internal representation down further, into (file) formats
//! that are not specific to the Substrait validator.

use crate::output::comment;
use crate::output::data_type;
use crate::output::diagnostic;
use crate::output::extension;
use crate::output::path;
use crate::output::primitive_data;
use std::collections::VecDeque;
use std::sync::Arc;

/// Node for a semi-structured documentation-like tree representation of a
/// parsed Substrait plan. The intention is for this to be serialized into
/// some human-readable format.
///
/// Note: although it should be possible to reconstruct the entire plan from
/// the information contained in the tree, the tree is only intended to be
/// converted to structured human-readable documentation for the plan. It is
/// expressly NOT intended to be read as a form of AST by a downstream
/// process, and therefore isn't nearly as strictly-typed as you would
/// otherwise want it to be. Protobuf itself is already a reasonable format
/// for this!
#[derive(Clone, Debug, PartialEq)]
pub struct Node {
    /// The type of a node in terms of plan semantics.
    pub class: Class,

    /// An optional brief description of the node. This can be regarded as
    /// a comment placed at the start of the data vector, but it is usually
    /// only set at the end of the parse function.
    pub brief: Option<comment::Brief>,

    /// An optional comment summarizing what this node does. This can be
    /// regarded as a comment placed at the start of the data vector (just
    /// after brief, if brief is also defined), but it is usually only set
    /// at the end of the parse function.
    pub summary: Option<comment::Comment>,

    /// The type of node in terms of what it represents in the original
    /// data structure.
    pub node_type: NodeType,

    /// The type of data returned by this node, if any. Depending on the
    /// message and context, this may represent a table schema or scalar
    /// data.
    pub data_type: Option<Arc<data_type::DataType>>,

    /// The information gathered about the message.
    ///
    /// This normally includes all child nodes for this message, possibly
    /// interspersed with diagnostics, type information, and unstructured
    /// comment nodes to provide context, all ordered in a reasonable way.
    /// Note however that this information is intended to be understood by
    /// a human, not by the validator itself (aside from serialization to a
    /// human-readable notation).
    pub data: Vec<NodeData>,
}

impl From<NodeType> for Node {
    fn from(node_type: NodeType) -> Self {
        Node {
            class: Class::Misc,
            brief: None,
            summary: None,
            node_type,
            data_type: None,
            data: vec![],
        }
    }
}

impl Node {
    /// Returns an iterator that iterates over all nodes depth-first.
    pub fn iter_flattened_nodes(&self) -> FlattenedNodeIter {
        FlattenedNodeIter {
            remaining: VecDeque::from(vec![self]),
        }
    }

    /// Returns an iterator that iterates over all NodeData objects in the
    /// order in which they were defined.
    pub fn iter_flattened_node_data(&self) -> FlattenedNodeDataIter {
        FlattenedNodeDataIter {
            remaining: self.data.iter().rev().collect(),
        }
    }

    /// Iterates over all diagnostics in the tree.
    pub fn iter_diagnostics(&self) -> impl Iterator<Item = &diagnostic::Diagnostic> + '_ {
        self.iter_flattened_node_data().filter_map(|x| match x {
            NodeData::Diagnostic(d) => Some(d),
            _ => None,
        })
    }

    /// Returns the first diagnostic of the highest severity level in the tree.
    pub fn get_diagnostic(&self) -> Option<&diagnostic::Diagnostic> {
        let mut result: Option<&diagnostic::Diagnostic> = None;
        for diag in self.iter_diagnostics() {
            // We can return immediately for error diagnostics, since this is the
            // highest level.
            if diag.adjusted_level == diagnostic::Level::Error {
                return Some(diag);
            }

            // For other levels, update only if the incoming diagnostic is of a
            // higher level/severity than the current one.
            if let Some(cur) = result.as_mut() {
                if diag.adjusted_level > (*cur).adjusted_level {
                    *cur = diag;
                }
            } else {
                result = Some(diag);
            }
        }
        result
    }

    /// Returns a reference to the data type that this node returns at runtime
    /// or (for type nodes) represents. If no type information is attached, a
    /// reference to a default-generated unresolved type is returned.
    pub fn data_type(&self) -> Arc<data_type::DataType> {
        self.data_type.clone().unwrap_or_default()
    }
}

/// The original data type that the node represents, to (in theory) allow the
/// original structure of the plan to be recovered from the documentation tree.
#[derive(Clone, Debug, PartialEq)]
pub enum NodeType {
    /// The associated node represents a protobuf message of the given type
    /// (full protobuf path). The contents of the message are described using
    /// Field, RepeatedField, and OneOfField.
    ProtoMessage(&'static str),

    /// The associated node represents a protobuf primitive value of the given
    /// type and with the given data.
    ProtoPrimitive(&'static str, primitive_data::PrimitiveData),

    /// The associated node represents an unpopulated oneof field. This is used
    /// for an error recovery node when a required oneof field is not
    /// populated.
    ProtoMissingOneOf,

    /// Used for anchor/reference-based references to other nodes.
    NodeReference(u64, NodeReference),

    /// Used for resolved YAML URIs, in order to include the parse result and
    /// documentation for the referenced YAML (if available), in addition to
    /// the URI itself.
    YamlReference(Arc<extension::YamlData>),

    /// The associated node represents a YAML map. The contents of the map are
    /// described using Field and UnknownField.
    YamlMap,

    /// The associated node represents a YAML array. The contents of the array
    /// are described using ArrayElement datums.
    YamlArray,

    /// The associated node represents a YAML primitive.
    YamlPrimitive(primitive_data::PrimitiveData),
}

/// Semantical information about a node.
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Class {
    /// Used for nodes for which no better classification exists.
    Misc,

    /// Used for nodes that define a type. The data_type field signifies this
    /// data type.
    Type,

    /// Used for nodes that represent scalar expressions or literals. The
    /// data_type field signifies the type of the value returned by the
    /// expression.
    Expression,

    /// Used for nodes that represent relations. The data_type field signifies
    /// the schema for the data returned by the relation.
    Relation,
}

/// Information nodes for a parsed protobuf message.
#[derive(Clone, Debug, PartialEq)]
pub enum NodeData {
    /// A reference to a child node in the tree.
    Child(Child),

    /// Indicates that parsing/validating this message resulted in some
    /// diagnostic message being emitted. The secondary error level is the
    /// modified level via
    Diagnostic(diagnostic::Diagnostic),

    /// Provides (intermediate) type information for this node. Depending on
    /// the message, this may be a struct or named struct representing a
    /// schema, or it may represent the type of some scalar expression.
    /// Multiple TypeInfo nodes may be present, in particular for relations
    /// that perform multiple operations in one go (for example read, project,
    /// emit). The TypeInfo and operation description *Field nodes are then
    /// ordered by data flow. In particular, the last TypeInfo node always
    /// represents the type of the final result of a node.
    DataType(Arc<data_type::DataType>),

    /// Used for adding unstructured additional information to a message,
    /// wherever this may aid human understanding of a message.
    Comment(comment::Comment),
}

/// Reference to a child node in the tree.
#[derive(Clone, Debug, PartialEq)]
pub struct Child {
    /// Path element identifying the relation of this child node to its parent.
    pub path_element: path::PathElement,

    /// The child node.
    pub node: Arc<Node>,

    /// Whether the validator recognized/expected the field or element that
    /// this child represents. Fields/elements may be unrecognized simply
    /// because validation is not implemented for them yet. In any case, this
    /// flag indicates that the subtree represented by this node could not be
    /// validated.
    pub recognized: bool,
}

/// A reference to a node elsewhere in the tree.
#[derive(Clone, Debug, PartialEq)]
pub struct NodeReference {
    /// Absolute path to the node.
    pub path: path::PathBuf,

    /// Link to the node.
    pub node: Arc<Node>,
}

pub struct FlattenedNodeIter<'a> {
    remaining: VecDeque<&'a Node>,
}
impl<'a> Iterator for FlattenedNodeIter<'a> {
    type Item = &'a Node;

    fn next(&mut self) -> Option<Self::Item> {
        let maybe_node = self.remaining.pop_back();
        if let Some(node) = maybe_node {
            self.remaining
                .extend(node.data.iter().rev().filter_map(|x| -> Option<&Node> {
                    if let NodeData::Child(child) = x {
                        Some(&child.node)
                    } else {
                        None
                    }
                }));
        }
        maybe_node
    }
}

pub struct FlattenedNodeDataIter<'a> {
    remaining: VecDeque<&'a NodeData>,
}

impl<'a> Iterator for FlattenedNodeDataIter<'a> {
    type Item = &'a NodeData;

    fn next(&mut self) -> Option<Self::Item> {
        let maybe_node_data = self.remaining.pop_back();
        if let Some(NodeData::Child(child)) = maybe_node_data {
            self.remaining.extend(child.node.data.iter().rev())
        }
        maybe_node_data
    }
}