substrait-validator 0.1.4

Substrait validator
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
// SPDX-License-Identifier: Apache-2.0

//! Module for diagnostic message types.
//!
//! Since diagnostic messages are rather important for a validator (after all,
//! getting a diagnostic message is hardly an exceptional case), they have
//! quite a bit of metadata attached to them. Ultimately, the diagnostic
//! messages attached to the tree ([`Diagnostic`]) have the following
//! parameters:
//!
//!  - cause.message: an enumeration of various types of error messages, in
//!    the usual Rust way. Messages generated by this crate are usually
//!    untyped (they just use String), but error information from other
//!    crates is retained as much as possible.
//!  - cause.classification: an enumeration of various bits of the validation
//!    process where diagnostics might occur. Each [`Classification`] enum
//!    variant can be converted to a unique number, known as the diagnostic
//!    code, which the user of the crate may use to easily programmatically
//!    determine what caused a diagnostic in a language-agnostic way. The user
//!    may also configure the validator in advance to promote or reduce the
//!    severity of diagnostics, indexed by their code. The codes are
//!    furthermore organized into groups, with up to 999 classes per group: the
//!    thousands digit and up is the group identifier, and the less-significant
//!    digits form the sub-code. Sub-code 0 is reserved to refer to the group
//!    as a whole.
//!  - original_level: the error [`Level`] that the validation code assigned to
//!    the message. This can be `Error`, `Warning`, or `Info`, which correspond
//!    directly to "this is definitely wrong," "this may or may not be wrong,"
//!    and "this conforms to the Substrait specification, but it's worth noting
//!    anyway" respectively.
//!  - adjusted_level: the error [`Level`] after configuration-based adjustment.
//!    This level is what's used by the high-level APIs to determine the
//!    validity of a plan. Thus, a user can choose to ignore a particular error
//!    if their consumer implementation can deal with it anyway, or they can
//!    assert whether a particular type of warning is actually an error or not.
//!  - path: a path into the substrait.Plan message. This is *usually* just a
//!    copy of the path to the node that was being validated when the
//!    diagnostic was created, but in some cases diagnostics may be placed in a
//!    parent node (for instance to refer to a node that should exist but
//!    doesn't), or refer to a different location altogether (for instance to
//!    point the user to the previous definition in a note following a
//!    duplicate definition error).

use crate::output::path;
use num_traits::cast::FromPrimitive;
use std::sync::Arc;
use strum::EnumProperty;

/// Owned variant of jsonschema::error::ValidationError<'a>. Instead of a
/// reference to the YAML tree node that caused the error, this just contains
/// the formatted error message. The validation error kind and paths are
/// however retained.
#[derive(Debug, thiserror::Error)]
pub struct JsonSchemaValidationError {
    pub message: String,
    pub kind: jsonschema::error::ValidationErrorKind,
    pub instance_path: jsonschema::paths::JSONPointer,
    pub schema_path: jsonschema::paths::JSONPointer,
}

impl std::fmt::Display for JsonSchemaValidationError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.message.fmt(f)
    }
}

impl From<jsonschema::error::ValidationError<'_>> for JsonSchemaValidationError {
    fn from(v: jsonschema::error::ValidationError) -> Self {
        JsonSchemaValidationError {
            message: v.to_string(),
            kind: v.kind,
            instance_path: v.instance_path,
            schema_path: v.schema_path,
        }
    }
}

/// Enumeration for error message data we might encounter.
#[derive(Debug, thiserror::Error)]
pub enum Message {
    #[error("{0}")]
    Untyped(String),

    #[error("{0}")]
    ProstDecodeError(#[from] prost::DecodeError),

    #[error("{0}")]
    IoError(#[from] std::io::Error),

    #[error("{0}")]
    UtfError(#[from] std::str::Utf8Error),

    #[error("{0}")]
    YamlError(#[from] serde_yaml::Error),

    #[error("{0}")]
    JsonSchemaValidationError(#[from] JsonSchemaValidationError),

    #[error("{0}")]
    UriError(#[from] uriparse::URIReferenceError),

    #[error("{0}")]
    GlobError(#[from] glob::PatternError),
}

impl From<&str> for Message {
    fn from(s: &str) -> Self {
        Message::Untyped(s.to_string())
    }
}

impl From<String> for Message {
    fn from(s: String) -> Self {
        Message::Untyped(s)
    }
}

impl From<jsonschema::error::ValidationError<'_>> for Message {
    fn from(v: jsonschema::error::ValidationError<'_>) -> Self {
        JsonSchemaValidationError::from(v).into()
    }
}

/// Enumeration for the particular types of diagnostics we might encounter.
///
/// Numbers must be assigned as follows:
///  - the group identifier is represented by the thousands digit and up;
///  - the first classification for each group (i.e. divisible by 1000) is
///    reserved for diagnostics that have no more specific information
///    attached to them: their description must be hidden and related to
///    the group name;
///  - group 0 is a sort of null group, where no group information is known;
///  - all enum variant names for classifications belonging to a group (except
///    the null group) must be prefixed by the group name;
///  - for backward/forward-compatibility, numbers should not be reassigned.
///
/// The Description and HiddenDescription enum properties define a description
/// of the class. When Description is used, the description is prefixed before
/// the error message; when HiddenDescription is used, the message is not
/// prefixed, and should thus be sufficiently specific to not need it. The
/// latter is useful to reduce the amount of redundant information in a
/// message.
#[derive(
    Clone,
    Copy,
    Debug,
    PartialEq,
    Eq,
    Hash,
    strum_macros::EnumIter,
    strum_macros::EnumProperty,
    num_derive::FromPrimitive,
    Default,
)]
pub enum Classification {
    // Unclassified diagnostics (group 0).
    #[strum(props(HiddenDescription = "unclassified diagnostic"))]
    #[default]
    Unclassified = 0,

    #[strum(props(Description = "not yet implemented"))]
    NotYetImplemented = 1,

    #[strum(props(Description = "illegal value"))]
    IllegalValue = 2,

    #[strum(props(Description = "illegal value in hint"))]
    IllegalValueInHint = 3,

    #[strum(props(Description = "illegal URI"))]
    IllegalUri = 4,

    #[strum(props(Description = "illegal glob"))]
    IllegalGlob = 5,

    #[strum(props(Description = "deprecation"))]
    Deprecation = 6,

    #[strum(props(HiddenDescription = "versioning"))]
    Versioning = 7,

    #[strum(props(HiddenDescription = "experimental"))]
    Experimental = 999,

    // Protobuf-related diagnostics (group 1).
    #[strum(props(HiddenDescription = "protobuf-related diagnostic"))]
    Proto = 1000,

    #[strum(props(HiddenDescription = "protobuf parsing failed"))]
    ProtoParseFailed = 1001,

    #[strum(props(Description = "missing required protobuf field"))]
    ProtoMissingField = 1002,

    #[strum(props(Description = "encountered a protobuf \"any\""))]
    ProtoAny = 1004,

    #[strum(props(Description = "missing protobuf \"any\" declaration"))]
    ProtoMissingAnyDeclaration = 1006,

    // YAML-related diagnostics (group 2).
    #[strum(props(HiddenDescription = "YAML-related diagnostic"))]
    Yaml = 2000,

    #[strum(props(Description = "did not attempt to resolve YAML"))]
    YamlResolutionDisabled = 2001,

    #[strum(props(Description = "failed to resolve YAML"))]
    YamlResolutionFailed = 2002,

    #[strum(props(Description = "failed to parse YAML"))]
    YamlParseFailed = 2003,

    #[strum(props(Description = "YAML does not conform to schema"))]
    YamlSchemaValidationFailed = 2004,

    #[strum(props(Description = "missing required YAML key"))]
    YamlMissingKey = 2005,

    #[strum(props(Description = "missing required YAML array element"))]
    YamlMissingElement = 2007,

    #[strum(props(Description = "invalid YAML value type"))]
    YamlInvalidType = 2008,

    #[strum(props(Description = "cyclic dependency"))]
    YamlCyclicDependency = 2009,

    // Link resolution diagnostics (group 3).
    #[strum(props(HiddenDescription = "link resolution diagnostic"))]
    Link = 3000,

    #[strum(props(Description = "failed to resolve anchor"))]
    LinkMissingAnchor = 3001,

    #[strum(props(HiddenDescription = "use of anchor zero"))]
    LinkAnchorZero = 3005,

    #[strum(props(Description = "failed to resolve type variation name & class pair"))]
    LinkMissingTypeVariationNameAndClass = 3006,

    #[strum(props(Description = "unresolved name lookup"))]
    LinkUnresolvedName = 3007,

    #[strum(props(Description = "ambiguous name lookup"))]
    LinkAmbiguousName = 3008,

    #[strum(props(Description = "duplicate definition"))]
    LinkDuplicateDefinition = 3009,

    #[strum(props(HiddenDescription = "invalid compound vs. simple function name usage"))]
    LinkCompoundVsSimpleFunctionName = 3010,

    // Type-related diagnostics (group 4).
    #[strum(props(HiddenDescription = "type-related diagnostics"))]
    Type = 4000,

    #[strum(props(Description = "unknown type"))]
    TypeUnknown = 4001,

    #[strum(props(Description = "mismatched type parameters"))]
    TypeMismatchedParameters = 4002,

    #[strum(props(Description = "mismatched field name associations"))]
    TypeMismatchedFieldNameAssociations = 4003,

    #[strum(props(Description = "invalid swizzle operation"))]
    TypeInvalidSwizzle = 4004,

    #[strum(props(Description = "mismatched types"))]
    TypeMismatch = 4005,

    #[strum(props(Description = "struct type is required"))]
    TypeStructRequired = 4006,

    #[strum(props(Description = "mismatched type variation"))]
    TypeMismatchedVariation = 4007,

    #[strum(props(Description = "mismatched nullability"))]
    TypeMismatchedNullability = 4008,

    #[strum(props(Description = "invalid type pattern or derivation expression"))]
    TypeDerivationInvalid = 4009,

    // Note the difference between above and below! Above should be used when
    // the derivation itself is invalid due to syntax or metatype errors, or in
    // other words, when it could *never* match or evaluate, regardless of
    // context. Below is used when the derivation itself appears to be sane,
    // but it does not apply to the given context. From a user perspective,
    // above means that the YAML is wrong, while below means that a function
    // is used incorrectly in a plan. Note that we cannot detect all problems
    // with type derivation expressions without evaluating them because they
    // are dynamically typed.
    #[strum(props(
        Description = "type pattern or derivation expression failed to match or evaluate"
    ))]
    TypeDerivationFailed = 4010,

    #[strum(props(Description = "parse error in type pattern or derivation expression"))]
    TypeParseError = 4011,

    #[strum(props(
        Description = "name resolution error in type pattern or derivation expression"
    ))]
    TypeResolutionError = 4012,

    #[strum(props(Description = "invalid field name"))]
    TypeInvalidFieldName = 4013,

    #[strum(props(Description = "unsupported type pattern or derivation construct"))]
    TypeDerivationNotSupported = 4014,

    // Relation-related diagnostics (group 5).
    #[strum(props(HiddenDescription = "relation-related diagnostics"))]
    Relation = 5000,

    #[strum(props(Description = "missing root relation"))]
    RelationRootMissing = 5001,

    #[strum(props(Description = "missing relation"))]
    RelationMissing = 5002,

    #[strum(props(Description = "invalid relation"))]
    RelationInvalid = 5003,

    // Expression-related diagnostics (group 6).
    #[strum(props(HiddenDescription = "expression-related diagnostics"))]
    Expression = 6000,

    #[strum(props(Description = "field reference into non-existent stream"))]
    ExpressionFieldRefMissingStream = 6001,

    #[strum(props(Description = "illegal literal value"))]
    ExpressionIllegalLiteralValue = 6002,

    #[strum(props(Description = "function definition unavailable"))]
    ExpressionFunctionDefinitionUnavailable = 6003,

    #[strum(props(Description = "illegal subquery"))]
    ExpressionIllegalSubquery = 6004,

    // Redundant declarations (group 7).
    #[strum(props(
        HiddenDescription = "diagnostics for pointing out parts of the plan that can be removed without changing its semantics"
    ))]
    Redundant = 7000,

    #[strum(props(Description = "redundant protobuf \"any\" declaration"))]
    RedundantProtoAnyDeclaration = 7001,

    #[strum(props(Description = "redundant extension URI definition"))]
    RedundantExtensionDefition = 7002,

    #[strum(props(Description = "redundant function declaration"))]
    RedundantFunctionDeclaration = 7003,

    #[strum(props(Description = "redundant type declaration"))]
    RedundantTypeDeclaration = 7004,

    #[strum(props(Description = "redundant type variation declaration"))]
    RedundantTypeVariationDeclaration = 7005,

    #[strum(props(Description = "redundant list slice"))]
    RedundantListSlice = 7006,

    #[strum(props(Description = "redundant field"))]
    RedundantField = 7007,

    #[strum(props(Description = "redundant enum variant"))]
    RedundantEnumVariant = 7008,
}

impl Classification {
    /// Returns the complete code for this classification.
    pub fn code(&self) -> u32 {
        *self as u32
    }

    /// Returns the name of the classiciation.
    pub fn name(&self) -> String {
        format!("{:?}", self)
    }

    /// Returns the group code for this classification.
    pub fn group_code(&self) -> u32 {
        (*self as u32) / 1000
    }

    /// Returns the group variant for this classification.
    pub fn group(&self) -> Classification {
        Self::from_group(self.group_code())
            .unwrap_or_else(|| panic!("missing group for {:?}", self))
    }

    /// Returns the code for this classification within its group.
    pub fn sub_code(&self) -> u32 {
        (*self as u32) % 1000
    }

    /// Returns the description of this classification.
    pub fn description(&self) -> &str {
        self.get_str("Description")
            .or_else(|| self.get_str("HiddenDescription"))
            .unwrap_or_else(|| {
                panic!(
                    "missing Description or HiddenDescription property for {:?}",
                    self
                )
            })
    }

    /// Returns the classification associated with the given code, if any.
    pub fn from_code(code: u32) -> Option<Self> {
        Self::from_u32(code)
    }

    /// Returns the group classification associated with the given code, if
    /// any.
    pub fn group_from_code(code: u32) -> Option<Self> {
        Self::from_group(code / 1000)
    }

    /// Returns the group classification associated with the given group.
    pub fn from_group(group: u32) -> Option<Self> {
        Self::from_u32(group * 1000)
    }

    /// Returns the "parent" code for the given code. For non-group codes, this
    /// is the code of their group (code rounded down to thousands). For group
    /// codes, this is 0.
    pub fn parent(code: u32) -> u32 {
        if code % 1000 != 0 {
            (code / 1000) * 1000
        } else {
            0
        }
    }

    /// Formats a Message with this classification.
    pub fn format_message(
        &self,
        message: &Message,
        f: &mut std::fmt::Formatter,
    ) -> std::fmt::Result {
        if let Some(description) = self.get_str("Description") {
            write!(f, "{description}: ")?;
        }
        write!(f, "{message} (code {:04})", self.code())
    }
}

impl From<Classification> for u32 {
    /// Converts a Classification into its error code.
    fn from(classification: Classification) -> Self {
        classification.code()
    }
}

/// Description of the cause of a diagnostic.
#[derive(Clone, Debug, thiserror::Error)]
pub struct Cause {
    /// The error message. Within this crate we don't bother typing these
    /// beyond the Classification enum, but we do retain typing information for
    /// messages from other crates.
    pub message: Arc<Message>,

    /// Classification of this cause. This attaches an error code and generic
    /// message for said code to the diagnostic message. The user can use these
    /// codes to for instance always promote a particular type of diagnostic to
    /// an error (like gcc -Werror).
    pub classification: Classification,
}

impl PartialEq for Cause {
    fn eq(&self, other: &Self) -> bool {
        self.message.to_string() == other.message.to_string()
            && self.classification == other.classification
    }
}

impl std::fmt::Display for Cause {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.classification.format_message(&self.message, f)
    }
}

impl Cause {
    /// Prefixes the message with context information.
    pub fn prefix<S: AsRef<str>>(self, prefix: S) -> Cause {
        Cause {
            message: Arc::new(Message::from(format!(
                "{}: {}",
                prefix.as_ref(),
                self.message
            ))),
            classification: self.classification,
        }
    }
}

/// Convenience/shorthand macro for creating error diagnostics. Use this
/// variant when you have something that can be cast into a Message via into(),
/// like a pre-formatted string or a compatible Error type from a dependency.
macro_rules! ecause {
    ($class:ident, $message:expr) => {
        crate::output::diagnostic::Cause {
            message: std::sync::Arc::new($message.into()),
            classification: crate::output::diagnostic::Classification::$class,
        }
    };
}

/// Convenience/shorthand macro for creating error diagnostics. Use this
/// variant when you want to format a string. The argument list beyond the
/// diagnostic class identifier is passed straight to [`format!`].
macro_rules! cause {
    ($class:ident, $($args:expr),*) => {
        ecause!($class, format!($($args),*))
    };
}

/// Result type for diagnostic causes.
pub type Result<T> = std::result::Result<T, Cause>;

/// Error level for a diagnostic message.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Level {
    /// Level used for diagnostics that don't point out anything wrong with
    /// the plan, and merely provide additional information.
    Info,

    /// Level used for diagnostics that may or may not indicate that there
    /// is something wrong with the plan, i.e. the plan *could* be valid,
    /// but the validator isn't sure.
    Warning,

    /// Level used for diagnostics that indicate that there is definitely
    /// something wrong with the plan.
    Error,
}

/// A diagnostic message, without configuration-based level override.
#[derive(Clone, Debug, PartialEq, thiserror::Error)]
pub struct RawDiagnostic {
    /// The cause of the diagnostic.
    pub cause: Cause,

    /// The severity of the diagnostic.
    pub level: Level,

    /// The path within the protobuf message where the diagnostic occurred.
    pub path: path::PathBuf,
}

impl std::fmt::Display for RawDiagnostic {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:?}", self.level)?;
        if !f.alternate() {
            write!(f, " at {}", self.path)?;
        }
        write!(f, ": {}", self.cause)
    }
}

/// A diagnostic message, including configuration-based level override.
#[derive(Clone, Debug, PartialEq, thiserror::Error)]
pub struct Diagnostic {
    /// The cause of the diagnostic.
    pub cause: Cause,

    /// The original severity of the diagnostic.
    pub original_level: Level,

    /// The severity of the diagnostic after application of configuration.
    pub adjusted_level: Level,

    /// The path within the protobuf message where the diagnostic occurred.
    pub path: path::PathBuf,
}

impl std::fmt::Display for Diagnostic {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:?}", self.adjusted_level)?;
        match self.original_level.cmp(&self.adjusted_level) {
            std::cmp::Ordering::Less => write!(f, " (upgraded from {:?})", self.original_level)?,
            std::cmp::Ordering::Equal => {}
            std::cmp::Ordering::Greater => {
                write!(f, " (downgraded from {:?})", self.original_level)?
            }
        }
        if !f.alternate() {
            write!(f, " at {}", self.path)?;
        }
        write!(f, ": {}", self.cause)
    }
}

impl RawDiagnostic {
    /// Converts to an AdjustedDiagnostic by adding an adjusted level.
    pub fn adjust_level(self, adjusted_level: Level) -> Diagnostic {
        Diagnostic {
            cause: self.cause,
            original_level: self.level,
            adjusted_level,
            path: self.path,
        }
    }
}

/// Convenience/shorthand macro for creating error diagnostics.
macro_rules! diag {
    ($path:expr, $level:ident, $class:ident, $($args:expr),*) => {
        diag!($path, $level, cause!($class, $($args),*))
    };
    ($path:expr, $level:ident, $cause:expr) => {
        crate::output::diagnostic::RawDiagnostic {
            cause: $cause,
            level: crate::output::diagnostic::Level::$level,
            path: $path
        }
    };
}
/*macro_rules! ediag {
    ($path:expr, $level:ident, $class:ident, $err:expr) => {
        diag!($path, $level, ecause!($class, $err))
    };
}*/

/// Result type for complete diagnostics, including path.
pub type DiagResult<T> = std::result::Result<T, RawDiagnostic>;

#[cfg(test)]
mod tests {
    use super::*;
    use std::collections::HashSet;
    use strum::IntoEnumIterator;

    #[test]
    fn test_diagnostic_classifications() {
        // Check validity of the classifications definitions.
        let mut descriptions = HashSet::new();
        for class in Classification::iter() {
            let group = class.group();
            if group != Classification::Unclassified {
                assert!(
                    class.name().starts_with(&group.name()),
                    "incorrect group prefix for {:?}, should start with {:?}",
                    class,
                    group
                );
            }
            assert!(
                descriptions.insert(class.description().to_string()),
                "duplicate description for {:?}",
                class
            );
        }
    }
}