substrait-validator 0.1.4

Substrait validator
Documentation
// SPDX-License-Identifier: Apache-2.0
syntax = "proto3";

// This proto file defines the tree structure that corresponds to the internal
// representation of the validator's parse result. The toplevel message type
// for this is ParseResult.

package substrait.validator;

import "google/protobuf/any.proto";
import "google/protobuf/empty.proto";
import "substrait/validator/simple_extensions.proto";
import "substrait/validator/type_system.proto";

option csharp_namespace = "Substrait.Validator.Protobuf";
option java_multiple_files = true;
option java_package = "io.substrait.validator.proto";

// Root message type returned by the validator as a result of parsing a
// substrait.Plan.
message ParseResult {
  // Root node of the parse result tree.
  Node root = 1;
}

// Nodes of the validator parse result tree.
//
// Note that, unlike substrait.Plan and its children, the nodes in this tree
// are intentionally devoid of typing information: all nodes are of type Node.
// The purpose of this is to allow a consumer of these trees to walk over the
// entire tree without needing in-depth knowledge of how Substrait works (and,
// with that, to decouple them from changes to the Substrait specification):
// they are intended as an intermediate format for converting Substrait plans
// into more human-friendly representations after all, not for programmatically
// dealing with the semantics of Substrait itself. That's what the validator is
// for, in this case.
//
// In particular, gathering all diagnostics emitted by the validator only
// requires the consumer to use the Node, Node.Data, Node.Child, and of course
// the Diagnostic message types.
//
// In case the consumer does need additional information from the original
// substrait.Plan, every node can be related back to its corresponding message
// via the path information associated with the nodes.
message Node {
  // The type of node.
  oneof node_type {
    // This node represents a protobuf message. The fields are described using
    // Field, RepeatedField, and OneOfField messages in data.
    ProtoMessage proto_message = 1;

    // This node represents a protobuf primitive or enum.
    ProtoPrimitive proto_primitive = 2;

    // This node is inserted as a placeholder when a required oneof field was
    // not populated in the input.
    google.protobuf.Empty proto_missing_oneof = 3;

    // Special case of proto_primitive for references to anchors defined
    // elsewhere in the plan.
    NodeReference node_reference = 4;

    // This node represents a YAML map/object. The keys are represented using
    // Field messages in data.
    google.protobuf.Empty yaml_map = 6;

    // This node represents a YAML array. The elements are represented using
    // ArrayElement messages in data.
    google.protobuf.Empty yaml_array = 7;

    // This node represents a YAML primitive.
    PrimitiveData yaml_primitive = 8;

    // Special case for string primitives that were interpreted and resolved as
    // a URI. These nodes will have a single child node with path `data` that
    // represents the parse result of the referred file.
    string resolved_uri = 9;

    // This node represents an abstract syntax tree node, used for representing
    // complex YAML string parse results.
    google.protobuf.Empty ast_node = 10;

    // No longer used. The more generic ResolvedUri type is used instead.
    YamlReference yaml_reference = 5 [deprecated = true];
  }

  // Semantic classification of this node.
  Class class = 13;
  enum Class {
    CLASS_UNSPECIFIED = 0;

    // This node represents a data type.
    CLASS_TYPE = 1;

    // This node represents an expression.
    CLASS_EXPRESSION = 2;

    // This node represents a relation.
    CLASS_RELATION = 3;
  }

  // Optional brief description of the node. Should not contain newlines or
  // other non-span formatting information.
  Comment brief = 14;

  // Optional summary of the node. Unlike brief, this may contain
  // paragraph-level formatting information.
  Comment summary = 15;

  // For the following types of nodes, the validator will try to do type
  // resolution:
  //  - type-like nodes resolve to said type;
  //  - expression-like nodes resolve to the type returned by the expression;
  //  - relation-like nodes resolve to the schema (as a named struct) returned
  //    by the relation.
  // This field will be populated for such nodes even if resolution fails, to
  // indicate that there is supposed to be a type. In that case, the type kind
  // will be set to "unresolved." The field will not be populated for nodes
  // that don't have a logical Substrait type.
  DataType data_type = 16;

  // Data associated with the node. Note that some variants are illegal based
  // on the node type (for example, a primitive does not have fields, so it
  // makes no sense for Field data to appear).
  repeated Data data = 31;
  message Data {
    oneof kind {
      // Represents a child node in the tree.
      Child child = 1;

      // Represents a diagnostic message.
      Diagnostic diagnostic = 2;

      // Represents an (intermediate) data type.
      DataType data_type = 3;

      // Unstructured additional information about the node or something in it.
      Comment comment = 4;

      // Pointer to a function extension.
      FunctionUsage function_usage = 5;

      // Simple extension definition.
      ExtensionDefinition extension_definition = 6;
    }
  }

  // Representation of a child node in the tree.
  message Child {
    // Path element identifying the relation of this child node to its
    // parent.
    Path.Element path = 1;

    // The child node.
    Node node = 2;

    // Whether the validator recognized/expected the field or element that
    // this child represents. Fields/elements may be unrecognized simply
    // because validation is not implemented for them yet. In any case, this
    // flag indicates that the subtree represented by this node could not be
    // validated.
    bool recognized = 3;
  }

  // Information about a protobuf message.
  message ProtoMessage {
    // The full protobuf path for the type, for example "substrait.Plan".
    string path = 1;
  }

  // Information about a protobuf primitive.
  message ProtoPrimitive {
    // Logically compatible protobuf name of the primitive type, for example
    // uint32 for any 32-bit unsigned data storage type.
    string path = 1;

    // Value of the primitive.
    PrimitiveData data = 2;
  }

  // Information about the reference part of a reference/anchor pair.
  message NodeReference {
    // Integer value of the reference and anchor.
    uint64 value = 1;

    // Absolute path to the referenced node, i.e. the node containing the
    // anchor field.
    Path path = 2;
  }

  // Information about a reference to a YAML file.
  message YamlReference {
    option deprecated = true;

    // URI to the YAML file.
    string uri = 1;
  }

  // Value for a primitive data element.
  message PrimitiveData {
    // Note: to represent a YAML null, this field is simply not populated.
    oneof data {
      bool boolean = 1;
      uint64 unsigned = 2;
      int64 signed = 3;
      double real = 4;
      string unicode = 5;
      bytes binary = 6;
      string variant = 7;
      google.protobuf.Any any = 8;
    }
  }
}

// An absolute path to a node in the tree.
message Path {
  // Name of the root node. Currently always set to `plan`.
  string root = 1;

  // Elements of the path. The first element selects a child node of the root
  // node, the second selects one of its children, etc.
  repeated Element elements = 2;
  message Element {
    oneof kind {
      Field field = 1;
      RepeatedField repeated_field = 2;
      OneOfField oneof_field = 3;
      ArrayElement array_element = 4;
    }
  }

  // Path element used for protobuf fields and YAML maps.
  // Canonically represented as `.<field>` if field matches
  // [a-zA-Z_][a-zA-Z0-9_]*, or as `."<field>"` using \\ and \" escape
  // sequences if not (note that this can only happen for YAML map keys).
  message Field {
    string field = 1;
  }

  // Path element used for protobuf repeated field elements.
  // Canonically represented as `.<field>[<index>]`.
  message RepeatedField {
    string field = 1;
    uint64 index = 2;
  }

  // Path element used for protobuf oneof fields.
  // Canonically represented as `.<field>{<variant>}`.
  message OneOfField {
    string field = 1;
    string variant = 2;
  }

  // Path element used for YAML arrays.
  // Canonically represented as `[<index>]`.
  message ArrayElement {
    uint64 index = 2;
  }
}

// Representation of a diagnostic message.
message Diagnostic {
  // The original error level/severity for this diagnostic.
  Level original_level = 1;

  // The error level/severity for this diagnostic after adjustment according
  // to the validator configuration.
  Level adjusted_level = 2;

  // The machine-readable message for this diagnostic.
  uint32 cause = 3;

  // The human-readable message for this diagnostic.
  string msg = 4;

  // A path associated with this diagnostic. This is usually the path for
  // the node it is associated with, but not necessarily: for example, a
  // diagnostic message relating to a duplicate definition may refer back
  // to the previous or first definition.
  Path path = 5;

  // Error level.
  enum Level {
    LEVEL_UNSPECIFIED = 0;

    // Information diagnostic. Has no bearing on the validity of the plan.
    LEVEL_INFO = 1;

    // Warning diagnostic. The presence of warning diagnostics indicates
    // that the plan may or may not be valid, for example because the
    // validator was unable to access a referenced YAML file, or because
    // enhancements using protobuf's Any type were used.
    LEVEL_WARNING = 2;

    // Error diagnostic. The presence of error diagnostics indicates that
    // the plan is invalid.
    LEVEL_ERROR = 3;
  }
}

// Representation of a comment made by the validator that is only intended
// to be interpreted by people.
message Comment {
  // Comments consist of one or more "elements," defining formatting
  // information.
  repeated Element elements = 1;
  message Element {
    oneof kind {
      // A span of text.
      Span span = 1;

      // A newline, i.e. the next span should start on the next line.
      google.protobuf.Empty new_line = 2;

      // Opens a new unordered list. The next span is the start of the text for
      // the next item. list_next elements are used to advance to the next list
      // item; newlines can be used to add paragraphs without bullet points.
      // Each list_open should be matched with a list_close. Lists may be
      // nested.
      google.protobuf.Empty list_open = 3;

      // Advances to the next list item.
      google.protobuf.Empty list_next = 4;

      // Closes the current list.
      google.protobuf.Empty list_close = 5;
    }
  }

  // A span of text.
  message Span {
    // Text for this span. Should not include newlines.
    string text = 1;

    // Specified if this span of text should link to something.
    oneof link {
      // Link to a path in the tree.
      Path path = 2;

      // Link to a web page.
      string url = 3;
    }
  }
}

// Information about the usage of a function.
message FunctionUsage {
  // URI of the YAML file that the function is (supposed to be) defined in,
  // if known.
  string uri = 1;

  // Compound name of the function, uniquely identifying a function
  // implementation within the scope of the URI.
  string compound_name = 2;

  // Simple name of the function, uniquely identifying the function behavior
  // description within the scope of the URI. If the function only has a
  // single implementation, this name may also be used to refer to it.
  string simple_name = 3;

  // If nonzero, points to a function extension definition elsewhere in the
  // tree. All extension definitions can be gathered by traversing the tree
  // and looking for ExtensionDefinition messages in the data associated with
  // each node. Note that extension IDs are only unique within a single tree.
  uint64 extension_id = 4;
}