substrait-validator 0.0.11

Substrait validator
Documentation
// SPDX-License-Identifier: Apache-2.0

//! Module for representing namespaces used within extension definitions.

use crate::output::diagnostic;
use crate::output::extension;
use crate::parse::context;
use crate::parse::traversal;
use std::collections::HashMap;
use std::sync::Arc;

/// Reference to a nested namespace structure.
pub type Reference<T> = Arc<Definition<T>>;

/// Nested namespace structure.
///
/// There are a lot of things "weird" about this structure, because Substrait
/// is defined horribly; everything is a valid identifier (including
/// identifiers with periods in them, so a period is ambiguous between a
/// namespace separator and just part of an identifier), matching is
/// case-insensitive (but for the validator we do want to store what the user's
/// case convention was for a definition), and for type variations names
/// need not even be unique (arguably there's a different variation namespace
/// for each type class, but the type class isn't yet known when a type
/// variation anchor is defined, and that's where we want to resolve the name;
/// see <https://github.com/substrait-io/substrait/issues/268>).
///
/// So here we have an abomination of a namespace system that allows for name
/// conflicts and period ambiguity and just tries to deal with it as cleanly as
/// it can.
#[derive(Debug)]
pub struct Definition<T> {
    /// Members of the namespace. The key is stored in lowercase.
    members: HashMap<String, Vec<InternalReference<T>>>,
}

impl<T> Clone for Definition<T> {
    fn clone(&self) -> Self {
        Self {
            members: self.members.clone(),
        }
    }
}

impl<T> Default for Definition<T> {
    fn default() -> Self {
        Self {
            members: Default::default(),
        }
    }
}

impl<T> Definition<T> {
    /// Creates a new, empty namespace.
    pub fn new() -> Self {
        Default::default()
    }

    /// Defines an item with the given (case-insensitive) name. If an item
    /// going by a conflicting name already existed, the new item is inserted
    /// anyway; if the caller is concerned about that, it must first resolve
    /// the name and throw a warning or error based on its result.
    pub fn define_item<S: AsRef<str>>(&mut self, name: S, item: Arc<T>, public: bool) {
        let name = name.as_ref();
        self.members
            .entry(name.to_ascii_lowercase())
            .or_default()
            .push(InternalReference {
                member: Arc::new(Member::Item(item)),
                original_name: name.to_string(),
                public,
            })
    }

    /// Registers a nested namespace with the given (case-insensitive) name.
    /// Same rules for duplicates as define_item(). If None is specified for
    /// item, the namespace reference is registered, but its contents are
    /// left unspecified.
    pub fn define_nested<S: AsRef<str>>(
        &mut self,
        name: S,
        item: Option<Reference<T>>,
        public: bool,
    ) {
        let name = name.as_ref();
        self.members
            .entry(name.to_ascii_lowercase())
            .or_default()
            .push(InternalReference {
                member: Arc::new(if let Some(item) = item {
                    Member::Nested(item)
                } else {
                    Member::UnresolvedNested
                }),
                original_name: name.to_string(),
                public,
            })
    }

    /// Helper function that determines whether a reference is visible, based
    /// on:
    ///  - whether this is a local lookup (private members visible) or an
    ///    external lookup (private members not visible);
    ///  - whether we had to traverse into namespaces in order to get to the
    ///    reference we're looking at;
    ///  - whether all of the above namespaces were public;
    ///  - whether the reference we're looking at is public.
    fn visibility(
        local: bool,
        with_prefix: bool,
        prefix_public: bool,
        reference_public: bool,
    ) -> bool {
        // Clippy doesn't like this, but the logic is hard enough to understand
        // without being buried in a big boolean expression.
        #[allow(clippy::if_same_then_else)]
        #[allow(clippy::needless_bool)]
        if local && !with_prefix {
            // Everything defined locally is visible.
            true
        } else if (prefix_public || !with_prefix) && reference_public {
            // If all references in the path are public, the item is visible.
            true
        } else {
            // Otherwise, the item is not visible.
            false
        }
    }

    /// Helper function for resolve. Resolves all references defined in the
    /// current namespace and its children recursively.
    ///
    ///  - target: the result object that matching elements get pushed into.
    ///  - local: whether this is a local lookup (i.e. private members of
    ///    the root namespace are visible).
    ///  - prefix: if specified, the concatenated original names of the
    ///    parent namespaces that were already recursed into.
    ///  - suffix: the name that we need to resolve locally.
    ///  - prefix_public: whether the prefix is public or private. Should be
    ///    true if there is no prefix.
    fn resolve_internal(
        &self,
        target: &mut ResolutionResult<T>,
        local: bool,
        prefix: Option<&str>,
        suffix: &str,
        prefix_public: bool,
    ) {
        // Resolve locally-defined elements.
        if let Some(references) = self.members.get(&suffix.to_ascii_lowercase()) {
            for reference in references {
                let original_name = if let Some(prefix) = prefix {
                    format!("{prefix}.{}", reference.original_name)
                } else {
                    reference.original_name.clone()
                };
                let visible =
                    Self::visibility(local, prefix.is_some(), prefix_public, reference.public);
                let target_vec = if visible {
                    &mut target.visible
                } else {
                    &mut target.invisible
                };
                target_vec.push((original_name, reference.member.clone()))
            }
        }

        // If the suffix has periods in it, try splitting at every period it
        // has and seeing if that prefix resolves to any nested namespace.
        for (split_point, _) in suffix.char_indices().filter(|(_, c)| *c == '.') {
            let namespace_name = &suffix[..split_point];
            let new_suffix = &suffix[split_point + 1..];
            if let Some(references) = self.members.get(&namespace_name.to_ascii_lowercase()) {
                for reference in references {
                    match reference.member.as_ref() {
                        Member::Item(_) => (),
                        Member::Nested(nested) => {
                            let new_prefix = if let Some(prefix) = prefix {
                                format!("{prefix}.{}", reference.original_name)
                            } else {
                                reference.original_name.clone()
                            };
                            nested.resolve_internal(
                                target,
                                local,
                                Some(&new_prefix),
                                new_suffix,
                                prefix_public && reference.public,
                            );
                        }
                        Member::UnresolvedNested => {
                            let visible = Self::visibility(
                                local,
                                prefix.is_some(),
                                prefix_public,
                                reference.public,
                            );
                            if visible {
                                target.visible_incomplete = true;
                            } else {
                                target.invisible_incomplete = true;
                            };
                        }
                    }
                }
            }
        }
    }

    /// Resolves a name to all items with the same name visible from within this
    /// namespace (so, including private items).
    pub fn resolve_local<R>(&self, reference: R) -> ResolutionResult<T>
    where
        R: Into<extension::reference::Data<T>>,
    {
        let reference = reference.into();
        let name = reference.name.name().unwrap_or("!").to_string();
        let mut result = ResolutionResult::new(reference);
        self.resolve_internal(&mut result, true, None, &name, true);
        result
    }

    /// Resolves a name to all items with the same name visible from outside
    /// this namespace (so, excluding private items).
    pub fn resolve_public<R>(&self, reference: R) -> ResolutionResult<T>
    where
        R: Into<extension::reference::Data<T>>,
    {
        let reference = reference.into();
        let name = reference.name.name().unwrap_or("!").to_string();
        let mut result = ResolutionResult::new(reference);
        self.resolve_internal(&mut result, false, None, &name, true);
        result
    }
}

/// Reference to a namespace member.
#[derive(Debug)]
struct InternalReference<T> {
    /// The member that is being referred to.
    pub member: Arc<Member<T>>,

    /// The name of the member using its original case convention.
    pub original_name: String,

    /// Whether this member has public visibility.
    pub public: bool,
}

impl<T> Clone for InternalReference<T> {
    fn clone(&self) -> Self {
        Self {
            member: self.member.clone(),
            original_name: self.original_name.clone(),
            public: self.public,
        }
    }
}

/// A namespace member.
#[derive(Debug)]
enum Member<T> {
    /// A leaf item.
    Item(Arc<T>),

    /// A nested namespace.
    Nested(Reference<T>),

    /// A nested namespace of which the contents were not resolved.
    UnresolvedNested,
}

impl<T> Clone for Member<T> {
    fn clone(&self) -> Self {
        match self {
            Self::Item(x) => Self::Item(x.clone()),
            Self::Nested(x) => Self::Nested(x.clone()),
            Self::UnresolvedNested => Self::UnresolvedNested,
        }
    }
}

impl<T> Member<T> {
    /// Returns the embedded item if this is an item.
    pub fn as_item(&self) -> Option<Arc<T>> {
        match self {
            Member::Item(x) => Some(x.clone()),
            _ => None,
        }
    }

    /// Returns the embedded potentially-unresolved namespace if this is a
    /// namespace.
    pub fn as_namespace(&self) -> Option<Option<Reference<T>>> {
        match self {
            Member::Nested(x) => Some(Some(x.clone())),
            Member::UnresolvedNested => Some(None),
            _ => None,
        }
    }
}

/// Result structure for a namespace resolution.
#[derive(Clone, Debug)]
pub struct ResolutionResult<T> {
    /// The reference that was being resolved.
    unresolved_reference: extension::reference::Data<T>,

    /// Members that are visible from the point where the resolution was
    /// performed.
    visible: Vec<(String, Arc<Member<T>>)>,

    /// Whether there might be more visible members, defined within
    /// unresolved namespaces.
    visible_incomplete: bool,

    /// Members that are not visible from the point where the resolution was
    /// performed. These are enumerated nonetheless to improve the quality of
    /// error messages.
    invisible: Vec<(String, Arc<Member<T>>)>,

    /// Whether there might be more invisible members, defined within
    /// unresolved namespaces.
    invisible_incomplete: bool,

    /// Whether a filter has been applied and visible items were removed by
    /// it.
    filtered: bool,
}

impl<T> std::fmt::Display for ResolutionResult<T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.unresolved_reference)?;
        if self.visible_incomplete {
            match self.visible.len() {
                0 => write!(
                    f,
                    " (no known matching definitions, namespace not resolved)"
                ),
                1 => write!(f, " (one or more matching definitions)"),
                n => write!(f, " ({n} or more matching definitions)"),
            }
        } else {
            match self.visible.len() {
                0 => write!(f, " (no matching definitions)"),
                1 => write!(f, " (one matching definition)"),
                n => write!(f, " ({n} matching definitions)"),
            }
        }
    }
}

impl<T> ResolutionResult<T> {
    /// Creates a new, empty resolution result for the given unresolved
    /// reference, to be used as a placeholder when no namespace is actually
    /// available. It will behave as if resolution failed because the namespace
    /// being looked in was itself not resolved. If an item is passed via the
    /// reference, it will be returned by as_item(). as_namespace() will return
    /// None.
    pub fn new(unresolved_reference: extension::reference::Data<T>) -> Self {
        Self {
            unresolved_reference,
            visible: vec![],
            visible_incomplete: true,
            invisible: vec![],
            invisible_incomplete: false,
            filtered: false,
        }
    }

    /// Internal helper for the various filter functions.
    fn filter<F: Fn(&(String, Arc<Member<T>>)) -> bool>(mut self, filter: F) -> Self {
        let size = self.visible.len();
        self.visible.retain(&filter);
        self.invisible.retain(&filter);
        if self.visible.len() < size {
            self.filtered = true;
        }
        self
    }

    /// Filter the resulting items by some predicate. This also filters out
    /// namespaces.
    pub fn filter_items<F: Fn(&T) -> bool>(self, filter: F) -> Self {
        self.filter(|x| {
            x.1.as_item()
                .map(|x| filter(x.as_ref()))
                .unwrap_or_default()
        })
    }

    /// Filter the resulting items by some predicate. This also filters out
    /// namespaces.
    pub fn filter_all_items(self) -> Self {
        self.filter(|x| x.1.as_item().is_some())
    }

    /// Filter out anything that isn't a namespace.
    pub fn filter_namespaces(self) -> Self {
        self.filter(|x| x.1.as_namespace().is_some())
    }

    /// Helper function for expect_one() and expect_multiple().
    fn expect<F1, F2>(
        self,
        parse_context: &mut context::Context,
        if_not_applicable: F1,
        if_ambiguous: F2,
        allow_ambiguity: bool,
        optional: bool,
    ) -> Self
    where
        F1: FnOnce(String, &mut context::Context) -> bool,
        F2: FnOnce(String, &mut context::Context) -> bool,
    {
        let mut visible = self.visible.iter();
        if visible.next().is_some() {
            if visible.next().is_some()
                && !allow_ambiguity
                && !if_ambiguous(self.unresolved_reference.to_string(), parse_context)
            {
                traversal::push_diagnostic(
                    parse_context,
                    diagnostic::Level::Error,
                    cause!(
                        LinkAmbiguousName,
                        "multiple definitions are in scope for {}",
                        self.unresolved_reference
                    ),
                )
            }
        } else if self.visible_incomplete || optional {
            // A visible namespace was not resolved (in which case we
            // optimistically assume the item exists) or the item doesn't need
            // to exist.
        } else if !self.invisible.is_empty() {
            traversal::push_diagnostic(
                parse_context,
                diagnostic::Level::Error,
                cause!(
                    LinkUnresolvedName,
                    "a definition for {} exists, but is not visible from here",
                    self.unresolved_reference
                ),
            );
        } else if self.invisible_incomplete {
            traversal::push_diagnostic(
                parse_context,
                diagnostic::Level::Error,
                cause!(
                    LinkUnresolvedName,
                    "a definition for {} may exist, but would not be visible from here",
                    self.unresolved_reference
                ),
            );
        } else if self.filtered {
            if !if_not_applicable(self.unresolved_reference.to_string(), parse_context) {
                traversal::push_diagnostic(
                    parse_context,
                    diagnostic::Level::Error,
                    cause!(
                        LinkUnresolvedName,
                        "a definition for {} exists, but it cannot be used in this context",
                        self.unresolved_reference
                    ),
                )
            }
        } else {
            traversal::push_diagnostic(
                parse_context,
                diagnostic::Level::Error,
                cause!(
                    LinkUnresolvedName,
                    "no definition found for {}",
                    self.unresolved_reference
                ),
            );
        }
        self
    }

    /// Expects a single item, yielding diagnostics if this isn't the case. If
    /// ambiguous or if all matching elements were not applicable (filtered
    /// out), the specified functions are called. They receive the reference
    /// name and the parse context as arguments to form a suitable diagnostic
    /// message. If they return false, a default diagnostic message will be
    /// emitted instead.
    pub fn expect_one<F1, F2>(
        self,
        parse_context: &mut context::Context,
        if_not_applicable: F1,
        if_ambiguous: F2,
    ) -> Self
    where
        F1: FnOnce(String, &mut context::Context) -> bool,
        F2: FnOnce(String, &mut context::Context) -> bool,
    {
        self.expect(parse_context, if_not_applicable, if_ambiguous, false, false)
    }

    /// Expects zero or one item(s), yielding diagnostics if this isn't the
    /// case. If ambiguous, the specified function is called. It receives the
    /// reference name and the parse context as arguments to form a suitable
    /// diagnostic message. If it returns false, a default diagnostic message
    /// will be emitted instead.
    pub fn expect_not_ambiguous<F>(
        self,
        parse_context: &mut context::Context,
        if_ambiguous: F,
    ) -> Self
    where
        F: FnOnce(String, &mut context::Context) -> bool,
    {
        self.expect(parse_context, |_, _| true, if_ambiguous, false, true)
    }

    /// Expects a one or more items, yielding diagnostics if this isn't the
    /// case. If all matching elements were not applicable (filtered out), the
    /// specified functions are called. They receive the reference name and the
    /// parse context as arguments to form a suitable diagnostic message. If
    /// they return false, a default diagnostic message will be emitted
    /// instead.
    pub fn expect_multiple<F>(
        self,
        parse_context: &mut context::Context,
        if_not_applicable: F,
    ) -> Self
    where
        F: FnOnce(String, &mut context::Context) -> bool,
    {
        self.expect(parse_context, if_not_applicable, |_, _| true, true, false)
    }

    /// Silently returns the first matching item, if any. If there are none,
    /// this just returns an unresolved reference. Use
    /// filter_items().expect_one() to formulate error messages if there are
    /// multiple or no items available.
    pub fn as_item(&self) -> extension::reference::Reference<T> {
        let mut data = self.unresolved_reference.clone();
        if let Some(item) = self.visible.iter().filter_map(|x| x.1.as_item()).next() {
            data.definition.replace(item);
        }
        Arc::new(data)
    }

    /// Silently returns the first matching item, if any. Unlike as_item(),
    /// this returns None if there are no matches.
    pub fn as_opt_item(&self) -> Option<extension::reference::Reference<T>> {
        self.visible
            .iter()
            .filter_map(|x| x.1.as_item())
            .next()
            .map(|item| {
                let mut data = self.unresolved_reference.clone();
                data.definition.replace(item);
                Arc::new(data)
            })
    }

    /// Silently returns the first matching namespace. Use
    /// filter_namespaces().expect_one() to formulate error messages if there
    /// are multiple or no namespaces available.
    pub fn as_namespace(&self) -> Option<Reference<T>> {
        self.visible
            .iter()
            .filter_map(|x| x.1.as_namespace())
            .next()
            .flatten()
    }

    /// Return an error if one or more definitions were found for this name
    /// resolution, to be used just before defining a new item.
    pub fn expect_not_yet_defined(&self) -> diagnostic::Result<()> {
        if self.visible.is_empty() {
            Ok(())
        } else {
            Err(cause!(
                LinkDuplicateDefinition,
                "{} is already defined",
                self.unresolved_reference
            ))
        }
    }
}