orql 0.1.0 - Docs.rs

//! Provides SQL parsing functions for the supported Oracle dialect.
//!

//! All exposed parsing functions accept a string slice and parse it into an
//! Abstract Syntax Tree (AST) with possibly associated metadata.  Published
//! AST nodes as well as metadata will reference the originally parsed input
//! string, allowing almost zero-copy parsing.

//! The following examples demonstrate the current possibilities:
//!
//! # Parsing SQL into a compact AST without location data and comments
//! ```rust
//! # use orql::parser;
//! let sql = "select * from dual";
//! let stmts = parser::parse(sql).expect("bad sql!");
//! for stmt in stmts {
//!    println!("{:#?}", stmt);
//! }
//! ```
//! # Parsing SQL into an AST with token locations
//! ```rust
//!    use orql::{ast::*, parser::{Location, parse_with_locations}};
//!
//!    let sql = "select * from dual";
//!    let stmts = parse_with_locations(sql).expect("bad sql");
//!    // locate the "from" token ...
//!    let select = if let StatementType::Select(stmt) = &stmts[0].statement
//!        && let Select {
//!            query: Query { body: QueryBody { block: QueryBlock::Select(select), .. }, .. }, ..
//!        } = &**stmt
//!    {
//!        select
//!    } else {
//!        panic!("unexpected statement");
//!    };
//!    // ... and access its location
//!    assert_eq!(select.from.from_token.1, Location { line: 1, col: 10 });
//! ```
//!
//! # Parsing SQL into an AST with token locations and comments
//! ```rust
//!    use orql::{ast::*, parser::{parse_with_metadata, Location, Metadata}};
//!
//!    let sql = "  select /* foo */ 42 /* bar */ from dual;";
//!    let (stmts, meta) = parse_with_metadata(sql).expect("bad sql");
//!    // ~ get a reference to "select", "from", and "dual"
//!    let select = if let StatementType::Select(stmt) = &stmts[0].statement
//!        && let Select {
//!            query: Query { body: QueryBody { block: QueryBlock::Select(select), .. }, .. }, ..
//!        } = &**stmt
//!    {
//!        select
//!    } else {
//!        panic!("unexpected statement");
//!    };
//!
//!    // ~ see the comments of the SELECT token
//!    let select_token = &select.select_token;
//!    // ~ access the token's location
//!    assert_eq!(meta.location(select_token.1), Location {line: 1, col: 3});
//!    let (before, after) = meta.comments(select_token.1);
//!    // ~ no comments immediately before it
//!    assert!(before.is_empty());
//!    // ~ only one comment immediately after it
//!    assert_eq!(
//!        vec![" foo "],
//!        after.iter().map(|c| c.text).collect::<Vec<_>>()
//!    );
//!    // ~ similarly " bar " is associated as an _after_ comment with `42`
//!    // and, at the same time, as a _before_ comment with the `from` token
//! ```
//!
//! # Lazy SQL parsing with token locations and comments
//!
//! The parser's `parse*` methods process the whole input first in order to
//! return a vec of statements, failing the whole operation when encountering
//! an error.  Iteration by means of the `iter*` methods, on the other hand,
//! parses the input string lazily one statement at a time, allowing to
//! process inputs with invalid or unsupported statements.
//!
//! ```rust
//!    use orql::parser::{Iter, IterError, Metadata, iter_with_metadata};
//!    
//!    let sql = r"
//!    select 1 from dual; -- first statement
//!    invalid tokens ;
//!    select 2 from dual; -- second statement
//!    ";
//!    // ~ parses `sql` lazily one statement at a time
//!    let mut iter = iter_with_metadata(sql);
//!    while let Some(next) = iter.next() {
//!        match next {
//!            Ok(stmt) => {
//!                // ~ process the statement's structure
//!                println!("{:?}", stmt);
//!    
//!                // ~ demo: print the trailing comments
//!                if let Some(terminator) = stmt.terminator {
//!                    // ~ get hold of access to metadata
//!                    let meta = iter.metadata();
//!                    let (_, after_comments) = meta.comments(terminator.1);
//!                    for c in after_comments {
//!                        println!("/*{}*/", c.text);
//!                    }
//!                }
//!            }
//!            Err(IterError { skipped, error }) => {
//!                // ~ print the error and the whole section that failed to
//!                // parsed and was skipped
//!                println!();
//!                println!("-- {error}");
//!                println!("{}", skipped.text);
//!                println!();
//!            }
//!        }
//!    }
//! ```

pub use crate::scanner::{CommentStyle, Location};
use crate::{
    ast::{Hint, Ident, Identifier, Node, Statement, StatementType},
    scanner::{Keyword, PeekableScanner, Scanner, Token, TokenType},
};

mod error;
pub use error::*;
mod meta;
pub use meta::{Comment, Id, Metadata};
use meta::{DefaultTracker, LocationsTracker, Tracker as MetaTracker, VoidTracker};
mod precedence;
use precedence::Prec;
mod iter;
pub use iter::*;

#[cfg(test)]
mod tests;

macro_rules! unexpected_eof_err {
    ($parser:expr, $expected:expr) => {
        Err($crate::parser::Error::Unexpected {
            unexpected: "end-of-file".into(),
            expected: $expected,
            loc: $parser.next_loc,
        })
    };
}

/// Assumes a `next_token` is available and matches `bind`; panics otherwise;
///
/// Safe to use only after ensuring the token immediately beforehand via
/// [ParserInner::peek_token].
macro_rules! let_next_token {
    ($parser:expr, $bind:pat) => {
        let Some($bind) = $parser.next_token()? else {
            panic!("invalid token assumed");
        };
    };
}

macro_rules! expect_token {
    (
        |$token_var:ident| $expected:literal match {
            $($ttype:pat $(if $cond:expr)? => $handler:expr $(,)?)*
        }
    ) => {{
        match $token_var.ttype {
            $( $ttype $(if $cond)? => $handler, )*
            #[allow(unreachable_patterns)]
            _ => return Err($crate::parser::Error::unexpected_token($token_var, $expected)),
        }
    }};
    (
        |$token_var:ident = $parser:ident.$next_token:ident()| $expected:literal match {
            $($ttype:pat $(if $cond:expr)? => $handler:expr $(,)?)*
        }
    ) => {{
        expect_token!(|$token_var = ($parser).$next_token()| $expected match {
            $($ttype $(if $cond)? => $handler),*
        })
    }};
    (
        |$token_var:ident = ($parser:expr).$next_token:ident()| $expected:literal match {
            $($ttype:pat $(if $cond:expr)? => $handler:expr $(,)?)*
        }
    ) => {{
        let $token_var = {
            match $parser.$next_token()? {
                None => return unexpected_eof_err!($parser, $expected),
                Some(t) => t,
            }
        };
        expect_token! {
            |$token_var| $expected match {
                $( $ttype $(if $cond)? => $handler )*
            }
        }
    }};
}

macro_rules! expect_reserved {
    (
        |$token_var:ident $(= $parser:ident.$next_token:ident())?| $expected:literal match {
            $($ttype:pat $(if $cond:expr)? => $handler:expr $(,)?)*
        }
    ) => {{
        expect_reserved!(|$token_var $(= ($parser).$next_token())? | $expected match {
            $($ttype $(if $cond)? => $handler),*
        })
    }};
    (
        |$token_var:ident $(= ($parser:expr).$next_token:ident())?| $expected:literal match {
            $($ttype:pat $(if $cond:expr)? => $handler:expr $(,)?)*
        }
    ) => {{
        $(
             let $token_var = {
                match $parser.$next_token()? {
                  None => return unexpected_eof_err!($parser, $expected),
                  Some(t) => t,
                }
            };
        )?
        expect_token! {
            |$token_var| $expected match {
                $crate::scanner::TokenType::Identifier(_, Some(_reserved)) => {
                    match _reserved {
                        $( $ttype $(if $cond)? => $handler, )*
                        #[allow(unreachable_patterns)]
                        _ => return Err($crate::parser::Error::unexpected_token($token_var, $expected)),
                    }
                }
            }
        }
    }};
}

mod condition;
mod expression;
mod order;
mod select;
mod window;

/// Eagerly parses `s` into individual statements without location or comments
/// metadata.
pub fn parse<'s>(s: &'s str) -> Result<Vec<Statement<'s, ()>>> {
    parse_::<VoidTracker>(s).map(|(ast, _)| ast)
}

/// Lazily parses `s` into individual statements without location or
/// comments metadata. Unlike [parse], iteration allows skipping
/// unparsable / invalid source sections.
pub fn iter<'s>(s: &'s str) -> impl Iterator<Item = iter::IterItem<'s, ()>> {
    iter_::<VoidTracker>(s)
}

/// Eagerly parses `s` into individual statements along with location metadata
/// for the resulting AST nodes.  Skips comments extraction, though.
pub fn parse_with_locations<'s>(s: &'s str) -> Result<Vec<Statement<'s, Location>>> {
    parse_::<LocationsTracker>(s).map(|(ast, _)| ast)
}

/// Lazily parses `s` into individual statements with location data but no
/// comments.  Unlike [`parse_with_locations`], iteration allows
/// skipping unparsable / invalid source sections.
pub fn iter_with_locations<'s>(s: &'s str) -> impl Iterator<Item = iter::IterItem<'s, Location>> {
    iter_::<LocationsTracker>(s)
}

/// Eagerly parses `s` into individual statements along with location metadata
/// and comments for the resulting AST nodes.
pub fn parse_with_metadata<'s>(
    s: &'s str,
) -> Result<(Vec<Statement<'s, Id>>, impl Metadata<'s, NodeId = Id>)> {
    parse_::<DefaultTracker>(s)
}

/// Lazily parses `s` into individual statements with location and
/// comments metadata. Unlike [parse_with_metadata], iteration
/// allows skipping unparsable / invalid source sections.
///
/// Access to metadata is provided through [`iter::Iter::metadata`].
pub fn iter_with_metadata<'s>(s: &'s str) -> impl iter::Iter<'s, Id> {
    iter_::<DefaultTracker>(s)
}

#[allow(clippy::type_complexity)]
fn parse_<'s, M: MetaTracker<'s> + Default>(
    s: &'s str,
) -> Result<(Vec<Statement<'s, M::NodeId>>, M::Metadata)> {
    let mut p = ParserInner::new(Scanner::new(s), M::default());
    Ok((p.parse_statements()?, p.finish()?))
}

fn iter_<'s, M: MetaTracker<'s> + Default + 's>(
    s: &'s str,
) -> impl iter::Iter<'s, M::NodeId, Item = iter::IterItem<'s, M::NodeId>> {
    ParserInner::new(Scanner::new(s), M::default()).into_iter()
}

// ----------------------------------------------------------------------------

struct ParserInner<'s, M> {
    /// the stream of tokens being consumed
    tokens: PeekableScanner<'s>,
    /// the next location, ie. the end location of the last _consumed_ token;
    /// see `Self::next_token`
    next_loc: Location,
    /// the current level of nesting via parentheses; incremented when we
    /// encounter an '(', decremented when encountering a ')'
    nest_level: usize,
    /// comments (and possibly other metadata) collector
    meta_tracker: M,
}

impl<'s, M> AsMut<Self> for ParserInner<'s, M> {
    fn as_mut(&mut self) -> &mut Self {
        self
    }
}

impl<'s, M> ParserInner<'s, M>
where
    M: MetaTracker<'s>,
{
    fn new(scanner: Scanner<'s>, tracker: M) -> Self {
        let tokens = scanner.peekable();
        Self {
            next_loc: tokens.inner().location(),
            nest_level: 0,
            tokens,
            meta_tracker: tracker,
        }
    }

    /// Retrieves and consumes the next token _without_ skipping comments.
    fn advance(&mut self) -> Result<Option<Token<'s>>> {
        let t = self.tokens.next();
        self.next_loc = self.tokens.location();
        t.map_or(Ok(None), |t| t.map(Some).map_err(Error::from))
    }

    /// Retrieves and consumes the next token skipping comments.
    fn next_token(&mut self) -> Result<Option<Token<'s>>> {
        while let Some(t) = self.advance()? {
            if !self.meta_tracker.accept_comment(&t) {
                return Ok(Some(t));
            }
        }
        Ok(None)
    }

    /// Advances the parser to the next non-comment token, returning the
    /// scanner's position / location right after the last skipped comment
    /// or the current position / location if there was no comment.
    fn skip_comments(&mut self) -> Result<(usize, Location)> {
        let (mut pos, mut loc) = (self.tokens.position(), self.tokens.location());
        loop {
            match self.tokens.peek() {
                None => break,
                Some(Err(e)) => return Err(e.into()),
                Some(Ok(t)) => {
                    if self.meta_tracker.accept_comment(t) {
                        pos = self.tokens.position();
                        loc = self.tokens.location();
                        self.advance()?;
                    } else {
                        break;
                    }
                }
            }
        }
        Ok((pos, loc))
    }

    /// Looks ahead at the next token and returns it if and only if it's a
    /// comment.  The encountered and returned comment will _not_ be
    /// registered with the meta tracker.  It will get registered with the
    /// next call to [Self::peek_comment] or [Self::next_comment] unless
    /// [Self::advance] has been called prior to them.
    fn peek_comment(&mut self) -> Result<Option<(&crate::scanner::Comment<'s>, &Location)>> {
        match self.tokens.peek() {
            Some(Err(e)) => Err(e.into()),
            Some(Ok(Token {
                ttype: TokenType::Comment(c),
                loc,
            })) => Ok(Some((c, loc))),
            None | Some(Ok(_)) => Ok(None),
        }
    }

    /// Looks ahead at the next comment skipping comments.
    fn peek_token(&mut self) -> Result<Option<&Token<'s>>> {
        // ~ consume until the peek token is not a comment
        loop {
            match self.tokens.peek() {
                None => return Ok(None),
                Some(Err(e)) => return Err(e.into()),
                Some(Ok(t)) => {
                    if self.meta_tracker.accept_comment(t) {
                        self.advance()?;
                    } else {
                        break;
                    }
                }
            }
        }
        // ~ now take the peek (assuming it's not a comment)
        self.tokens
            .peek()
            .map_or(Ok(None), |t| t.as_ref().map(Some).map_err(Error::from))
    }

    /// Consumes the [peek token](Self::peek_token).
    fn consume_token(&mut self) -> Result<()> {
        self.next_token().map(|_| ())
    }

    /// Consumes the rest of the input (until encountering end-of-file)
    /// accepting only trailing whitespace and comments and returns collected
    /// metadata.
    fn finish(mut self) -> Result<M::Metadata> {
        // ~ next_token already skip whitespace and comments
        if let Some(t) = self.next_token()? {
            return Err(Error::unexpected_token(t, "end-of-file"));
        }
        Ok(self.meta_tracker.finish())
    }

    // ------------------------------------------------------------------------

    /// Parses the whole input into individual statements.
    fn parse_statements(&mut self) -> Result<Vec<Statement<'s, M::NodeId>>> {
        let mut stmts = Vec::new();
        while let Some(stmt) = self.parse_statement_()? {
            stmts.push(stmt);
        }
        Ok(stmts)
    }

    /// Parses a single statement returning `None` if there is no more input.
    fn parse_statement_(&mut self) -> Result<Option<Statement<'s, M::NodeId>>> {
        let statement = match self.peek_token()? {
            None => return Ok(None), // ~ done
            Some(t) => match t.ttype {
                TokenType::Semicolon => StatementType::Empty,
                TokenType::Keyword(Keyword::SELECT)
                | TokenType::Keyword(Keyword::WITH)
                | TokenType::LeftParen => self
                    .parse_select()
                    .map(|s| StatementType::Select(s.into()))?,
                _ => return Err(Error::unexpected_token(t, "a statement")), // ~ error
            },
        };
        Ok(Some(Statement {
            statement,
            terminator: match self.next_token()? {
                None => None,
                Some(Token {
                    ttype: TokenType::Semicolon,
                    loc,
                }) => Some(Node((), self.meta_tracker.on_node_start(loc))),
                Some(t) => {
                    return Err(Error::unexpected_token(t, "a semicolon or end-of-file"));
                }
            },
        }))
    }

    /// Parses a query optimizer hint; to be called having parsed `SELECT`,
    /// `UPDATE`, `INSERT`, `MERGE`, or the `DELETE` keyword right _after_
    /// having allocated a node_id for them.
    fn parse_hint(&mut self) -> Result<Option<Node<Hint<'s>, M::NodeId>>> {
        match self.peek_comment()? {
            Some((crate::scanner::Comment(text, style), loc))
                if matches!(text.as_bytes(), [b'+', ..]) =>
            {
                let hint = Hint {
                    text: &text[1..],
                    comment_style: *style,
                };
                let loc = *loc;
                // ~ consume the (comment) token without registering it with
                // the meta tracker. we treat this "hint" comment as a regular
                // non-comment node.
                self.advance()?;
                Ok(Some(Node(hint, self.meta_tracker.on_node_start(loc))))
            }
            _ => Ok(None),
        }
    }

    /// Parses an expected identifier.
    fn parse_identifier(&mut self) -> Result<Identifier<'s, M::NodeId>> {
        let ident = self.parse_ident()?;
        self.parse_identifier_(ident)
    }

    /// Parses a possibly qualified identifier; e.g. `schema.table.column`
    /// with the initial identifier part already given
    fn parse_identifier_(
        &mut self,
        init_ident: Node<Ident<'s>, M::NodeId>,
    ) -> Result<Identifier<'s, M::NodeId>> {
        if matches!(
            self.peek_token()?,
            Some(Token {
                ttype: TokenType::Dot,
                ..
            })
        ) {
            let mut parts = Vec::with_capacity(3);
            parts.push(init_ident);

            self.consume_token()?; // ~ consume the peeked '.'
            loop {
                // ~ don't associate comments (before the DOT) as "leading"
                // for the next identifier; only as "trailing" comments of the
                // preceding identifier (the one before the DOT)
                self.meta_tracker.on_node_end();

                expect_token! {
                    |t = self.next_token()| "an identifier" match {
                        TokenType::Identifier(ident, _) => {
                            parts.push(Node(ident, self.meta_tracker.on_node_start(t.loc)))
                        }
                    }
                }
                if let Some(t) = self.peek_token()?
                    && matches!(t.ttype, TokenType::Dot)
                {
                    continue;
                } else {
                    break Ok(Identifier::Qualified(parts));
                }
            }
        } else {
            Ok(Identifier::Simple(init_ident))
        }
    }

    /// Parses a simple, single word identifier
    fn parse_ident(&mut self) -> Result<Node<Ident<'s>, M::NodeId>> {
        expect_token!(|t = self.next_token()| "a simple identifier" match {
            TokenType::Identifier(ident, _) => Ok(Node(ident, self.meta_tracker.on_node_start(t.loc)))
        })
    }
}

// ----------------------------------------------------------------------------

/// Parses an expected parenthesized expression(s) using `f` as a consumer of
/// the token stream contained within the parens.
///
/// This is equivalent to consuming the left paren token and calling
/// [parse_opened_parens].
fn parse_parens<'s, M, P, T, F>(parser: &mut P, f: F) -> Result<T>
where
    P: AsMut<ParserInner<'s, M>>,
    M: MetaTracker<'s>,
    F: FnOnce(&mut P, M::NodeId) -> Result<T>,
{
    let inner = parser.as_mut();
    expect_token!(|t = inner.next_token()| "an opening parenthesis" match {
        TokenType::LeftParen => parse_opened_parens(parser, t.into(), f)
    })
}

/// Input parameter to [ParserInner::parse_opened_parens] denoting either a
/// prefetched opening parenthesis or that parenthesis represented already as
/// an AST node identifier.
enum OpenedParen<'s, ID> {
    Token(Token<'s>),
    NodeId(ID),
}

impl<'s, ID> From<Token<'s>> for OpenedParen<'s, ID> {
    fn from(value: Token<'s>) -> Self {
        Self::Token(value)
    }
}

/// Assuming an opening paren has been encountered, executes `f` with `parser`
/// as argument and with a newly allocated `node_id` for the opened paren.
/// Keeping track of proper nesting level expects a closing parenthesis
/// afterwards. Returns the result of `f`.
fn parse_opened_parens<'s, M, P, T, F>(
    parser: &mut P,
    opened_paren: OpenedParen<'s, M::NodeId>,
    f: F,
) -> Result<T>
where
    P: AsMut<ParserInner<'s, M>>,
    M: MetaTracker<'s>,
    F: FnOnce(&mut P, M::NodeId) -> Result<T>,
{
    let inner = parser.as_mut();
    let node_id = match opened_paren {
        OpenedParen::Token(token) => inner.meta_tracker.on_node_start(token.loc),
        OpenedParen::NodeId(id) => id,
    };
    // ~ don't associate comments after '(' with the
    // parsed expression as its "leading comments"
    inner.meta_tracker.on_node_end();
    inner.nest_level += 1;
    let result = f(parser, node_id)?;
    let inner = parser.as_mut();
    expect_token! {
        |t = inner.next_token()| "a closing parenthesis" match {
            TokenType::RightParen => {}
        }
    }
    inner.meta_tracker.on_node_end_restore(node_id);
    inner.nest_level -= 1;
    Ok(result)
}

/// Parses a comma separated list of parse results of `f`. At least one
/// items is expected. Parsing stops successfully at the first non-comma
/// token.
fn parse_comma_separated<'s, M, P, T, F>(parser: &mut P, mut f: F) -> Result<Vec<T>>
where
    P: AsMut<ParserInner<'s, M>>,
    M: MetaTracker<'s>,
    F: FnMut(&mut P) -> Result<T>,
{
    let initial = f(parser)?;
    parse_comma_separated_rest(parser, initial, f)
}

/// Parses a comma separated list of parse results of `f` with the
/// `initial` element being first in the list. Possibly no further element
/// is accepted; parsing simply stops successfully at the first non-comma
/// token.
fn parse_comma_separated_rest<'s, M, P, T, F>(
    parser: &mut P,
    initial: T,
    mut f: F,
) -> Result<Vec<T>>
where
    P: AsMut<ParserInner<'s, M>>,
    M: MetaTracker<'s>,
    F: FnMut(&mut P) -> Result<T>,
{
    let mut results = vec![initial];
    loop {
        let as_mut = parser.as_mut();
        if let Some(Token {
            ttype: TokenType::Comma,
            ..
        }) = as_mut.peek_token()?
        {
            as_mut.meta_tracker.on_node_end();
            as_mut.consume_token()?;
            results.push(f(parser)?);
        } else {
            break;
        }
    }
    Ok(results)
}