postgresql-cst-parser 0.2.0

An unofficial PostgreSQL CST parser written in Pure Rust.
Documentation
#[macro_use]
mod generated;
pub mod lexer_ported;
pub mod parser_error;
mod util;

use std::collections::HashMap;

use parser_error::{ParserError, ScanReport};

use self::generated::State;

pub const NAMEDATALEN: usize = 64;

#[allow(dead_code)]
#[derive(Debug, Clone)]
pub enum Yylval {
    Str(String),
    I(i32),
    Keyword(String),
    Uninitialized,
}

pub struct Lexer {
    pub input: String,
    pub index_bytes: usize,
    pub state: State,
    pub yyleng: usize,

    pub literal: String,

    // user state
    pub xcdepth: usize,
    pub yyllocend_bytes: usize,
    pub state_before_str_stop: State,
    pub yylloc_stack: Vec<usize>,
    pub dolqstart: String,
    pub warn_on_first_escape: bool,
    pub saw_non_ascii: bool,
    pub utf16_first_part: u32,

    // states
    pub yylval: Yylval,
    pub yylloc_bytes: usize,

    #[cfg(feature = "regex-match")]
    pub rules: Vec<Rule>,
    pub keyword_map: HashMap<&'static str, &'static str>,
    pub reports: Vec<ScanReport>,
}

#[cfg(feature = "regex-match")]
pub struct Rule {
    pub state: State,
    pub pattern: regex::bytes::Regex,
    pub eof: bool,
    pub kind: self::generated::RuleKind,
}

#[allow(clippy::all)]
// #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum TokenKind {
    RAW(String),
    IDENT,
    KEYWORD(String),
    C_COMMENT,
    SQL_COMMENT,
    EOF,
    BCONST,
    XCONST,
    SCONST,
    USCONST,
    UIDENT,
    TYPECAST,
    DOT_DOT,
    COLON_EQUALS,
    EQUALS_GREATER,
    LESS_EQUALS,
    GREATER_EQUALS,
    NOT_EQUALS,
    Op,
    PARAM,
    FCONST,
    ICONST,
}

impl TokenKind {
    #[allow(dead_code)]
    pub fn to_id(&self) -> String {
        match self {
            TokenKind::RAW(s) => format!("'{}'", s),
            TokenKind::IDENT => "IDENT".to_string(),
            TokenKind::KEYWORD(s) => s.to_string(),
            TokenKind::C_COMMENT => "C_COMMENT".to_string(),
            TokenKind::SQL_COMMENT => "SQL_COMMENT".to_string(),
            TokenKind::EOF => "EOF".to_string(),
            TokenKind::BCONST => "BCONST".to_string(),
            TokenKind::XCONST => "XCONST".to_string(),
            TokenKind::SCONST => "SCONST".to_string(),
            TokenKind::USCONST => "USCONST".to_string(),
            TokenKind::UIDENT => "UIDENT".to_string(),
            TokenKind::TYPECAST => "TYPECAST".to_string(),
            TokenKind::DOT_DOT => "DOT_DOT".to_string(),
            TokenKind::COLON_EQUALS => "COLON_EQUALS".to_string(),
            TokenKind::EQUALS_GREATER => "EQUALS_GREATER".to_string(),
            TokenKind::LESS_EQUALS => "LESS_EQUALS".to_string(),
            TokenKind::GREATER_EQUALS => "GREATER_EQUALS".to_string(),
            TokenKind::NOT_EQUALS => "NOT_EQUALS".to_string(),
            TokenKind::Op => "Op".to_string(),
            TokenKind::PARAM => "PARAM".to_string(),
            TokenKind::FCONST => "FCONST".to_string(),
            TokenKind::ICONST => "ICONST".to_string(),
        }
    }
}

impl<T> From<T> for TokenKind
where
    T: AsRef<str>,
{
    fn from(s: T) -> Self {
        match s.as_ref() {
            "IDENT" => TokenKind::IDENT,
            "C_COMMENT" => TokenKind::C_COMMENT,
            "SQL_COMMENT" => TokenKind::SQL_COMMENT,
            "EOF" => TokenKind::EOF,
            "BCONST" => TokenKind::BCONST,
            "XCONST" => TokenKind::XCONST,
            "SCONST" => TokenKind::SCONST,
            "USCONST" => TokenKind::USCONST,
            "UIDENT" => TokenKind::UIDENT,
            "TYPECAST" => TokenKind::TYPECAST,
            "DOT_DOT" => TokenKind::DOT_DOT,
            "COLON_EQUALS" => TokenKind::COLON_EQUALS,
            "EQUALS_GREATER" => TokenKind::EQUALS_GREATER,
            "LESS_EQUALS" => TokenKind::LESS_EQUALS,
            "GREATER_EQUALS" => TokenKind::GREATER_EQUALS,
            "NOT_EQUALS" => TokenKind::NOT_EQUALS,
            "Op" => TokenKind::Op,
            "PARAM" => TokenKind::PARAM,
            "FCONST" => TokenKind::FCONST,
            "ICONST" => TokenKind::ICONST,
            s if s.starts_with('\'') => TokenKind::RAW(s.to_string()), // TODO is it necessary?
            s => TokenKind::KEYWORD(s.to_string()),                    // TODO check if keyword
        }
    }
}

// #[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone)]
pub struct Token {
    pub start_byte_pos: usize,
    pub end_byte_pos: usize,
    pub kind: TokenKind,
    pub value: String,
}

pub fn lex(input: &str) -> Result<Vec<Token>, ParserError> {
    let mut lexer = Lexer::new(input);

    let mut tokens = vec![];
    while let Some(kind) = lexer.parse_token()? {
        // dbg!(&kind);
        if kind == TokenKind::EOF {
            break;
        }

        let start_byte_pos = lexer.yylloc_bytes;
        let end_byte_pos = if matches!(
            kind,
            TokenKind::SCONST
                | TokenKind::BCONST
                | TokenKind::XCONST
                | TokenKind::IDENT
                | TokenKind::C_COMMENT
        ) {
            lexer.yyllocend_bytes
        } else {
            lexer.yylloc_bytes + lexer.yyleng
        };

        tokens.push(Token {
            start_byte_pos,
            end_byte_pos,
            kind,
            value: input[start_byte_pos..end_byte_pos].to_string(),
        });
        lexer.advance();
    }

    // dbg!(&tokens);
    Ok(tokens)
}