use std::iter::Peekable;
use std::str::Chars;
use super::dialect::keywords::ALL_KEYWORDS;
use super::dialect::Dialect;
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
SQLWord(SQLWord),
Number(String),
Char(char),
SingleQuotedString(String),
NationalStringLiteral(String),
Comma,
Whitespace(Whitespace),
Eq,
Neq,
Lt,
Gt,
LtEq,
GtEq,
Plus,
Minus,
Mult,
Div,
Mod,
LParen,
RParen,
Period,
Colon,
DoubleColon,
SemiColon,
Backslash,
LBracket,
RBracket,
Ampersand,
LBrace,
RBrace,
}
impl ToString for Token {
fn to_string(&self) -> String {
match self {
Token::SQLWord(ref w) => w.to_string(),
Token::Number(ref n) => n.to_string(),
Token::Char(ref c) => c.to_string(),
Token::SingleQuotedString(ref s) => format!("'{}'", s),
Token::NationalStringLiteral(ref s) => format!("N'{}'", s),
Token::Comma => ",".to_string(),
Token::Whitespace(ws) => ws.to_string(),
Token::Eq => "=".to_string(),
Token::Neq => "-".to_string(),
Token::Lt => "<".to_string(),
Token::Gt => ">".to_string(),
Token::LtEq => "<=".to_string(),
Token::GtEq => ">=".to_string(),
Token::Plus => "+".to_string(),
Token::Minus => "-".to_string(),
Token::Mult => "*".to_string(),
Token::Div => "/".to_string(),
Token::Mod => "%".to_string(),
Token::LParen => "(".to_string(),
Token::RParen => ")".to_string(),
Token::Period => ".".to_string(),
Token::Colon => ":".to_string(),
Token::DoubleColon => "::".to_string(),
Token::SemiColon => ";".to_string(),
Token::Backslash => "\\".to_string(),
Token::LBracket => "[".to_string(),
Token::RBracket => "]".to_string(),
Token::Ampersand => "&".to_string(),
Token::LBrace => "{".to_string(),
Token::RBrace => "}".to_string(),
}
}
}
impl Token {
pub fn make_keyword(keyword: &str) -> Self {
Token::make_word(keyword, None)
}
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
let word_uppercase = word.to_uppercase();
let is_keyword = quote_style == None && ALL_KEYWORDS.contains(&word_uppercase.as_str());
Token::SQLWord(SQLWord {
value: word.to_string(),
quote_style: quote_style,
keyword: if is_keyword {
word_uppercase.to_string()
} else {
"".to_string()
},
})
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct SQLWord {
pub value: String,
pub quote_style: Option<char>,
pub keyword: String,
}
impl ToString for SQLWord {
fn to_string(&self) -> String {
match self.quote_style {
Some(s) if s == '"' || s == '[' || s == '`' => {
format!("{}{}{}", s, self.value, SQLWord::matching_end_quote(s))
}
None => self.value.clone(),
_ => panic!("Unexpected quote_style!"),
}
}
}
impl SQLWord {
fn matching_end_quote(ch: char) -> char {
match ch {
'"' => '"',
'[' => ']',
'`' => '`',
_ => panic!("unexpected quoting style!"),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum Whitespace {
Space,
Newline,
Tab,
SingleLineComment(String),
MultiLineComment(String),
}
impl ToString for Whitespace {
fn to_string(&self) -> String {
match self {
Whitespace::Space => " ".to_string(),
Whitespace::Newline => "\n".to_string(),
Whitespace::Tab => "\t".to_string(),
Whitespace::SingleLineComment(s) => format!("--{}", s),
Whitespace::MultiLineComment(s) => format!("/*{}*/", s),
}
}
}
#[derive(Debug, PartialEq)]
pub struct TokenizerError(String);
pub struct Tokenizer<'a> {
dialect: &'a Dialect,
pub query: String,
pub line: u64,
pub col: u64,
}
impl<'a> Tokenizer<'a> {
pub fn new(dialect: &'a Dialect, query: &str) -> Self {
Self {
dialect,
query: query.to_string(),
line: 1,
col: 1,
}
}
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
let mut peekable = self.query.chars().peekable();
let mut tokens: Vec<Token> = vec![];
while let Some(token) = self.next_token(&mut peekable)? {
match &token {
Token::Whitespace(Whitespace::Newline) => {
self.line += 1;
self.col = 1;
}
Token::Whitespace(Whitespace::Tab) => self.col += 4,
Token::SQLWord(w) if w.quote_style == None => self.col += w.value.len() as u64,
Token::SQLWord(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2,
Token::Number(s) => self.col += s.len() as u64,
Token::SingleQuotedString(s) => self.col += s.len() as u64,
_ => self.col += 1,
}
tokens.push(token);
}
Ok(tokens)
}
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<Token>, TokenizerError> {
match chars.peek() {
Some(&ch) => match ch {
' ' => {
chars.next();
Ok(Some(Token::Whitespace(Whitespace::Space)))
}
'\t' => {
chars.next();
Ok(Some(Token::Whitespace(Whitespace::Tab)))
}
'\n' => {
chars.next();
Ok(Some(Token::Whitespace(Whitespace::Newline)))
}
'N' => {
chars.next();
match chars.peek() {
Some('\'') => {
let s = self.tokenize_single_quoted_string(chars);
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
let s = self.tokenize_word('N', chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
ch if self.dialect.is_identifier_start(ch) => {
chars.next();
let s = self.tokenize_word(ch, chars);
Ok(Some(Token::make_word(&s, None)))
}
'\'' => {
let s = self.tokenize_single_quoted_string(chars);
Ok(Some(Token::SingleQuotedString(s)))
}
quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
let mut s = String::new();
chars.next();
let quote_end = SQLWord::matching_end_quote(quote_start);
while let Some(ch) = chars.next() {
match ch {
c if c == quote_end => break,
_ => s.push(ch),
}
}
Ok(Some(Token::make_word(&s, Some(quote_start))))
}
'0'...'9' => {
let mut s = String::new();
while let Some(&ch) = chars.peek() {
match ch {
'0'...'9' | '.' => {
chars.next();
s.push(ch);
}
_ => break,
}
}
Ok(Some(Token::Number(s)))
}
'(' => self.consume_and_return(chars, Token::LParen),
')' => self.consume_and_return(chars, Token::RParen),
',' => self.consume_and_return(chars, Token::Comma),
'-' => {
chars.next();
match chars.peek() {
Some('-') => {
chars.next();
let mut s = String::new();
loop {
match chars.next() {
Some(ch) if ch != '\n' => {
s.push(ch);
}
other => {
if other.is_some() {
s.push('\n');
}
break Ok(Some(Token::Whitespace(
Whitespace::SingleLineComment(s),
)));
}
}
}
}
_ => Ok(Some(Token::Minus)),
}
}
'/' => {
chars.next();
match chars.peek() {
Some('*') => {
chars.next();
self.tokenize_multiline_comment(chars)
}
_ => Ok(Some(Token::Div)),
}
}
'+' => self.consume_and_return(chars, Token::Plus),
'*' => self.consume_and_return(chars, Token::Mult),
'%' => self.consume_and_return(chars, Token::Mod),
'=' => self.consume_and_return(chars, Token::Eq),
'.' => self.consume_and_return(chars, Token::Period),
'!' => {
chars.next();
match chars.peek() {
Some(&ch) => match ch {
'=' => self.consume_and_return(chars, Token::Neq),
_ => Err(TokenizerError(format!(
"Tokenizer Error at Line: {}, Col: {}",
self.line, self.col
))),
},
None => Err(TokenizerError(format!(
"Tokenizer Error at Line: {}, Col: {}",
self.line, self.col
))),
}
}
'<' => {
chars.next();
match chars.peek() {
Some(&ch) => match ch {
'=' => self.consume_and_return(chars, Token::LtEq),
'>' => self.consume_and_return(chars, Token::Neq),
_ => Ok(Some(Token::Lt)),
},
None => Ok(Some(Token::Lt)),
}
}
'>' => {
chars.next();
match chars.peek() {
Some(&ch) => match ch {
'=' => self.consume_and_return(chars, Token::GtEq),
_ => Ok(Some(Token::Gt)),
},
None => Ok(Some(Token::Gt)),
}
}
':' => {
chars.next();
match chars.peek() {
Some(&ch) => match ch {
':' => self.consume_and_return(chars, Token::DoubleColon),
_ => Ok(Some(Token::Colon)),
},
None => Ok(Some(Token::Colon)),
}
}
';' => self.consume_and_return(chars, Token::SemiColon),
'\\' => self.consume_and_return(chars, Token::Backslash),
'[' => self.consume_and_return(chars, Token::LBracket),
']' => self.consume_and_return(chars, Token::RBracket),
'&' => self.consume_and_return(chars, Token::Ampersand),
'{' => self.consume_and_return(chars, Token::LBrace),
'}' => self.consume_and_return(chars, Token::RBrace),
other => self.consume_and_return(chars, Token::Char(other)),
},
None => Ok(None),
}
}
fn tokenize_word(&self, first_char: char, chars: &mut Peekable<Chars>) -> String {
let mut s = String::new();
s.push(first_char);
while let Some(&ch) = chars.peek() {
if self.dialect.is_identifier_part(ch) {
chars.next();
s.push(ch);
} else {
break;
}
}
s
}
fn tokenize_single_quoted_string(&self, chars: &mut Peekable<Chars>) -> String {
let mut s = String::new();
chars.next();
while let Some(&ch) = chars.peek() {
match ch {
'\'' => {
chars.next();
break;
}
_ => {
chars.next();
s.push(ch);
}
}
}
s
}
fn tokenize_multiline_comment(
&self,
chars: &mut Peekable<Chars>,
) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new();
let mut maybe_closing_comment = false;
loop {
match chars.next() {
Some(ch) => {
if maybe_closing_comment {
if ch == '/' {
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
} else {
s.push('*');
}
}
maybe_closing_comment = ch == '*';
if !maybe_closing_comment {
s.push(ch);
}
}
None => {
break Err(TokenizerError(
"Unexpected EOF while in a multi-line comment".to_string(),
));
}
}
}
}
fn consume_and_return(
&self,
chars: &mut Peekable<Chars>,
t: Token,
) -> Result<Option<Token>, TokenizerError> {
chars.next();
Ok(Some(t))
}
}
#[cfg(test)]
mod tests {
use super::super::dialect::GenericSqlDialect;
use super::*;
#[test]
fn tokenize_select_1() {
let sql = String::from("SELECT 1");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_scalar_function() {
let sql = String::from("SELECT sqrt(1)");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::make_word("sqrt", None),
Token::LParen,
Token::Number(String::from("1")),
Token::RParen,
];
compare(expected, tokens);
}
#[test]
fn tokenize_simple_select() {
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mult,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("id", None),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("LIMIT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("5")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_string_predicate() {
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mult,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::make_word("salary", None),
Token::Whitespace(Whitespace::Space),
Token::Neq,
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString(String::from("Not Provided")),
];
compare(expected, tokens);
}
#[test]
fn tokenize_invalid_string() {
let sql = String::from("\nمصطفىh");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
println!("tokens: {:#?}", tokens);
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Char('م'),
Token::Char('ص'),
Token::Char('ط'),
Token::Char('ف'),
Token::Char('ى'),
Token::make_word("h", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_invalid_string_cols() {
let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
println!("tokens: {:#?}", tokens);
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Whitespace(Whitespace::Newline),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mult,
Token::Whitespace(Whitespace::Space),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("table"),
Token::Whitespace(Whitespace::Tab),
Token::Char('م'),
Token::Char('ص'),
Token::Char('ط'),
Token::Char('ف'),
Token::Char('ى'),
Token::make_word("h", None),
];
compare(expected, tokens);
}
#[test]
fn tokenize_is_null() {
let sql = String::from("a IS NULL");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::make_word("a", None),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("IS"),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("NULL"),
];
compare(expected, tokens);
}
#[test]
fn tokenize_comment() {
let sql = String::from("0--this is a comment\n1");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string()),
Token::Whitespace(Whitespace::SingleLineComment(
"this is a comment\n".to_string(),
)),
Token::Number("1".to_string()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_comment_at_eof() {
let sql = String::from("--this is a comment");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![Token::Whitespace(Whitespace::SingleLineComment(
"this is a comment".to_string(),
))];
compare(expected, tokens);
}
#[test]
fn tokenize_multiline_comment() {
let sql = String::from("0/*multi-line\n* /comment*/1");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string()),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* /comment".to_string(),
)),
Token::Number("1".to_string()),
];
compare(expected, tokens);
}
#[test]
fn tokenize_multiline_comment_with_even_asterisks() {
let sql = String::from("\n/** Comment **/\n");
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
Token::Whitespace(Whitespace::Newline),
];
compare(expected, tokens);
}
fn compare(expected: Vec<Token>, actual: Vec<Token>) {
assert_eq!(expected, actual);
}
}