use std::io::BufRead;
use std::iter::Peekable;
use utf8_chars::{BufReadCharsExt, Chars};
use super::dialect::keywords::ALL_KEYWORDS;
use super::dialect::Dialect;
use std::collections::VecDeque;
use std::fmt;
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
Word(Word),
Number(String),
Char(char),
SingleQuotedString(String),
NationalStringLiteral(String),
HexStringLiteral(String),
Comma,
Whitespace(Whitespace),
Eq,
Neq([char; 2]),
Lt,
Gt,
LtEq,
GtEq,
Plus,
Minus,
Mult,
Div,
Mod,
LParen,
RParen,
Period,
Colon,
DoubleColon,
SemiColon,
Backslash,
LBracket,
RBracket,
Ampersand,
LBrace,
RBrace,
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Token::Word(ref w) => write!(f, "{}", w),
Token::Number(ref n) => f.write_str(n),
Token::Char(ref c) => write!(f, "{}", c),
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
Token::Comma => f.write_str(","),
Token::Whitespace(ws) => write!(f, "{}", ws),
Token::Eq => f.write_str("="),
Token::Neq(values) => write!(f, "{}{}", values[0], values[1]),
Token::Lt => f.write_str("<"),
Token::Gt => f.write_str(">"),
Token::LtEq => f.write_str("<="),
Token::GtEq => f.write_str(">="),
Token::Plus => f.write_str("+"),
Token::Minus => f.write_str("-"),
Token::Mult => f.write_str("*"),
Token::Div => f.write_str("/"),
Token::Mod => f.write_str("%"),
Token::LParen => f.write_str("("),
Token::RParen => f.write_str(")"),
Token::Period => f.write_str("."),
Token::Colon => f.write_str(":"),
Token::DoubleColon => f.write_str("::"),
Token::SemiColon => f.write_str(";"),
Token::Backslash => f.write_str("\\"),
Token::LBracket => f.write_str("["),
Token::RBracket => f.write_str("]"),
Token::Ampersand => f.write_str("&"),
Token::LBrace => f.write_str("{"),
Token::RBrace => f.write_str("}"),
}
}
}
impl Token {
pub fn new(word: &str, quote_style: Option<char>) -> Self {
let word_uppercase = word.to_uppercase();
let is_keyword = quote_style == None && ALL_KEYWORDS.contains(&word_uppercase.as_str());
Token::Word(Word {
value: word.to_string(),
quote_style,
keyword: if is_keyword {
word_uppercase
} else {
"".to_string()
},
})
}
pub fn get_value(&self) -> String {
match self {
Token::Word(word) => word.value.clone(),
Token::SingleQuotedString(ref s)
| Token::NationalStringLiteral(ref s)
| Token::HexStringLiteral(ref s) => s.clone(),
_ => format!("{}", self),
}
}
pub fn get_number(&self) -> Option<f64> {
match self {
Token::Number(number) => {
let number = number.parse();
match number {
Ok(number) => Some(number),
Err(_) => None,
}
}
_ => None,
}
}
pub fn is_keyword(&self) -> bool {
match self {
Token::Word(word) if word.keyword != "" => true,
_ => false,
}
}
pub fn get_quote_style(&self) -> Option<char> {
match self {
Token::Word(word) => word.quote_style,
_ => None,
}
}
}
impl From<Token> for String {
fn from(token: Token) -> String {
format!("{}", token)
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct Word {
pub value: String,
pub quote_style: Option<char>,
pub keyword: String,
}
impl fmt::Display for Word {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.quote_style {
Some(s) if s == '"' || s == '[' || s == '`' => {
write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
}
None => f.write_str(&self.value),
_ => panic!("Unexpected quote_style!"),
}
}
}
impl Word {
fn matching_end_quote(ch: char) -> char {
match ch {
'"' => '"',
'[' => ']',
'`' => '`',
_ => panic!("unexpected quoting style!"),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum Whitespace {
Space,
Newline,
Tab,
SingleLineComment(String),
MultiLineComment(String),
}
impl fmt::Display for Whitespace {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Whitespace::Space => f.write_str(" "),
Whitespace::Newline => f.write_str("\n"),
Whitespace::Tab => f.write_str("\t"),
Whitespace::SingleLineComment(s) => write!(f, "--{}", s),
Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
}
}
}
#[derive(Debug, PartialEq)]
pub struct TokenizerError(String);
pub struct Tokenizer<'a, R: BufRead, D: Dialect> {
dialect: D,
pub query: Peekable<Chars<'a, R>>,
pub line: u64,
pub col: u64,
peeked_tokens: VecDeque<Token>,
}
impl<'a, R: BufRead, D: Dialect> Tokenizer<'a, R, D> {
pub fn new(dialect: D, query: &'a mut R) -> Self {
Self {
dialect,
query: query.chars().peekable(),
line: 1,
col: 1,
peeked_tokens: VecDeque::new(),
}
}
pub fn peek_token(&mut self, n: usize) -> Result<Option<Token>, TokenizerError> {
if self.peeked_tokens.len() <= n {
let tokens_to_peek = n - self.peeked_tokens.len() + 1;
for _ in 0..tokens_to_peek {
match self.internal_next_token() {
Ok(Some(token)) => {
self.peeked_tokens.push_back(token);
}
_ => return Err(TokenizerError("Unexpected EOF.".to_string())),
}
}
}
Ok(Some(self.peeked_tokens[n].clone()))
}
pub fn pushback_token(&mut self, token: Token) {
self.peeked_tokens.push_front(token);
}
pub fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
if let Some(token) = self.peeked_tokens.pop_front() {
return Ok(Some(token));
}
self.internal_next_token()
}
fn internal_next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
match self.query.peek() {
Some(Ok(ch)) => match *ch {
' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
'\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
'\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
'\r' => {
self.query.next();
if let Some(Ok('\n')) = self.query.peek() {
self.query.next();
}
Ok(Some(Token::Whitespace(Whitespace::Newline)))
}
'N' => {
self.query.next();
match self.query.peek() {
Some(Ok('\'')) => {
let s = self.tokenize_single_quoted_string();
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
let s = self.tokenize_word('N');
Ok(Some(Token::new(&s, None)))
}
}
}
x @ 'x' | x @ 'X' => {
self.query.next();
match self.query.peek() {
Some(Ok('\'')) => {
let s = self.tokenize_single_quoted_string();
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
let s = self.tokenize_word(x);
Ok(Some(Token::new(&s, None)))
}
}
}
ch if self.dialect.is_identifier_start(ch) => {
self.query.next();
let s = self.tokenize_word(ch);
Ok(Some(Token::new(&s, None)))
}
'\'' => {
let s = self.tokenize_single_quoted_string();
Ok(Some(Token::SingleQuotedString(s)))
}
quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
self.query.next();
let quote_end = Word::matching_end_quote(quote_start);
let s = self.peeking_take_while(|_tok, ch| ch != quote_end);
match self.query.next() {
Some(Ok(ch)) if ch == quote_end => {
Ok(Some(Token::new(&s, Some(quote_start))))
}
_ => Err(TokenizerError(format!(
"Expected close delimiter '{}' before EOF.",
quote_end
))),
}
}
'0'..='9' => {
let s = self.peeking_take_while(|_tok, ch| match ch {
'0'..='9' | '.' => true,
_ => false,
});
Ok(Some(Token::Number(s)))
}
'(' => self.consume_and_return(Token::LParen),
')' => self.consume_and_return(Token::RParen),
',' => self.consume_and_return(Token::Comma),
'-' => {
self.query.next();
match self.query.peek() {
Some(Ok('-')) => {
self.query.next();
let mut s = self.peeking_take_while(|_tok, ch| ch != '\n');
if let Some(Ok(ch)) = self.query.next() {
assert_eq!(ch, '\n');
s.push(ch);
}
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment(s))))
}
Some(Ok('0'..='9')) => {
let s = self.peeking_take_while(|_tok, ch| match ch {
'0'..='9' | '.' => true,
_ => false,
});
Ok(Some(Token::Number(format!("-{}", s))))
}
_ => Ok(Some(Token::Minus)),
}
}
'/' => {
self.query.next();
match self.query.peek() {
Some(Ok('*')) => {
self.query.next();
self.tokenize_multiline_comment()
}
_ => Ok(Some(Token::Div)),
}
}
'+' => self.consume_and_return(Token::Plus),
'*' => self.consume_and_return(Token::Mult),
'%' => self.consume_and_return(Token::Mod),
'=' => self.consume_and_return(Token::Eq),
'.' => self.consume_and_return(Token::Period),
'!' => {
self.query.next();
match self.query.peek() {
Some(Ok('=')) => self.consume_and_return(Token::Neq(['!', '='])),
_ => Err(TokenizerError(format!(
"Tokenizer Error at Line: {}, Col: {}",
self.line, self.col
))),
}
}
'<' => {
self.query.next();
match self.query.peek() {
Some(Ok('=')) => self.consume_and_return(Token::LtEq),
Some(Ok('>')) => self.consume_and_return(Token::Neq(['<', '>'])),
_ => Ok(Some(Token::Lt)),
}
}
'>' => {
self.query.next();
match self.query.peek() {
Some(Ok('=')) => self.consume_and_return(Token::GtEq),
_ => Ok(Some(Token::Gt)),
}
}
':' => {
self.query.next();
match self.query.peek() {
Some(Ok(':')) => self.consume_and_return(Token::DoubleColon),
_ => Ok(Some(Token::Colon)),
}
}
';' => self.consume_and_return(Token::SemiColon),
'\\' => self.consume_and_return(Token::Backslash),
'[' => self.consume_and_return(Token::LBracket),
']' => self.consume_and_return(Token::RBracket),
'&' => self.consume_and_return(Token::Ampersand),
'{' => self.consume_and_return(Token::LBrace),
'}' => self.consume_and_return(Token::RBrace),
other => self.consume_and_return(Token::Char(other)),
},
_ => Ok(None),
}
}
fn tokenize_word(&mut self, first_char: char) -> String {
let mut s = first_char.to_string();
s.push_str(&self.peeking_take_while(|tok, ch| tok.dialect.is_identifier_part(ch)));
s
}
fn tokenize_single_quoted_string(&mut self) -> String {
let chars = &mut self.query;
let mut s = String::new();
chars.next();
while let Some(Ok(ch)) = chars.peek() {
match *ch {
'\'' => {
chars.next();
let escaped_quote = chars
.peek()
.map(|c| c.as_ref().unwrap() == &'\'')
.unwrap_or(false);
if escaped_quote {
s.push('\'');
s.push('\'');
chars.next();
} else {
break;
}
}
'\\' => {
chars.next();
let next_char = chars.peek().unwrap().as_ref().unwrap();
if next_char == &'\\'
|| next_char == &'\''
|| next_char == &'\"'
|| next_char == &'n'
|| next_char == &'t'
|| next_char == &'r'
|| next_char == &'0'
{
s.push('\\');
s.push(*next_char);
chars.next();
} else {
break;
}
}
ch => {
chars.next();
s.push(ch);
}
}
}
s
}
fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new();
let mut maybe_closing_comment = false;
loop {
match self.query.next() {
Some(Ok(ch)) => {
if maybe_closing_comment {
if ch == '/' {
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
} else {
s.push('*');
}
}
maybe_closing_comment = ch == '*';
if !maybe_closing_comment {
s.push(ch);
}
}
_ => {
break Err(TokenizerError(
"Unexpected EOF while in a multi-line comment".to_string(),
));
}
}
}
}
fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
self.query.next();
Ok(Some(t))
}
fn peeking_take_while(
&mut self,
mut predicate: impl FnMut(&mut Tokenizer<'a, R, D>, char) -> bool,
) -> String {
let mut s = String::new();
while let Some(Ok(ch)) = self.query.peek() {
let ch = *ch;
if predicate(self, ch) {
self.query.next();
s.push(ch);
} else {
break;
}
}
s
}
}