1use std::io::BufRead;
20use std::iter::Peekable;
21use utf8_chars::{BufReadCharsExt, Chars};
22
23use super::dialect::keywords::ALL_KEYWORDS;
24use super::dialect::Dialect;
25use std::collections::VecDeque;
26use std::fmt;
27
28#[derive(Debug, Clone, PartialEq)]
30pub enum Token {
31 Word(Word),
33 Number(String),
35 Char(char),
37 SingleQuotedString(String),
39 NationalStringLiteral(String),
41 HexStringLiteral(String),
43 Comma,
45 Whitespace(Whitespace),
47 Eq,
49 Neq([char; 2]),
51 Lt,
53 Gt,
55 LtEq,
57 GtEq,
59 Plus,
61 Minus,
63 Mult,
65 Div,
67 Mod,
69 LParen,
71 RParen,
73 Period,
75 Colon,
77 DoubleColon,
79 SemiColon,
81 Backslash,
83 LBracket,
85 RBracket,
87 Ampersand,
89 LBrace,
91 RBrace,
93}
94
95impl fmt::Display for Token {
96 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
97 match self {
98 Token::Word(ref w) => write!(f, "{}", w),
99 Token::Number(ref n) => f.write_str(n),
100 Token::Char(ref c) => write!(f, "{}", c),
101 Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
102 Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
103 Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
104 Token::Comma => f.write_str(","),
105 Token::Whitespace(ws) => write!(f, "{}", ws),
106 Token::Eq => f.write_str("="),
107 Token::Neq(values) => write!(f, "{}{}", values[0], values[1]),
108 Token::Lt => f.write_str("<"),
109 Token::Gt => f.write_str(">"),
110 Token::LtEq => f.write_str("<="),
111 Token::GtEq => f.write_str(">="),
112 Token::Plus => f.write_str("+"),
113 Token::Minus => f.write_str("-"),
114 Token::Mult => f.write_str("*"),
115 Token::Div => f.write_str("/"),
116 Token::Mod => f.write_str("%"),
117 Token::LParen => f.write_str("("),
118 Token::RParen => f.write_str(")"),
119 Token::Period => f.write_str("."),
120 Token::Colon => f.write_str(":"),
121 Token::DoubleColon => f.write_str("::"),
122 Token::SemiColon => f.write_str(";"),
123 Token::Backslash => f.write_str("\\"),
124 Token::LBracket => f.write_str("["),
125 Token::RBracket => f.write_str("]"),
126 Token::Ampersand => f.write_str("&"),
127 Token::LBrace => f.write_str("{"),
128 Token::RBrace => f.write_str("}"),
129 }
130 }
131}
132
133impl Token {
134 pub fn new(word: &str, quote_style: Option<char>) -> Self {
140 let word_uppercase = word.to_uppercase();
141 let is_keyword = quote_style == None && ALL_KEYWORDS.contains(&word_uppercase.as_str());
145 Token::Word(Word {
146 value: word.to_string(),
147 quote_style,
148 keyword: if is_keyword {
149 word_uppercase
150 } else {
151 "".to_string()
152 },
153 })
154 }
155
156 pub fn get_value(&self) -> String {
158 match self {
159 Token::Word(word) => word.value.clone(),
160 Token::SingleQuotedString(ref s)
161 | Token::NationalStringLiteral(ref s)
162 | Token::HexStringLiteral(ref s) => s.clone(),
163 _ => format!("{}", self),
164 }
165 }
166
167 pub fn get_number(&self) -> Option<f64> {
169 match self {
170 Token::Number(number) => {
171 let number = number.parse();
172 match number {
173 Ok(number) => Some(number),
174 Err(_) => None,
175 }
176 }
177 _ => None,
178 }
179 }
180
181 pub fn is_keyword(&self) -> bool {
183 match self {
184 Token::Word(word) if word.keyword != "" => true,
185 _ => false,
186 }
187 }
188
189 pub fn get_quote_style(&self) -> Option<char> {
191 match self {
192 Token::Word(word) => word.quote_style,
193 _ => None,
194 }
195 }
196}
197
198impl From<Token> for String {
199 fn from(token: Token) -> String {
200 format!("{}", token)
201 }
202}
203
204#[derive(Debug, Clone, PartialEq)]
206pub struct Word {
207 pub value: String,
210 pub quote_style: Option<char>,
214 pub keyword: String,
217}
218
219impl fmt::Display for Word {
220 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
221 match self.quote_style {
222 Some(s) if s == '"' || s == '[' || s == '`' => {
223 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
224 }
225 None => f.write_str(&self.value),
226 _ => panic!("Unexpected quote_style!"),
227 }
228 }
229}
230impl Word {
231 fn matching_end_quote(ch: char) -> char {
232 match ch {
233 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
237 }
238 }
239}
240
241#[derive(Debug, Clone, PartialEq)]
242pub enum Whitespace {
243 Space,
244 Newline,
245 Tab,
246 SingleLineComment(String),
247 MultiLineComment(String),
248}
249
250impl fmt::Display for Whitespace {
251 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
252 match self {
253 Whitespace::Space => f.write_str(" "),
254 Whitespace::Newline => f.write_str("\n"),
255 Whitespace::Tab => f.write_str("\t"),
256 Whitespace::SingleLineComment(s) => write!(f, "--{}", s),
257 Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
258 }
259 }
260}
261
262#[derive(Debug, PartialEq)]
264pub struct TokenizerError(String);
265
266pub struct Tokenizer<'a, R: BufRead, D: Dialect> {
268 dialect: D,
269 pub query: Peekable<Chars<'a, R>>,
270 pub line: u64,
271 pub col: u64,
272 peeked_tokens: VecDeque<Token>,
273}
274
275impl<'a, R: BufRead, D: Dialect> Tokenizer<'a, R, D> {
276 pub fn new(dialect: D, query: &'a mut R) -> Self {
278 Self {
279 dialect,
280 query: query.chars().peekable(),
281 line: 1,
282 col: 1,
283 peeked_tokens: VecDeque::new(),
284 }
285 }
286
287 pub fn peek_token(&mut self, n: usize) -> Result<Option<Token>, TokenizerError> {
288 if self.peeked_tokens.len() <= n {
290 let tokens_to_peek = n - self.peeked_tokens.len() + 1;
292 for _ in 0..tokens_to_peek {
293 match self.internal_next_token() {
294 Ok(Some(token)) => {
295 self.peeked_tokens.push_back(token);
296 }
297 _ => return Err(TokenizerError("Unexpected EOF.".to_string())),
298 }
299 }
300 }
301 Ok(Some(self.peeked_tokens[n].clone()))
302 }
303
304 pub fn pushback_token(&mut self, token: Token) {
305 self.peeked_tokens.push_front(token);
306 }
307
308 pub fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
310 if let Some(token) = self.peeked_tokens.pop_front() {
311 return Ok(Some(token));
313 }
314
315 self.internal_next_token()
316 }
318
319 fn internal_next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
320 match self.query.peek() {
321 Some(Ok(ch)) => match *ch {
322 ' ' => self.consume_and_return(Token::Whitespace(Whitespace::Space)),
323 '\t' => self.consume_and_return(Token::Whitespace(Whitespace::Tab)),
324 '\n' => self.consume_and_return(Token::Whitespace(Whitespace::Newline)),
325 '\r' => {
326 self.query.next();
328 if let Some(Ok('\n')) = self.query.peek() {
329 self.query.next();
330 }
331 Ok(Some(Token::Whitespace(Whitespace::Newline)))
332 }
333 'N' => {
334 self.query.next(); match self.query.peek() {
336 Some(Ok('\'')) => {
337 let s = self.tokenize_single_quoted_string();
339 Ok(Some(Token::NationalStringLiteral(s)))
340 }
341 _ => {
342 let s = self.tokenize_word('N');
344 Ok(Some(Token::new(&s, None)))
345 }
346 }
347 }
348 x @ 'x' | x @ 'X' => {
351 self.query.next(); match self.query.peek() {
353 Some(Ok('\'')) => {
354 let s = self.tokenize_single_quoted_string();
356 Ok(Some(Token::HexStringLiteral(s)))
357 }
358 _ => {
359 let s = self.tokenize_word(x);
361 Ok(Some(Token::new(&s, None)))
362 }
363 }
364 }
365 ch if self.dialect.is_identifier_start(ch) => {
367 self.query.next(); let s = self.tokenize_word(ch);
369 Ok(Some(Token::new(&s, None)))
370 }
371 '\'' => {
373 let s = self.tokenize_single_quoted_string();
374 Ok(Some(Token::SingleQuotedString(s)))
375 }
376 quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
378 self.query.next(); let quote_end = Word::matching_end_quote(quote_start);
380 let s = self.peeking_take_while(|_tok, ch| ch != quote_end);
381 match self.query.next() {
382 Some(Ok(ch)) if ch == quote_end => {
383 Ok(Some(Token::new(&s, Some(quote_start))))
384 }
385 _ => Err(TokenizerError(format!(
386 "Expected close delimiter '{}' before EOF.",
387 quote_end
388 ))),
389 }
390 }
391 '0'..='9' => {
393 let s = self.peeking_take_while(|_tok, ch| match ch {
395 '0'..='9' | '.' => true,
396 _ => false,
397 });
398 Ok(Some(Token::Number(s)))
399 }
400 '(' => self.consume_and_return(Token::LParen),
402 ')' => self.consume_and_return(Token::RParen),
403 ',' => self.consume_and_return(Token::Comma),
404 '-' => {
406 self.query.next(); match self.query.peek() {
408 Some(Ok('-')) => {
409 self.query.next(); let mut s = self.peeking_take_while(|_tok, ch| ch != '\n');
411 if let Some(Ok(ch)) = self.query.next() {
412 assert_eq!(ch, '\n');
413 s.push(ch);
414 }
415 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment(s))))
416 }
417 Some(Ok('0'..='9')) => {
418 let s = self.peeking_take_while(|_tok, ch| match ch {
419 '0'..='9' | '.' => true,
420 _ => false,
421 });
422 Ok(Some(Token::Number(format!("-{}", s))))
423 }
424 _ => Ok(Some(Token::Minus)),
426 }
427 }
428 '/' => {
429 self.query.next(); match self.query.peek() {
431 Some(Ok('*')) => {
432 self.query.next(); self.tokenize_multiline_comment()
434 }
435 _ => Ok(Some(Token::Div)),
437 }
438 }
439 '+' => self.consume_and_return(Token::Plus),
440 '*' => self.consume_and_return(Token::Mult),
441 '%' => self.consume_and_return(Token::Mod),
442 '=' => self.consume_and_return(Token::Eq),
443 '.' => self.consume_and_return(Token::Period),
444 '!' => {
445 self.query.next(); match self.query.peek() {
447 Some(Ok('=')) => self.consume_and_return(Token::Neq(['!', '='])),
448 _ => Err(TokenizerError(format!(
449 "Tokenizer Error at Line: {}, Col: {}",
450 self.line, self.col
451 ))),
452 }
453 }
454 '<' => {
455 self.query.next(); match self.query.peek() {
457 Some(Ok('=')) => self.consume_and_return(Token::LtEq),
458 Some(Ok('>')) => self.consume_and_return(Token::Neq(['<', '>'])),
459 _ => Ok(Some(Token::Lt)),
460 }
461 }
462 '>' => {
463 self.query.next(); match self.query.peek() {
465 Some(Ok('=')) => self.consume_and_return(Token::GtEq),
466 _ => Ok(Some(Token::Gt)),
467 }
468 }
469 ':' => {
470 self.query.next();
471 match self.query.peek() {
472 Some(Ok(':')) => self.consume_and_return(Token::DoubleColon),
473 _ => Ok(Some(Token::Colon)),
474 }
475 }
476 ';' => self.consume_and_return(Token::SemiColon),
477 '\\' => self.consume_and_return(Token::Backslash),
478 '[' => self.consume_and_return(Token::LBracket),
479 ']' => self.consume_and_return(Token::RBracket),
480 '&' => self.consume_and_return(Token::Ampersand),
481 '{' => self.consume_and_return(Token::LBrace),
482 '}' => self.consume_and_return(Token::RBrace),
483 other => self.consume_and_return(Token::Char(other)),
484 },
485 _ => Ok(None),
486 }
487 }
488
489 fn tokenize_word(&mut self, first_char: char) -> String {
491 let mut s = first_char.to_string();
492 s.push_str(&self.peeking_take_while(|tok, ch| tok.dialect.is_identifier_part(ch)));
493 s
494 }
495
496 fn tokenize_single_quoted_string(&mut self) -> String {
498 let chars = &mut self.query;
503 let mut s = String::new();
504 chars.next(); while let Some(Ok(ch)) = chars.peek() {
506 match *ch {
507 '\'' => {
508 chars.next(); let escaped_quote = chars
510 .peek()
511 .map(|c| c.as_ref().unwrap() == &'\'')
512 .unwrap_or(false);
513 if escaped_quote {
514 s.push('\'');
515 s.push('\'');
516 chars.next();
517 } else {
518 break;
519 }
520 }
521 '\\' => {
522 chars.next(); let next_char = chars.peek().unwrap().as_ref().unwrap();
524 if next_char == &'\\'
525 || next_char == &'\''
526 || next_char == &'\"'
527 || next_char == &'n'
528 || next_char == &'t'
529 || next_char == &'r'
530 || next_char == &'0'
531 {
532 s.push('\\');
533 s.push(*next_char);
534 chars.next();
535 } else {
536 break;
537 }
538 }
539 ch => {
540 chars.next(); s.push(ch);
542 }
543 }
544 }
545 s
546 }
547
548 fn tokenize_multiline_comment(&mut self) -> Result<Option<Token>, TokenizerError> {
549 let mut s = String::new();
550 let mut maybe_closing_comment = false;
551 loop {
553 match self.query.next() {
554 Some(Ok(ch)) => {
555 if maybe_closing_comment {
556 if ch == '/' {
557 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
558 } else {
559 s.push('*');
560 }
561 }
562 maybe_closing_comment = ch == '*';
563 if !maybe_closing_comment {
564 s.push(ch);
565 }
566 }
567 _ => {
568 break Err(TokenizerError(
569 "Unexpected EOF while in a multi-line comment".to_string(),
570 ));
571 }
572 }
573 }
574 }
575
576 fn consume_and_return(&mut self, t: Token) -> Result<Option<Token>, TokenizerError> {
577 self.query.next();
578 Ok(Some(t))
579 }
580
581 fn peeking_take_while(
585 &mut self,
586 mut predicate: impl FnMut(&mut Tokenizer<'a, R, D>, char) -> bool,
587 ) -> String {
588 let mut s = String::new();
589 while let Some(Ok(ch)) = self.query.peek() {
590 let ch = *ch;
591 if predicate(self, ch) {
592 self.query.next(); s.push(ch);
594 } else {
595 break;
596 }
597 }
598 s
599 }
600}