1use std::iter::Peekable;
20use std::str::Chars;
21
22use super::dialect::keywords::ALL_KEYWORDS;
23use super::dialect::Dialect;
24use std::fmt;
25
26#[derive(Debug, Clone, PartialEq)]
28pub enum Token {
29 Word(Word),
31 Number(String),
33 Char(char),
35 SingleQuotedString(String),
37 NationalStringLiteral(String),
39 HexStringLiteral(String),
41 Comma,
43 Whitespace(Whitespace),
45 Eq,
47 Neq,
49 Lt,
51 Gt,
53 LtEq,
55 GtEq,
57 Plus,
59 Minus,
61 Mult,
63 Div,
65 Mod,
67 LParen,
69 RParen,
71 Period,
73 Colon,
75 DoubleColon,
77 SemiColon,
79 Backslash,
81 LBracket,
83 RBracket,
85 Ampersand,
87 LBrace,
89 RBrace,
91}
92
93impl fmt::Display for Token {
94 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
95 match self {
96 Token::Word(ref w) => write!(f, "{}", w),
97 Token::Number(ref n) => f.write_str(n),
98 Token::Char(ref c) => write!(f, "{}", c),
99 Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
100 Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
101 Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
102 Token::Comma => f.write_str(","),
103 Token::Whitespace(ws) => write!(f, "{}", ws),
104 Token::Eq => f.write_str("="),
105 Token::Neq => f.write_str("<>"),
106 Token::Lt => f.write_str("<"),
107 Token::Gt => f.write_str(">"),
108 Token::LtEq => f.write_str("<="),
109 Token::GtEq => f.write_str(">="),
110 Token::Plus => f.write_str("+"),
111 Token::Minus => f.write_str("-"),
112 Token::Mult => f.write_str("*"),
113 Token::Div => f.write_str("/"),
114 Token::Mod => f.write_str("%"),
115 Token::LParen => f.write_str("("),
116 Token::RParen => f.write_str(")"),
117 Token::Period => f.write_str("."),
118 Token::Colon => f.write_str(":"),
119 Token::DoubleColon => f.write_str("::"),
120 Token::SemiColon => f.write_str(";"),
121 Token::Backslash => f.write_str("\\"),
122 Token::LBracket => f.write_str("["),
123 Token::RBracket => f.write_str("]"),
124 Token::Ampersand => f.write_str("&"),
125 Token::LBrace => f.write_str("{"),
126 Token::RBrace => f.write_str("}"),
127 }
128 }
129}
130
131impl Token {
132 pub fn make_keyword(keyword: &str) -> Self {
133 Token::make_word(keyword, None)
134 }
135 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
136 let word_uppercase = word.to_uppercase();
137 let is_keyword = quote_style.is_none() && ALL_KEYWORDS.contains(&word_uppercase.as_str());
141 Token::Word(Word {
142 value: word.to_string(),
143 quote_style,
144 keyword: if is_keyword {
145 word_uppercase
146 } else {
147 "".to_string()
148 },
149 })
150 }
151}
152
153#[derive(Debug, Clone, PartialEq)]
155pub struct Word {
156 pub value: String,
159 pub quote_style: Option<char>,
163 pub keyword: String,
166}
167
168impl fmt::Display for Word {
169 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
170 match self.quote_style {
171 Some(s) if s == '"' || s == '[' || s == '`' => {
172 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
173 }
174 None => f.write_str(&self.value),
175 _ => panic!("Unexpected quote_style!"),
176 }
177 }
178}
179impl Word {
180 fn matching_end_quote(ch: char) -> char {
181 match ch {
182 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
186 }
187 }
188}
189
190#[derive(Debug, Clone, PartialEq)]
191pub enum Whitespace {
192 Space,
193 Newline,
194 Tab,
195 SingleLineComment(String),
196 MultiLineComment(String),
197}
198
199impl fmt::Display for Whitespace {
200 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
201 match self {
202 Whitespace::Space => f.write_str(" "),
203 Whitespace::Newline => f.write_str("\n"),
204 Whitespace::Tab => f.write_str("\t"),
205 Whitespace::SingleLineComment(s) => write!(f, "--{}", s),
206 Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
207 }
208 }
209}
210
211#[derive(Debug, PartialEq)]
213pub struct TokenizerError(String);
214
215pub struct Tokenizer<'a> {
217 dialect: &'a dyn Dialect,
218 pub query: String,
219 pub line: u64,
220 pub col: u64,
221}
222
223impl<'a> Tokenizer<'a> {
224 pub fn new(dialect: &'a dyn Dialect, query: &str) -> Self {
226 Self {
227 dialect,
228 query: query.to_string(),
229 line: 1,
230 col: 1,
231 }
232 }
233
234 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
236 let mut peekable = self.query.chars().peekable();
237
238 let mut tokens: Vec<Token> = vec![];
239
240 while let Some(token) = self.next_token(&mut peekable)? {
241 match &token {
242 Token::Whitespace(Whitespace::Newline) => {
243 self.line += 1;
244 self.col = 1;
245 }
246
247 Token::Whitespace(Whitespace::Tab) => self.col += 4,
248 Token::Word(w) if w.quote_style.is_none() => self.col += w.value.len() as u64,
249 Token::Word(w) if w.quote_style.is_some() => self.col += w.value.len() as u64 + 2,
250 Token::Number(s) => self.col += s.len() as u64,
251 Token::SingleQuotedString(s) => self.col += s.len() as u64,
252 _ => self.col += 1,
253 }
254
255 tokens.push(token);
256 }
257 Ok(tokens)
258 }
259
260 fn next_token(&self, chars: &mut Peekable<Chars<'_>>) -> Result<Option<Token>, TokenizerError> {
262 match chars.peek() {
264 Some(&ch) => match ch {
265 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
266 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
267 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
268 '\r' => {
269 chars.next();
271 if let Some('\n') = chars.peek() {
272 chars.next();
273 }
274 Ok(Some(Token::Whitespace(Whitespace::Newline)))
275 }
276 'N' => {
277 chars.next(); match chars.peek() {
279 Some('\'') => {
280 let s = self.tokenize_single_quoted_string(chars);
282 Ok(Some(Token::NationalStringLiteral(s)))
283 }
284 _ => {
285 let s = self.tokenize_word('N', chars);
287 Ok(Some(Token::make_word(&s, None)))
288 }
289 }
290 }
291 x @ 'x' | x @ 'X' => {
294 chars.next(); match chars.peek() {
296 Some('\'') => {
297 let s = self.tokenize_single_quoted_string(chars);
299 Ok(Some(Token::HexStringLiteral(s)))
300 }
301 _ => {
302 let s = self.tokenize_word(x, chars);
304 Ok(Some(Token::make_word(&s, None)))
305 }
306 }
307 }
308 ch if self.dialect.is_identifier_start(ch) => {
310 chars.next(); let s = self.tokenize_word(ch, chars);
312 Ok(Some(Token::make_word(&s, None)))
313 }
314 '\'' => {
316 let s = self.tokenize_single_quoted_string(chars);
317 Ok(Some(Token::SingleQuotedString(s)))
318 }
319 quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
321 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
323 let s = peeking_take_while(chars, |ch| ch != quote_end);
324 if chars.next() == Some(quote_end) {
325 Ok(Some(Token::make_word(&s, Some(quote_start))))
326 } else {
327 Err(TokenizerError(format!(
328 "Expected close delimiter '{}' before EOF.",
329 quote_end
330 )))
331 }
332 }
333 '0'..='9' => {
335 let s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
337 Ok(Some(Token::Number(s)))
338 }
339 '(' => self.consume_and_return(chars, Token::LParen),
341 ')' => self.consume_and_return(chars, Token::RParen),
342 ',' => self.consume_and_return(chars, Token::Comma),
343 '-' => {
345 chars.next(); match chars.peek() {
347 Some('-') => {
348 chars.next(); let mut s = peeking_take_while(chars, |ch| ch != '\n');
350 if let Some(ch) = chars.next() {
351 assert_eq!(ch, '\n');
352 s.push(ch);
353 }
354 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment(s))))
355 }
356 _ => Ok(Some(Token::Minus)),
358 }
359 }
360 '/' => {
361 chars.next(); match chars.peek() {
363 Some('*') => {
364 chars.next(); self.tokenize_multiline_comment(chars)
366 }
367 _ => Ok(Some(Token::Div)),
369 }
370 }
371 '+' => self.consume_and_return(chars, Token::Plus),
372 '*' => self.consume_and_return(chars, Token::Mult),
373 '%' => self.consume_and_return(chars, Token::Mod),
374 '=' => self.consume_and_return(chars, Token::Eq),
375 '.' => self.consume_and_return(chars, Token::Period),
376 '!' => {
377 chars.next(); match chars.peek() {
379 Some('=') => self.consume_and_return(chars, Token::Neq),
380 _ => Err(TokenizerError(format!(
381 "Tokenizer Error at Line: {}, Col: {}",
382 self.line, self.col
383 ))),
384 }
385 }
386 '<' => {
387 chars.next(); match chars.peek() {
389 Some('=') => self.consume_and_return(chars, Token::LtEq),
390 Some('>') => self.consume_and_return(chars, Token::Neq),
391 _ => Ok(Some(Token::Lt)),
392 }
393 }
394 '>' => {
395 chars.next(); match chars.peek() {
397 Some('=') => self.consume_and_return(chars, Token::GtEq),
398 _ => Ok(Some(Token::Gt)),
399 }
400 }
401 ':' => {
402 chars.next();
403 match chars.peek() {
404 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
405 _ => Ok(Some(Token::Colon)),
406 }
407 }
408 ';' => self.consume_and_return(chars, Token::SemiColon),
409 '\\' => self.consume_and_return(chars, Token::Backslash),
410 '[' => self.consume_and_return(chars, Token::LBracket),
411 ']' => self.consume_and_return(chars, Token::RBracket),
412 '&' => self.consume_and_return(chars, Token::Ampersand),
413 '{' => self.consume_and_return(chars, Token::LBrace),
414 '}' => self.consume_and_return(chars, Token::RBrace),
415 other => self.consume_and_return(chars, Token::Char(other)),
416 },
417 None => Ok(None),
418 }
419 }
420
421 fn tokenize_word(&self, first_char: char, chars: &mut Peekable<Chars<'_>>) -> String {
423 let mut s = first_char.to_string();
424 s.push_str(&peeking_take_while(chars, |ch| {
425 self.dialect.is_identifier_part(ch)
426 }));
427 s
428 }
429
430 fn tokenize_single_quoted_string(&self, chars: &mut Peekable<Chars<'_>>) -> String {
432 let mut s = String::new();
437 chars.next(); while let Some(&ch) = chars.peek() {
439 match ch {
440 '\'' => {
441 chars.next(); let escaped_quote = chars.peek().map(|c| *c == '\'').unwrap_or(false);
443 if escaped_quote {
444 s.push('\'');
445 chars.next();
446 } else {
447 break;
448 }
449 }
450 _ => {
451 chars.next(); s.push(ch);
453 }
454 }
455 }
456 s
457 }
458
459 fn tokenize_multiline_comment(
460 &self,
461 chars: &mut Peekable<Chars<'_>>,
462 ) -> Result<Option<Token>, TokenizerError> {
463 let mut s = String::new();
464 let mut maybe_closing_comment = false;
465 loop {
467 match chars.next() {
468 Some(ch) => {
469 if maybe_closing_comment {
470 if ch == '/' {
471 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
472 } else {
473 s.push('*');
474 }
475 }
476 maybe_closing_comment = ch == '*';
477 if !maybe_closing_comment {
478 s.push(ch);
479 }
480 }
481 None => {
482 break Err(TokenizerError(
483 "Unexpected EOF while in a multi-line comment".to_string(),
484 ));
485 }
486 }
487 }
488 }
489
490 fn consume_and_return(
491 &self,
492 chars: &mut Peekable<Chars<'_>>,
493 t: Token,
494 ) -> Result<Option<Token>, TokenizerError> {
495 chars.next();
496 Ok(Some(t))
497 }
498}
499
500fn peeking_take_while(
504 chars: &mut Peekable<Chars<'_>>,
505 mut predicate: impl FnMut(char) -> bool,
506) -> String {
507 let mut s = String::new();
508 while let Some(&ch) = chars.peek() {
509 if predicate(ch) {
510 chars.next(); s.push(ch);
512 } else {
513 break;
514 }
515 }
516 s
517}
518
519#[cfg(test)]
520mod tests {
521 use super::super::dialect::GenericDialect;
522 use super::*;
523
524 #[test]
525 fn tokenize_select_1() {
526 let sql = String::from("SELECT 1");
527 let dialect = GenericDialect {};
528 let mut tokenizer = Tokenizer::new(&dialect, &sql);
529 let tokens = tokenizer.tokenize().unwrap();
530
531 let expected = vec![
532 Token::make_keyword("SELECT"),
533 Token::Whitespace(Whitespace::Space),
534 Token::Number(String::from("1")),
535 ];
536
537 compare(expected, tokens);
538 }
539
540 #[test]
541 fn tokenize_scalar_function() {
542 let sql = String::from("SELECT sqrt(1)");
543 let dialect = GenericDialect {};
544 let mut tokenizer = Tokenizer::new(&dialect, &sql);
545 let tokens = tokenizer.tokenize().unwrap();
546
547 let expected = vec![
548 Token::make_keyword("SELECT"),
549 Token::Whitespace(Whitespace::Space),
550 Token::make_word("sqrt", None),
551 Token::LParen,
552 Token::Number(String::from("1")),
553 Token::RParen,
554 ];
555
556 compare(expected, tokens);
557 }
558
559 #[test]
560 fn tokenize_simple_select() {
561 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
562 let dialect = GenericDialect {};
563 let mut tokenizer = Tokenizer::new(&dialect, &sql);
564 let tokens = tokenizer.tokenize().unwrap();
565
566 let expected = vec![
567 Token::make_keyword("SELECT"),
568 Token::Whitespace(Whitespace::Space),
569 Token::Mult,
570 Token::Whitespace(Whitespace::Space),
571 Token::make_keyword("FROM"),
572 Token::Whitespace(Whitespace::Space),
573 Token::make_word("customer", None),
574 Token::Whitespace(Whitespace::Space),
575 Token::make_keyword("WHERE"),
576 Token::Whitespace(Whitespace::Space),
577 Token::make_word("id", None),
578 Token::Whitespace(Whitespace::Space),
579 Token::Eq,
580 Token::Whitespace(Whitespace::Space),
581 Token::Number(String::from("1")),
582 Token::Whitespace(Whitespace::Space),
583 Token::make_keyword("LIMIT"),
584 Token::Whitespace(Whitespace::Space),
585 Token::Number(String::from("5")),
586 ];
587
588 compare(expected, tokens);
589 }
590
591 #[test]
592 fn tokenize_string_predicate() {
593 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
594 let dialect = GenericDialect {};
595 let mut tokenizer = Tokenizer::new(&dialect, &sql);
596 let tokens = tokenizer.tokenize().unwrap();
597
598 let expected = vec![
599 Token::make_keyword("SELECT"),
600 Token::Whitespace(Whitespace::Space),
601 Token::Mult,
602 Token::Whitespace(Whitespace::Space),
603 Token::make_keyword("FROM"),
604 Token::Whitespace(Whitespace::Space),
605 Token::make_word("customer", None),
606 Token::Whitespace(Whitespace::Space),
607 Token::make_keyword("WHERE"),
608 Token::Whitespace(Whitespace::Space),
609 Token::make_word("salary", None),
610 Token::Whitespace(Whitespace::Space),
611 Token::Neq,
612 Token::Whitespace(Whitespace::Space),
613 Token::SingleQuotedString(String::from("Not Provided")),
614 ];
615
616 compare(expected, tokens);
617 }
618
619 #[test]
620 fn tokenize_invalid_string() {
621 let sql = String::from("\nمصطفىh");
622
623 let dialect = GenericDialect {};
624 let mut tokenizer = Tokenizer::new(&dialect, &sql);
625 let tokens = tokenizer.tokenize().unwrap();
626 println!("tokens: {:#?}", tokens);
627 let expected = vec![
628 Token::Whitespace(Whitespace::Newline),
629 Token::Char('م'),
630 Token::Char('ص'),
631 Token::Char('ط'),
632 Token::Char('ف'),
633 Token::Char('ى'),
634 Token::make_word("h", None),
635 ];
636 compare(expected, tokens);
637 }
638
639 #[test]
640 fn tokenize_invalid_string_cols() {
641 let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
642
643 let dialect = GenericDialect {};
644 let mut tokenizer = Tokenizer::new(&dialect, &sql);
645 let tokens = tokenizer.tokenize().unwrap();
646 println!("tokens: {:#?}", tokens);
647 let expected = vec![
648 Token::Whitespace(Whitespace::Newline),
649 Token::Whitespace(Whitespace::Newline),
650 Token::make_keyword("SELECT"),
651 Token::Whitespace(Whitespace::Space),
652 Token::Mult,
653 Token::Whitespace(Whitespace::Space),
654 Token::make_keyword("FROM"),
655 Token::Whitespace(Whitespace::Space),
656 Token::make_keyword("table"),
657 Token::Whitespace(Whitespace::Tab),
658 Token::Char('م'),
659 Token::Char('ص'),
660 Token::Char('ط'),
661 Token::Char('ف'),
662 Token::Char('ى'),
663 Token::make_word("h", None),
664 ];
665 compare(expected, tokens);
666 }
667
668 #[test]
669 fn tokenize_is_null() {
670 let sql = String::from("a IS NULL");
671 let dialect = GenericDialect {};
672 let mut tokenizer = Tokenizer::new(&dialect, &sql);
673 let tokens = tokenizer.tokenize().unwrap();
674
675 let expected = vec![
676 Token::make_word("a", None),
677 Token::Whitespace(Whitespace::Space),
678 Token::make_keyword("IS"),
679 Token::Whitespace(Whitespace::Space),
680 Token::make_keyword("NULL"),
681 ];
682
683 compare(expected, tokens);
684 }
685
686 #[test]
687 fn tokenize_comment() {
688 let sql = String::from("0--this is a comment\n1");
689
690 let dialect = GenericDialect {};
691 let mut tokenizer = Tokenizer::new(&dialect, &sql);
692 let tokens = tokenizer.tokenize().unwrap();
693 let expected = vec![
694 Token::Number("0".to_string()),
695 Token::Whitespace(Whitespace::SingleLineComment(
696 "this is a comment\n".to_string(),
697 )),
698 Token::Number("1".to_string()),
699 ];
700 compare(expected, tokens);
701 }
702
703 #[test]
704 fn tokenize_comment_at_eof() {
705 let sql = String::from("--this is a comment");
706
707 let dialect = GenericDialect {};
708 let mut tokenizer = Tokenizer::new(&dialect, &sql);
709 let tokens = tokenizer.tokenize().unwrap();
710 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment(
711 "this is a comment".to_string(),
712 ))];
713 compare(expected, tokens);
714 }
715
716 #[test]
717 fn tokenize_multiline_comment() {
718 let sql = String::from("0/*multi-line\n* /comment*/1");
719
720 let dialect = GenericDialect {};
721 let mut tokenizer = Tokenizer::new(&dialect, &sql);
722 let tokens = tokenizer.tokenize().unwrap();
723 let expected = vec![
724 Token::Number("0".to_string()),
725 Token::Whitespace(Whitespace::MultiLineComment(
726 "multi-line\n* /comment".to_string(),
727 )),
728 Token::Number("1".to_string()),
729 ];
730 compare(expected, tokens);
731 }
732
733 #[test]
734 fn tokenize_multiline_comment_with_even_asterisks() {
735 let sql = String::from("\n/** Comment **/\n");
736
737 let dialect = GenericDialect {};
738 let mut tokenizer = Tokenizer::new(&dialect, &sql);
739 let tokens = tokenizer.tokenize().unwrap();
740 let expected = vec![
741 Token::Whitespace(Whitespace::Newline),
742 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
743 Token::Whitespace(Whitespace::Newline),
744 ];
745 compare(expected, tokens);
746 }
747
748 #[test]
749 fn tokenize_mismatched_quotes() {
750 let sql = String::from("\"foo");
751
752 let dialect = GenericDialect {};
753 let mut tokenizer = Tokenizer::new(&dialect, &sql);
754 assert_eq!(
755 tokenizer.tokenize(),
756 Err(TokenizerError(
757 "Expected close delimiter '\"' before EOF.".to_string(),
758 ))
759 );
760 }
761
762 #[test]
763 fn tokenize_newlines() {
764 let sql = String::from("line1\nline2\rline3\r\nline4\r");
765
766 let dialect = GenericDialect {};
767 let mut tokenizer = Tokenizer::new(&dialect, &sql);
768 let tokens = tokenizer.tokenize().unwrap();
769 let expected = vec![
770 Token::make_word("line1", None),
771 Token::Whitespace(Whitespace::Newline),
772 Token::make_word("line2", None),
773 Token::Whitespace(Whitespace::Newline),
774 Token::make_word("line3", None),
775 Token::Whitespace(Whitespace::Newline),
776 Token::make_word("line4", None),
777 Token::Whitespace(Whitespace::Newline),
778 ];
779 compare(expected, tokens);
780 }
781
782 fn compare(expected: Vec<Token>, actual: Vec<Token>) {
783 assert_eq!(expected, actual);
788 }
789}