1#[cfg(not(feature = "std"))]
20use alloc::{
21 borrow::ToOwned,
22 format,
23 string::{String, ToString},
24 vec,
25 vec::Vec,
26};
27use core::fmt;
28use core::iter::Peekable;
29use core::str::Chars;
30
31#[cfg(feature = "serde")]
32use serde::{Deserialize, Serialize};
33
34use crate::dialect::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
35use crate::dialect::Dialect;
36use crate::dialect::SnowflakeDialect;
37
38#[derive(Debug, Clone, PartialEq, Eq, Hash)]
40#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
41pub enum Token {
42 EOF,
44 Word(Word),
46 Number(String, bool),
48 Char(char),
50 SingleQuotedString(String),
52 NationalStringLiteral(String),
54 HexStringLiteral(String),
56 Comma,
58 Whitespace(Whitespace),
60 DoubleEq,
62 Eq,
64 Neq,
66 Lt,
68 Gt,
70 LtEq,
72 GtEq,
74 Spaceship,
76 Plus,
78 Minus,
80 Mult,
82 Div,
84 Mod,
86 StringConcat,
88 LParen,
90 RParen,
92 Period,
94 Colon,
96 DoubleColon,
98 SemiColon,
100 Backslash,
102 LBracket,
104 RBracket,
106 Ampersand,
108 Pipe,
110 Caret,
112 LBrace,
114 RBrace,
116 RArrow,
118 Sharp,
120 Tilde,
122 TildeAsterisk,
124 ExclamationMarkTilde,
126 ExclamationMarkTildeAsterisk,
128 ShiftLeft,
130 ShiftRight,
132 ExclamationMark,
134 DoubleExclamationMark,
136 AtSign,
138 PGSquareRoot,
140 PGCubeRoot,
142}
143
144impl fmt::Display for Token {
145 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
146 match self {
147 Token::EOF => f.write_str("EOF"),
148 Token::Word(ref w) => write!(f, "{}", w),
149 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
150 Token::Char(ref c) => write!(f, "{}", c),
151 Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
152 Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
153 Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
154 Token::Comma => f.write_str(","),
155 Token::Whitespace(ws) => write!(f, "{}", ws),
156 Token::DoubleEq => f.write_str("=="),
157 Token::Spaceship => f.write_str("<=>"),
158 Token::Eq => f.write_str("="),
159 Token::Neq => f.write_str("<>"),
160 Token::Lt => f.write_str("<"),
161 Token::Gt => f.write_str(">"),
162 Token::LtEq => f.write_str("<="),
163 Token::GtEq => f.write_str(">="),
164 Token::Plus => f.write_str("+"),
165 Token::Minus => f.write_str("-"),
166 Token::Mult => f.write_str("*"),
167 Token::Div => f.write_str("/"),
168 Token::StringConcat => f.write_str("||"),
169 Token::Mod => f.write_str("%"),
170 Token::LParen => f.write_str("("),
171 Token::RParen => f.write_str(")"),
172 Token::Period => f.write_str("."),
173 Token::Colon => f.write_str(":"),
174 Token::DoubleColon => f.write_str("::"),
175 Token::SemiColon => f.write_str(";"),
176 Token::Backslash => f.write_str("\\"),
177 Token::LBracket => f.write_str("["),
178 Token::RBracket => f.write_str("]"),
179 Token::Ampersand => f.write_str("&"),
180 Token::Caret => f.write_str("^"),
181 Token::Pipe => f.write_str("|"),
182 Token::LBrace => f.write_str("{"),
183 Token::RBrace => f.write_str("}"),
184 Token::RArrow => f.write_str("=>"),
185 Token::Sharp => f.write_str("#"),
186 Token::ExclamationMark => f.write_str("!"),
187 Token::DoubleExclamationMark => f.write_str("!!"),
188 Token::Tilde => f.write_str("~"),
189 Token::TildeAsterisk => f.write_str("~*"),
190 Token::ExclamationMarkTilde => f.write_str("!~"),
191 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
192 Token::AtSign => f.write_str("@"),
193 Token::ShiftLeft => f.write_str("<<"),
194 Token::ShiftRight => f.write_str(">>"),
195 Token::PGSquareRoot => f.write_str("|/"),
196 Token::PGCubeRoot => f.write_str("||/"),
197 }
198 }
199}
200
201impl Token {
202 pub fn make_keyword(keyword: &str) -> Self {
203 Token::make_word(keyword, None)
204 }
205
206 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
207 let word_uppercase = word.to_uppercase();
208 Token::Word(Word {
209 value: word.to_string(),
210 quote_style,
211 keyword: if quote_style == None {
212 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
213 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
214 } else {
215 Keyword::NoKeyword
216 },
217 })
218 }
219}
220
221#[derive(Debug, Clone, PartialEq, Eq, Hash)]
223#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
224pub struct Word {
225 pub value: String,
228 pub quote_style: Option<char>,
232 pub keyword: Keyword,
235}
236
237impl fmt::Display for Word {
238 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
239 match self.quote_style {
240 Some(s) if s == '"' || s == '[' || s == '`' => {
241 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
242 }
243 None => f.write_str(&self.value),
244 _ => panic!("Unexpected quote_style!"),
245 }
246 }
247}
248
249impl Word {
250 fn matching_end_quote(ch: char) -> char {
251 match ch {
252 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
256 }
257 }
258}
259
260#[derive(Debug, Clone, PartialEq, Eq, Hash)]
261#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
262pub enum Whitespace {
263 Space,
264 Newline,
265 Tab,
266 SingleLineComment { comment: String, prefix: String },
267 MultiLineComment(String),
268}
269
270impl fmt::Display for Whitespace {
271 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
272 match self {
273 Whitespace::Space => f.write_str(" "),
274 Whitespace::Newline => f.write_str("\n"),
275 Whitespace::Tab => f.write_str("\t"),
276 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
277 Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
278 }
279 }
280}
281
282#[derive(Debug, PartialEq)]
284pub struct TokenizerError {
285 pub message: String,
286 pub line: u64,
287 pub col: u64,
288}
289
290pub struct Tokenizer<'a> {
292 dialect: &'a dyn Dialect,
293 query: &'a str,
294 line: u64,
295 col: u64,
296}
297
298impl<'a> Tokenizer<'a> {
299 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
301 Self {
302 dialect,
303 query,
304 line: 1,
305 col: 1,
306 }
307 }
308
309 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
311 let mut peekable = self.query.chars().peekable();
312
313 let mut tokens: Vec<Token> = vec![];
314
315 while let Some(token) = self.next_token(&mut peekable)? {
316 match &token {
317 Token::Whitespace(Whitespace::Newline) => {
318 self.line += 1;
319 self.col = 1;
320 }
321
322 Token::Whitespace(Whitespace::Tab) => self.col += 4,
323 Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64,
324 Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2,
325 Token::Number(s, _) => self.col += s.len() as u64,
326 Token::SingleQuotedString(s) => self.col += s.len() as u64,
327 _ => self.col += 1,
328 }
329
330 tokens.push(token);
331 }
332 Ok(tokens)
333 }
334
335 fn next_token(&self, chars: &mut Peekable<Chars<'_>>) -> Result<Option<Token>, TokenizerError> {
337 match chars.peek() {
339 Some(&ch) => match ch {
340 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
341 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
342 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
343 '\r' => {
344 chars.next();
346 if let Some('\n') = chars.peek() {
347 chars.next();
348 }
349 Ok(Some(Token::Whitespace(Whitespace::Newline)))
350 }
351 'N' => {
352 chars.next(); match chars.peek() {
354 Some('\'') => {
355 let s = self.tokenize_single_quoted_string(chars)?;
357 Ok(Some(Token::NationalStringLiteral(s)))
358 }
359 _ => {
360 let s = self.tokenize_word('N', chars);
362 Ok(Some(Token::make_word(&s, None)))
363 }
364 }
365 }
366 x @ 'x' | x @ 'X' => {
369 chars.next(); match chars.peek() {
371 Some('\'') => {
372 let s = self.tokenize_single_quoted_string(chars)?;
374 Ok(Some(Token::HexStringLiteral(s)))
375 }
376 _ => {
377 let s = self.tokenize_word(x, chars);
379 Ok(Some(Token::make_word(&s, None)))
380 }
381 }
382 }
383 ch if self.dialect.is_identifier_start(ch) => {
385 chars.next(); let s = self.tokenize_word(ch, chars);
387
388 if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
389 let mut s = peeking_take_while(&mut s.chars().peekable(), |ch| {
390 matches!(ch, '0'..='9' | '.')
391 });
392 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
393 s += s2.as_str();
394 return Ok(Some(Token::Number(s, false)));
395 }
396 Ok(Some(Token::make_word(&s, None)))
397 }
398 '\'' => {
400 let s = self.tokenize_single_quoted_string(chars)?;
401 Ok(Some(Token::SingleQuotedString(s)))
402 }
403 quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
405 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
407 let s = peeking_take_while(chars, |ch| ch != quote_end);
408 if chars.next() == Some(quote_end) {
409 Ok(Some(Token::make_word(&s, Some(quote_start))))
410 } else {
411 self.tokenizer_error(
412 format!("Expected close delimiter '{}' before EOF.", quote_end)
413 .as_str(),
414 )
415 }
416 }
417 '0'..='9' | '.' => {
419 let mut s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
420
421 if s == "0" && chars.peek() == Some(&'x') {
423 chars.next();
424 let s2 = peeking_take_while(
425 chars,
426 |ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'),
427 );
428 return Ok(Some(Token::HexStringLiteral(s2)));
429 }
430
431 if let Some('.') = chars.peek() {
433 s.push('.');
434 chars.next();
435 }
436 s += &peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
437
438 if s == "." {
440 return Ok(Some(Token::Period));
441 }
442
443 let long = if chars.peek() == Some(&'L') {
444 chars.next();
445 true
446 } else {
447 false
448 };
449 Ok(Some(Token::Number(s, long)))
450 }
451 '(' => self.consume_and_return(chars, Token::LParen),
453 ')' => self.consume_and_return(chars, Token::RParen),
454 ',' => self.consume_and_return(chars, Token::Comma),
455 '-' => {
457 chars.next(); match chars.peek() {
459 Some('-') => {
460 chars.next(); let comment = self.tokenize_single_line_comment(chars);
462 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
463 prefix: "--".to_owned(),
464 comment,
465 })))
466 }
467 _ => Ok(Some(Token::Minus)),
469 }
470 }
471 '/' => {
472 chars.next(); match chars.peek() {
474 Some('*') => {
475 chars.next(); self.tokenize_multiline_comment(chars)
477 }
478 Some('/') if dialect_of!(self is SnowflakeDialect) => {
479 chars.next(); let comment = self.tokenize_single_line_comment(chars);
481 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
482 prefix: "//".to_owned(),
483 comment,
484 })))
485 }
486 _ => Ok(Some(Token::Div)),
488 }
489 }
490 '+' => self.consume_and_return(chars, Token::Plus),
491 '*' => self.consume_and_return(chars, Token::Mult),
492 '%' => self.consume_and_return(chars, Token::Mod),
493 '|' => {
494 chars.next(); match chars.peek() {
496 Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
497 Some('|') => {
498 chars.next(); match chars.peek() {
500 Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
501 _ => Ok(Some(Token::StringConcat)),
502 }
503 }
504 _ => Ok(Some(Token::Pipe)),
506 }
507 }
508 '=' => {
509 chars.next(); match chars.peek() {
511 Some('>') => self.consume_and_return(chars, Token::RArrow),
512 _ => Ok(Some(Token::Eq)),
513 }
514 }
515 '!' => {
516 chars.next(); match chars.peek() {
518 Some('=') => self.consume_and_return(chars, Token::Neq),
519 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
520 Some('~') => {
521 chars.next();
522 match chars.peek() {
523 Some('*') => self
524 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
525 _ => Ok(Some(Token::ExclamationMarkTilde)),
526 }
527 }
528 _ => Ok(Some(Token::ExclamationMark)),
529 }
530 }
531 '<' => {
532 chars.next(); match chars.peek() {
534 Some('=') => {
535 chars.next();
536 match chars.peek() {
537 Some('>') => self.consume_and_return(chars, Token::Spaceship),
538 _ => Ok(Some(Token::LtEq)),
539 }
540 }
541 Some('>') => self.consume_and_return(chars, Token::Neq),
542 Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
543 _ => Ok(Some(Token::Lt)),
544 }
545 }
546 '>' => {
547 chars.next(); match chars.peek() {
549 Some('=') => self.consume_and_return(chars, Token::GtEq),
550 Some('>') => self.consume_and_return(chars, Token::ShiftRight),
551 _ => Ok(Some(Token::Gt)),
552 }
553 }
554 ':' => {
555 chars.next();
556 match chars.peek() {
557 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
558 _ => Ok(Some(Token::Colon)),
559 }
560 }
561 ';' => self.consume_and_return(chars, Token::SemiColon),
562 '\\' => self.consume_and_return(chars, Token::Backslash),
563 '[' => self.consume_and_return(chars, Token::LBracket),
564 ']' => self.consume_and_return(chars, Token::RBracket),
565 '&' => self.consume_and_return(chars, Token::Ampersand),
566 '^' => self.consume_and_return(chars, Token::Caret),
567 '{' => self.consume_and_return(chars, Token::LBrace),
568 '}' => self.consume_and_return(chars, Token::RBrace),
569 '#' if dialect_of!(self is SnowflakeDialect) => {
570 chars.next(); let comment = self.tokenize_single_line_comment(chars);
572 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
573 prefix: "#".to_owned(),
574 comment,
575 })))
576 }
577 '~' => {
578 chars.next(); match chars.peek() {
580 Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
581 _ => Ok(Some(Token::Tilde)),
582 }
583 }
584 '#' => self.consume_and_return(chars, Token::Sharp),
585 '@' => self.consume_and_return(chars, Token::AtSign),
586 other => self.consume_and_return(chars, Token::Char(other)),
587 },
588 None => Ok(None),
589 }
590 }
591
592 fn tokenizer_error<R>(&self, message: &str) -> Result<R, TokenizerError> {
593 Err(TokenizerError {
594 message: message.to_string(),
595 col: self.col,
596 line: self.line,
597 })
598 }
599
600 fn tokenize_single_line_comment(&self, chars: &mut Peekable<Chars<'_>>) -> String {
602 let mut comment = peeking_take_while(chars, |ch| ch != '\n');
603 if let Some(ch) = chars.next() {
604 assert_eq!(ch, '\n');
605 comment.push(ch);
606 }
607 comment
608 }
609
610 fn tokenize_word(&self, first_char: char, chars: &mut Peekable<Chars<'_>>) -> String {
612 let mut s = first_char.to_string();
613 s.push_str(&peeking_take_while(chars, |ch| {
614 self.dialect.is_identifier_part(ch)
615 }));
616 s
617 }
618
619 fn tokenize_single_quoted_string(
621 &self,
622 chars: &mut Peekable<Chars<'_>>,
623 ) -> Result<String, TokenizerError> {
624 let mut s = String::new();
625 chars.next(); while let Some(&ch) = chars.peek() {
627 match ch {
628 '\'' => {
629 chars.next(); let escaped_quote = chars.peek().map(|c| *c == '\'').unwrap_or(false);
631 if escaped_quote {
632 s.push('\'');
633 chars.next();
634 } else {
635 return Ok(s);
636 }
637 }
638 _ => {
639 chars.next(); s.push(ch);
641 }
642 }
643 }
644 self.tokenizer_error("Unterminated string literal")
645 }
646
647 fn tokenize_multiline_comment(
648 &self,
649 chars: &mut Peekable<Chars<'_>>,
650 ) -> Result<Option<Token>, TokenizerError> {
651 let mut s = String::new();
652 let mut maybe_closing_comment = false;
653 loop {
655 match chars.next() {
656 Some(ch) => {
657 if maybe_closing_comment {
658 if ch == '/' {
659 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
660 } else {
661 s.push('*');
662 }
663 }
664 maybe_closing_comment = ch == '*';
665 if !maybe_closing_comment {
666 s.push(ch);
667 }
668 }
669 None => break self.tokenizer_error("Unexpected EOF while in a multi-line comment"),
670 }
671 }
672 }
673
674 #[allow(clippy::unnecessary_wraps)]
675 fn consume_and_return(
676 &self,
677 chars: &mut Peekable<Chars<'_>>,
678 t: Token,
679 ) -> Result<Option<Token>, TokenizerError> {
680 chars.next();
681 Ok(Some(t))
682 }
683}
684
685fn peeking_take_while(
689 chars: &mut Peekable<Chars<'_>>,
690 mut predicate: impl FnMut(char) -> bool,
691) -> String {
692 let mut s = String::new();
693 while let Some(&ch) = chars.peek() {
694 if predicate(ch) {
695 chars.next(); s.push(ch);
697 } else {
698 break;
699 }
700 }
701 s
702}
703
704#[cfg(test)]
705mod tests {
706 use super::super::dialect::GenericDialect;
707 use super::super::dialect::MsSqlDialect;
708 use super::*;
709
710 #[test]
711 fn tokenize_select_1() {
712 let sql = String::from("SELECT 1");
713 let dialect = GenericDialect {};
714 let mut tokenizer = Tokenizer::new(&dialect, &sql);
715 let tokens = tokenizer.tokenize().unwrap();
716
717 let expected = vec![
718 Token::make_keyword("SELECT"),
719 Token::Whitespace(Whitespace::Space),
720 Token::Number(String::from("1"), false),
721 ];
722
723 compare(expected, tokens);
724 }
725
726 #[test]
727 fn tokenize_select_float() {
728 let sql = String::from("SELECT .1");
729 let dialect = GenericDialect {};
730 let mut tokenizer = Tokenizer::new(&dialect, &sql);
731 let tokens = tokenizer.tokenize().unwrap();
732
733 let expected = vec![
734 Token::make_keyword("SELECT"),
735 Token::Whitespace(Whitespace::Space),
736 Token::Number(String::from(".1"), false),
737 ];
738
739 compare(expected, tokens);
740 }
741
742 #[test]
743 fn tokenize_scalar_function() {
744 let sql = String::from("SELECT sqrt(1)");
745 let dialect = GenericDialect {};
746 let mut tokenizer = Tokenizer::new(&dialect, &sql);
747 let tokens = tokenizer.tokenize().unwrap();
748
749 let expected = vec![
750 Token::make_keyword("SELECT"),
751 Token::Whitespace(Whitespace::Space),
752 Token::make_word("sqrt", None),
753 Token::LParen,
754 Token::Number(String::from("1"), false),
755 Token::RParen,
756 ];
757
758 compare(expected, tokens);
759 }
760
761 #[test]
762 fn tokenize_string_string_concat() {
763 let sql = String::from("SELECT 'a' || 'b'");
764 let dialect = GenericDialect {};
765 let mut tokenizer = Tokenizer::new(&dialect, &sql);
766 let tokens = tokenizer.tokenize().unwrap();
767
768 let expected = vec![
769 Token::make_keyword("SELECT"),
770 Token::Whitespace(Whitespace::Space),
771 Token::SingleQuotedString(String::from("a")),
772 Token::Whitespace(Whitespace::Space),
773 Token::StringConcat,
774 Token::Whitespace(Whitespace::Space),
775 Token::SingleQuotedString(String::from("b")),
776 ];
777
778 compare(expected, tokens);
779 }
780 #[test]
781 fn tokenize_bitwise_op() {
782 let sql = String::from("SELECT one | two ^ three");
783 let dialect = GenericDialect {};
784 let mut tokenizer = Tokenizer::new(&dialect, &sql);
785 let tokens = tokenizer.tokenize().unwrap();
786
787 let expected = vec![
788 Token::make_keyword("SELECT"),
789 Token::Whitespace(Whitespace::Space),
790 Token::make_word("one", None),
791 Token::Whitespace(Whitespace::Space),
792 Token::Pipe,
793 Token::Whitespace(Whitespace::Space),
794 Token::make_word("two", None),
795 Token::Whitespace(Whitespace::Space),
796 Token::Caret,
797 Token::Whitespace(Whitespace::Space),
798 Token::make_word("three", None),
799 ];
800
801 compare(expected, tokens);
802 }
803
804 #[test]
805 fn tokenize_simple_select() {
806 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
807 let dialect = GenericDialect {};
808 let mut tokenizer = Tokenizer::new(&dialect, &sql);
809 let tokens = tokenizer.tokenize().unwrap();
810
811 let expected = vec![
812 Token::make_keyword("SELECT"),
813 Token::Whitespace(Whitespace::Space),
814 Token::Mult,
815 Token::Whitespace(Whitespace::Space),
816 Token::make_keyword("FROM"),
817 Token::Whitespace(Whitespace::Space),
818 Token::make_word("customer", None),
819 Token::Whitespace(Whitespace::Space),
820 Token::make_keyword("WHERE"),
821 Token::Whitespace(Whitespace::Space),
822 Token::make_word("id", None),
823 Token::Whitespace(Whitespace::Space),
824 Token::Eq,
825 Token::Whitespace(Whitespace::Space),
826 Token::Number(String::from("1"), false),
827 Token::Whitespace(Whitespace::Space),
828 Token::make_keyword("LIMIT"),
829 Token::Whitespace(Whitespace::Space),
830 Token::Number(String::from("5"), false),
831 ];
832
833 compare(expected, tokens);
834 }
835
836 #[test]
837 fn tokenize_explain_select() {
838 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
839 let dialect = GenericDialect {};
840 let mut tokenizer = Tokenizer::new(&dialect, &sql);
841 let tokens = tokenizer.tokenize().unwrap();
842
843 let expected = vec![
844 Token::make_keyword("EXPLAIN"),
845 Token::Whitespace(Whitespace::Space),
846 Token::make_keyword("SELECT"),
847 Token::Whitespace(Whitespace::Space),
848 Token::Mult,
849 Token::Whitespace(Whitespace::Space),
850 Token::make_keyword("FROM"),
851 Token::Whitespace(Whitespace::Space),
852 Token::make_word("customer", None),
853 Token::Whitespace(Whitespace::Space),
854 Token::make_keyword("WHERE"),
855 Token::Whitespace(Whitespace::Space),
856 Token::make_word("id", None),
857 Token::Whitespace(Whitespace::Space),
858 Token::Eq,
859 Token::Whitespace(Whitespace::Space),
860 Token::Number(String::from("1"), false),
861 ];
862
863 compare(expected, tokens);
864 }
865
866 #[test]
867 fn tokenize_explain_analyze_select() {
868 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
869 let dialect = GenericDialect {};
870 let mut tokenizer = Tokenizer::new(&dialect, &sql);
871 let tokens = tokenizer.tokenize().unwrap();
872
873 let expected = vec![
874 Token::make_keyword("EXPLAIN"),
875 Token::Whitespace(Whitespace::Space),
876 Token::make_keyword("ANALYZE"),
877 Token::Whitespace(Whitespace::Space),
878 Token::make_keyword("SELECT"),
879 Token::Whitespace(Whitespace::Space),
880 Token::Mult,
881 Token::Whitespace(Whitespace::Space),
882 Token::make_keyword("FROM"),
883 Token::Whitespace(Whitespace::Space),
884 Token::make_word("customer", None),
885 Token::Whitespace(Whitespace::Space),
886 Token::make_keyword("WHERE"),
887 Token::Whitespace(Whitespace::Space),
888 Token::make_word("id", None),
889 Token::Whitespace(Whitespace::Space),
890 Token::Eq,
891 Token::Whitespace(Whitespace::Space),
892 Token::Number(String::from("1"), false),
893 ];
894
895 compare(expected, tokens);
896 }
897
898 #[test]
899 fn tokenize_string_predicate() {
900 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
901 let dialect = GenericDialect {};
902 let mut tokenizer = Tokenizer::new(&dialect, &sql);
903 let tokens = tokenizer.tokenize().unwrap();
904
905 let expected = vec![
906 Token::make_keyword("SELECT"),
907 Token::Whitespace(Whitespace::Space),
908 Token::Mult,
909 Token::Whitespace(Whitespace::Space),
910 Token::make_keyword("FROM"),
911 Token::Whitespace(Whitespace::Space),
912 Token::make_word("customer", None),
913 Token::Whitespace(Whitespace::Space),
914 Token::make_keyword("WHERE"),
915 Token::Whitespace(Whitespace::Space),
916 Token::make_word("salary", None),
917 Token::Whitespace(Whitespace::Space),
918 Token::Neq,
919 Token::Whitespace(Whitespace::Space),
920 Token::SingleQuotedString(String::from("Not Provided")),
921 ];
922
923 compare(expected, tokens);
924 }
925
926 #[test]
927 fn tokenize_invalid_string() {
928 let sql = String::from("\nمصطفىh");
929
930 let dialect = GenericDialect {};
931 let mut tokenizer = Tokenizer::new(&dialect, &sql);
932 let tokens = tokenizer.tokenize().unwrap();
933 println!("tokens: {:#?}", tokens);
934 let expected = vec![
935 Token::Whitespace(Whitespace::Newline),
936 Token::Char('م'),
937 Token::Char('ص'),
938 Token::Char('ط'),
939 Token::Char('ف'),
940 Token::Char('ى'),
941 Token::make_word("h", None),
942 ];
943 compare(expected, tokens);
944 }
945
946 #[test]
947 fn tokenize_newline_in_string_literal() {
948 let sql = String::from("'foo\r\nbar\nbaz'");
949
950 let dialect = GenericDialect {};
951 let mut tokenizer = Tokenizer::new(&dialect, &sql);
952 let tokens = tokenizer.tokenize().unwrap();
953 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
954 compare(expected, tokens);
955 }
956
957 #[test]
958 fn tokenize_unterminated_string_literal() {
959 let sql = String::from("select 'foo");
960
961 let dialect = GenericDialect {};
962 let mut tokenizer = Tokenizer::new(&dialect, &sql);
963 assert_eq!(
964 tokenizer.tokenize(),
965 Err(TokenizerError {
966 message: "Unterminated string literal".to_string(),
967 line: 1,
968 col: 8
969 })
970 );
971 }
972
973 #[test]
974 fn tokenize_invalid_string_cols() {
975 let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
976
977 let dialect = GenericDialect {};
978 let mut tokenizer = Tokenizer::new(&dialect, &sql);
979 let tokens = tokenizer.tokenize().unwrap();
980 println!("tokens: {:#?}", tokens);
981 let expected = vec![
982 Token::Whitespace(Whitespace::Newline),
983 Token::Whitespace(Whitespace::Newline),
984 Token::make_keyword("SELECT"),
985 Token::Whitespace(Whitespace::Space),
986 Token::Mult,
987 Token::Whitespace(Whitespace::Space),
988 Token::make_keyword("FROM"),
989 Token::Whitespace(Whitespace::Space),
990 Token::make_keyword("table"),
991 Token::Whitespace(Whitespace::Tab),
992 Token::Char('م'),
993 Token::Char('ص'),
994 Token::Char('ط'),
995 Token::Char('ف'),
996 Token::Char('ى'),
997 Token::make_word("h", None),
998 ];
999 compare(expected, tokens);
1000 }
1001
1002 #[test]
1003 fn tokenize_right_arrow() {
1004 let sql = String::from("FUNCTION(key=>value)");
1005 let dialect = GenericDialect {};
1006 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1007 let tokens = tokenizer.tokenize().unwrap();
1008 let expected = vec![
1009 Token::make_word("FUNCTION", None),
1010 Token::LParen,
1011 Token::make_word("key", None),
1012 Token::RArrow,
1013 Token::make_word("value", None),
1014 Token::RParen,
1015 ];
1016 compare(expected, tokens);
1017 }
1018
1019 #[test]
1020 fn tokenize_is_null() {
1021 let sql = String::from("a IS NULL");
1022 let dialect = GenericDialect {};
1023 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1024 let tokens = tokenizer.tokenize().unwrap();
1025
1026 let expected = vec![
1027 Token::make_word("a", None),
1028 Token::Whitespace(Whitespace::Space),
1029 Token::make_keyword("IS"),
1030 Token::Whitespace(Whitespace::Space),
1031 Token::make_keyword("NULL"),
1032 ];
1033
1034 compare(expected, tokens);
1035 }
1036
1037 #[test]
1038 fn tokenize_comment() {
1039 let sql = String::from("0--this is a comment\n1");
1040
1041 let dialect = GenericDialect {};
1042 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1043 let tokens = tokenizer.tokenize().unwrap();
1044 let expected = vec![
1045 Token::Number("0".to_string(), false),
1046 Token::Whitespace(Whitespace::SingleLineComment {
1047 prefix: "--".to_string(),
1048 comment: "this is a comment\n".to_string(),
1049 }),
1050 Token::Number("1".to_string(), false),
1051 ];
1052 compare(expected, tokens);
1053 }
1054
1055 #[test]
1056 fn tokenize_comment_at_eof() {
1057 let sql = String::from("--this is a comment");
1058
1059 let dialect = GenericDialect {};
1060 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1061 let tokens = tokenizer.tokenize().unwrap();
1062 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1063 prefix: "--".to_string(),
1064 comment: "this is a comment".to_string(),
1065 })];
1066 compare(expected, tokens);
1067 }
1068
1069 #[test]
1070 fn tokenize_multiline_comment() {
1071 let sql = String::from("0/*multi-line\n* /comment*/1");
1072
1073 let dialect = GenericDialect {};
1074 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1075 let tokens = tokenizer.tokenize().unwrap();
1076 let expected = vec![
1077 Token::Number("0".to_string(), false),
1078 Token::Whitespace(Whitespace::MultiLineComment(
1079 "multi-line\n* /comment".to_string(),
1080 )),
1081 Token::Number("1".to_string(), false),
1082 ];
1083 compare(expected, tokens);
1084 }
1085
1086 #[test]
1087 fn tokenize_multiline_comment_with_even_asterisks() {
1088 let sql = String::from("\n/** Comment **/\n");
1089
1090 let dialect = GenericDialect {};
1091 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1092 let tokens = tokenizer.tokenize().unwrap();
1093 let expected = vec![
1094 Token::Whitespace(Whitespace::Newline),
1095 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
1096 Token::Whitespace(Whitespace::Newline),
1097 ];
1098 compare(expected, tokens);
1099 }
1100
1101 #[test]
1102 fn tokenize_mismatched_quotes() {
1103 let sql = String::from("\"foo");
1104
1105 let dialect = GenericDialect {};
1106 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1107 assert_eq!(
1108 tokenizer.tokenize(),
1109 Err(TokenizerError {
1110 message: "Expected close delimiter '\"' before EOF.".to_string(),
1111 line: 1,
1112 col: 1
1113 })
1114 );
1115 }
1116
1117 #[test]
1118 fn tokenize_newlines() {
1119 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1120
1121 let dialect = GenericDialect {};
1122 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1123 let tokens = tokenizer.tokenize().unwrap();
1124 let expected = vec![
1125 Token::make_word("line1", None),
1126 Token::Whitespace(Whitespace::Newline),
1127 Token::make_word("line2", None),
1128 Token::Whitespace(Whitespace::Newline),
1129 Token::make_word("line3", None),
1130 Token::Whitespace(Whitespace::Newline),
1131 Token::make_word("line4", None),
1132 Token::Whitespace(Whitespace::Newline),
1133 ];
1134 compare(expected, tokens);
1135 }
1136
1137 #[test]
1138 fn tokenize_mssql_top() {
1139 let sql = "SELECT TOP 5 [bar] FROM foo";
1140 let dialect = MsSqlDialect {};
1141 let mut tokenizer = Tokenizer::new(&dialect, sql);
1142 let tokens = tokenizer.tokenize().unwrap();
1143 let expected = vec![
1144 Token::make_keyword("SELECT"),
1145 Token::Whitespace(Whitespace::Space),
1146 Token::make_keyword("TOP"),
1147 Token::Whitespace(Whitespace::Space),
1148 Token::Number(String::from("5"), false),
1149 Token::Whitespace(Whitespace::Space),
1150 Token::make_word("bar", Some('[')),
1151 Token::Whitespace(Whitespace::Space),
1152 Token::make_keyword("FROM"),
1153 Token::Whitespace(Whitespace::Space),
1154 Token::make_word("foo", None),
1155 ];
1156 compare(expected, tokens);
1157 }
1158
1159 #[test]
1160 fn tokenize_pg_regex_match() {
1161 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1162 let dialect = GenericDialect {};
1163 let mut tokenizer = Tokenizer::new(&dialect, sql);
1164 let tokens = tokenizer.tokenize().unwrap();
1165 let expected = vec![
1166 Token::make_keyword("SELECT"),
1167 Token::Whitespace(Whitespace::Space),
1168 Token::make_word("col", None),
1169 Token::Whitespace(Whitespace::Space),
1170 Token::Tilde,
1171 Token::Whitespace(Whitespace::Space),
1172 Token::SingleQuotedString("^a".into()),
1173 Token::Comma,
1174 Token::Whitespace(Whitespace::Space),
1175 Token::make_word("col", None),
1176 Token::Whitespace(Whitespace::Space),
1177 Token::TildeAsterisk,
1178 Token::Whitespace(Whitespace::Space),
1179 Token::SingleQuotedString("^a".into()),
1180 Token::Comma,
1181 Token::Whitespace(Whitespace::Space),
1182 Token::make_word("col", None),
1183 Token::Whitespace(Whitespace::Space),
1184 Token::ExclamationMarkTilde,
1185 Token::Whitespace(Whitespace::Space),
1186 Token::SingleQuotedString("^a".into()),
1187 Token::Comma,
1188 Token::Whitespace(Whitespace::Space),
1189 Token::make_word("col", None),
1190 Token::Whitespace(Whitespace::Space),
1191 Token::ExclamationMarkTildeAsterisk,
1192 Token::Whitespace(Whitespace::Space),
1193 Token::SingleQuotedString("^a".into()),
1194 ];
1195 compare(expected, tokens);
1196 }
1197
1198 fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1199 assert_eq!(expected, actual);
1204 }
1205}