1#[cfg(not(feature = "std"))]
20use alloc::{
21 borrow::ToOwned,
22 format,
23 string::{String, ToString},
24 vec,
25 vec::Vec,
26};
27use core::fmt;
28use core::iter::Peekable;
29use core::str::Chars;
30
31#[cfg(feature = "serde")]
32use serde::{Deserialize, Serialize};
33
34use crate::dialect::SnowflakeDialect;
35use crate::dialect::{Dialect, MySqlDialect};
36use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
37
38#[derive(Debug, Clone, PartialEq, Eq, Hash)]
40#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
41pub enum Token {
42 EOF,
44 Word(Word),
46 Number(String, bool),
48 Char(char),
50 SingleQuotedString(String),
52 NationalStringLiteral(String),
54 HexStringLiteral(String),
56 Comma,
58 Whitespace(Whitespace),
60 DoubleEq,
62 Eq,
64 Neq,
66 Lt,
68 Gt,
70 LtEq,
72 GtEq,
74 Spaceship,
76 Plus,
78 Minus,
80 Mul,
82 Div,
84 Mod,
86 StringConcat,
88 LParen,
90 RParen,
92 Period,
94 Colon,
96 DoubleColon,
98 SemiColon,
100 Backslash,
102 LBracket,
104 RBracket,
106 Ampersand,
108 Pipe,
110 Caret,
112 LBrace,
114 RBrace,
116 RArrow,
118 Sharp,
120 Tilde,
122 TildeAsterisk,
124 ExclamationMarkTilde,
126 ExclamationMarkTildeAsterisk,
128 ShiftLeft,
130 ShiftRight,
132 ExclamationMark,
134 DoubleExclamationMark,
136 AtSign,
138 PGSquareRoot,
140 PGCubeRoot,
142 Placeholder(String),
144}
145
146impl fmt::Display for Token {
147 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
148 match self {
149 Token::EOF => f.write_str("EOF"),
150 Token::Word(ref w) => write!(f, "{}", w),
151 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
152 Token::Char(ref c) => write!(f, "{}", c),
153 Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
154 Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
155 Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
156 Token::Comma => f.write_str(","),
157 Token::Whitespace(ws) => write!(f, "{}", ws),
158 Token::DoubleEq => f.write_str("=="),
159 Token::Spaceship => f.write_str("<=>"),
160 Token::Eq => f.write_str("="),
161 Token::Neq => f.write_str("<>"),
162 Token::Lt => f.write_str("<"),
163 Token::Gt => f.write_str(">"),
164 Token::LtEq => f.write_str("<="),
165 Token::GtEq => f.write_str(">="),
166 Token::Plus => f.write_str("+"),
167 Token::Minus => f.write_str("-"),
168 Token::Mul => f.write_str("*"),
169 Token::Div => f.write_str("/"),
170 Token::StringConcat => f.write_str("||"),
171 Token::Mod => f.write_str("%"),
172 Token::LParen => f.write_str("("),
173 Token::RParen => f.write_str(")"),
174 Token::Period => f.write_str("."),
175 Token::Colon => f.write_str(":"),
176 Token::DoubleColon => f.write_str("::"),
177 Token::SemiColon => f.write_str(";"),
178 Token::Backslash => f.write_str("\\"),
179 Token::LBracket => f.write_str("["),
180 Token::RBracket => f.write_str("]"),
181 Token::Ampersand => f.write_str("&"),
182 Token::Caret => f.write_str("^"),
183 Token::Pipe => f.write_str("|"),
184 Token::LBrace => f.write_str("{"),
185 Token::RBrace => f.write_str("}"),
186 Token::RArrow => f.write_str("=>"),
187 Token::Sharp => f.write_str("#"),
188 Token::ExclamationMark => f.write_str("!"),
189 Token::DoubleExclamationMark => f.write_str("!!"),
190 Token::Tilde => f.write_str("~"),
191 Token::TildeAsterisk => f.write_str("~*"),
192 Token::ExclamationMarkTilde => f.write_str("!~"),
193 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
194 Token::AtSign => f.write_str("@"),
195 Token::ShiftLeft => f.write_str("<<"),
196 Token::ShiftRight => f.write_str(">>"),
197 Token::PGSquareRoot => f.write_str("|/"),
198 Token::PGCubeRoot => f.write_str("||/"),
199 Token::Placeholder(ref s) => write!(f, "{}", s),
200 }
201 }
202}
203
204impl Token {
205 pub fn make_keyword(keyword: &str) -> Self {
206 Token::make_word(keyword, None)
207 }
208
209 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
210 let word_uppercase = word.to_uppercase();
211 Token::Word(Word {
212 value: word.to_string(),
213 quote_style,
214 keyword: if quote_style == None {
215 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
216 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
217 } else {
218 Keyword::NoKeyword
219 },
220 })
221 }
222}
223
224#[derive(Debug, Clone, PartialEq, Eq, Hash)]
226#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
227pub struct Word {
228 pub value: String,
231 pub quote_style: Option<char>,
235 pub keyword: Keyword,
238}
239
240impl fmt::Display for Word {
241 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
242 match self.quote_style {
243 Some(s) if s == '"' || s == '[' || s == '`' => {
244 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
245 }
246 None => f.write_str(&self.value),
247 _ => panic!("Unexpected quote_style!"),
248 }
249 }
250}
251
252impl Word {
253 fn matching_end_quote(ch: char) -> char {
254 match ch {
255 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
259 }
260 }
261}
262
263#[derive(Debug, Clone, PartialEq, Eq, Hash)]
264#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
265pub enum Whitespace {
266 Space,
267 Newline,
268 Tab,
269 SingleLineComment { comment: String, prefix: String },
270 MultiLineComment(String),
271}
272
273impl fmt::Display for Whitespace {
274 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
275 match self {
276 Whitespace::Space => f.write_str(" "),
277 Whitespace::Newline => f.write_str("\n"),
278 Whitespace::Tab => f.write_str("\t"),
279 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
280 Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
281 }
282 }
283}
284
285#[derive(Debug, PartialEq)]
287pub struct TokenizerError {
288 pub message: String,
289 pub line: u64,
290 pub col: u64,
291}
292
293impl fmt::Display for TokenizerError {
294 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
295 write!(
296 f,
297 "{} at Line: {}, Column {}",
298 self.message, self.line, self.col
299 )
300 }
301}
302
303#[cfg(feature = "std")]
304impl std::error::Error for TokenizerError {}
305
306pub struct Tokenizer<'a> {
308 dialect: &'a dyn Dialect,
309 query: &'a str,
310 line: u64,
311 col: u64,
312}
313
314impl<'a> Tokenizer<'a> {
315 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
317 Self {
318 dialect,
319 query,
320 line: 1,
321 col: 1,
322 }
323 }
324
325 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
327 let mut peekable = self.query.chars().peekable();
328
329 let mut tokens: Vec<Token> = vec![];
330
331 while let Some(token) = self.next_token(&mut peekable)? {
332 match &token {
333 Token::Whitespace(Whitespace::Newline) => {
334 self.line += 1;
335 self.col = 1;
336 }
337
338 Token::Whitespace(Whitespace::Tab) => self.col += 4,
339 Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64,
340 Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2,
341 Token::Number(s, _) => self.col += s.len() as u64,
342 Token::SingleQuotedString(s) => self.col += s.len() as u64,
343 Token::Placeholder(s) => self.col += s.len() as u64,
344 _ => self.col += 1,
345 }
346
347 tokens.push(token);
348 }
349 Ok(tokens)
350 }
351
352 fn next_token(&self, chars: &mut Peekable<Chars<'_>>) -> Result<Option<Token>, TokenizerError> {
354 match chars.peek() {
356 Some(&ch) => match ch {
357 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
358 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
359 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
360 '\r' => {
361 chars.next();
363 if let Some('\n') = chars.peek() {
364 chars.next();
365 }
366 Ok(Some(Token::Whitespace(Whitespace::Newline)))
367 }
368 'N' => {
369 chars.next(); match chars.peek() {
371 Some('\'') => {
372 let s = self.tokenize_single_quoted_string(chars)?;
374 Ok(Some(Token::NationalStringLiteral(s)))
375 }
376 _ => {
377 let s = self.tokenize_word('N', chars);
379 Ok(Some(Token::make_word(&s, None)))
380 }
381 }
382 }
383 x @ 'x' | x @ 'X' => {
386 chars.next(); match chars.peek() {
388 Some('\'') => {
389 let s = self.tokenize_single_quoted_string(chars)?;
391 Ok(Some(Token::HexStringLiteral(s)))
392 }
393 _ => {
394 let s = self.tokenize_word(x, chars);
396 Ok(Some(Token::make_word(&s, None)))
397 }
398 }
399 }
400 ch if self.dialect.is_identifier_start(ch) => {
402 chars.next(); let s = self.tokenize_word(ch, chars);
404
405 if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
406 let mut s = peeking_take_while(&mut s.chars().peekable(), |ch| {
407 matches!(ch, '0'..='9' | '.')
408 });
409 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
410 s += s2.as_str();
411 return Ok(Some(Token::Number(s, false)));
412 }
413 Ok(Some(Token::make_word(&s, None)))
414 }
415 '\'' => {
417 let s = self.tokenize_single_quoted_string(chars)?;
418
419 Ok(Some(Token::SingleQuotedString(s)))
420 }
421 quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
423 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
425 let s = peeking_take_while(chars, |ch| ch != quote_end);
426 if chars.next() == Some(quote_end) {
427 Ok(Some(Token::make_word(&s, Some(quote_start))))
428 } else {
429 self.tokenizer_error(format!(
430 "Expected close delimiter '{}' before EOF.",
431 quote_end
432 ))
433 }
434 }
435 '0'..='9' | '.' => {
437 let mut s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
438
439 if s == "0" && chars.peek() == Some(&'x') {
441 chars.next();
442 let s2 = peeking_take_while(
443 chars,
444 |ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'),
445 );
446 return Ok(Some(Token::HexStringLiteral(s2)));
447 }
448
449 if let Some('.') = chars.peek() {
451 s.push('.');
452 chars.next();
453 }
454 s += &peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
455
456 if s == "." {
458 return Ok(Some(Token::Period));
459 }
460
461 let long = if chars.peek() == Some(&'L') {
462 chars.next();
463 true
464 } else {
465 false
466 };
467 Ok(Some(Token::Number(s, long)))
468 }
469 '(' => self.consume_and_return(chars, Token::LParen),
471 ')' => self.consume_and_return(chars, Token::RParen),
472 ',' => self.consume_and_return(chars, Token::Comma),
473 '-' => {
475 chars.next(); match chars.peek() {
477 Some('-') => {
478 chars.next(); let comment = self.tokenize_single_line_comment(chars);
480 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
481 prefix: "--".to_owned(),
482 comment,
483 })))
484 }
485 _ => Ok(Some(Token::Minus)),
487 }
488 }
489 '/' => {
490 chars.next(); match chars.peek() {
492 Some('*') => {
493 chars.next(); self.tokenize_multiline_comment(chars)
495 }
496 Some('/') if dialect_of!(self is SnowflakeDialect) => {
497 chars.next(); let comment = self.tokenize_single_line_comment(chars);
499 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
500 prefix: "//".to_owned(),
501 comment,
502 })))
503 }
504 _ => Ok(Some(Token::Div)),
506 }
507 }
508 '+' => self.consume_and_return(chars, Token::Plus),
509 '*' => self.consume_and_return(chars, Token::Mul),
510 '%' => self.consume_and_return(chars, Token::Mod),
511 '|' => {
512 chars.next(); match chars.peek() {
514 Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
515 Some('|') => {
516 chars.next(); match chars.peek() {
518 Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
519 _ => Ok(Some(Token::StringConcat)),
520 }
521 }
522 _ => Ok(Some(Token::Pipe)),
524 }
525 }
526 '=' => {
527 chars.next(); match chars.peek() {
529 Some('>') => self.consume_and_return(chars, Token::RArrow),
530 _ => Ok(Some(Token::Eq)),
531 }
532 }
533 '!' => {
534 chars.next(); match chars.peek() {
536 Some('=') => self.consume_and_return(chars, Token::Neq),
537 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
538 Some('~') => {
539 chars.next();
540 match chars.peek() {
541 Some('*') => self
542 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
543 _ => Ok(Some(Token::ExclamationMarkTilde)),
544 }
545 }
546 _ => Ok(Some(Token::ExclamationMark)),
547 }
548 }
549 '<' => {
550 chars.next(); match chars.peek() {
552 Some('=') => {
553 chars.next();
554 match chars.peek() {
555 Some('>') => self.consume_and_return(chars, Token::Spaceship),
556 _ => Ok(Some(Token::LtEq)),
557 }
558 }
559 Some('>') => self.consume_and_return(chars, Token::Neq),
560 Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
561 _ => Ok(Some(Token::Lt)),
562 }
563 }
564 '>' => {
565 chars.next(); match chars.peek() {
567 Some('=') => self.consume_and_return(chars, Token::GtEq),
568 Some('>') => self.consume_and_return(chars, Token::ShiftRight),
569 _ => Ok(Some(Token::Gt)),
570 }
571 }
572 ':' => {
573 chars.next();
574 match chars.peek() {
575 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
576 _ => Ok(Some(Token::Colon)),
577 }
578 }
579 ';' => self.consume_and_return(chars, Token::SemiColon),
580 '\\' => self.consume_and_return(chars, Token::Backslash),
581 '[' => self.consume_and_return(chars, Token::LBracket),
582 ']' => self.consume_and_return(chars, Token::RBracket),
583 '&' => self.consume_and_return(chars, Token::Ampersand),
584 '^' => self.consume_and_return(chars, Token::Caret),
585 '{' => self.consume_and_return(chars, Token::LBrace),
586 '}' => self.consume_and_return(chars, Token::RBrace),
587 '#' if dialect_of!(self is SnowflakeDialect) => {
588 chars.next(); let comment = self.tokenize_single_line_comment(chars);
590 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
591 prefix: "#".to_owned(),
592 comment,
593 })))
594 }
595 '~' => {
596 chars.next(); match chars.peek() {
598 Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
599 _ => Ok(Some(Token::Tilde)),
600 }
601 }
602 '#' => self.consume_and_return(chars, Token::Sharp),
603 '@' => self.consume_and_return(chars, Token::AtSign),
604 '?' => self.consume_and_return(chars, Token::Placeholder(String::from("?"))),
605 '$' => {
606 chars.next();
607 let s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
608 Ok(Some(Token::Placeholder(String::from("$") + &s)))
609 }
610 other => self.consume_and_return(chars, Token::Char(other)),
611 },
612 None => Ok(None),
613 }
614 }
615
616 fn tokenizer_error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
617 Err(TokenizerError {
618 message: message.into(),
619 col: self.col,
620 line: self.line,
621 })
622 }
623
624 fn tokenize_single_line_comment(&self, chars: &mut Peekable<Chars<'_>>) -> String {
626 let mut comment = peeking_take_while(chars, |ch| ch != '\n');
627 if let Some(ch) = chars.next() {
628 assert_eq!(ch, '\n');
629 comment.push(ch);
630 }
631 comment
632 }
633
634 fn tokenize_word(&self, first_char: char, chars: &mut Peekable<Chars<'_>>) -> String {
636 let mut s = first_char.to_string();
637 s.push_str(&peeking_take_while(chars, |ch| {
638 self.dialect.is_identifier_part(ch)
639 }));
640 s
641 }
642
643 fn tokenize_single_quoted_string(
645 &self,
646 chars: &mut Peekable<Chars<'_>>,
647 ) -> Result<String, TokenizerError> {
648 let mut s = String::new();
649 chars.next(); let mut is_escaped = false;
653 while let Some(&ch) = chars.peek() {
654 match ch {
655 '\'' => {
656 chars.next(); if is_escaped {
658 s.push(ch);
659 is_escaped = false;
660 } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
661 s.push(ch);
662 chars.next();
663 } else {
664 return Ok(s);
665 }
666 }
667 '\\' => {
668 if dialect_of!(self is MySqlDialect) {
669 is_escaped = !is_escaped;
670 } else {
671 s.push(ch);
672 }
673 chars.next();
674 }
675 _ => {
676 chars.next(); s.push(ch);
678 }
679 }
680 }
681 self.tokenizer_error("Unterminated string literal")
682 }
683
684 fn tokenize_multiline_comment(
685 &self,
686 chars: &mut Peekable<Chars<'_>>,
687 ) -> Result<Option<Token>, TokenizerError> {
688 let mut s = String::new();
689 let mut maybe_closing_comment = false;
690 loop {
692 match chars.next() {
693 Some(ch) => {
694 if maybe_closing_comment {
695 if ch == '/' {
696 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
697 } else {
698 s.push('*');
699 }
700 }
701 maybe_closing_comment = ch == '*';
702 if !maybe_closing_comment {
703 s.push(ch);
704 }
705 }
706 None => break self.tokenizer_error("Unexpected EOF while in a multi-line comment"),
707 }
708 }
709 }
710
711 #[allow(clippy::unnecessary_wraps)]
712 fn consume_and_return(
713 &self,
714 chars: &mut Peekable<Chars<'_>>,
715 t: Token,
716 ) -> Result<Option<Token>, TokenizerError> {
717 chars.next();
718 Ok(Some(t))
719 }
720}
721
722fn peeking_take_while(
726 chars: &mut Peekable<Chars<'_>>,
727 mut predicate: impl FnMut(char) -> bool,
728) -> String {
729 let mut s = String::new();
730 while let Some(&ch) = chars.peek() {
731 if predicate(ch) {
732 chars.next(); s.push(ch);
734 } else {
735 break;
736 }
737 }
738 s
739}
740
741#[cfg(test)]
742mod tests {
743 use super::*;
744 use crate::dialect::{GenericDialect, MsSqlDialect};
745
746 #[test]
747 fn tokenizer_error_impl() {
748 let err = TokenizerError {
749 message: "test".into(),
750 line: 1,
751 col: 1,
752 };
753 #[cfg(feature = "std")]
754 {
755 use std::error::Error;
756 assert!(err.source().is_none());
757 }
758 assert_eq!(err.to_string(), "test at Line: 1, Column 1");
759 }
760
761 #[test]
762 fn tokenize_select_1() {
763 let sql = String::from("SELECT 1");
764 let dialect = GenericDialect {};
765 let mut tokenizer = Tokenizer::new(&dialect, &sql);
766 let tokens = tokenizer.tokenize().unwrap();
767
768 let expected = vec![
769 Token::make_keyword("SELECT"),
770 Token::Whitespace(Whitespace::Space),
771 Token::Number(String::from("1"), false),
772 ];
773
774 compare(expected, tokens);
775 }
776
777 #[test]
778 fn tokenize_select_float() {
779 let sql = String::from("SELECT .1");
780 let dialect = GenericDialect {};
781 let mut tokenizer = Tokenizer::new(&dialect, &sql);
782 let tokens = tokenizer.tokenize().unwrap();
783
784 let expected = vec![
785 Token::make_keyword("SELECT"),
786 Token::Whitespace(Whitespace::Space),
787 Token::Number(String::from(".1"), false),
788 ];
789
790 compare(expected, tokens);
791 }
792
793 #[test]
794 fn tokenize_scalar_function() {
795 let sql = String::from("SELECT sqrt(1)");
796 let dialect = GenericDialect {};
797 let mut tokenizer = Tokenizer::new(&dialect, &sql);
798 let tokens = tokenizer.tokenize().unwrap();
799
800 let expected = vec![
801 Token::make_keyword("SELECT"),
802 Token::Whitespace(Whitespace::Space),
803 Token::make_word("sqrt", None),
804 Token::LParen,
805 Token::Number(String::from("1"), false),
806 Token::RParen,
807 ];
808
809 compare(expected, tokens);
810 }
811
812 #[test]
813 fn tokenize_string_string_concat() {
814 let sql = String::from("SELECT 'a' || 'b'");
815 let dialect = GenericDialect {};
816 let mut tokenizer = Tokenizer::new(&dialect, &sql);
817 let tokens = tokenizer.tokenize().unwrap();
818
819 let expected = vec![
820 Token::make_keyword("SELECT"),
821 Token::Whitespace(Whitespace::Space),
822 Token::SingleQuotedString(String::from("a")),
823 Token::Whitespace(Whitespace::Space),
824 Token::StringConcat,
825 Token::Whitespace(Whitespace::Space),
826 Token::SingleQuotedString(String::from("b")),
827 ];
828
829 compare(expected, tokens);
830 }
831 #[test]
832 fn tokenize_bitwise_op() {
833 let sql = String::from("SELECT one | two ^ three");
834 let dialect = GenericDialect {};
835 let mut tokenizer = Tokenizer::new(&dialect, &sql);
836 let tokens = tokenizer.tokenize().unwrap();
837
838 let expected = vec![
839 Token::make_keyword("SELECT"),
840 Token::Whitespace(Whitespace::Space),
841 Token::make_word("one", None),
842 Token::Whitespace(Whitespace::Space),
843 Token::Pipe,
844 Token::Whitespace(Whitespace::Space),
845 Token::make_word("two", None),
846 Token::Whitespace(Whitespace::Space),
847 Token::Caret,
848 Token::Whitespace(Whitespace::Space),
849 Token::make_word("three", None),
850 ];
851 compare(expected, tokens);
852 }
853
854 #[test]
855 fn tokenize_logical_xor() {
856 let sql =
857 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
858 let dialect = GenericDialect {};
859 let mut tokenizer = Tokenizer::new(&dialect, &sql);
860 let tokens = tokenizer.tokenize().unwrap();
861
862 let expected = vec![
863 Token::make_keyword("SELECT"),
864 Token::Whitespace(Whitespace::Space),
865 Token::make_keyword("true"),
866 Token::Whitespace(Whitespace::Space),
867 Token::make_keyword("XOR"),
868 Token::Whitespace(Whitespace::Space),
869 Token::make_keyword("true"),
870 Token::Comma,
871 Token::Whitespace(Whitespace::Space),
872 Token::make_keyword("false"),
873 Token::Whitespace(Whitespace::Space),
874 Token::make_keyword("XOR"),
875 Token::Whitespace(Whitespace::Space),
876 Token::make_keyword("false"),
877 Token::Comma,
878 Token::Whitespace(Whitespace::Space),
879 Token::make_keyword("true"),
880 Token::Whitespace(Whitespace::Space),
881 Token::make_keyword("XOR"),
882 Token::Whitespace(Whitespace::Space),
883 Token::make_keyword("false"),
884 Token::Comma,
885 Token::Whitespace(Whitespace::Space),
886 Token::make_keyword("false"),
887 Token::Whitespace(Whitespace::Space),
888 Token::make_keyword("XOR"),
889 Token::Whitespace(Whitespace::Space),
890 Token::make_keyword("true"),
891 ];
892 compare(expected, tokens);
893 }
894
895 #[test]
896 fn tokenize_simple_select() {
897 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
898 let dialect = GenericDialect {};
899 let mut tokenizer = Tokenizer::new(&dialect, &sql);
900 let tokens = tokenizer.tokenize().unwrap();
901
902 let expected = vec![
903 Token::make_keyword("SELECT"),
904 Token::Whitespace(Whitespace::Space),
905 Token::Mul,
906 Token::Whitespace(Whitespace::Space),
907 Token::make_keyword("FROM"),
908 Token::Whitespace(Whitespace::Space),
909 Token::make_word("customer", None),
910 Token::Whitespace(Whitespace::Space),
911 Token::make_keyword("WHERE"),
912 Token::Whitespace(Whitespace::Space),
913 Token::make_word("id", None),
914 Token::Whitespace(Whitespace::Space),
915 Token::Eq,
916 Token::Whitespace(Whitespace::Space),
917 Token::Number(String::from("1"), false),
918 Token::Whitespace(Whitespace::Space),
919 Token::make_keyword("LIMIT"),
920 Token::Whitespace(Whitespace::Space),
921 Token::Number(String::from("5"), false),
922 ];
923
924 compare(expected, tokens);
925 }
926
927 #[test]
928 fn tokenize_explain_select() {
929 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
930 let dialect = GenericDialect {};
931 let mut tokenizer = Tokenizer::new(&dialect, &sql);
932 let tokens = tokenizer.tokenize().unwrap();
933
934 let expected = vec![
935 Token::make_keyword("EXPLAIN"),
936 Token::Whitespace(Whitespace::Space),
937 Token::make_keyword("SELECT"),
938 Token::Whitespace(Whitespace::Space),
939 Token::Mul,
940 Token::Whitespace(Whitespace::Space),
941 Token::make_keyword("FROM"),
942 Token::Whitespace(Whitespace::Space),
943 Token::make_word("customer", None),
944 Token::Whitespace(Whitespace::Space),
945 Token::make_keyword("WHERE"),
946 Token::Whitespace(Whitespace::Space),
947 Token::make_word("id", None),
948 Token::Whitespace(Whitespace::Space),
949 Token::Eq,
950 Token::Whitespace(Whitespace::Space),
951 Token::Number(String::from("1"), false),
952 ];
953
954 compare(expected, tokens);
955 }
956
957 #[test]
958 fn tokenize_explain_analyze_select() {
959 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
960 let dialect = GenericDialect {};
961 let mut tokenizer = Tokenizer::new(&dialect, &sql);
962 let tokens = tokenizer.tokenize().unwrap();
963
964 let expected = vec![
965 Token::make_keyword("EXPLAIN"),
966 Token::Whitespace(Whitespace::Space),
967 Token::make_keyword("ANALYZE"),
968 Token::Whitespace(Whitespace::Space),
969 Token::make_keyword("SELECT"),
970 Token::Whitespace(Whitespace::Space),
971 Token::Mul,
972 Token::Whitespace(Whitespace::Space),
973 Token::make_keyword("FROM"),
974 Token::Whitespace(Whitespace::Space),
975 Token::make_word("customer", None),
976 Token::Whitespace(Whitespace::Space),
977 Token::make_keyword("WHERE"),
978 Token::Whitespace(Whitespace::Space),
979 Token::make_word("id", None),
980 Token::Whitespace(Whitespace::Space),
981 Token::Eq,
982 Token::Whitespace(Whitespace::Space),
983 Token::Number(String::from("1"), false),
984 ];
985
986 compare(expected, tokens);
987 }
988
989 #[test]
990 fn tokenize_string_predicate() {
991 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
992 let dialect = GenericDialect {};
993 let mut tokenizer = Tokenizer::new(&dialect, &sql);
994 let tokens = tokenizer.tokenize().unwrap();
995
996 let expected = vec![
997 Token::make_keyword("SELECT"),
998 Token::Whitespace(Whitespace::Space),
999 Token::Mul,
1000 Token::Whitespace(Whitespace::Space),
1001 Token::make_keyword("FROM"),
1002 Token::Whitespace(Whitespace::Space),
1003 Token::make_word("customer", None),
1004 Token::Whitespace(Whitespace::Space),
1005 Token::make_keyword("WHERE"),
1006 Token::Whitespace(Whitespace::Space),
1007 Token::make_word("salary", None),
1008 Token::Whitespace(Whitespace::Space),
1009 Token::Neq,
1010 Token::Whitespace(Whitespace::Space),
1011 Token::SingleQuotedString(String::from("Not Provided")),
1012 ];
1013
1014 compare(expected, tokens);
1015 }
1016
1017 #[test]
1018 fn tokenize_invalid_string() {
1019 let sql = String::from("\nمصطفىh");
1020
1021 let dialect = GenericDialect {};
1022 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1023 let tokens = tokenizer.tokenize().unwrap();
1024 let expected = vec![
1026 Token::Whitespace(Whitespace::Newline),
1027 Token::Char('م'),
1028 Token::Char('ص'),
1029 Token::Char('ط'),
1030 Token::Char('ف'),
1031 Token::Char('ى'),
1032 Token::make_word("h", None),
1033 ];
1034 compare(expected, tokens);
1035 }
1036
1037 #[test]
1038 fn tokenize_newline_in_string_literal() {
1039 let sql = String::from("'foo\r\nbar\nbaz'");
1040
1041 let dialect = GenericDialect {};
1042 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1043 let tokens = tokenizer.tokenize().unwrap();
1044 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
1045 compare(expected, tokens);
1046 }
1047
1048 #[test]
1049 fn tokenize_unterminated_string_literal() {
1050 let sql = String::from("select 'foo");
1051
1052 let dialect = GenericDialect {};
1053 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1054 assert_eq!(
1055 tokenizer.tokenize(),
1056 Err(TokenizerError {
1057 message: "Unterminated string literal".to_string(),
1058 line: 1,
1059 col: 8
1060 })
1061 );
1062 }
1063
1064 #[test]
1065 fn tokenize_invalid_string_cols() {
1066 let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
1067
1068 let dialect = GenericDialect {};
1069 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1070 let tokens = tokenizer.tokenize().unwrap();
1071 let expected = vec![
1073 Token::Whitespace(Whitespace::Newline),
1074 Token::Whitespace(Whitespace::Newline),
1075 Token::make_keyword("SELECT"),
1076 Token::Whitespace(Whitespace::Space),
1077 Token::Mul,
1078 Token::Whitespace(Whitespace::Space),
1079 Token::make_keyword("FROM"),
1080 Token::Whitespace(Whitespace::Space),
1081 Token::make_keyword("table"),
1082 Token::Whitespace(Whitespace::Tab),
1083 Token::Char('م'),
1084 Token::Char('ص'),
1085 Token::Char('ط'),
1086 Token::Char('ف'),
1087 Token::Char('ى'),
1088 Token::make_word("h", None),
1089 ];
1090 compare(expected, tokens);
1091 }
1092
1093 #[test]
1094 fn tokenize_right_arrow() {
1095 let sql = String::from("FUNCTION(key=>value)");
1096 let dialect = GenericDialect {};
1097 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1098 let tokens = tokenizer.tokenize().unwrap();
1099 let expected = vec![
1100 Token::make_word("FUNCTION", None),
1101 Token::LParen,
1102 Token::make_word("key", None),
1103 Token::RArrow,
1104 Token::make_word("value", None),
1105 Token::RParen,
1106 ];
1107 compare(expected, tokens);
1108 }
1109
1110 #[test]
1111 fn tokenize_is_null() {
1112 let sql = String::from("a IS NULL");
1113 let dialect = GenericDialect {};
1114 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1115 let tokens = tokenizer.tokenize().unwrap();
1116
1117 let expected = vec![
1118 Token::make_word("a", None),
1119 Token::Whitespace(Whitespace::Space),
1120 Token::make_keyword("IS"),
1121 Token::Whitespace(Whitespace::Space),
1122 Token::make_keyword("NULL"),
1123 ];
1124
1125 compare(expected, tokens);
1126 }
1127
1128 #[test]
1129 fn tokenize_comment() {
1130 let sql = String::from("0--this is a comment\n1");
1131
1132 let dialect = GenericDialect {};
1133 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1134 let tokens = tokenizer.tokenize().unwrap();
1135 let expected = vec![
1136 Token::Number("0".to_string(), false),
1137 Token::Whitespace(Whitespace::SingleLineComment {
1138 prefix: "--".to_string(),
1139 comment: "this is a comment\n".to_string(),
1140 }),
1141 Token::Number("1".to_string(), false),
1142 ];
1143 compare(expected, tokens);
1144 }
1145
1146 #[test]
1147 fn tokenize_comment_at_eof() {
1148 let sql = String::from("--this is a comment");
1149
1150 let dialect = GenericDialect {};
1151 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1152 let tokens = tokenizer.tokenize().unwrap();
1153 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1154 prefix: "--".to_string(),
1155 comment: "this is a comment".to_string(),
1156 })];
1157 compare(expected, tokens);
1158 }
1159
1160 #[test]
1161 fn tokenize_multiline_comment() {
1162 let sql = String::from("0/*multi-line\n* /comment*/1");
1163
1164 let dialect = GenericDialect {};
1165 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1166 let tokens = tokenizer.tokenize().unwrap();
1167 let expected = vec![
1168 Token::Number("0".to_string(), false),
1169 Token::Whitespace(Whitespace::MultiLineComment(
1170 "multi-line\n* /comment".to_string(),
1171 )),
1172 Token::Number("1".to_string(), false),
1173 ];
1174 compare(expected, tokens);
1175 }
1176
1177 #[test]
1178 fn tokenize_multiline_comment_with_even_asterisks() {
1179 let sql = String::from("\n/** Comment **/\n");
1180
1181 let dialect = GenericDialect {};
1182 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1183 let tokens = tokenizer.tokenize().unwrap();
1184 let expected = vec![
1185 Token::Whitespace(Whitespace::Newline),
1186 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
1187 Token::Whitespace(Whitespace::Newline),
1188 ];
1189 compare(expected, tokens);
1190 }
1191
1192 #[test]
1193 fn tokenize_mismatched_quotes() {
1194 let sql = String::from("\"foo");
1195
1196 let dialect = GenericDialect {};
1197 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1198 assert_eq!(
1199 tokenizer.tokenize(),
1200 Err(TokenizerError {
1201 message: "Expected close delimiter '\"' before EOF.".to_string(),
1202 line: 1,
1203 col: 1
1204 })
1205 );
1206 }
1207
1208 #[test]
1209 fn tokenize_newlines() {
1210 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1211
1212 let dialect = GenericDialect {};
1213 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1214 let tokens = tokenizer.tokenize().unwrap();
1215 let expected = vec![
1216 Token::make_word("line1", None),
1217 Token::Whitespace(Whitespace::Newline),
1218 Token::make_word("line2", None),
1219 Token::Whitespace(Whitespace::Newline),
1220 Token::make_word("line3", None),
1221 Token::Whitespace(Whitespace::Newline),
1222 Token::make_word("line4", None),
1223 Token::Whitespace(Whitespace::Newline),
1224 ];
1225 compare(expected, tokens);
1226 }
1227
1228 #[test]
1229 fn tokenize_mssql_top() {
1230 let sql = "SELECT TOP 5 [bar] FROM foo";
1231 let dialect = MsSqlDialect {};
1232 let mut tokenizer = Tokenizer::new(&dialect, sql);
1233 let tokens = tokenizer.tokenize().unwrap();
1234 let expected = vec![
1235 Token::make_keyword("SELECT"),
1236 Token::Whitespace(Whitespace::Space),
1237 Token::make_keyword("TOP"),
1238 Token::Whitespace(Whitespace::Space),
1239 Token::Number(String::from("5"), false),
1240 Token::Whitespace(Whitespace::Space),
1241 Token::make_word("bar", Some('[')),
1242 Token::Whitespace(Whitespace::Space),
1243 Token::make_keyword("FROM"),
1244 Token::Whitespace(Whitespace::Space),
1245 Token::make_word("foo", None),
1246 ];
1247 compare(expected, tokens);
1248 }
1249
1250 #[test]
1251 fn tokenize_pg_regex_match() {
1252 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1253 let dialect = GenericDialect {};
1254 let mut tokenizer = Tokenizer::new(&dialect, sql);
1255 let tokens = tokenizer.tokenize().unwrap();
1256 let expected = vec![
1257 Token::make_keyword("SELECT"),
1258 Token::Whitespace(Whitespace::Space),
1259 Token::make_word("col", None),
1260 Token::Whitespace(Whitespace::Space),
1261 Token::Tilde,
1262 Token::Whitespace(Whitespace::Space),
1263 Token::SingleQuotedString("^a".into()),
1264 Token::Comma,
1265 Token::Whitespace(Whitespace::Space),
1266 Token::make_word("col", None),
1267 Token::Whitespace(Whitespace::Space),
1268 Token::TildeAsterisk,
1269 Token::Whitespace(Whitespace::Space),
1270 Token::SingleQuotedString("^a".into()),
1271 Token::Comma,
1272 Token::Whitespace(Whitespace::Space),
1273 Token::make_word("col", None),
1274 Token::Whitespace(Whitespace::Space),
1275 Token::ExclamationMarkTilde,
1276 Token::Whitespace(Whitespace::Space),
1277 Token::SingleQuotedString("^a".into()),
1278 Token::Comma,
1279 Token::Whitespace(Whitespace::Space),
1280 Token::make_word("col", None),
1281 Token::Whitespace(Whitespace::Space),
1282 Token::ExclamationMarkTildeAsterisk,
1283 Token::Whitespace(Whitespace::Space),
1284 Token::SingleQuotedString("^a".into()),
1285 ];
1286 compare(expected, tokens);
1287 }
1288
1289 fn compare(expected: Vec<Token>, actual: Vec<Token>) {
1290 assert_eq!(expected, actual);
1295 }
1296}