1#[cfg(not(feature = "std"))]
20use alloc::{
21 borrow::ToOwned,
22 format,
23 string::{String, ToString},
24 vec,
25 vec::Vec,
26};
27use core::fmt;
28use core::iter::Peekable;
29use core::str::Chars;
30
31#[cfg(feature = "serde")]
32use serde::{Deserialize, Serialize};
33
34#[cfg(feature = "visitor")]
35use sqlparser_derive::{Visit, VisitMut};
36
37use crate::ast::DollarQuotedString;
38use crate::dialect::{
39 BigQueryDialect, DuckDbDialect, GenericDialect, HiveDialect, SnowflakeDialect,
40};
41use crate::dialect::{Dialect, MySqlDialect};
42use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
43
44#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
46#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
47#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
48pub enum Token {
49 EOF,
51 Word(Word),
53 Number(String, bool),
55 Char(char),
57 SingleQuotedString(String),
59 DoubleQuotedString(String),
61 DollarQuotedString(DollarQuotedString),
63 SingleQuotedByteStringLiteral(String),
66 DoubleQuotedByteStringLiteral(String),
68 RawStringLiteral(String),
70 NationalStringLiteral(String),
72 EscapedStringLiteral(String),
74 HexStringLiteral(String),
76 Comma,
78 Whitespace(Whitespace),
80 DoubleEq,
82 Eq,
84 Neq,
86 Lt,
88 Gt,
90 LtEq,
92 GtEq,
94 Spaceship,
96 Plus,
98 Minus,
100 Mul,
102 Div,
104 DuckIntDiv,
106 Mod,
108 StringConcat,
110 LParen,
112 RParen,
114 Period,
116 Colon,
118 DoubleColon,
120 DuckAssignment,
122 SemiColon,
124 Backslash,
126 LBracket,
128 RBracket,
130 Ampersand,
132 Pipe,
134 Caret,
136 LBrace,
138 RBrace,
140 RArrow,
142 Sharp,
144 Tilde,
146 TildeAsterisk,
148 ExclamationMarkTilde,
150 ExclamationMarkTildeAsterisk,
152 DoubleTilde,
154 DoubleTildeAsterisk,
156 ExclamationMarkDoubleTilde,
158 ExclamationMarkDoubleTildeAsterisk,
160 ShiftLeft,
162 ShiftRight,
164 Overlap,
166 ExclamationMark,
168 DoubleExclamationMark,
170 AtSign,
172 CaretAt,
174 PGSquareRoot,
176 PGCubeRoot,
178 Placeholder(String),
180 Arrow,
182 LongArrow,
184 HashArrow,
186 HashLongArrow,
188 AtArrow,
190 ArrowAt,
192 HashMinus,
195 AtQuestion,
198 AtAt,
202}
203
204impl fmt::Display for Token {
205 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
206 match self {
207 Token::EOF => f.write_str("EOF"),
208 Token::Word(ref w) => write!(f, "{w}"),
209 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
210 Token::Char(ref c) => write!(f, "{c}"),
211 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
212 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
213 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
214 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
215 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
216 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
217 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
218 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
219 Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
220 Token::Comma => f.write_str(","),
221 Token::Whitespace(ws) => write!(f, "{ws}"),
222 Token::DoubleEq => f.write_str("=="),
223 Token::Spaceship => f.write_str("<=>"),
224 Token::Eq => f.write_str("="),
225 Token::Neq => f.write_str("<>"),
226 Token::Lt => f.write_str("<"),
227 Token::Gt => f.write_str(">"),
228 Token::LtEq => f.write_str("<="),
229 Token::GtEq => f.write_str(">="),
230 Token::Plus => f.write_str("+"),
231 Token::Minus => f.write_str("-"),
232 Token::Mul => f.write_str("*"),
233 Token::Div => f.write_str("/"),
234 Token::DuckIntDiv => f.write_str("//"),
235 Token::StringConcat => f.write_str("||"),
236 Token::Mod => f.write_str("%"),
237 Token::LParen => f.write_str("("),
238 Token::RParen => f.write_str(")"),
239 Token::Period => f.write_str("."),
240 Token::Colon => f.write_str(":"),
241 Token::DoubleColon => f.write_str("::"),
242 Token::DuckAssignment => f.write_str(":="),
243 Token::SemiColon => f.write_str(";"),
244 Token::Backslash => f.write_str("\\"),
245 Token::LBracket => f.write_str("["),
246 Token::RBracket => f.write_str("]"),
247 Token::Ampersand => f.write_str("&"),
248 Token::Caret => f.write_str("^"),
249 Token::Pipe => f.write_str("|"),
250 Token::LBrace => f.write_str("{"),
251 Token::RBrace => f.write_str("}"),
252 Token::RArrow => f.write_str("=>"),
253 Token::Sharp => f.write_str("#"),
254 Token::ExclamationMark => f.write_str("!"),
255 Token::DoubleExclamationMark => f.write_str("!!"),
256 Token::Tilde => f.write_str("~"),
257 Token::TildeAsterisk => f.write_str("~*"),
258 Token::ExclamationMarkTilde => f.write_str("!~"),
259 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
260 Token::DoubleTilde => f.write_str("~~"),
261 Token::DoubleTildeAsterisk => f.write_str("~~*"),
262 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
263 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
264 Token::AtSign => f.write_str("@"),
265 Token::CaretAt => f.write_str("^@"),
266 Token::ShiftLeft => f.write_str("<<"),
267 Token::ShiftRight => f.write_str(">>"),
268 Token::Overlap => f.write_str("&&"),
269 Token::PGSquareRoot => f.write_str("|/"),
270 Token::PGCubeRoot => f.write_str("||/"),
271 Token::Placeholder(ref s) => write!(f, "{s}"),
272 Token::Arrow => write!(f, "->"),
273 Token::LongArrow => write!(f, "->>"),
274 Token::HashArrow => write!(f, "#>"),
275 Token::HashLongArrow => write!(f, "#>>"),
276 Token::AtArrow => write!(f, "@>"),
277 Token::ArrowAt => write!(f, "<@"),
278 Token::HashMinus => write!(f, "#-"),
279 Token::AtQuestion => write!(f, "@?"),
280 Token::AtAt => write!(f, "@@"),
281 }
282 }
283}
284
285impl Token {
286 pub fn make_keyword(keyword: &str) -> Self {
287 Token::make_word(keyword, None)
288 }
289
290 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
291 let word_uppercase = word.to_uppercase();
292 Token::Word(Word {
293 value: word.to_string(),
294 quote_style,
295 keyword: if quote_style.is_none() {
296 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
297 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
298 } else {
299 Keyword::NoKeyword
300 },
301 })
302 }
303}
304
305#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
307#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
308#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
309pub struct Word {
310 pub value: String,
313 pub quote_style: Option<char>,
317 pub keyword: Keyword,
320}
321
322impl fmt::Display for Word {
323 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
324 match self.quote_style {
325 Some(s) if s == '"' || s == '[' || s == '`' => {
326 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
327 }
328 None => f.write_str(&self.value),
329 _ => panic!("Unexpected quote_style!"),
330 }
331 }
332}
333
334impl Word {
335 fn matching_end_quote(ch: char) -> char {
336 match ch {
337 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
341 }
342 }
343}
344
345#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
346#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
347#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
348pub enum Whitespace {
349 Space,
350 Newline,
351 Tab,
352 SingleLineComment { comment: String, prefix: String },
353 MultiLineComment(String),
354}
355
356impl fmt::Display for Whitespace {
357 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
358 match self {
359 Whitespace::Space => f.write_str(" "),
360 Whitespace::Newline => f.write_str("\n"),
361 Whitespace::Tab => f.write_str("\t"),
362 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
363 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
364 }
365 }
366}
367
368#[derive(Debug, Eq, PartialEq, Clone, Copy)]
370pub struct Location {
371 pub line: u64,
373 pub column: u64,
375}
376
377impl fmt::Display for Location {
378 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
379 if self.line == 0 {
380 return Ok(());
381 }
382 write!(
383 f,
384 " at Line: {}, Column {}",
386 self.line, self.column,
387 )
388 }
389}
390
391#[derive(Debug, Eq, PartialEq, Clone)]
393pub struct TokenWithLocation {
394 pub token: Token,
395 pub location: Location,
396}
397
398impl TokenWithLocation {
399 pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
400 TokenWithLocation {
401 token,
402 location: Location { line, column },
403 }
404 }
405
406 pub fn wrap(token: Token) -> TokenWithLocation {
407 TokenWithLocation::new(token, 0, 0)
408 }
409}
410
411impl PartialEq<Token> for TokenWithLocation {
412 fn eq(&self, other: &Token) -> bool {
413 &self.token == other
414 }
415}
416
417impl PartialEq<TokenWithLocation> for Token {
418 fn eq(&self, other: &TokenWithLocation) -> bool {
419 self == &other.token
420 }
421}
422
423impl fmt::Display for TokenWithLocation {
424 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
425 self.token.fmt(f)
426 }
427}
428
429#[derive(Debug, PartialEq, Eq)]
431pub struct TokenizerError {
432 pub message: String,
433 pub location: Location,
434}
435
436impl fmt::Display for TokenizerError {
437 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
438 write!(f, "{}{}", self.message, self.location,)
439 }
440}
441
442#[cfg(feature = "std")]
443impl std::error::Error for TokenizerError {}
444
445struct State<'a> {
446 peekable: Peekable<Chars<'a>>,
447 pub line: u64,
448 pub col: u64,
449}
450
451impl<'a> State<'a> {
452 pub fn next(&mut self) -> Option<char> {
454 match self.peekable.next() {
455 None => None,
456 Some(s) => {
457 if s == '\n' {
458 self.line += 1;
459 self.col = 1;
460 } else {
461 self.col += 1;
462 }
463 Some(s)
464 }
465 }
466 }
467
468 pub fn peek(&mut self) -> Option<&char> {
470 self.peekable.peek()
471 }
472
473 pub fn location(&self) -> Location {
474 Location {
475 line: self.line,
476 column: self.col,
477 }
478 }
479}
480
481pub struct Tokenizer<'a> {
483 dialect: &'a dyn Dialect,
484 query: &'a str,
485 unescape: bool,
488}
489
490impl<'a> Tokenizer<'a> {
491 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
508 Self {
509 dialect,
510 query,
511 unescape: true,
512 }
513 }
514
515 pub fn with_unescape(mut self, unescape: bool) -> Self {
546 self.unescape = unescape;
547 self
548 }
549
550 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
552 let twl = self.tokenize_with_location()?;
553 Ok(twl.into_iter().map(|t| t.token).collect())
554 }
555
556 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
558 let mut tokens: Vec<TokenWithLocation> = vec![];
559 self.tokenize_with_location_into_buf(&mut tokens)
560 .map(|_| tokens)
561 }
562
563 pub fn tokenize_with_location_into_buf(
566 &mut self,
567 buf: &mut Vec<TokenWithLocation>,
568 ) -> Result<(), TokenizerError> {
569 let mut state = State {
570 peekable: self.query.chars().peekable(),
571 line: 1,
572 col: 1,
573 };
574
575 let mut location = state.location();
576 while let Some(token) = self.next_token(&mut state)? {
577 buf.push(TokenWithLocation { token, location });
578
579 location = state.location();
580 }
581 Ok(())
582 }
583
584 fn tokenize_identifier_or_keyword(
586 &self,
587 ch: impl IntoIterator<Item = char>,
588 chars: &mut State,
589 ) -> Result<Option<Token>, TokenizerError> {
590 chars.next(); let ch: String = ch.into_iter().collect();
592 let word = self.tokenize_word(ch, chars);
593
594 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
596 let mut inner_state = State {
597 peekable: word.chars().peekable(),
598 line: 0,
599 col: 0,
600 };
601 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
602 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
603 s += s2.as_str();
604 return Ok(Some(Token::Number(s, false)));
605 }
606
607 Ok(Some(Token::make_word(&word, None)))
608 }
609
610 fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
612 match chars.peek() {
613 Some(&ch) => match ch {
614 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
615 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
616 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
617 '\r' => {
618 chars.next();
620 if let Some('\n') = chars.peek() {
621 chars.next();
622 }
623 Ok(Some(Token::Whitespace(Whitespace::Newline)))
624 }
625 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
627 chars.next(); match chars.peek() {
629 Some('\'') => {
630 let s = self.tokenize_quoted_string(chars, '\'')?;
631 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
632 }
633 Some('\"') => {
634 let s = self.tokenize_quoted_string(chars, '\"')?;
635 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
636 }
637 _ => {
638 let s = self.tokenize_word(b, chars);
640 Ok(Some(Token::make_word(&s, None)))
641 }
642 }
643 }
644 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
646 chars.next(); match chars.peek() {
648 Some('\'') => {
649 let s = self.tokenize_quoted_string(chars, '\'')?;
650 Ok(Some(Token::RawStringLiteral(s)))
651 }
652 Some('\"') => {
653 let s = self.tokenize_quoted_string(chars, '\"')?;
654 Ok(Some(Token::RawStringLiteral(s)))
655 }
656 _ => {
657 let s = self.tokenize_word(b, chars);
659 Ok(Some(Token::make_word(&s, None)))
660 }
661 }
662 }
663 n @ 'N' | n @ 'n' => {
665 chars.next(); match chars.peek() {
667 Some('\'') => {
668 let s = self.tokenize_quoted_string(chars, '\'')?;
670 Ok(Some(Token::NationalStringLiteral(s)))
671 }
672 _ => {
673 let s = self.tokenize_word(n, chars);
675 Ok(Some(Token::make_word(&s, None)))
676 }
677 }
678 }
679 x @ 'e' | x @ 'E' => {
681 let starting_loc = chars.location();
682 chars.next(); match chars.peek() {
684 Some('\'') => {
685 let s =
686 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
687 Ok(Some(Token::EscapedStringLiteral(s)))
688 }
689 _ => {
690 let s = self.tokenize_word(x, chars);
692 Ok(Some(Token::make_word(&s, None)))
693 }
694 }
695 }
696 x @ 'x' | x @ 'X' => {
699 chars.next(); match chars.peek() {
701 Some('\'') => {
702 let s = self.tokenize_quoted_string(chars, '\'')?;
704 Ok(Some(Token::HexStringLiteral(s)))
705 }
706 _ => {
707 let s = self.tokenize_word(x, chars);
709 Ok(Some(Token::make_word(&s, None)))
710 }
711 }
712 }
713 '\'' => {
715 let s = self.tokenize_quoted_string(chars, '\'')?;
716
717 Ok(Some(Token::SingleQuotedString(s)))
718 }
719 '\"' if !self.dialect.is_delimited_identifier_start(ch)
721 && !self.dialect.is_identifier_start(ch) =>
722 {
723 let s = self.tokenize_quoted_string(chars, '"')?;
724
725 Ok(Some(Token::DoubleQuotedString(s)))
726 }
727 quote_start
729 if self.dialect.is_delimited_identifier_start(ch)
730 && self
731 .dialect
732 .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
733 {
734 let error_loc = chars.location();
735 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
737 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
738
739 if last_char == Some(quote_end) {
740 Ok(Some(Token::make_word(&s, Some(quote_start))))
741 } else {
742 self.tokenizer_error(
743 error_loc,
744 format!("Expected close delimiter '{quote_end}' before EOF."),
745 )
746 }
747 }
748 '0'..='9' | '.' => {
750 let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
751
752 if s == "0" && chars.peek() == Some(&'x') {
754 chars.next();
755 let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
756 return Ok(Some(Token::HexStringLiteral(s2)));
757 }
758
759 if let Some('.') = chars.peek() {
761 s.push('.');
762 chars.next();
763 }
764 s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
765
766 if s == "." {
768 return Ok(Some(Token::Period));
769 }
770
771 let mut exponent_part = String::new();
772 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
774 let mut char_clone = chars.peekable.clone();
775 exponent_part.push(char_clone.next().unwrap());
776
777 match char_clone.peek() {
779 Some(&c) if matches!(c, '+' | '-') => {
780 exponent_part.push(c);
781 char_clone.next();
782 }
783 _ => (),
784 }
785
786 match char_clone.peek() {
787 Some(&c) if c.is_ascii_digit() => {
789 for _ in 0..exponent_part.len() {
790 chars.next();
791 }
792 exponent_part +=
793 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
794 s += exponent_part.as_str();
795 }
796 _ => (),
798 }
799 }
800
801 if dialect_of!(self is MySqlDialect | HiveDialect) && exponent_part.is_empty() {
804 let word =
805 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
806
807 if !word.is_empty() {
808 s += word.as_str();
809 return Ok(Some(Token::make_word(s.as_str(), None)));
810 }
811 }
812
813 let long = if chars.peek() == Some(&'L') {
814 chars.next();
815 true
816 } else {
817 false
818 };
819 Ok(Some(Token::Number(s, long)))
820 }
821 '(' => self.consume_and_return(chars, Token::LParen),
823 ')' => self.consume_and_return(chars, Token::RParen),
824 ',' => self.consume_and_return(chars, Token::Comma),
825 '-' => {
827 chars.next(); match chars.peek() {
829 Some('-') => {
830 chars.next(); let comment = self.tokenize_single_line_comment(chars);
832 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
833 prefix: "--".to_owned(),
834 comment,
835 })))
836 }
837 Some('>') => {
838 chars.next();
839 match chars.peek() {
840 Some('>') => {
841 chars.next();
842 Ok(Some(Token::LongArrow))
843 }
844 _ => Ok(Some(Token::Arrow)),
845 }
846 }
847 _ => Ok(Some(Token::Minus)),
849 }
850 }
851 '/' => {
852 chars.next(); match chars.peek() {
854 Some('*') => {
855 chars.next(); self.tokenize_multiline_comment(chars)
857 }
858 Some('/') if dialect_of!(self is SnowflakeDialect) => {
859 chars.next(); let comment = self.tokenize_single_line_comment(chars);
861 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
862 prefix: "//".to_owned(),
863 comment,
864 })))
865 }
866 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
867 self.consume_and_return(chars, Token::DuckIntDiv)
868 }
869 _ => Ok(Some(Token::Div)),
871 }
872 }
873 '+' => self.consume_and_return(chars, Token::Plus),
874 '*' => self.consume_and_return(chars, Token::Mul),
875 '%' => {
876 chars.next(); match chars.peek() {
878 Some(' ') => Ok(Some(Token::Mod)),
879 Some(sch) if self.dialect.is_identifier_start('%') => {
880 self.tokenize_identifier_or_keyword([ch, *sch], chars)
881 }
882 _ => Ok(Some(Token::Mod)),
883 }
884 }
885 '|' => {
886 chars.next(); match chars.peek() {
888 Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
889 Some('|') => {
890 chars.next(); match chars.peek() {
892 Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
893 _ => Ok(Some(Token::StringConcat)),
894 }
895 }
896 _ => Ok(Some(Token::Pipe)),
898 }
899 }
900 '=' => {
901 chars.next(); match chars.peek() {
903 Some('>') => self.consume_and_return(chars, Token::RArrow),
904 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
905 _ => Ok(Some(Token::Eq)),
906 }
907 }
908 '!' => {
909 chars.next(); match chars.peek() {
911 Some('=') => self.consume_and_return(chars, Token::Neq),
912 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
913 Some('~') => {
914 chars.next();
915 match chars.peek() {
916 Some('*') => self
917 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
918 Some('~') => {
919 chars.next();
920 match chars.peek() {
921 Some('*') => self.consume_and_return(
922 chars,
923 Token::ExclamationMarkDoubleTildeAsterisk,
924 ),
925 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
926 }
927 }
928 _ => Ok(Some(Token::ExclamationMarkTilde)),
929 }
930 }
931 _ => Ok(Some(Token::ExclamationMark)),
932 }
933 }
934 '<' => {
935 chars.next(); match chars.peek() {
937 Some('=') => {
938 chars.next();
939 match chars.peek() {
940 Some('>') => self.consume_and_return(chars, Token::Spaceship),
941 _ => Ok(Some(Token::LtEq)),
942 }
943 }
944 Some('>') => self.consume_and_return(chars, Token::Neq),
945 Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
946 Some('@') => self.consume_and_return(chars, Token::ArrowAt),
947 _ => Ok(Some(Token::Lt)),
948 }
949 }
950 '>' => {
951 chars.next(); match chars.peek() {
953 Some('=') => self.consume_and_return(chars, Token::GtEq),
954 Some('>') => self.consume_and_return(chars, Token::ShiftRight),
955 _ => Ok(Some(Token::Gt)),
956 }
957 }
958 ':' => {
959 chars.next();
960 match chars.peek() {
961 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
962 Some('=') => self.consume_and_return(chars, Token::DuckAssignment),
963 _ => Ok(Some(Token::Colon)),
964 }
965 }
966 ';' => self.consume_and_return(chars, Token::SemiColon),
967 '\\' => self.consume_and_return(chars, Token::Backslash),
968 '[' => self.consume_and_return(chars, Token::LBracket),
969 ']' => self.consume_and_return(chars, Token::RBracket),
970 '&' => {
971 chars.next(); match chars.peek() {
973 Some('&') => self.consume_and_return(chars, Token::Overlap),
974 _ => Ok(Some(Token::Ampersand)),
976 }
977 }
978 '^' => {
979 chars.next(); match chars.peek() {
981 Some('@') => self.consume_and_return(chars, Token::CaretAt),
982 _ => Ok(Some(Token::Caret)),
983 }
984 }
985 '{' => self.consume_and_return(chars, Token::LBrace),
986 '}' => self.consume_and_return(chars, Token::RBrace),
987 '#' if dialect_of!(self is SnowflakeDialect) => {
988 chars.next(); let comment = self.tokenize_single_line_comment(chars);
990 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
991 prefix: "#".to_owned(),
992 comment,
993 })))
994 }
995 '~' => {
996 chars.next(); match chars.peek() {
998 Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
999 Some('~') => {
1000 chars.next();
1001 match chars.peek() {
1002 Some('*') => {
1003 self.consume_and_return(chars, Token::DoubleTildeAsterisk)
1004 }
1005 _ => Ok(Some(Token::DoubleTilde)),
1006 }
1007 }
1008 _ => Ok(Some(Token::Tilde)),
1009 }
1010 }
1011 '#' => {
1012 chars.next();
1013 match chars.peek() {
1014 Some('-') => self.consume_and_return(chars, Token::HashMinus),
1015 Some('>') => {
1016 chars.next();
1017 match chars.peek() {
1018 Some('>') => {
1019 chars.next();
1020 Ok(Some(Token::HashLongArrow))
1021 }
1022 _ => Ok(Some(Token::HashArrow)),
1023 }
1024 }
1025 Some(' ') => Ok(Some(Token::Sharp)),
1026 Some(sch) if self.dialect.is_identifier_start('#') => {
1027 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1028 }
1029 _ => Ok(Some(Token::Sharp)),
1030 }
1031 }
1032 '@' => {
1033 chars.next();
1034 match chars.peek() {
1035 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1036 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1037 Some('@') => {
1038 chars.next();
1039 match chars.peek() {
1040 Some(' ') => Ok(Some(Token::AtAt)),
1041 Some(tch) if self.dialect.is_identifier_start('@') => {
1042 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1043 }
1044 _ => Ok(Some(Token::AtAt)),
1045 }
1046 }
1047 Some(' ') => Ok(Some(Token::AtSign)),
1048 Some(sch) if self.dialect.is_identifier_start('@') => {
1049 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1050 }
1051 _ => Ok(Some(Token::AtSign)),
1052 }
1053 }
1054 '?' => {
1055 chars.next();
1056 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1057 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1058 }
1059
1060 ch if self.dialect.is_identifier_start(ch) => {
1062 self.tokenize_identifier_or_keyword([ch], chars)
1063 }
1064 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1065
1066 ch if ch.is_whitespace() => {
1068 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1069 }
1070 other => self.consume_and_return(chars, Token::Char(other)),
1071 },
1072 None => Ok(None),
1073 }
1074 }
1075
1076 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1078 let mut s = String::new();
1079 let mut value = String::new();
1080
1081 chars.next();
1082
1083 if let Some('$') = chars.peek() {
1084 chars.next();
1085
1086 let mut is_terminated = false;
1087 let mut prev: Option<char> = None;
1088
1089 while let Some(&ch) = chars.peek() {
1090 if prev == Some('$') {
1091 if ch == '$' {
1092 chars.next();
1093 is_terminated = true;
1094 break;
1095 } else {
1096 s.push('$');
1097 s.push(ch);
1098 }
1099 } else if ch != '$' {
1100 s.push(ch);
1101 }
1102
1103 prev = Some(ch);
1104 chars.next();
1105 }
1106
1107 return if chars.peek().is_none() && !is_terminated {
1108 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1109 } else {
1110 Ok(Token::DollarQuotedString(DollarQuotedString {
1111 value: s,
1112 tag: None,
1113 }))
1114 };
1115 } else {
1116 value.push_str(&peeking_take_while(chars, |ch| {
1117 ch.is_alphanumeric() || ch == '_'
1118 }));
1119
1120 if let Some('$') = chars.peek() {
1121 chars.next();
1122 s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
1123
1124 match chars.peek() {
1125 Some('$') => {
1126 chars.next();
1127 for c in value.chars() {
1128 let next_char = chars.next();
1129 if Some(c) != next_char {
1130 return self.tokenizer_error(
1131 chars.location(),
1132 format!(
1133 "Unterminated dollar-quoted string at or near \"{value}\""
1134 ),
1135 );
1136 }
1137 }
1138
1139 if let Some('$') = chars.peek() {
1140 chars.next();
1141 } else {
1142 return self.tokenizer_error(
1143 chars.location(),
1144 "Unterminated dollar-quoted string, expected $",
1145 );
1146 }
1147 }
1148 _ => {
1149 return self.tokenizer_error(
1150 chars.location(),
1151 "Unterminated dollar-quoted, expected $",
1152 );
1153 }
1154 }
1155 } else {
1156 return Ok(Token::Placeholder(String::from("$") + &value));
1157 }
1158 }
1159
1160 Ok(Token::DollarQuotedString(DollarQuotedString {
1161 value: s,
1162 tag: if value.is_empty() { None } else { Some(value) },
1163 }))
1164 }
1165
1166 fn tokenizer_error<R>(
1167 &self,
1168 loc: Location,
1169 message: impl Into<String>,
1170 ) -> Result<R, TokenizerError> {
1171 Err(TokenizerError {
1172 message: message.into(),
1173 location: loc,
1174 })
1175 }
1176
1177 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1179 let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1180 if let Some(ch) = chars.next() {
1181 assert_eq!(ch, '\n');
1182 comment.push(ch);
1183 }
1184 comment
1185 }
1186
1187 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1189 let mut s = first_chars.into();
1190 s.push_str(&peeking_take_while(chars, |ch| {
1191 self.dialect.is_identifier_part(ch)
1192 }));
1193 s
1194 }
1195
1196 fn tokenize_escaped_single_quoted_string(
1198 &self,
1199 starting_loc: Location,
1200 chars: &mut State,
1201 ) -> Result<String, TokenizerError> {
1202 let mut s = String::new();
1203
1204 chars.next(); let mut is_escaped = false;
1210 while let Some(&ch) = chars.peek() {
1211 macro_rules! escape_control_character {
1212 ($ESCAPED:expr) => {{
1213 if is_escaped {
1214 s.push($ESCAPED);
1215 is_escaped = false;
1216 } else {
1217 s.push(ch);
1218 }
1219
1220 chars.next();
1221 }};
1222 }
1223
1224 match ch {
1225 '\'' => {
1226 chars.next(); if is_escaped {
1228 s.push(ch);
1229 is_escaped = false;
1230 } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1231 s.push(ch);
1232 chars.next();
1233 } else {
1234 return Ok(s);
1235 }
1236 }
1237 '\\' => {
1238 if is_escaped {
1239 s.push('\\');
1240 is_escaped = false;
1241 } else {
1242 is_escaped = true;
1243 }
1244
1245 chars.next();
1246 }
1247 'r' => escape_control_character!('\r'),
1248 'n' => escape_control_character!('\n'),
1249 't' => escape_control_character!('\t'),
1250 _ => {
1251 is_escaped = false;
1252 chars.next(); s.push(ch);
1254 }
1255 }
1256 }
1257 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1258 }
1259
1260 fn tokenize_quoted_string(
1262 &self,
1263 chars: &mut State,
1264 quote_style: char,
1265 ) -> Result<String, TokenizerError> {
1266 let mut s = String::new();
1267 let error_loc = chars.location();
1268
1269 chars.next(); while let Some(&ch) = chars.peek() {
1272 match ch {
1273 char if char == quote_style => {
1274 chars.next(); if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
1276 s.push(ch);
1277 if !self.unescape {
1278 s.push(ch);
1280 }
1281 chars.next();
1282 } else {
1283 return Ok(s);
1284 }
1285 }
1286 '\\' => {
1287 chars.next();
1289 if dialect_of!(self is MySqlDialect) {
1291 if let Some(next) = chars.peek() {
1292 if !self.unescape {
1293 s.push(ch);
1295 s.push(*next);
1296 chars.next(); } else {
1298 let n = match next {
1300 '\'' | '\"' | '\\' | '%' | '_' => *next,
1301 '0' => '\0',
1302 'b' => '\u{8}',
1303 'n' => '\n',
1304 'r' => '\r',
1305 't' => '\t',
1306 'Z' => '\u{1a}',
1307 _ => *next,
1308 };
1309 s.push(n);
1310 chars.next(); }
1312 }
1313 } else {
1314 s.push(ch);
1315 }
1316 }
1317 _ => {
1318 chars.next(); s.push(ch);
1320 }
1321 }
1322 }
1323 self.tokenizer_error(error_loc, "Unterminated string literal")
1324 }
1325
1326 fn tokenize_multiline_comment(
1327 &self,
1328 chars: &mut State,
1329 ) -> Result<Option<Token>, TokenizerError> {
1330 let mut s = String::new();
1331 let mut nested = 1;
1332 let mut last_ch = ' ';
1333
1334 loop {
1335 match chars.next() {
1336 Some(ch) => {
1337 if last_ch == '/' && ch == '*' {
1338 nested += 1;
1339 } else if last_ch == '*' && ch == '/' {
1340 nested -= 1;
1341 if nested == 0 {
1342 s.pop();
1343 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1344 }
1345 }
1346 s.push(ch);
1347 last_ch = ch;
1348 }
1349 None => {
1350 break self.tokenizer_error(
1351 chars.location(),
1352 "Unexpected EOF while in a multi-line comment",
1353 )
1354 }
1355 }
1356 }
1357 }
1358
1359 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
1360 let mut last_char = None;
1361 let mut s = String::new();
1362 while let Some(ch) = chars.next() {
1363 if ch == quote_end {
1364 if chars.peek() == Some("e_end) {
1365 chars.next();
1366 s.push(ch);
1367 if !self.unescape {
1368 s.push(ch);
1370 }
1371 } else {
1372 last_char = Some(quote_end);
1373 break;
1374 }
1375 } else {
1376 s.push(ch);
1377 }
1378 }
1379 (s, last_char)
1380 }
1381
1382 #[allow(clippy::unnecessary_wraps)]
1383 fn consume_and_return(
1384 &self,
1385 chars: &mut State,
1386 t: Token,
1387 ) -> Result<Option<Token>, TokenizerError> {
1388 chars.next();
1389 Ok(Some(t))
1390 }
1391}
1392
1393fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1397 let mut s = String::new();
1398 while let Some(&ch) = chars.peek() {
1399 if predicate(ch) {
1400 chars.next(); s.push(ch);
1402 } else {
1403 break;
1404 }
1405 }
1406 s
1407}
1408
1409#[cfg(test)]
1410mod tests {
1411 use super::*;
1412 use crate::dialect::{ClickHouseDialect, GenericDialect, MsSqlDialect};
1413
1414 #[test]
1415 fn tokenizer_error_impl() {
1416 let err = TokenizerError {
1417 message: "test".into(),
1418 location: Location { line: 1, column: 1 },
1419 };
1420 #[cfg(feature = "std")]
1421 {
1422 use std::error::Error;
1423 assert!(err.source().is_none());
1424 }
1425 assert_eq!(err.to_string(), "test at Line: 1, Column 1");
1426 }
1427
1428 #[test]
1429 fn tokenize_select_1() {
1430 let sql = String::from("SELECT 1");
1431 let dialect = GenericDialect {};
1432 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1433
1434 let expected = vec![
1435 Token::make_keyword("SELECT"),
1436 Token::Whitespace(Whitespace::Space),
1437 Token::Number(String::from("1"), false),
1438 ];
1439
1440 compare(expected, tokens);
1441 }
1442
1443 #[test]
1444 fn tokenize_select_float() {
1445 let sql = String::from("SELECT .1");
1446 let dialect = GenericDialect {};
1447 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1448
1449 let expected = vec![
1450 Token::make_keyword("SELECT"),
1451 Token::Whitespace(Whitespace::Space),
1452 Token::Number(String::from(".1"), false),
1453 ];
1454
1455 compare(expected, tokens);
1456 }
1457
1458 #[test]
1459 fn tokenize_clickhouse_double_equal() {
1460 let sql = String::from("SELECT foo=='1'");
1461 let dialect = ClickHouseDialect {};
1462 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1463 let tokens = tokenizer.tokenize().unwrap();
1464
1465 let expected = vec![
1466 Token::make_keyword("SELECT"),
1467 Token::Whitespace(Whitespace::Space),
1468 Token::Word(Word {
1469 value: "foo".to_string(),
1470 quote_style: None,
1471 keyword: Keyword::NoKeyword,
1472 }),
1473 Token::DoubleEq,
1474 Token::SingleQuotedString("1".to_string()),
1475 ];
1476
1477 compare(expected, tokens);
1478 }
1479
1480 #[test]
1481 fn tokenize_select_exponent() {
1482 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
1483 let dialect = GenericDialect {};
1484 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1485
1486 let expected = vec![
1487 Token::make_keyword("SELECT"),
1488 Token::Whitespace(Whitespace::Space),
1489 Token::Number(String::from("1e10"), false),
1490 Token::Comma,
1491 Token::Whitespace(Whitespace::Space),
1492 Token::Number(String::from("1e-10"), false),
1493 Token::Comma,
1494 Token::Whitespace(Whitespace::Space),
1495 Token::Number(String::from("1e+10"), false),
1496 Token::Comma,
1497 Token::Whitespace(Whitespace::Space),
1498 Token::Number(String::from("1"), false),
1499 Token::make_word("ea", None),
1500 Token::Comma,
1501 Token::Whitespace(Whitespace::Space),
1502 Token::Number(String::from("1e-10"), false),
1503 Token::make_word("a", None),
1504 Token::Comma,
1505 Token::Whitespace(Whitespace::Space),
1506 Token::Number(String::from("1e-10"), false),
1507 Token::Minus,
1508 Token::Number(String::from("10"), false),
1509 ];
1510
1511 compare(expected, tokens);
1512 }
1513
1514 #[test]
1515 fn tokenize_scalar_function() {
1516 let sql = String::from("SELECT sqrt(1)");
1517 let dialect = GenericDialect {};
1518 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1519
1520 let expected = vec![
1521 Token::make_keyword("SELECT"),
1522 Token::Whitespace(Whitespace::Space),
1523 Token::make_word("sqrt", None),
1524 Token::LParen,
1525 Token::Number(String::from("1"), false),
1526 Token::RParen,
1527 ];
1528
1529 compare(expected, tokens);
1530 }
1531
1532 #[test]
1533 fn tokenize_string_string_concat() {
1534 let sql = String::from("SELECT 'a' || 'b'");
1535 let dialect = GenericDialect {};
1536 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1537
1538 let expected = vec![
1539 Token::make_keyword("SELECT"),
1540 Token::Whitespace(Whitespace::Space),
1541 Token::SingleQuotedString(String::from("a")),
1542 Token::Whitespace(Whitespace::Space),
1543 Token::StringConcat,
1544 Token::Whitespace(Whitespace::Space),
1545 Token::SingleQuotedString(String::from("b")),
1546 ];
1547
1548 compare(expected, tokens);
1549 }
1550 #[test]
1551 fn tokenize_bitwise_op() {
1552 let sql = String::from("SELECT one | two ^ three");
1553 let dialect = GenericDialect {};
1554 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1555
1556 let expected = vec![
1557 Token::make_keyword("SELECT"),
1558 Token::Whitespace(Whitespace::Space),
1559 Token::make_word("one", None),
1560 Token::Whitespace(Whitespace::Space),
1561 Token::Pipe,
1562 Token::Whitespace(Whitespace::Space),
1563 Token::make_word("two", None),
1564 Token::Whitespace(Whitespace::Space),
1565 Token::Caret,
1566 Token::Whitespace(Whitespace::Space),
1567 Token::make_word("three", None),
1568 ];
1569 compare(expected, tokens);
1570 }
1571
1572 #[test]
1573 fn tokenize_logical_xor() {
1574 let sql =
1575 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1576 let dialect = GenericDialect {};
1577 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1578
1579 let expected = vec![
1580 Token::make_keyword("SELECT"),
1581 Token::Whitespace(Whitespace::Space),
1582 Token::make_keyword("true"),
1583 Token::Whitespace(Whitespace::Space),
1584 Token::make_keyword("XOR"),
1585 Token::Whitespace(Whitespace::Space),
1586 Token::make_keyword("true"),
1587 Token::Comma,
1588 Token::Whitespace(Whitespace::Space),
1589 Token::make_keyword("false"),
1590 Token::Whitespace(Whitespace::Space),
1591 Token::make_keyword("XOR"),
1592 Token::Whitespace(Whitespace::Space),
1593 Token::make_keyword("false"),
1594 Token::Comma,
1595 Token::Whitespace(Whitespace::Space),
1596 Token::make_keyword("true"),
1597 Token::Whitespace(Whitespace::Space),
1598 Token::make_keyword("XOR"),
1599 Token::Whitespace(Whitespace::Space),
1600 Token::make_keyword("false"),
1601 Token::Comma,
1602 Token::Whitespace(Whitespace::Space),
1603 Token::make_keyword("false"),
1604 Token::Whitespace(Whitespace::Space),
1605 Token::make_keyword("XOR"),
1606 Token::Whitespace(Whitespace::Space),
1607 Token::make_keyword("true"),
1608 ];
1609 compare(expected, tokens);
1610 }
1611
1612 #[test]
1613 fn tokenize_simple_select() {
1614 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1615 let dialect = GenericDialect {};
1616 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1617
1618 let expected = vec![
1619 Token::make_keyword("SELECT"),
1620 Token::Whitespace(Whitespace::Space),
1621 Token::Mul,
1622 Token::Whitespace(Whitespace::Space),
1623 Token::make_keyword("FROM"),
1624 Token::Whitespace(Whitespace::Space),
1625 Token::make_word("customer", None),
1626 Token::Whitespace(Whitespace::Space),
1627 Token::make_keyword("WHERE"),
1628 Token::Whitespace(Whitespace::Space),
1629 Token::make_word("id", None),
1630 Token::Whitespace(Whitespace::Space),
1631 Token::Eq,
1632 Token::Whitespace(Whitespace::Space),
1633 Token::Number(String::from("1"), false),
1634 Token::Whitespace(Whitespace::Space),
1635 Token::make_keyword("LIMIT"),
1636 Token::Whitespace(Whitespace::Space),
1637 Token::Number(String::from("5"), false),
1638 ];
1639
1640 compare(expected, tokens);
1641 }
1642
1643 #[test]
1644 fn tokenize_explain_select() {
1645 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1646 let dialect = GenericDialect {};
1647 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1648
1649 let expected = vec![
1650 Token::make_keyword("EXPLAIN"),
1651 Token::Whitespace(Whitespace::Space),
1652 Token::make_keyword("SELECT"),
1653 Token::Whitespace(Whitespace::Space),
1654 Token::Mul,
1655 Token::Whitespace(Whitespace::Space),
1656 Token::make_keyword("FROM"),
1657 Token::Whitespace(Whitespace::Space),
1658 Token::make_word("customer", None),
1659 Token::Whitespace(Whitespace::Space),
1660 Token::make_keyword("WHERE"),
1661 Token::Whitespace(Whitespace::Space),
1662 Token::make_word("id", None),
1663 Token::Whitespace(Whitespace::Space),
1664 Token::Eq,
1665 Token::Whitespace(Whitespace::Space),
1666 Token::Number(String::from("1"), false),
1667 ];
1668
1669 compare(expected, tokens);
1670 }
1671
1672 #[test]
1673 fn tokenize_explain_analyze_select() {
1674 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1675 let dialect = GenericDialect {};
1676 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1677
1678 let expected = vec![
1679 Token::make_keyword("EXPLAIN"),
1680 Token::Whitespace(Whitespace::Space),
1681 Token::make_keyword("ANALYZE"),
1682 Token::Whitespace(Whitespace::Space),
1683 Token::make_keyword("SELECT"),
1684 Token::Whitespace(Whitespace::Space),
1685 Token::Mul,
1686 Token::Whitespace(Whitespace::Space),
1687 Token::make_keyword("FROM"),
1688 Token::Whitespace(Whitespace::Space),
1689 Token::make_word("customer", None),
1690 Token::Whitespace(Whitespace::Space),
1691 Token::make_keyword("WHERE"),
1692 Token::Whitespace(Whitespace::Space),
1693 Token::make_word("id", None),
1694 Token::Whitespace(Whitespace::Space),
1695 Token::Eq,
1696 Token::Whitespace(Whitespace::Space),
1697 Token::Number(String::from("1"), false),
1698 ];
1699
1700 compare(expected, tokens);
1701 }
1702
1703 #[test]
1704 fn tokenize_string_predicate() {
1705 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1706 let dialect = GenericDialect {};
1707 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1708
1709 let expected = vec![
1710 Token::make_keyword("SELECT"),
1711 Token::Whitespace(Whitespace::Space),
1712 Token::Mul,
1713 Token::Whitespace(Whitespace::Space),
1714 Token::make_keyword("FROM"),
1715 Token::Whitespace(Whitespace::Space),
1716 Token::make_word("customer", None),
1717 Token::Whitespace(Whitespace::Space),
1718 Token::make_keyword("WHERE"),
1719 Token::Whitespace(Whitespace::Space),
1720 Token::make_word("salary", None),
1721 Token::Whitespace(Whitespace::Space),
1722 Token::Neq,
1723 Token::Whitespace(Whitespace::Space),
1724 Token::SingleQuotedString(String::from("Not Provided")),
1725 ];
1726
1727 compare(expected, tokens);
1728 }
1729
1730 #[test]
1731 fn tokenize_invalid_string() {
1732 let sql = String::from("\n💝مصطفىh");
1733
1734 let dialect = GenericDialect {};
1735 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1736 let expected = vec![
1738 Token::Whitespace(Whitespace::Newline),
1739 Token::Char('💝'),
1740 Token::make_word("مصطفىh", None),
1741 ];
1742 compare(expected, tokens);
1743 }
1744
1745 #[test]
1746 fn tokenize_newline_in_string_literal() {
1747 let sql = String::from("'foo\r\nbar\nbaz'");
1748
1749 let dialect = GenericDialect {};
1750 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1751 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
1752 compare(expected, tokens);
1753 }
1754
1755 #[test]
1756 fn tokenize_unterminated_string_literal() {
1757 let sql = String::from("select 'foo");
1758
1759 let dialect = GenericDialect {};
1760 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1761 assert_eq!(
1762 tokenizer.tokenize(),
1763 Err(TokenizerError {
1764 message: "Unterminated string literal".to_string(),
1765 location: Location { line: 1, column: 8 },
1766 })
1767 );
1768 }
1769
1770 #[test]
1771 fn tokenize_unterminated_string_literal_utf8() {
1772 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
1773
1774 let dialect = GenericDialect {};
1775 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1776 assert_eq!(
1777 tokenizer.tokenize(),
1778 Err(TokenizerError {
1779 message: "Unterminated string literal".to_string(),
1780 location: Location {
1781 line: 1,
1782 column: 35
1783 }
1784 })
1785 );
1786 }
1787
1788 #[test]
1789 fn tokenize_invalid_string_cols() {
1790 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
1791
1792 let dialect = GenericDialect {};
1793 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1794 let expected = vec![
1796 Token::Whitespace(Whitespace::Newline),
1797 Token::Whitespace(Whitespace::Newline),
1798 Token::make_keyword("SELECT"),
1799 Token::Whitespace(Whitespace::Space),
1800 Token::Mul,
1801 Token::Whitespace(Whitespace::Space),
1802 Token::make_keyword("FROM"),
1803 Token::Whitespace(Whitespace::Space),
1804 Token::make_keyword("table"),
1805 Token::Whitespace(Whitespace::Tab),
1806 Token::Char('💝'),
1807 Token::make_word("مصطفىh", None),
1808 ];
1809 compare(expected, tokens);
1810 }
1811
1812 #[test]
1813 fn tokenize_right_arrow() {
1814 let sql = String::from("FUNCTION(key=>value)");
1815 let dialect = GenericDialect {};
1816 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1817 let expected = vec![
1818 Token::make_word("FUNCTION", None),
1819 Token::LParen,
1820 Token::make_word("key", None),
1821 Token::RArrow,
1822 Token::make_word("value", None),
1823 Token::RParen,
1824 ];
1825 compare(expected, tokens);
1826 }
1827
1828 #[test]
1829 fn tokenize_is_null() {
1830 let sql = String::from("a IS NULL");
1831 let dialect = GenericDialect {};
1832 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1833
1834 let expected = vec![
1835 Token::make_word("a", None),
1836 Token::Whitespace(Whitespace::Space),
1837 Token::make_keyword("IS"),
1838 Token::Whitespace(Whitespace::Space),
1839 Token::make_keyword("NULL"),
1840 ];
1841
1842 compare(expected, tokens);
1843 }
1844
1845 #[test]
1846 fn tokenize_comment() {
1847 let sql = String::from("0--this is a comment\n1");
1848
1849 let dialect = GenericDialect {};
1850 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1851 let expected = vec![
1852 Token::Number("0".to_string(), false),
1853 Token::Whitespace(Whitespace::SingleLineComment {
1854 prefix: "--".to_string(),
1855 comment: "this is a comment\n".to_string(),
1856 }),
1857 Token::Number("1".to_string(), false),
1858 ];
1859 compare(expected, tokens);
1860 }
1861
1862 #[test]
1863 fn tokenize_comment_at_eof() {
1864 let sql = String::from("--this is a comment");
1865
1866 let dialect = GenericDialect {};
1867 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1868 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1869 prefix: "--".to_string(),
1870 comment: "this is a comment".to_string(),
1871 })];
1872 compare(expected, tokens);
1873 }
1874
1875 #[test]
1876 fn tokenize_multiline_comment() {
1877 let sql = String::from("0/*multi-line\n* /comment*/1");
1878
1879 let dialect = GenericDialect {};
1880 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1881 let expected = vec![
1882 Token::Number("0".to_string(), false),
1883 Token::Whitespace(Whitespace::MultiLineComment(
1884 "multi-line\n* /comment".to_string(),
1885 )),
1886 Token::Number("1".to_string(), false),
1887 ];
1888 compare(expected, tokens);
1889 }
1890
1891 #[test]
1892 fn tokenize_nested_multiline_comment() {
1893 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1894
1895 let dialect = GenericDialect {};
1896 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1897 let expected = vec![
1898 Token::Number("0".to_string(), false),
1899 Token::Whitespace(Whitespace::MultiLineComment(
1900 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
1901 )),
1902 Token::Number("1".to_string(), false),
1903 ];
1904 compare(expected, tokens);
1905 }
1906
1907 #[test]
1908 fn tokenize_multiline_comment_with_even_asterisks() {
1909 let sql = String::from("\n/** Comment **/\n");
1910
1911 let dialect = GenericDialect {};
1912 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1913 let expected = vec![
1914 Token::Whitespace(Whitespace::Newline),
1915 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
1916 Token::Whitespace(Whitespace::Newline),
1917 ];
1918 compare(expected, tokens);
1919 }
1920
1921 #[test]
1922 fn tokenize_unicode_whitespace() {
1923 let sql = String::from(" \u{2003}\n");
1924
1925 let dialect = GenericDialect {};
1926 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1927 let expected = vec![
1928 Token::Whitespace(Whitespace::Space),
1929 Token::Whitespace(Whitespace::Space),
1930 Token::Whitespace(Whitespace::Newline),
1931 ];
1932 compare(expected, tokens);
1933 }
1934
1935 #[test]
1936 fn tokenize_mismatched_quotes() {
1937 let sql = String::from("\"foo");
1938
1939 let dialect = GenericDialect {};
1940 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1941 assert_eq!(
1942 tokenizer.tokenize(),
1943 Err(TokenizerError {
1944 message: "Expected close delimiter '\"' before EOF.".to_string(),
1945 location: Location { line: 1, column: 1 },
1946 })
1947 );
1948 }
1949
1950 #[test]
1951 fn tokenize_newlines() {
1952 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1953
1954 let dialect = GenericDialect {};
1955 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1956 let expected = vec![
1957 Token::make_word("line1", None),
1958 Token::Whitespace(Whitespace::Newline),
1959 Token::make_word("line2", None),
1960 Token::Whitespace(Whitespace::Newline),
1961 Token::make_word("line3", None),
1962 Token::Whitespace(Whitespace::Newline),
1963 Token::make_word("line4", None),
1964 Token::Whitespace(Whitespace::Newline),
1965 ];
1966 compare(expected, tokens);
1967 }
1968
1969 #[test]
1970 fn tokenize_mssql_top() {
1971 let sql = "SELECT TOP 5 [bar] FROM foo";
1972 let dialect = MsSqlDialect {};
1973 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1974 let expected = vec![
1975 Token::make_keyword("SELECT"),
1976 Token::Whitespace(Whitespace::Space),
1977 Token::make_keyword("TOP"),
1978 Token::Whitespace(Whitespace::Space),
1979 Token::Number(String::from("5"), false),
1980 Token::Whitespace(Whitespace::Space),
1981 Token::make_word("bar", Some('[')),
1982 Token::Whitespace(Whitespace::Space),
1983 Token::make_keyword("FROM"),
1984 Token::Whitespace(Whitespace::Space),
1985 Token::make_word("foo", None),
1986 ];
1987 compare(expected, tokens);
1988 }
1989
1990 #[test]
1991 fn tokenize_pg_regex_match() {
1992 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1993 let dialect = GenericDialect {};
1994 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1995 let expected = vec![
1996 Token::make_keyword("SELECT"),
1997 Token::Whitespace(Whitespace::Space),
1998 Token::make_word("col", None),
1999 Token::Whitespace(Whitespace::Space),
2000 Token::Tilde,
2001 Token::Whitespace(Whitespace::Space),
2002 Token::SingleQuotedString("^a".into()),
2003 Token::Comma,
2004 Token::Whitespace(Whitespace::Space),
2005 Token::make_word("col", None),
2006 Token::Whitespace(Whitespace::Space),
2007 Token::TildeAsterisk,
2008 Token::Whitespace(Whitespace::Space),
2009 Token::SingleQuotedString("^a".into()),
2010 Token::Comma,
2011 Token::Whitespace(Whitespace::Space),
2012 Token::make_word("col", None),
2013 Token::Whitespace(Whitespace::Space),
2014 Token::ExclamationMarkTilde,
2015 Token::Whitespace(Whitespace::Space),
2016 Token::SingleQuotedString("^a".into()),
2017 Token::Comma,
2018 Token::Whitespace(Whitespace::Space),
2019 Token::make_word("col", None),
2020 Token::Whitespace(Whitespace::Space),
2021 Token::ExclamationMarkTildeAsterisk,
2022 Token::Whitespace(Whitespace::Space),
2023 Token::SingleQuotedString("^a".into()),
2024 ];
2025 compare(expected, tokens);
2026 }
2027
2028 #[test]
2029 fn tokenize_pg_like_match() {
2030 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
2031 let dialect = GenericDialect {};
2032 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2033 let expected = vec![
2034 Token::make_keyword("SELECT"),
2035 Token::Whitespace(Whitespace::Space),
2036 Token::make_word("col", None),
2037 Token::Whitespace(Whitespace::Space),
2038 Token::DoubleTilde,
2039 Token::Whitespace(Whitespace::Space),
2040 Token::SingleQuotedString("_a%".into()),
2041 Token::Comma,
2042 Token::Whitespace(Whitespace::Space),
2043 Token::make_word("col", None),
2044 Token::Whitespace(Whitespace::Space),
2045 Token::DoubleTildeAsterisk,
2046 Token::Whitespace(Whitespace::Space),
2047 Token::SingleQuotedString("_a%".into()),
2048 Token::Comma,
2049 Token::Whitespace(Whitespace::Space),
2050 Token::make_word("col", None),
2051 Token::Whitespace(Whitespace::Space),
2052 Token::ExclamationMarkDoubleTilde,
2053 Token::Whitespace(Whitespace::Space),
2054 Token::SingleQuotedString("_a%".into()),
2055 Token::Comma,
2056 Token::Whitespace(Whitespace::Space),
2057 Token::make_word("col", None),
2058 Token::Whitespace(Whitespace::Space),
2059 Token::ExclamationMarkDoubleTildeAsterisk,
2060 Token::Whitespace(Whitespace::Space),
2061 Token::SingleQuotedString("_a%".into()),
2062 ];
2063 compare(expected, tokens);
2064 }
2065
2066 #[test]
2067 fn tokenize_quoted_identifier() {
2068 let sql = r#" "a "" b" "a """ "c """"" "#;
2069 let dialect = GenericDialect {};
2070 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2071 let expected = vec![
2072 Token::Whitespace(Whitespace::Space),
2073 Token::make_word(r#"a " b"#, Some('"')),
2074 Token::Whitespace(Whitespace::Space),
2075 Token::make_word(r#"a ""#, Some('"')),
2076 Token::Whitespace(Whitespace::Space),
2077 Token::make_word(r#"c """#, Some('"')),
2078 Token::Whitespace(Whitespace::Space),
2079 ];
2080 compare(expected, tokens);
2081 }
2082
2083 #[test]
2084 fn tokenize_snowflake_div() {
2085 let sql = r#"field/1000"#;
2086 let dialect = SnowflakeDialect {};
2087 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2088 let expected = vec![
2089 Token::make_word(r#"field"#, None),
2090 Token::Div,
2091 Token::Number("1000".to_string(), false),
2092 ];
2093 compare(expected, tokens);
2094 }
2095
2096 #[test]
2097 fn tokenize_quoted_identifier_with_no_escape() {
2098 let sql = r#" "a "" b" "a """ "c """"" "#;
2099 let dialect = GenericDialect {};
2100 let tokens = Tokenizer::new(&dialect, sql)
2101 .with_unescape(false)
2102 .tokenize()
2103 .unwrap();
2104 let expected = vec![
2105 Token::Whitespace(Whitespace::Space),
2106 Token::make_word(r#"a "" b"#, Some('"')),
2107 Token::Whitespace(Whitespace::Space),
2108 Token::make_word(r#"a """#, Some('"')),
2109 Token::Whitespace(Whitespace::Space),
2110 Token::make_word(r#"c """""#, Some('"')),
2111 Token::Whitespace(Whitespace::Space),
2112 ];
2113 compare(expected, tokens);
2114 }
2115
2116 #[test]
2117 fn tokenize_with_location() {
2118 let sql = "SELECT a,\n b";
2119 let dialect = GenericDialect {};
2120 let tokens = Tokenizer::new(&dialect, sql)
2121 .tokenize_with_location()
2122 .unwrap();
2123 let expected = vec![
2124 TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
2125 TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
2126 TokenWithLocation::new(Token::make_word("a", None), 1, 8),
2127 TokenWithLocation::new(Token::Comma, 1, 9),
2128 TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
2129 TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
2130 TokenWithLocation::new(Token::make_word("b", None), 2, 2),
2131 ];
2132 compare(expected, tokens);
2133 }
2134
2135 fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
2136 assert_eq!(expected, actual);
2141 }
2142}