1#[cfg(not(feature = "std"))]
20use alloc::{
21 borrow::ToOwned,
22 format,
23 string::{String, ToString},
24 vec,
25 vec::Vec,
26};
27use core::fmt;
28use core::iter::Peekable;
29use core::str::Chars;
30
31#[cfg(feature = "serde")]
32use serde::{Deserialize, Serialize};
33
34#[cfg(feature = "visitor")]
35use sqlparser_derive::{Visit, VisitMut};
36
37use crate::ast::DollarQuotedString;
38use crate::dialect::{
39 BigQueryDialect, DuckDbDialect, GenericDialect, HiveDialect, SnowflakeDialect,
40};
41use crate::dialect::{Dialect, MySqlDialect};
42use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
43
44#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
46#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
47#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
48pub enum Token {
49 EOF,
51 Word(Word),
53 Number(String, bool),
55 Char(char),
57 SingleQuotedString(String),
59 DoubleQuotedString(String),
61 DollarQuotedString(DollarQuotedString),
63 SingleQuotedByteStringLiteral(String),
65 DoubleQuotedByteStringLiteral(String),
67 RawStringLiteral(String),
69 NationalStringLiteral(String),
71 EscapedStringLiteral(String),
73 HexStringLiteral(String),
75 Comma,
77 Whitespace(Whitespace),
79 DoubleEq,
81 Eq,
83 Neq,
85 Lt,
87 Gt,
89 LtEq,
91 GtEq,
93 Spaceship,
95 Plus,
97 Minus,
99 Mul,
101 Div,
103 DuckIntDiv,
105 Mod,
107 StringConcat,
109 LParen,
111 RParen,
113 Period,
115 Colon,
117 DoubleColon,
119 DuckAssignment,
121 SemiColon,
123 Backslash,
125 LBracket,
127 RBracket,
129 Ampersand,
131 Pipe,
133 Caret,
135 LBrace,
137 RBrace,
139 RArrow,
141 Sharp,
143 Tilde,
145 TildeAsterisk,
147 ExclamationMarkTilde,
149 ExclamationMarkTildeAsterisk,
151 ShiftLeft,
153 ShiftRight,
155 Overlap,
157 ExclamationMark,
159 DoubleExclamationMark,
161 AtSign,
163 PGSquareRoot,
165 PGCubeRoot,
167 Placeholder(String),
169 Arrow,
171 LongArrow,
173 HashArrow,
175 HashLongArrow,
177 AtArrow,
179 ArrowAt,
181 HashMinus,
184 AtQuestion,
187 AtAt,
191}
192
193impl fmt::Display for Token {
194 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
195 match self {
196 Token::EOF => f.write_str("EOF"),
197 Token::Word(ref w) => write!(f, "{w}"),
198 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
199 Token::Char(ref c) => write!(f, "{c}"),
200 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
201 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
202 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
203 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
204 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
205 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
206 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
207 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
208 Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
209 Token::Comma => f.write_str(","),
210 Token::Whitespace(ws) => write!(f, "{ws}"),
211 Token::DoubleEq => f.write_str("=="),
212 Token::Spaceship => f.write_str("<=>"),
213 Token::Eq => f.write_str("="),
214 Token::Neq => f.write_str("<>"),
215 Token::Lt => f.write_str("<"),
216 Token::Gt => f.write_str(">"),
217 Token::LtEq => f.write_str("<="),
218 Token::GtEq => f.write_str(">="),
219 Token::Plus => f.write_str("+"),
220 Token::Minus => f.write_str("-"),
221 Token::Mul => f.write_str("*"),
222 Token::Div => f.write_str("/"),
223 Token::DuckIntDiv => f.write_str("//"),
224 Token::StringConcat => f.write_str("||"),
225 Token::Mod => f.write_str("%"),
226 Token::LParen => f.write_str("("),
227 Token::RParen => f.write_str(")"),
228 Token::Period => f.write_str("."),
229 Token::Colon => f.write_str(":"),
230 Token::DoubleColon => f.write_str("::"),
231 Token::DuckAssignment => f.write_str(":="),
232 Token::SemiColon => f.write_str(";"),
233 Token::Backslash => f.write_str("\\"),
234 Token::LBracket => f.write_str("["),
235 Token::RBracket => f.write_str("]"),
236 Token::Ampersand => f.write_str("&"),
237 Token::Caret => f.write_str("^"),
238 Token::Pipe => f.write_str("|"),
239 Token::LBrace => f.write_str("{"),
240 Token::RBrace => f.write_str("}"),
241 Token::RArrow => f.write_str("=>"),
242 Token::Sharp => f.write_str("#"),
243 Token::ExclamationMark => f.write_str("!"),
244 Token::DoubleExclamationMark => f.write_str("!!"),
245 Token::Tilde => f.write_str("~"),
246 Token::TildeAsterisk => f.write_str("~*"),
247 Token::ExclamationMarkTilde => f.write_str("!~"),
248 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
249 Token::AtSign => f.write_str("@"),
250 Token::ShiftLeft => f.write_str("<<"),
251 Token::ShiftRight => f.write_str(">>"),
252 Token::Overlap => f.write_str("&&"),
253 Token::PGSquareRoot => f.write_str("|/"),
254 Token::PGCubeRoot => f.write_str("||/"),
255 Token::Placeholder(ref s) => write!(f, "{s}"),
256 Token::Arrow => write!(f, "->"),
257 Token::LongArrow => write!(f, "->>"),
258 Token::HashArrow => write!(f, "#>"),
259 Token::HashLongArrow => write!(f, "#>>"),
260 Token::AtArrow => write!(f, "@>"),
261 Token::ArrowAt => write!(f, "<@"),
262 Token::HashMinus => write!(f, "#-"),
263 Token::AtQuestion => write!(f, "@?"),
264 Token::AtAt => write!(f, "@@"),
265 }
266 }
267}
268
269impl Token {
270 pub fn make_keyword(keyword: &str) -> Self {
271 Token::make_word(keyword, None)
272 }
273
274 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
275 let word_uppercase = word.to_uppercase();
276 Token::Word(Word {
277 value: word.to_string(),
278 quote_style,
279 keyword: if quote_style.is_none() {
280 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
281 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
282 } else {
283 Keyword::NoKeyword
284 },
285 })
286 }
287}
288
289#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
291#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
292#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
293pub struct Word {
294 pub value: String,
297 pub quote_style: Option<char>,
301 pub keyword: Keyword,
304}
305
306impl fmt::Display for Word {
307 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
308 match self.quote_style {
309 Some(s) if s == '"' || s == '[' || s == '`' => {
310 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
311 }
312 None => f.write_str(&self.value),
313 _ => panic!("Unexpected quote_style!"),
314 }
315 }
316}
317
318impl Word {
319 fn matching_end_quote(ch: char) -> char {
320 match ch {
321 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
325 }
326 }
327}
328
329#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
330#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
331#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
332pub enum Whitespace {
333 Space,
334 Newline,
335 Tab,
336 SingleLineComment { comment: String, prefix: String },
337 MultiLineComment(String),
338}
339
340impl fmt::Display for Whitespace {
341 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
342 match self {
343 Whitespace::Space => f.write_str(" "),
344 Whitespace::Newline => f.write_str("\n"),
345 Whitespace::Tab => f.write_str("\t"),
346 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
347 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
348 }
349 }
350}
351
352#[derive(Debug, Eq, PartialEq, Clone, Copy)]
354pub struct Location {
355 pub line: u64,
357 pub column: u64,
359}
360
361impl fmt::Display for Location {
362 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
363 if self.line == 0 {
364 return Ok(());
365 }
366 write!(
367 f,
368 " at Line: {}, Column {}",
370 self.line, self.column,
371 )
372 }
373}
374
375#[derive(Debug, Eq, PartialEq, Clone)]
377pub struct TokenWithLocation {
378 pub token: Token,
379 pub location: Location,
380}
381
382impl TokenWithLocation {
383 pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
384 TokenWithLocation {
385 token,
386 location: Location { line, column },
387 }
388 }
389
390 pub fn wrap(token: Token) -> TokenWithLocation {
391 TokenWithLocation::new(token, 0, 0)
392 }
393}
394
395impl PartialEq<Token> for TokenWithLocation {
396 fn eq(&self, other: &Token) -> bool {
397 &self.token == other
398 }
399}
400
401impl PartialEq<TokenWithLocation> for Token {
402 fn eq(&self, other: &TokenWithLocation) -> bool {
403 self == &other.token
404 }
405}
406
407impl fmt::Display for TokenWithLocation {
408 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
409 self.token.fmt(f)
410 }
411}
412
413#[derive(Debug, PartialEq, Eq)]
415pub struct TokenizerError {
416 pub message: String,
417 pub location: Location,
418}
419
420impl fmt::Display for TokenizerError {
421 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
422 write!(f, "{}{}", self.message, self.location,)
423 }
424}
425
426#[cfg(feature = "std")]
427impl std::error::Error for TokenizerError {}
428
429struct State<'a> {
430 peekable: Peekable<Chars<'a>>,
431 pub line: u64,
432 pub col: u64,
433}
434
435impl<'a> State<'a> {
436 pub fn next(&mut self) -> Option<char> {
438 match self.peekable.next() {
439 None => None,
440 Some(s) => {
441 if s == '\n' {
442 self.line += 1;
443 self.col = 1;
444 } else {
445 self.col += 1;
446 }
447 Some(s)
448 }
449 }
450 }
451
452 pub fn peek(&mut self) -> Option<&char> {
454 self.peekable.peek()
455 }
456
457 pub fn location(&self) -> Location {
458 Location {
459 line: self.line,
460 column: self.col,
461 }
462 }
463}
464
465pub struct Tokenizer<'a> {
467 dialect: &'a dyn Dialect,
468 query: &'a str,
469 unescape: bool,
472}
473
474impl<'a> Tokenizer<'a> {
475 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
492 Self {
493 dialect,
494 query,
495 unescape: true,
496 }
497 }
498
499 pub fn with_unescape(mut self, unescape: bool) -> Self {
530 self.unescape = unescape;
531 self
532 }
533
534 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
536 let twl = self.tokenize_with_location()?;
537
538 let mut tokens: Vec<Token> = vec![];
539 tokens.reserve(twl.len());
540 for token_with_location in twl {
541 tokens.push(token_with_location.token);
542 }
543 Ok(tokens)
544 }
545
546 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
548 let mut state = State {
549 peekable: self.query.chars().peekable(),
550 line: 1,
551 col: 1,
552 };
553
554 let mut tokens: Vec<TokenWithLocation> = vec![];
555
556 let mut location = state.location();
557 while let Some(token) = self.next_token(&mut state)? {
558 tokens.push(TokenWithLocation { token, location });
559
560 location = state.location();
561 }
562 Ok(tokens)
563 }
564
565 fn tokenize_identifier_or_keyword(
567 &self,
568 ch: impl IntoIterator<Item = char>,
569 chars: &mut State,
570 ) -> Result<Option<Token>, TokenizerError> {
571 chars.next(); let ch: String = ch.into_iter().collect();
573 let word = self.tokenize_word(ch, chars);
574
575 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
577 let mut inner_state = State {
578 peekable: word.chars().peekable(),
579 line: 0,
580 col: 0,
581 };
582 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
583 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
584 s += s2.as_str();
585 return Ok(Some(Token::Number(s, false)));
586 }
587
588 Ok(Some(Token::make_word(&word, None)))
589 }
590
591 fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
593 match chars.peek() {
594 Some(&ch) => match ch {
595 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
596 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
597 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
598 '\r' => {
599 chars.next();
601 if let Some('\n') = chars.peek() {
602 chars.next();
603 }
604 Ok(Some(Token::Whitespace(Whitespace::Newline)))
605 }
606 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
608 chars.next(); match chars.peek() {
610 Some('\'') => {
611 let s = self.tokenize_quoted_string(chars, '\'')?;
612 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
613 }
614 Some('\"') => {
615 let s = self.tokenize_quoted_string(chars, '\"')?;
616 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
617 }
618 _ => {
619 let s = self.tokenize_word(b, chars);
621 Ok(Some(Token::make_word(&s, None)))
622 }
623 }
624 }
625 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
627 chars.next(); match chars.peek() {
629 Some('\'') => {
630 let s = self.tokenize_quoted_string(chars, '\'')?;
631 Ok(Some(Token::RawStringLiteral(s)))
632 }
633 Some('\"') => {
634 let s = self.tokenize_quoted_string(chars, '\"')?;
635 Ok(Some(Token::RawStringLiteral(s)))
636 }
637 _ => {
638 let s = self.tokenize_word(b, chars);
640 Ok(Some(Token::make_word(&s, None)))
641 }
642 }
643 }
644 n @ 'N' | n @ 'n' => {
646 chars.next(); match chars.peek() {
648 Some('\'') => {
649 let s = self.tokenize_quoted_string(chars, '\'')?;
651 Ok(Some(Token::NationalStringLiteral(s)))
652 }
653 _ => {
654 let s = self.tokenize_word(n, chars);
656 Ok(Some(Token::make_word(&s, None)))
657 }
658 }
659 }
660 x @ 'e' | x @ 'E' => {
662 let starting_loc = chars.location();
663 chars.next(); match chars.peek() {
665 Some('\'') => {
666 let s =
667 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
668 Ok(Some(Token::EscapedStringLiteral(s)))
669 }
670 _ => {
671 let s = self.tokenize_word(x, chars);
673 Ok(Some(Token::make_word(&s, None)))
674 }
675 }
676 }
677 x @ 'x' | x @ 'X' => {
680 chars.next(); match chars.peek() {
682 Some('\'') => {
683 let s = self.tokenize_quoted_string(chars, '\'')?;
685 Ok(Some(Token::HexStringLiteral(s)))
686 }
687 _ => {
688 let s = self.tokenize_word(x, chars);
690 Ok(Some(Token::make_word(&s, None)))
691 }
692 }
693 }
694 '\'' => {
696 let s = self.tokenize_quoted_string(chars, '\'')?;
697
698 Ok(Some(Token::SingleQuotedString(s)))
699 }
700 '\"' if !self.dialect.is_delimited_identifier_start(ch)
702 && !self.dialect.is_identifier_start(ch) =>
703 {
704 let s = self.tokenize_quoted_string(chars, '"')?;
705
706 Ok(Some(Token::DoubleQuotedString(s)))
707 }
708 quote_start
710 if self.dialect.is_delimited_identifier_start(ch)
711 && self
712 .dialect
713 .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
714 {
715 let error_loc = chars.location();
716 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
718 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
719
720 if last_char == Some(quote_end) {
721 Ok(Some(Token::make_word(&s, Some(quote_start))))
722 } else {
723 self.tokenizer_error(
724 error_loc,
725 format!("Expected close delimiter '{quote_end}' before EOF."),
726 )
727 }
728 }
729 '0'..='9' | '.' => {
731 let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
732
733 if s == "0" && chars.peek() == Some(&'x') {
735 chars.next();
736 let s2 = peeking_take_while(
737 chars,
738 |ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'),
739 );
740 return Ok(Some(Token::HexStringLiteral(s2)));
741 }
742
743 if let Some('.') = chars.peek() {
745 s.push('.');
746 chars.next();
747 }
748 s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
749
750 if s == "." {
752 return Ok(Some(Token::Period));
753 }
754
755 let mut exponent_part = String::new();
756 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
758 let mut char_clone = chars.peekable.clone();
759 exponent_part.push(char_clone.next().unwrap());
760
761 match char_clone.peek() {
763 Some(&c) if matches!(c, '+' | '-') => {
764 exponent_part.push(c);
765 char_clone.next();
766 }
767 _ => (),
768 }
769
770 match char_clone.peek() {
771 Some(&c) if c.is_ascii_digit() => {
773 for _ in 0..exponent_part.len() {
774 chars.next();
775 }
776 exponent_part +=
777 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
778 s += exponent_part.as_str();
779 }
780 _ => (),
782 }
783 }
784
785 if dialect_of!(self is MySqlDialect | HiveDialect) && exponent_part.is_empty() {
788 let word =
789 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
790
791 if !word.is_empty() {
792 s += word.as_str();
793 return Ok(Some(Token::make_word(s.as_str(), None)));
794 }
795 }
796
797 let long = if chars.peek() == Some(&'L') {
798 chars.next();
799 true
800 } else {
801 false
802 };
803 Ok(Some(Token::Number(s, long)))
804 }
805 '(' => self.consume_and_return(chars, Token::LParen),
807 ')' => self.consume_and_return(chars, Token::RParen),
808 ',' => self.consume_and_return(chars, Token::Comma),
809 '-' => {
811 chars.next(); match chars.peek() {
813 Some('-') => {
814 chars.next(); let comment = self.tokenize_single_line_comment(chars);
816 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
817 prefix: "--".to_owned(),
818 comment,
819 })))
820 }
821 Some('>') => {
822 chars.next();
823 match chars.peek() {
824 Some('>') => {
825 chars.next();
826 Ok(Some(Token::LongArrow))
827 }
828 _ => Ok(Some(Token::Arrow)),
829 }
830 }
831 _ => Ok(Some(Token::Minus)),
833 }
834 }
835 '/' => {
836 chars.next(); match chars.peek() {
838 Some('*') => {
839 chars.next(); self.tokenize_multiline_comment(chars)
841 }
842 Some('/') if dialect_of!(self is SnowflakeDialect) => {
843 chars.next(); let comment = self.tokenize_single_line_comment(chars);
845 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
846 prefix: "//".to_owned(),
847 comment,
848 })))
849 }
850 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
851 self.consume_and_return(chars, Token::DuckIntDiv)
852 }
853 _ => Ok(Some(Token::Div)),
855 }
856 }
857 '+' => self.consume_and_return(chars, Token::Plus),
858 '*' => self.consume_and_return(chars, Token::Mul),
859 '%' => {
860 chars.next(); match chars.peek() {
862 Some(' ') => Ok(Some(Token::Mod)),
863 Some(sch) if self.dialect.is_identifier_start('%') => {
864 self.tokenize_identifier_or_keyword([ch, *sch], chars)
865 }
866 _ => Ok(Some(Token::Mod)),
867 }
868 }
869 '|' => {
870 chars.next(); match chars.peek() {
872 Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
873 Some('|') => {
874 chars.next(); match chars.peek() {
876 Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
877 _ => Ok(Some(Token::StringConcat)),
878 }
879 }
880 _ => Ok(Some(Token::Pipe)),
882 }
883 }
884 '=' => {
885 chars.next(); match chars.peek() {
887 Some('>') => self.consume_and_return(chars, Token::RArrow),
888 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
889 _ => Ok(Some(Token::Eq)),
890 }
891 }
892 '!' => {
893 chars.next(); match chars.peek() {
895 Some('=') => self.consume_and_return(chars, Token::Neq),
896 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
897 Some('~') => {
898 chars.next();
899 match chars.peek() {
900 Some('*') => self
901 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
902 _ => Ok(Some(Token::ExclamationMarkTilde)),
903 }
904 }
905 _ => Ok(Some(Token::ExclamationMark)),
906 }
907 }
908 '<' => {
909 chars.next(); match chars.peek() {
911 Some('=') => {
912 chars.next();
913 match chars.peek() {
914 Some('>') => self.consume_and_return(chars, Token::Spaceship),
915 _ => Ok(Some(Token::LtEq)),
916 }
917 }
918 Some('>') => self.consume_and_return(chars, Token::Neq),
919 Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
920 Some('@') => self.consume_and_return(chars, Token::ArrowAt),
921 _ => Ok(Some(Token::Lt)),
922 }
923 }
924 '>' => {
925 chars.next(); match chars.peek() {
927 Some('=') => self.consume_and_return(chars, Token::GtEq),
928 Some('>') => self.consume_and_return(chars, Token::ShiftRight),
929 _ => Ok(Some(Token::Gt)),
930 }
931 }
932 ':' => {
933 chars.next();
934 match chars.peek() {
935 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
936 Some('=') => self.consume_and_return(chars, Token::DuckAssignment),
937 _ => Ok(Some(Token::Colon)),
938 }
939 }
940 ';' => self.consume_and_return(chars, Token::SemiColon),
941 '\\' => self.consume_and_return(chars, Token::Backslash),
942 '[' => self.consume_and_return(chars, Token::LBracket),
943 ']' => self.consume_and_return(chars, Token::RBracket),
944 '&' => {
945 chars.next(); match chars.peek() {
947 Some('&') => self.consume_and_return(chars, Token::Overlap),
948 _ => Ok(Some(Token::Ampersand)),
950 }
951 }
952 '^' => self.consume_and_return(chars, Token::Caret),
953 '{' => self.consume_and_return(chars, Token::LBrace),
954 '}' => self.consume_and_return(chars, Token::RBrace),
955 '#' if dialect_of!(self is SnowflakeDialect) => {
956 chars.next(); let comment = self.tokenize_single_line_comment(chars);
958 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
959 prefix: "#".to_owned(),
960 comment,
961 })))
962 }
963 '~' => {
964 chars.next(); match chars.peek() {
966 Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
967 _ => Ok(Some(Token::Tilde)),
968 }
969 }
970 '#' => {
971 chars.next();
972 match chars.peek() {
973 Some('-') => self.consume_and_return(chars, Token::HashMinus),
974 Some('>') => {
975 chars.next();
976 match chars.peek() {
977 Some('>') => {
978 chars.next();
979 Ok(Some(Token::HashLongArrow))
980 }
981 _ => Ok(Some(Token::HashArrow)),
982 }
983 }
984 Some(' ') => Ok(Some(Token::Sharp)),
985 Some(sch) if self.dialect.is_identifier_start('#') => {
986 self.tokenize_identifier_or_keyword([ch, *sch], chars)
987 }
988 _ => Ok(Some(Token::Sharp)),
989 }
990 }
991 '@' => {
992 chars.next();
993 match chars.peek() {
994 Some('>') => self.consume_and_return(chars, Token::AtArrow),
995 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
996 Some('@') => {
997 chars.next();
998 match chars.peek() {
999 Some(' ') => Ok(Some(Token::AtAt)),
1000 Some(tch) if self.dialect.is_identifier_start('@') => {
1001 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1002 }
1003 _ => Ok(Some(Token::AtAt)),
1004 }
1005 }
1006 Some(' ') => Ok(Some(Token::AtSign)),
1007 Some(sch) if self.dialect.is_identifier_start('@') => {
1008 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1009 }
1010 _ => Ok(Some(Token::AtSign)),
1011 }
1012 }
1013 '?' => {
1014 chars.next();
1015 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1016 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1017 }
1018
1019 ch if self.dialect.is_identifier_start(ch) => {
1021 self.tokenize_identifier_or_keyword([ch], chars)
1022 }
1023 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1024
1025 ch if ch.is_whitespace() => {
1027 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1028 }
1029 other => self.consume_and_return(chars, Token::Char(other)),
1030 },
1031 None => Ok(None),
1032 }
1033 }
1034
1035 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1037 let mut s = String::new();
1038 let mut value = String::new();
1039
1040 chars.next();
1041
1042 if let Some('$') = chars.peek() {
1043 chars.next();
1044
1045 let mut is_terminated = false;
1046 let mut prev: Option<char> = None;
1047
1048 while let Some(&ch) = chars.peek() {
1049 if prev == Some('$') {
1050 if ch == '$' {
1051 chars.next();
1052 is_terminated = true;
1053 break;
1054 } else {
1055 s.push('$');
1056 s.push(ch);
1057 }
1058 } else if ch != '$' {
1059 s.push(ch);
1060 }
1061
1062 prev = Some(ch);
1063 chars.next();
1064 }
1065
1066 return if chars.peek().is_none() && !is_terminated {
1067 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1068 } else {
1069 Ok(Token::DollarQuotedString(DollarQuotedString {
1070 value: s,
1071 tag: None,
1072 }))
1073 };
1074 } else {
1075 value.push_str(&peeking_take_while(chars, |ch| {
1076 ch.is_alphanumeric() || ch == '_'
1077 }));
1078
1079 if let Some('$') = chars.peek() {
1080 chars.next();
1081 s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
1082
1083 match chars.peek() {
1084 Some('$') => {
1085 chars.next();
1086 for (_, c) in value.chars().enumerate() {
1087 let next_char = chars.next();
1088 if Some(c) != next_char {
1089 return self.tokenizer_error(
1090 chars.location(),
1091 format!(
1092 "Unterminated dollar-quoted string at or near \"{value}\""
1093 ),
1094 );
1095 }
1096 }
1097
1098 if let Some('$') = chars.peek() {
1099 chars.next();
1100 } else {
1101 return self.tokenizer_error(
1102 chars.location(),
1103 "Unterminated dollar-quoted string, expected $",
1104 );
1105 }
1106 }
1107 _ => {
1108 return self.tokenizer_error(
1109 chars.location(),
1110 "Unterminated dollar-quoted, expected $",
1111 );
1112 }
1113 }
1114 } else {
1115 return Ok(Token::Placeholder(String::from("$") + &value));
1116 }
1117 }
1118
1119 Ok(Token::DollarQuotedString(DollarQuotedString {
1120 value: s,
1121 tag: if value.is_empty() { None } else { Some(value) },
1122 }))
1123 }
1124
1125 fn tokenizer_error<R>(
1126 &self,
1127 loc: Location,
1128 message: impl Into<String>,
1129 ) -> Result<R, TokenizerError> {
1130 Err(TokenizerError {
1131 message: message.into(),
1132 location: loc,
1133 })
1134 }
1135
1136 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1138 let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1139 if let Some(ch) = chars.next() {
1140 assert_eq!(ch, '\n');
1141 comment.push(ch);
1142 }
1143 comment
1144 }
1145
1146 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1148 let mut s = first_chars.into();
1149 s.push_str(&peeking_take_while(chars, |ch| {
1150 self.dialect.is_identifier_part(ch)
1151 }));
1152 s
1153 }
1154
1155 fn tokenize_escaped_single_quoted_string(
1157 &self,
1158 starting_loc: Location,
1159 chars: &mut State,
1160 ) -> Result<String, TokenizerError> {
1161 let mut s = String::new();
1162
1163 chars.next(); let mut is_escaped = false;
1169 while let Some(&ch) = chars.peek() {
1170 macro_rules! escape_control_character {
1171 ($ESCAPED:expr) => {{
1172 if is_escaped {
1173 s.push($ESCAPED);
1174 is_escaped = false;
1175 } else {
1176 s.push(ch);
1177 }
1178
1179 chars.next();
1180 }};
1181 }
1182
1183 match ch {
1184 '\'' => {
1185 chars.next(); if is_escaped {
1187 s.push(ch);
1188 is_escaped = false;
1189 } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1190 s.push(ch);
1191 chars.next();
1192 } else {
1193 return Ok(s);
1194 }
1195 }
1196 '\\' => {
1197 if is_escaped {
1198 s.push('\\');
1199 is_escaped = false;
1200 } else {
1201 is_escaped = true;
1202 }
1203
1204 chars.next();
1205 }
1206 'r' => escape_control_character!('\r'),
1207 'n' => escape_control_character!('\n'),
1208 't' => escape_control_character!('\t'),
1209 _ => {
1210 is_escaped = false;
1211 chars.next(); s.push(ch);
1213 }
1214 }
1215 }
1216 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1217 }
1218
1219 fn tokenize_quoted_string(
1221 &self,
1222 chars: &mut State,
1223 quote_style: char,
1224 ) -> Result<String, TokenizerError> {
1225 let mut s = String::new();
1226 let error_loc = chars.location();
1227
1228 chars.next(); while let Some(&ch) = chars.peek() {
1231 match ch {
1232 char if char == quote_style => {
1233 chars.next(); if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
1235 s.push(ch);
1236 if !self.unescape {
1237 s.push(ch);
1239 }
1240 chars.next();
1241 } else {
1242 return Ok(s);
1243 }
1244 }
1245 '\\' => {
1246 chars.next();
1248 if dialect_of!(self is MySqlDialect) {
1250 if let Some(next) = chars.peek() {
1251 if !self.unescape {
1252 s.push(ch);
1254 s.push(*next);
1255 chars.next(); } else {
1257 let n = match next {
1259 '\'' | '\"' | '\\' | '%' | '_' => *next,
1260 '0' => '\0',
1261 'b' => '\u{8}',
1262 'n' => '\n',
1263 'r' => '\r',
1264 't' => '\t',
1265 'Z' => '\u{1a}',
1266 _ => *next,
1267 };
1268 s.push(n);
1269 chars.next(); }
1271 }
1272 } else {
1273 s.push(ch);
1274 }
1275 }
1276 _ => {
1277 chars.next(); s.push(ch);
1279 }
1280 }
1281 }
1282 self.tokenizer_error(error_loc, "Unterminated string literal")
1283 }
1284
1285 fn tokenize_multiline_comment(
1286 &self,
1287 chars: &mut State,
1288 ) -> Result<Option<Token>, TokenizerError> {
1289 let mut s = String::new();
1290 let mut nested = 1;
1291 let mut last_ch = ' ';
1292
1293 loop {
1294 match chars.next() {
1295 Some(ch) => {
1296 if last_ch == '/' && ch == '*' {
1297 nested += 1;
1298 } else if last_ch == '*' && ch == '/' {
1299 nested -= 1;
1300 if nested == 0 {
1301 s.pop();
1302 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1303 }
1304 }
1305 s.push(ch);
1306 last_ch = ch;
1307 }
1308 None => {
1309 break self.tokenizer_error(
1310 chars.location(),
1311 "Unexpected EOF while in a multi-line comment",
1312 )
1313 }
1314 }
1315 }
1316 }
1317
1318 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
1319 let mut last_char = None;
1320 let mut s = String::new();
1321 while let Some(ch) = chars.next() {
1322 if ch == quote_end {
1323 if chars.peek() == Some("e_end) {
1324 chars.next();
1325 s.push(ch);
1326 if !self.unescape {
1327 s.push(ch);
1329 }
1330 } else {
1331 last_char = Some(quote_end);
1332 break;
1333 }
1334 } else {
1335 s.push(ch);
1336 }
1337 }
1338 (s, last_char)
1339 }
1340
1341 #[allow(clippy::unnecessary_wraps)]
1342 fn consume_and_return(
1343 &self,
1344 chars: &mut State,
1345 t: Token,
1346 ) -> Result<Option<Token>, TokenizerError> {
1347 chars.next();
1348 Ok(Some(t))
1349 }
1350}
1351
1352fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1356 let mut s = String::new();
1357 while let Some(&ch) = chars.peek() {
1358 if predicate(ch) {
1359 chars.next(); s.push(ch);
1361 } else {
1362 break;
1363 }
1364 }
1365 s
1366}
1367
1368#[cfg(test)]
1369mod tests {
1370 use super::*;
1371 use crate::dialect::{ClickHouseDialect, GenericDialect, MsSqlDialect};
1372
1373 #[test]
1374 fn tokenizer_error_impl() {
1375 let err = TokenizerError {
1376 message: "test".into(),
1377 location: Location { line: 1, column: 1 },
1378 };
1379 #[cfg(feature = "std")]
1380 {
1381 use std::error::Error;
1382 assert!(err.source().is_none());
1383 }
1384 assert_eq!(err.to_string(), "test at Line: 1, Column 1");
1385 }
1386
1387 #[test]
1388 fn tokenize_select_1() {
1389 let sql = String::from("SELECT 1");
1390 let dialect = GenericDialect {};
1391 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1392
1393 let expected = vec![
1394 Token::make_keyword("SELECT"),
1395 Token::Whitespace(Whitespace::Space),
1396 Token::Number(String::from("1"), false),
1397 ];
1398
1399 compare(expected, tokens);
1400 }
1401
1402 #[test]
1403 fn tokenize_select_float() {
1404 let sql = String::from("SELECT .1");
1405 let dialect = GenericDialect {};
1406 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1407
1408 let expected = vec![
1409 Token::make_keyword("SELECT"),
1410 Token::Whitespace(Whitespace::Space),
1411 Token::Number(String::from(".1"), false),
1412 ];
1413
1414 compare(expected, tokens);
1415 }
1416
1417 #[test]
1418 fn tokenize_clickhouse_double_equal() {
1419 let sql = String::from("SELECT foo=='1'");
1420 let dialect = ClickHouseDialect {};
1421 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1422 let tokens = tokenizer.tokenize().unwrap();
1423
1424 let expected = vec![
1425 Token::make_keyword("SELECT"),
1426 Token::Whitespace(Whitespace::Space),
1427 Token::Word(Word {
1428 value: "foo".to_string(),
1429 quote_style: None,
1430 keyword: Keyword::NoKeyword,
1431 }),
1432 Token::DoubleEq,
1433 Token::SingleQuotedString("1".to_string()),
1434 ];
1435
1436 compare(expected, tokens);
1437 }
1438
1439 #[test]
1440 fn tokenize_select_exponent() {
1441 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
1442 let dialect = GenericDialect {};
1443 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1444
1445 let expected = vec![
1446 Token::make_keyword("SELECT"),
1447 Token::Whitespace(Whitespace::Space),
1448 Token::Number(String::from("1e10"), false),
1449 Token::Comma,
1450 Token::Whitespace(Whitespace::Space),
1451 Token::Number(String::from("1e-10"), false),
1452 Token::Comma,
1453 Token::Whitespace(Whitespace::Space),
1454 Token::Number(String::from("1e+10"), false),
1455 Token::Comma,
1456 Token::Whitespace(Whitespace::Space),
1457 Token::Number(String::from("1"), false),
1458 Token::make_word("ea", None),
1459 Token::Comma,
1460 Token::Whitespace(Whitespace::Space),
1461 Token::Number(String::from("1e-10"), false),
1462 Token::make_word("a", None),
1463 Token::Comma,
1464 Token::Whitespace(Whitespace::Space),
1465 Token::Number(String::from("1e-10"), false),
1466 Token::Minus,
1467 Token::Number(String::from("10"), false),
1468 ];
1469
1470 compare(expected, tokens);
1471 }
1472
1473 #[test]
1474 fn tokenize_scalar_function() {
1475 let sql = String::from("SELECT sqrt(1)");
1476 let dialect = GenericDialect {};
1477 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1478
1479 let expected = vec![
1480 Token::make_keyword("SELECT"),
1481 Token::Whitespace(Whitespace::Space),
1482 Token::make_word("sqrt", None),
1483 Token::LParen,
1484 Token::Number(String::from("1"), false),
1485 Token::RParen,
1486 ];
1487
1488 compare(expected, tokens);
1489 }
1490
1491 #[test]
1492 fn tokenize_string_string_concat() {
1493 let sql = String::from("SELECT 'a' || 'b'");
1494 let dialect = GenericDialect {};
1495 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1496
1497 let expected = vec![
1498 Token::make_keyword("SELECT"),
1499 Token::Whitespace(Whitespace::Space),
1500 Token::SingleQuotedString(String::from("a")),
1501 Token::Whitespace(Whitespace::Space),
1502 Token::StringConcat,
1503 Token::Whitespace(Whitespace::Space),
1504 Token::SingleQuotedString(String::from("b")),
1505 ];
1506
1507 compare(expected, tokens);
1508 }
1509 #[test]
1510 fn tokenize_bitwise_op() {
1511 let sql = String::from("SELECT one | two ^ three");
1512 let dialect = GenericDialect {};
1513 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1514
1515 let expected = vec![
1516 Token::make_keyword("SELECT"),
1517 Token::Whitespace(Whitespace::Space),
1518 Token::make_word("one", None),
1519 Token::Whitespace(Whitespace::Space),
1520 Token::Pipe,
1521 Token::Whitespace(Whitespace::Space),
1522 Token::make_word("two", None),
1523 Token::Whitespace(Whitespace::Space),
1524 Token::Caret,
1525 Token::Whitespace(Whitespace::Space),
1526 Token::make_word("three", None),
1527 ];
1528 compare(expected, tokens);
1529 }
1530
1531 #[test]
1532 fn tokenize_logical_xor() {
1533 let sql =
1534 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1535 let dialect = GenericDialect {};
1536 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1537
1538 let expected = vec![
1539 Token::make_keyword("SELECT"),
1540 Token::Whitespace(Whitespace::Space),
1541 Token::make_keyword("true"),
1542 Token::Whitespace(Whitespace::Space),
1543 Token::make_keyword("XOR"),
1544 Token::Whitespace(Whitespace::Space),
1545 Token::make_keyword("true"),
1546 Token::Comma,
1547 Token::Whitespace(Whitespace::Space),
1548 Token::make_keyword("false"),
1549 Token::Whitespace(Whitespace::Space),
1550 Token::make_keyword("XOR"),
1551 Token::Whitespace(Whitespace::Space),
1552 Token::make_keyword("false"),
1553 Token::Comma,
1554 Token::Whitespace(Whitespace::Space),
1555 Token::make_keyword("true"),
1556 Token::Whitespace(Whitespace::Space),
1557 Token::make_keyword("XOR"),
1558 Token::Whitespace(Whitespace::Space),
1559 Token::make_keyword("false"),
1560 Token::Comma,
1561 Token::Whitespace(Whitespace::Space),
1562 Token::make_keyword("false"),
1563 Token::Whitespace(Whitespace::Space),
1564 Token::make_keyword("XOR"),
1565 Token::Whitespace(Whitespace::Space),
1566 Token::make_keyword("true"),
1567 ];
1568 compare(expected, tokens);
1569 }
1570
1571 #[test]
1572 fn tokenize_simple_select() {
1573 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1574 let dialect = GenericDialect {};
1575 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1576
1577 let expected = vec![
1578 Token::make_keyword("SELECT"),
1579 Token::Whitespace(Whitespace::Space),
1580 Token::Mul,
1581 Token::Whitespace(Whitespace::Space),
1582 Token::make_keyword("FROM"),
1583 Token::Whitespace(Whitespace::Space),
1584 Token::make_word("customer", None),
1585 Token::Whitespace(Whitespace::Space),
1586 Token::make_keyword("WHERE"),
1587 Token::Whitespace(Whitespace::Space),
1588 Token::make_word("id", None),
1589 Token::Whitespace(Whitespace::Space),
1590 Token::Eq,
1591 Token::Whitespace(Whitespace::Space),
1592 Token::Number(String::from("1"), false),
1593 Token::Whitespace(Whitespace::Space),
1594 Token::make_keyword("LIMIT"),
1595 Token::Whitespace(Whitespace::Space),
1596 Token::Number(String::from("5"), false),
1597 ];
1598
1599 compare(expected, tokens);
1600 }
1601
1602 #[test]
1603 fn tokenize_explain_select() {
1604 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1605 let dialect = GenericDialect {};
1606 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1607
1608 let expected = vec![
1609 Token::make_keyword("EXPLAIN"),
1610 Token::Whitespace(Whitespace::Space),
1611 Token::make_keyword("SELECT"),
1612 Token::Whitespace(Whitespace::Space),
1613 Token::Mul,
1614 Token::Whitespace(Whitespace::Space),
1615 Token::make_keyword("FROM"),
1616 Token::Whitespace(Whitespace::Space),
1617 Token::make_word("customer", None),
1618 Token::Whitespace(Whitespace::Space),
1619 Token::make_keyword("WHERE"),
1620 Token::Whitespace(Whitespace::Space),
1621 Token::make_word("id", None),
1622 Token::Whitespace(Whitespace::Space),
1623 Token::Eq,
1624 Token::Whitespace(Whitespace::Space),
1625 Token::Number(String::from("1"), false),
1626 ];
1627
1628 compare(expected, tokens);
1629 }
1630
1631 #[test]
1632 fn tokenize_explain_analyze_select() {
1633 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1634 let dialect = GenericDialect {};
1635 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1636
1637 let expected = vec![
1638 Token::make_keyword("EXPLAIN"),
1639 Token::Whitespace(Whitespace::Space),
1640 Token::make_keyword("ANALYZE"),
1641 Token::Whitespace(Whitespace::Space),
1642 Token::make_keyword("SELECT"),
1643 Token::Whitespace(Whitespace::Space),
1644 Token::Mul,
1645 Token::Whitespace(Whitespace::Space),
1646 Token::make_keyword("FROM"),
1647 Token::Whitespace(Whitespace::Space),
1648 Token::make_word("customer", None),
1649 Token::Whitespace(Whitespace::Space),
1650 Token::make_keyword("WHERE"),
1651 Token::Whitespace(Whitespace::Space),
1652 Token::make_word("id", None),
1653 Token::Whitespace(Whitespace::Space),
1654 Token::Eq,
1655 Token::Whitespace(Whitespace::Space),
1656 Token::Number(String::from("1"), false),
1657 ];
1658
1659 compare(expected, tokens);
1660 }
1661
1662 #[test]
1663 fn tokenize_string_predicate() {
1664 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1665 let dialect = GenericDialect {};
1666 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1667
1668 let expected = vec![
1669 Token::make_keyword("SELECT"),
1670 Token::Whitespace(Whitespace::Space),
1671 Token::Mul,
1672 Token::Whitespace(Whitespace::Space),
1673 Token::make_keyword("FROM"),
1674 Token::Whitespace(Whitespace::Space),
1675 Token::make_word("customer", None),
1676 Token::Whitespace(Whitespace::Space),
1677 Token::make_keyword("WHERE"),
1678 Token::Whitespace(Whitespace::Space),
1679 Token::make_word("salary", None),
1680 Token::Whitespace(Whitespace::Space),
1681 Token::Neq,
1682 Token::Whitespace(Whitespace::Space),
1683 Token::SingleQuotedString(String::from("Not Provided")),
1684 ];
1685
1686 compare(expected, tokens);
1687 }
1688
1689 #[test]
1690 fn tokenize_invalid_string() {
1691 let sql = String::from("\n💝مصطفىh");
1692
1693 let dialect = GenericDialect {};
1694 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1695 let expected = vec![
1697 Token::Whitespace(Whitespace::Newline),
1698 Token::Char('💝'),
1699 Token::make_word("مصطفىh", None),
1700 ];
1701 compare(expected, tokens);
1702 }
1703
1704 #[test]
1705 fn tokenize_newline_in_string_literal() {
1706 let sql = String::from("'foo\r\nbar\nbaz'");
1707
1708 let dialect = GenericDialect {};
1709 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1710 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
1711 compare(expected, tokens);
1712 }
1713
1714 #[test]
1715 fn tokenize_unterminated_string_literal() {
1716 let sql = String::from("select 'foo");
1717
1718 let dialect = GenericDialect {};
1719 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1720 assert_eq!(
1721 tokenizer.tokenize(),
1722 Err(TokenizerError {
1723 message: "Unterminated string literal".to_string(),
1724 location: Location { line: 1, column: 8 },
1725 })
1726 );
1727 }
1728
1729 #[test]
1730 fn tokenize_unterminated_string_literal_utf8() {
1731 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
1732
1733 let dialect = GenericDialect {};
1734 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1735 assert_eq!(
1736 tokenizer.tokenize(),
1737 Err(TokenizerError {
1738 message: "Unterminated string literal".to_string(),
1739 location: Location {
1740 line: 1,
1741 column: 35
1742 }
1743 })
1744 );
1745 }
1746
1747 #[test]
1748 fn tokenize_invalid_string_cols() {
1749 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
1750
1751 let dialect = GenericDialect {};
1752 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1753 let expected = vec![
1755 Token::Whitespace(Whitespace::Newline),
1756 Token::Whitespace(Whitespace::Newline),
1757 Token::make_keyword("SELECT"),
1758 Token::Whitespace(Whitespace::Space),
1759 Token::Mul,
1760 Token::Whitespace(Whitespace::Space),
1761 Token::make_keyword("FROM"),
1762 Token::Whitespace(Whitespace::Space),
1763 Token::make_keyword("table"),
1764 Token::Whitespace(Whitespace::Tab),
1765 Token::Char('💝'),
1766 Token::make_word("مصطفىh", None),
1767 ];
1768 compare(expected, tokens);
1769 }
1770
1771 #[test]
1772 fn tokenize_right_arrow() {
1773 let sql = String::from("FUNCTION(key=>value)");
1774 let dialect = GenericDialect {};
1775 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1776 let expected = vec![
1777 Token::make_word("FUNCTION", None),
1778 Token::LParen,
1779 Token::make_word("key", None),
1780 Token::RArrow,
1781 Token::make_word("value", None),
1782 Token::RParen,
1783 ];
1784 compare(expected, tokens);
1785 }
1786
1787 #[test]
1788 fn tokenize_is_null() {
1789 let sql = String::from("a IS NULL");
1790 let dialect = GenericDialect {};
1791 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1792
1793 let expected = vec![
1794 Token::make_word("a", None),
1795 Token::Whitespace(Whitespace::Space),
1796 Token::make_keyword("IS"),
1797 Token::Whitespace(Whitespace::Space),
1798 Token::make_keyword("NULL"),
1799 ];
1800
1801 compare(expected, tokens);
1802 }
1803
1804 #[test]
1805 fn tokenize_comment() {
1806 let sql = String::from("0--this is a comment\n1");
1807
1808 let dialect = GenericDialect {};
1809 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1810 let expected = vec![
1811 Token::Number("0".to_string(), false),
1812 Token::Whitespace(Whitespace::SingleLineComment {
1813 prefix: "--".to_string(),
1814 comment: "this is a comment\n".to_string(),
1815 }),
1816 Token::Number("1".to_string(), false),
1817 ];
1818 compare(expected, tokens);
1819 }
1820
1821 #[test]
1822 fn tokenize_comment_at_eof() {
1823 let sql = String::from("--this is a comment");
1824
1825 let dialect = GenericDialect {};
1826 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1827 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1828 prefix: "--".to_string(),
1829 comment: "this is a comment".to_string(),
1830 })];
1831 compare(expected, tokens);
1832 }
1833
1834 #[test]
1835 fn tokenize_multiline_comment() {
1836 let sql = String::from("0/*multi-line\n* /comment*/1");
1837
1838 let dialect = GenericDialect {};
1839 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1840 let expected = vec![
1841 Token::Number("0".to_string(), false),
1842 Token::Whitespace(Whitespace::MultiLineComment(
1843 "multi-line\n* /comment".to_string(),
1844 )),
1845 Token::Number("1".to_string(), false),
1846 ];
1847 compare(expected, tokens);
1848 }
1849
1850 #[test]
1851 fn tokenize_nested_multiline_comment() {
1852 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1853
1854 let dialect = GenericDialect {};
1855 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1856 let expected = vec![
1857 Token::Number("0".to_string(), false),
1858 Token::Whitespace(Whitespace::MultiLineComment(
1859 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
1860 )),
1861 Token::Number("1".to_string(), false),
1862 ];
1863 compare(expected, tokens);
1864 }
1865
1866 #[test]
1867 fn tokenize_multiline_comment_with_even_asterisks() {
1868 let sql = String::from("\n/** Comment **/\n");
1869
1870 let dialect = GenericDialect {};
1871 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1872 let expected = vec![
1873 Token::Whitespace(Whitespace::Newline),
1874 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
1875 Token::Whitespace(Whitespace::Newline),
1876 ];
1877 compare(expected, tokens);
1878 }
1879
1880 #[test]
1881 fn tokenize_unicode_whitespace() {
1882 let sql = String::from(" \u{2003}\n");
1883
1884 let dialect = GenericDialect {};
1885 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1886 let expected = vec![
1887 Token::Whitespace(Whitespace::Space),
1888 Token::Whitespace(Whitespace::Space),
1889 Token::Whitespace(Whitespace::Newline),
1890 ];
1891 compare(expected, tokens);
1892 }
1893
1894 #[test]
1895 fn tokenize_mismatched_quotes() {
1896 let sql = String::from("\"foo");
1897
1898 let dialect = GenericDialect {};
1899 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1900 assert_eq!(
1901 tokenizer.tokenize(),
1902 Err(TokenizerError {
1903 message: "Expected close delimiter '\"' before EOF.".to_string(),
1904 location: Location { line: 1, column: 1 },
1905 })
1906 );
1907 }
1908
1909 #[test]
1910 fn tokenize_newlines() {
1911 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1912
1913 let dialect = GenericDialect {};
1914 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1915 let expected = vec![
1916 Token::make_word("line1", None),
1917 Token::Whitespace(Whitespace::Newline),
1918 Token::make_word("line2", None),
1919 Token::Whitespace(Whitespace::Newline),
1920 Token::make_word("line3", None),
1921 Token::Whitespace(Whitespace::Newline),
1922 Token::make_word("line4", None),
1923 Token::Whitespace(Whitespace::Newline),
1924 ];
1925 compare(expected, tokens);
1926 }
1927
1928 #[test]
1929 fn tokenize_mssql_top() {
1930 let sql = "SELECT TOP 5 [bar] FROM foo";
1931 let dialect = MsSqlDialect {};
1932 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1933 let expected = vec![
1934 Token::make_keyword("SELECT"),
1935 Token::Whitespace(Whitespace::Space),
1936 Token::make_keyword("TOP"),
1937 Token::Whitespace(Whitespace::Space),
1938 Token::Number(String::from("5"), false),
1939 Token::Whitespace(Whitespace::Space),
1940 Token::make_word("bar", Some('[')),
1941 Token::Whitespace(Whitespace::Space),
1942 Token::make_keyword("FROM"),
1943 Token::Whitespace(Whitespace::Space),
1944 Token::make_word("foo", None),
1945 ];
1946 compare(expected, tokens);
1947 }
1948
1949 #[test]
1950 fn tokenize_pg_regex_match() {
1951 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1952 let dialect = GenericDialect {};
1953 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1954 let expected = vec![
1955 Token::make_keyword("SELECT"),
1956 Token::Whitespace(Whitespace::Space),
1957 Token::make_word("col", None),
1958 Token::Whitespace(Whitespace::Space),
1959 Token::Tilde,
1960 Token::Whitespace(Whitespace::Space),
1961 Token::SingleQuotedString("^a".into()),
1962 Token::Comma,
1963 Token::Whitespace(Whitespace::Space),
1964 Token::make_word("col", None),
1965 Token::Whitespace(Whitespace::Space),
1966 Token::TildeAsterisk,
1967 Token::Whitespace(Whitespace::Space),
1968 Token::SingleQuotedString("^a".into()),
1969 Token::Comma,
1970 Token::Whitespace(Whitespace::Space),
1971 Token::make_word("col", None),
1972 Token::Whitespace(Whitespace::Space),
1973 Token::ExclamationMarkTilde,
1974 Token::Whitespace(Whitespace::Space),
1975 Token::SingleQuotedString("^a".into()),
1976 Token::Comma,
1977 Token::Whitespace(Whitespace::Space),
1978 Token::make_word("col", None),
1979 Token::Whitespace(Whitespace::Space),
1980 Token::ExclamationMarkTildeAsterisk,
1981 Token::Whitespace(Whitespace::Space),
1982 Token::SingleQuotedString("^a".into()),
1983 ];
1984 compare(expected, tokens);
1985 }
1986
1987 #[test]
1988 fn tokenize_quoted_identifier() {
1989 let sql = r#" "a "" b" "a """ "c """"" "#;
1990 let dialect = GenericDialect {};
1991 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1992 let expected = vec![
1993 Token::Whitespace(Whitespace::Space),
1994 Token::make_word(r#"a " b"#, Some('"')),
1995 Token::Whitespace(Whitespace::Space),
1996 Token::make_word(r#"a ""#, Some('"')),
1997 Token::Whitespace(Whitespace::Space),
1998 Token::make_word(r#"c """#, Some('"')),
1999 Token::Whitespace(Whitespace::Space),
2000 ];
2001 compare(expected, tokens);
2002 }
2003
2004 #[test]
2005 fn tokenize_snowflake_div() {
2006 let sql = r#"field/1000"#;
2007 let dialect = SnowflakeDialect {};
2008 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2009 let expected = vec![
2010 Token::make_word(r#"field"#, None),
2011 Token::Div,
2012 Token::Number("1000".to_string(), false),
2013 ];
2014 compare(expected, tokens);
2015 }
2016
2017 #[test]
2018 fn tokenize_quoted_identifier_with_no_escape() {
2019 let sql = r#" "a "" b" "a """ "c """"" "#;
2020 let dialect = GenericDialect {};
2021 let tokens = Tokenizer::new(&dialect, sql)
2022 .with_unescape(false)
2023 .tokenize()
2024 .unwrap();
2025 let expected = vec![
2026 Token::Whitespace(Whitespace::Space),
2027 Token::make_word(r#"a "" b"#, Some('"')),
2028 Token::Whitespace(Whitespace::Space),
2029 Token::make_word(r#"a """#, Some('"')),
2030 Token::Whitespace(Whitespace::Space),
2031 Token::make_word(r#"c """""#, Some('"')),
2032 Token::Whitespace(Whitespace::Space),
2033 ];
2034 compare(expected, tokens);
2035 }
2036
2037 #[test]
2038 fn tokenize_with_location() {
2039 let sql = "SELECT a,\n b";
2040 let dialect = GenericDialect {};
2041 let tokens = Tokenizer::new(&dialect, sql)
2042 .tokenize_with_location()
2043 .unwrap();
2044 let expected = vec![
2045 TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
2046 TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
2047 TokenWithLocation::new(Token::make_word("a", None), 1, 8),
2048 TokenWithLocation::new(Token::Comma, 1, 9),
2049 TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
2050 TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
2051 TokenWithLocation::new(Token::make_word("b", None), 2, 2),
2052 ];
2053 compare(expected, tokens);
2054 }
2055
2056 fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
2057 assert_eq!(expected, actual);
2062 }
2063}