1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqltk_parser_derive::{Visit, VisitMut};
42
43use crate::ast::DollarQuotedString;
44use crate::dialect::Dialect;
45use crate::dialect::{
46 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
47 SnowflakeDialect,
48};
49use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
50
51#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56 EOF,
58 Word(Word),
60 Number(String, bool),
62 Char(char),
64 SingleQuotedString(String),
66 DoubleQuotedString(String),
68 TripleSingleQuotedString(String),
71 TripleDoubleQuotedString(String),
74 DollarQuotedString(DollarQuotedString),
76 SingleQuotedByteStringLiteral(String),
79 DoubleQuotedByteStringLiteral(String),
81 TripleSingleQuotedByteStringLiteral(String),
84 TripleDoubleQuotedByteStringLiteral(String),
87 SingleQuotedRawStringLiteral(String),
90 DoubleQuotedRawStringLiteral(String),
93 TripleSingleQuotedRawStringLiteral(String),
96 TripleDoubleQuotedRawStringLiteral(String),
99 NationalStringLiteral(String),
101 EscapedStringLiteral(String),
103 UnicodeStringLiteral(String),
105 HexStringLiteral(String),
107 Comma,
109 Whitespace(Whitespace),
111 DoubleEq,
113 Eq,
115 Neq,
117 Lt,
119 Gt,
121 LtEq,
123 GtEq,
125 Spaceship,
127 Plus,
129 Minus,
131 Mul,
133 Div,
135 DuckIntDiv,
137 Mod,
139 StringConcat,
141 LParen,
143 RParen,
145 Period,
147 Colon,
149 DoubleColon,
151 Assignment,
153 SemiColon,
155 Backslash,
157 LBracket,
159 RBracket,
161 Ampersand,
163 Pipe,
165 Caret,
167 LBrace,
169 RBrace,
171 RArrow,
173 Sharp,
175 Tilde,
177 TildeAsterisk,
179 ExclamationMarkTilde,
181 ExclamationMarkTildeAsterisk,
183 DoubleTilde,
185 DoubleTildeAsterisk,
187 ExclamationMarkDoubleTilde,
189 ExclamationMarkDoubleTildeAsterisk,
191 ShiftLeft,
193 ShiftRight,
195 Overlap,
197 ExclamationMark,
199 DoubleExclamationMark,
201 AtSign,
203 CaretAt,
205 PGSquareRoot,
207 PGCubeRoot,
209 Placeholder(String),
211 Arrow,
213 LongArrow,
215 HashArrow,
217 HashLongArrow,
219 AtArrow,
221 ArrowAt,
223 HashMinus,
226 AtQuestion,
229 AtAt,
233 Question,
236 QuestionAnd,
239 QuestionPipe,
242 CustomBinaryOperator(String),
246}
247
248impl fmt::Display for Token {
249 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
250 match self {
251 Token::EOF => f.write_str("EOF"),
252 Token::Word(ref w) => write!(f, "{w}"),
253 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
254 Token::Char(ref c) => write!(f, "{c}"),
255 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
256 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
257 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
258 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
259 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
260 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
261 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
262 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
263 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
264 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
265 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
266 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
267 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
268 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
269 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
270 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
271 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
272 Token::Comma => f.write_str(","),
273 Token::Whitespace(ws) => write!(f, "{ws}"),
274 Token::DoubleEq => f.write_str("=="),
275 Token::Spaceship => f.write_str("<=>"),
276 Token::Eq => f.write_str("="),
277 Token::Neq => f.write_str("<>"),
278 Token::Lt => f.write_str("<"),
279 Token::Gt => f.write_str(">"),
280 Token::LtEq => f.write_str("<="),
281 Token::GtEq => f.write_str(">="),
282 Token::Plus => f.write_str("+"),
283 Token::Minus => f.write_str("-"),
284 Token::Mul => f.write_str("*"),
285 Token::Div => f.write_str("/"),
286 Token::DuckIntDiv => f.write_str("//"),
287 Token::StringConcat => f.write_str("||"),
288 Token::Mod => f.write_str("%"),
289 Token::LParen => f.write_str("("),
290 Token::RParen => f.write_str(")"),
291 Token::Period => f.write_str("."),
292 Token::Colon => f.write_str(":"),
293 Token::DoubleColon => f.write_str("::"),
294 Token::Assignment => f.write_str(":="),
295 Token::SemiColon => f.write_str(";"),
296 Token::Backslash => f.write_str("\\"),
297 Token::LBracket => f.write_str("["),
298 Token::RBracket => f.write_str("]"),
299 Token::Ampersand => f.write_str("&"),
300 Token::Caret => f.write_str("^"),
301 Token::Pipe => f.write_str("|"),
302 Token::LBrace => f.write_str("{"),
303 Token::RBrace => f.write_str("}"),
304 Token::RArrow => f.write_str("=>"),
305 Token::Sharp => f.write_str("#"),
306 Token::ExclamationMark => f.write_str("!"),
307 Token::DoubleExclamationMark => f.write_str("!!"),
308 Token::Tilde => f.write_str("~"),
309 Token::TildeAsterisk => f.write_str("~*"),
310 Token::ExclamationMarkTilde => f.write_str("!~"),
311 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
312 Token::DoubleTilde => f.write_str("~~"),
313 Token::DoubleTildeAsterisk => f.write_str("~~*"),
314 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
315 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
316 Token::AtSign => f.write_str("@"),
317 Token::CaretAt => f.write_str("^@"),
318 Token::ShiftLeft => f.write_str("<<"),
319 Token::ShiftRight => f.write_str(">>"),
320 Token::Overlap => f.write_str("&&"),
321 Token::PGSquareRoot => f.write_str("|/"),
322 Token::PGCubeRoot => f.write_str("||/"),
323 Token::Placeholder(ref s) => write!(f, "{s}"),
324 Token::Arrow => write!(f, "->"),
325 Token::LongArrow => write!(f, "->>"),
326 Token::HashArrow => write!(f, "#>"),
327 Token::HashLongArrow => write!(f, "#>>"),
328 Token::AtArrow => write!(f, "@>"),
329 Token::ArrowAt => write!(f, "<@"),
330 Token::HashMinus => write!(f, "#-"),
331 Token::AtQuestion => write!(f, "@?"),
332 Token::AtAt => write!(f, "@@"),
333 Token::Question => write!(f, "?"),
334 Token::QuestionAnd => write!(f, "?&"),
335 Token::QuestionPipe => write!(f, "?|"),
336 Token::CustomBinaryOperator(s) => f.write_str(s),
337 }
338 }
339}
340
341impl Token {
342 pub fn make_keyword(keyword: &str) -> Self {
343 Token::make_word(keyword, None)
344 }
345
346 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
347 let word_uppercase = word.to_uppercase();
348 Token::Word(Word {
349 value: word.to_string(),
350 quote_style,
351 keyword: if quote_style.is_none() {
352 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
353 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
354 } else {
355 Keyword::NoKeyword
356 },
357 })
358 }
359}
360
361#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
363#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
364#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
365pub struct Word {
366 pub value: String,
369 pub quote_style: Option<char>,
373 pub keyword: Keyword,
376}
377
378impl fmt::Display for Word {
379 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
380 match self.quote_style {
381 Some(s) if s == '"' || s == '[' || s == '`' => {
382 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
383 }
384 None => f.write_str(&self.value),
385 _ => panic!("Unexpected quote_style!"),
386 }
387 }
388}
389
390impl Word {
391 fn matching_end_quote(ch: char) -> char {
392 match ch {
393 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
397 }
398 }
399}
400
401#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
402#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
403#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
404pub enum Whitespace {
405 Space,
406 Newline,
407 Tab,
408 SingleLineComment { comment: String, prefix: String },
409 MultiLineComment(String),
410}
411
412impl fmt::Display for Whitespace {
413 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
414 match self {
415 Whitespace::Space => f.write_str(" "),
416 Whitespace::Newline => f.write_str("\n"),
417 Whitespace::Tab => f.write_str("\t"),
418 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
419 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
420 }
421 }
422}
423
424#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
444#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
445#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
446pub struct Location {
447 pub line: u64,
451 pub column: u64,
455}
456
457impl fmt::Display for Location {
458 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
459 if self.line == 0 {
460 return Ok(());
461 }
462 write!(f, " at Line: {}, Column: {}", self.line, self.column)
463 }
464}
465
466impl fmt::Debug for Location {
467 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
468 write!(f, "Location({},{})", self.line, self.column)
469 }
470}
471
472impl Location {
473 pub fn empty() -> Self {
475 Self { line: 0, column: 0 }
476 }
477
478 pub fn new(line: u64, column: u64) -> Self {
480 Self { line, column }
481 }
482
483 pub fn of(line: u64, column: u64) -> Self {
488 Self::new(line, column)
489 }
490
491 pub fn span_to(self, end: Self) -> Span {
493 Span { start: self, end }
494 }
495}
496
497impl From<(u64, u64)> for Location {
498 fn from((line, column): (u64, u64)) -> Self {
499 Self { line, column }
500 }
501}
502
503#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
507#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
508#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
509pub struct Span {
510 pub start: Location,
511 pub end: Location,
512}
513
514impl fmt::Debug for Span {
515 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
516 write!(f, "Span({:?}..{:?})", self.start, self.end)
517 }
518}
519
520impl Span {
521 const EMPTY: Span = Self::empty();
524
525 pub fn new(start: Location, end: Location) -> Span {
527 Span { start, end }
528 }
529
530 pub const fn empty() -> Span {
535 Span {
536 start: Location { line: 0, column: 0 },
537 end: Location { line: 0, column: 0 },
538 }
539 }
540
541 pub fn union(&self, other: &Span) -> Span {
557 match (self, other) {
560 (&Span::EMPTY, _) => *other,
561 (_, &Span::EMPTY) => *self,
562 _ => Span {
563 start: cmp::min(self.start, other.start),
564 end: cmp::max(self.end, other.end),
565 },
566 }
567 }
568
569 pub fn union_opt(&self, other: &Option<Span>) -> Span {
573 match other {
574 Some(other) => self.union(other),
575 None => *self,
576 }
577 }
578
579 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
597 iter.into_iter()
598 .reduce(|acc, item| acc.union(&item))
599 .unwrap_or(Span::empty())
600 }
601}
602
603#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
605pub type TokenWithLocation = TokenWithSpan;
606
607#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
630#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
631#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
632pub struct TokenWithSpan {
633 pub token: Token,
634 pub span: Span,
635}
636
637impl TokenWithSpan {
638 pub fn new(token: Token, span: Span) -> Self {
640 Self { token, span }
641 }
642
643 pub fn wrap(token: Token) -> Self {
645 Self::new(token, Span::empty())
646 }
647
648 pub fn at(token: Token, start: Location, end: Location) -> Self {
650 Self::new(token, Span::new(start, end))
651 }
652
653 pub fn new_eof() -> Self {
655 Self::wrap(Token::EOF)
656 }
657}
658
659impl PartialEq<Token> for TokenWithSpan {
660 fn eq(&self, other: &Token) -> bool {
661 &self.token == other
662 }
663}
664
665impl PartialEq<TokenWithSpan> for Token {
666 fn eq(&self, other: &TokenWithSpan) -> bool {
667 self == &other.token
668 }
669}
670
671impl fmt::Display for TokenWithSpan {
672 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
673 self.token.fmt(f)
674 }
675}
676
677#[derive(Debug, PartialEq, Eq)]
679pub struct TokenizerError {
680 pub message: String,
681 pub location: Location,
682}
683
684impl fmt::Display for TokenizerError {
685 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
686 write!(f, "{}{}", self.message, self.location,)
687 }
688}
689
690#[cfg(feature = "std")]
691impl std::error::Error for TokenizerError {}
692
693struct State<'a> {
694 peekable: Peekable<Chars<'a>>,
695 pub line: u64,
696 pub col: u64,
697}
698
699impl State<'_> {
700 pub fn next(&mut self) -> Option<char> {
702 match self.peekable.next() {
703 None => None,
704 Some(s) => {
705 if s == '\n' {
706 self.line += 1;
707 self.col = 1;
708 } else {
709 self.col += 1;
710 }
711 Some(s)
712 }
713 }
714 }
715
716 pub fn peek(&mut self) -> Option<&char> {
718 self.peekable.peek()
719 }
720
721 pub fn location(&self) -> Location {
722 Location {
723 line: self.line,
724 column: self.col,
725 }
726 }
727}
728
729#[derive(Copy, Clone)]
731enum NumStringQuoteChars {
732 One,
734 Many(NonZeroU8),
736}
737
738struct TokenizeQuotedStringSettings {
740 quote_style: char,
742 num_quote_chars: NumStringQuoteChars,
744 num_opening_quotes_to_consume: u8,
750 backslash_escape: bool,
753}
754
755pub struct Tokenizer<'a> {
757 dialect: &'a dyn Dialect,
758 query: &'a str,
759 unescape: bool,
762}
763
764impl<'a> Tokenizer<'a> {
765 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
782 Self {
783 dialect,
784 query,
785 unescape: true,
786 }
787 }
788
789 pub fn with_unescape(mut self, unescape: bool) -> Self {
820 self.unescape = unescape;
821 self
822 }
823
824 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
826 let twl = self.tokenize_with_location()?;
827 Ok(twl.into_iter().map(|t| t.token).collect())
828 }
829
830 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
832 let mut tokens: Vec<TokenWithSpan> = vec![];
833 self.tokenize_with_location_into_buf(&mut tokens)
834 .map(|_| tokens)
835 }
836
837 pub fn tokenize_with_location_into_buf(
840 &mut self,
841 buf: &mut Vec<TokenWithSpan>,
842 ) -> Result<(), TokenizerError> {
843 let mut state = State {
844 peekable: self.query.chars().peekable(),
845 line: 1,
846 col: 1,
847 };
848
849 let mut location = state.location();
850 while let Some(token) = self.next_token(&mut state)? {
851 let span = location.span_to(state.location());
852
853 buf.push(TokenWithSpan { token, span });
854
855 location = state.location();
856 }
857 Ok(())
858 }
859
860 fn tokenize_identifier_or_keyword(
862 &self,
863 ch: impl IntoIterator<Item = char>,
864 chars: &mut State,
865 ) -> Result<Option<Token>, TokenizerError> {
866 chars.next(); let ch: String = ch.into_iter().collect();
868 let word = self.tokenize_word(ch, chars);
869
870 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
872 let mut inner_state = State {
873 peekable: word.chars().peekable(),
874 line: 0,
875 col: 0,
876 };
877 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
878 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
879 s += s2.as_str();
880 return Ok(Some(Token::Number(s, false)));
881 }
882
883 Ok(Some(Token::make_word(&word, None)))
884 }
885
886 fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
888 match chars.peek() {
889 Some(&ch) => match ch {
890 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
891 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
892 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
893 '\r' => {
894 chars.next();
896 if let Some('\n') = chars.peek() {
897 chars.next();
898 }
899 Ok(Some(Token::Whitespace(Whitespace::Newline)))
900 }
901 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
903 {
904 chars.next(); match chars.peek() {
906 Some('\'') => {
907 if self.dialect.supports_triple_quoted_string() {
908 return self
909 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
910 chars,
911 '\'',
912 false,
913 Token::SingleQuotedByteStringLiteral,
914 Token::TripleSingleQuotedByteStringLiteral,
915 );
916 }
917 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
918 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
919 }
920 Some('\"') => {
921 if self.dialect.supports_triple_quoted_string() {
922 return self
923 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
924 chars,
925 '"',
926 false,
927 Token::DoubleQuotedByteStringLiteral,
928 Token::TripleDoubleQuotedByteStringLiteral,
929 );
930 }
931 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
932 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
933 }
934 _ => {
935 let s = self.tokenize_word(b, chars);
937 Ok(Some(Token::make_word(&s, None)))
938 }
939 }
940 }
941 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
943 chars.next(); match chars.peek() {
945 Some('\'') => self
946 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
947 chars,
948 '\'',
949 false,
950 Token::SingleQuotedRawStringLiteral,
951 Token::TripleSingleQuotedRawStringLiteral,
952 ),
953 Some('\"') => self
954 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
955 chars,
956 '"',
957 false,
958 Token::DoubleQuotedRawStringLiteral,
959 Token::TripleDoubleQuotedRawStringLiteral,
960 ),
961 _ => {
962 let s = self.tokenize_word(b, chars);
964 Ok(Some(Token::make_word(&s, None)))
965 }
966 }
967 }
968 n @ 'N' | n @ 'n' => {
970 chars.next(); match chars.peek() {
972 Some('\'') => {
973 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
975 Ok(Some(Token::NationalStringLiteral(s)))
976 }
977 _ => {
978 let s = self.tokenize_word(n, chars);
980 Ok(Some(Token::make_word(&s, None)))
981 }
982 }
983 }
984 x @ 'e' | x @ 'E' => {
986 let starting_loc = chars.location();
987 chars.next(); match chars.peek() {
989 Some('\'') => {
990 let s =
991 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
992 Ok(Some(Token::EscapedStringLiteral(s)))
993 }
994 _ => {
995 let s = self.tokenize_word(x, chars);
997 Ok(Some(Token::make_word(&s, None)))
998 }
999 }
1000 }
1001 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1003 chars.next(); if chars.peek() == Some(&'&') {
1005 let mut chars_clone = chars.peekable.clone();
1007 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
1009 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
1011 return Ok(Some(Token::UnicodeStringLiteral(s)));
1012 }
1013 }
1014 let s = self.tokenize_word(x, chars);
1016 Ok(Some(Token::make_word(&s, None)))
1017 }
1018 x @ 'x' | x @ 'X' => {
1021 chars.next(); match chars.peek() {
1023 Some('\'') => {
1024 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1026 Ok(Some(Token::HexStringLiteral(s)))
1027 }
1028 _ => {
1029 let s = self.tokenize_word(x, chars);
1031 Ok(Some(Token::make_word(&s, None)))
1032 }
1033 }
1034 }
1035 '\'' => {
1037 if self.dialect.supports_triple_quoted_string() {
1038 return self
1039 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1040 chars,
1041 '\'',
1042 self.dialect.supports_string_literal_backslash_escape(),
1043 Token::SingleQuotedString,
1044 Token::TripleSingleQuotedString,
1045 );
1046 }
1047 let s = self.tokenize_single_quoted_string(
1048 chars,
1049 '\'',
1050 self.dialect.supports_string_literal_backslash_escape(),
1051 )?;
1052
1053 Ok(Some(Token::SingleQuotedString(s)))
1054 }
1055 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1057 && !self.dialect.is_identifier_start(ch) =>
1058 {
1059 if self.dialect.supports_triple_quoted_string() {
1060 return self
1061 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1062 chars,
1063 '"',
1064 self.dialect.supports_string_literal_backslash_escape(),
1065 Token::DoubleQuotedString,
1066 Token::TripleDoubleQuotedString,
1067 );
1068 }
1069 let s = self.tokenize_single_quoted_string(
1070 chars,
1071 '"',
1072 self.dialect.supports_string_literal_backslash_escape(),
1073 )?;
1074
1075 Ok(Some(Token::DoubleQuotedString(s)))
1076 }
1077 quote_start
1079 if self.dialect.is_delimited_identifier_start(ch)
1080 && self
1081 .dialect
1082 .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
1083 {
1084 let error_loc = chars.location();
1085 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
1087 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1088
1089 if last_char == Some(quote_end) {
1090 Ok(Some(Token::make_word(&s, Some(quote_start))))
1091 } else {
1092 self.tokenizer_error(
1093 error_loc,
1094 format!("Expected close delimiter '{quote_end}' before EOF."),
1095 )
1096 }
1097 }
1098 '0'..='9' | '.' => {
1100 let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
1101
1102 if s == "0" && chars.peek() == Some(&'x') {
1104 chars.next();
1105 let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
1106 return Ok(Some(Token::HexStringLiteral(s2)));
1107 }
1108
1109 if let Some('.') = chars.peek() {
1111 s.push('.');
1112 chars.next();
1113 }
1114 s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1115
1116 if s == "." {
1118 return Ok(Some(Token::Period));
1119 }
1120
1121 let mut exponent_part = String::new();
1122 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1124 let mut char_clone = chars.peekable.clone();
1125 exponent_part.push(char_clone.next().unwrap());
1126
1127 match char_clone.peek() {
1129 Some(&c) if matches!(c, '+' | '-') => {
1130 exponent_part.push(c);
1131 char_clone.next();
1132 }
1133 _ => (),
1134 }
1135
1136 match char_clone.peek() {
1137 Some(&c) if c.is_ascii_digit() => {
1139 for _ in 0..exponent_part.len() {
1140 chars.next();
1141 }
1142 exponent_part +=
1143 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1144 s += exponent_part.as_str();
1145 }
1146 _ => (),
1148 }
1149 }
1150
1151 if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
1154 let word =
1155 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1156
1157 if !word.is_empty() {
1158 s += word.as_str();
1159 return Ok(Some(Token::make_word(s.as_str(), None)));
1160 }
1161 }
1162
1163 let long = if chars.peek() == Some(&'L') {
1164 chars.next();
1165 true
1166 } else {
1167 false
1168 };
1169 Ok(Some(Token::Number(s, long)))
1170 }
1171 '(' => self.consume_and_return(chars, Token::LParen),
1173 ')' => self.consume_and_return(chars, Token::RParen),
1174 ',' => self.consume_and_return(chars, Token::Comma),
1175 '-' => {
1177 chars.next(); match chars.peek() {
1179 Some('-') => {
1180 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1182 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1183 prefix: "--".to_owned(),
1184 comment,
1185 })))
1186 }
1187 Some('>') => {
1188 chars.next();
1189 match chars.peek() {
1190 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1191 _ => self.start_binop(chars, "->", Token::Arrow),
1192 }
1193 }
1194 _ => self.start_binop(chars, "-", Token::Minus),
1196 }
1197 }
1198 '/' => {
1199 chars.next(); match chars.peek() {
1201 Some('*') => {
1202 chars.next(); self.tokenize_multiline_comment(chars)
1204 }
1205 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1206 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1208 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1209 prefix: "//".to_owned(),
1210 comment,
1211 })))
1212 }
1213 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1214 self.consume_and_return(chars, Token::DuckIntDiv)
1215 }
1216 _ => Ok(Some(Token::Div)),
1218 }
1219 }
1220 '+' => self.consume_and_return(chars, Token::Plus),
1221 '*' => self.consume_and_return(chars, Token::Mul),
1222 '%' => {
1223 chars.next(); match chars.peek() {
1225 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1226 Some(sch) if self.dialect.is_identifier_start('%') => {
1227 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1228 }
1229 _ => self.start_binop(chars, "%", Token::Mod),
1230 }
1231 }
1232 '|' => {
1233 chars.next(); match chars.peek() {
1235 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1236 Some('|') => {
1237 chars.next(); match chars.peek() {
1239 Some('/') => {
1240 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1241 }
1242 _ => self.start_binop(chars, "||", Token::StringConcat),
1243 }
1244 }
1245 _ => self.start_binop(chars, "|", Token::Pipe),
1247 }
1248 }
1249 '=' => {
1250 chars.next(); match chars.peek() {
1252 Some('>') => self.consume_and_return(chars, Token::RArrow),
1253 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1254 _ => Ok(Some(Token::Eq)),
1255 }
1256 }
1257 '!' => {
1258 chars.next(); match chars.peek() {
1260 Some('=') => self.consume_and_return(chars, Token::Neq),
1261 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1262 Some('~') => {
1263 chars.next();
1264 match chars.peek() {
1265 Some('*') => self
1266 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1267 Some('~') => {
1268 chars.next();
1269 match chars.peek() {
1270 Some('*') => self.consume_and_return(
1271 chars,
1272 Token::ExclamationMarkDoubleTildeAsterisk,
1273 ),
1274 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1275 }
1276 }
1277 _ => Ok(Some(Token::ExclamationMarkTilde)),
1278 }
1279 }
1280 _ => Ok(Some(Token::ExclamationMark)),
1281 }
1282 }
1283 '<' => {
1284 chars.next(); match chars.peek() {
1286 Some('=') => {
1287 chars.next();
1288 match chars.peek() {
1289 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1290 _ => self.start_binop(chars, "<=", Token::LtEq),
1291 }
1292 }
1293 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1294 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1295 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1296 _ => self.start_binop(chars, "<", Token::Lt),
1297 }
1298 }
1299 '>' => {
1300 chars.next(); match chars.peek() {
1302 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1303 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1304 _ => self.start_binop(chars, ">", Token::Gt),
1305 }
1306 }
1307 ':' => {
1308 chars.next();
1309 match chars.peek() {
1310 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1311 Some('=') => self.consume_and_return(chars, Token::Assignment),
1312 _ => Ok(Some(Token::Colon)),
1313 }
1314 }
1315 ';' => self.consume_and_return(chars, Token::SemiColon),
1316 '\\' => self.consume_and_return(chars, Token::Backslash),
1317 '[' => self.consume_and_return(chars, Token::LBracket),
1318 ']' => self.consume_and_return(chars, Token::RBracket),
1319 '&' => {
1320 chars.next(); match chars.peek() {
1322 Some('&') => {
1323 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1325 }
1326 _ => self.start_binop(chars, "&", Token::Ampersand),
1328 }
1329 }
1330 '^' => {
1331 chars.next(); match chars.peek() {
1333 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1334 _ => Ok(Some(Token::Caret)),
1335 }
1336 }
1337 '{' => self.consume_and_return(chars, Token::LBrace),
1338 '}' => self.consume_and_return(chars, Token::RBrace),
1339 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect) => {
1340 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1342 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1343 prefix: "#".to_owned(),
1344 comment,
1345 })))
1346 }
1347 '~' => {
1348 chars.next(); match chars.peek() {
1350 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1351 Some('~') => {
1352 chars.next();
1353 match chars.peek() {
1354 Some('*') => {
1355 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1356 }
1357 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1358 }
1359 }
1360 _ => self.start_binop(chars, "~", Token::Tilde),
1361 }
1362 }
1363 '#' => {
1364 chars.next();
1365 match chars.peek() {
1366 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1367 Some('>') => {
1368 chars.next();
1369 match chars.peek() {
1370 Some('>') => {
1371 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1372 }
1373 _ => self.start_binop(chars, "#>", Token::HashArrow),
1374 }
1375 }
1376 Some(' ') => Ok(Some(Token::Sharp)),
1377 Some(sch) if self.dialect.is_identifier_start('#') => {
1378 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1379 }
1380 _ => self.start_binop(chars, "#", Token::Sharp),
1381 }
1382 }
1383 '@' => {
1384 chars.next();
1385 match chars.peek() {
1386 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1387 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1388 Some('@') => {
1389 chars.next();
1390 match chars.peek() {
1391 Some(' ') => Ok(Some(Token::AtAt)),
1392 Some(tch) if self.dialect.is_identifier_start('@') => {
1393 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1394 }
1395 _ => Ok(Some(Token::AtAt)),
1396 }
1397 }
1398 Some(' ') => Ok(Some(Token::AtSign)),
1399 Some(sch) if self.dialect.is_identifier_start('@') => {
1400 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1401 }
1402 _ => Ok(Some(Token::AtSign)),
1403 }
1404 }
1405 '?' if dialect_of!(self is PostgreSqlDialect) => {
1407 chars.next();
1408 match chars.peek() {
1409 Some('|') => self.consume_and_return(chars, Token::QuestionPipe),
1410 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1411 _ => self.consume_and_return(chars, Token::Question),
1412 }
1413 }
1414 '?' => {
1415 chars.next();
1416 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1417 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1418 }
1419
1420 ch if self.dialect.is_identifier_start(ch) => {
1422 self.tokenize_identifier_or_keyword([ch], chars)
1423 }
1424 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1425
1426 ch if ch.is_whitespace() => {
1428 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1429 }
1430 other => self.consume_and_return(chars, Token::Char(other)),
1431 },
1432 None => Ok(None),
1433 }
1434 }
1435
1436 fn consume_for_binop(
1438 &self,
1439 chars: &mut State,
1440 prefix: &str,
1441 default: Token,
1442 ) -> Result<Option<Token>, TokenizerError> {
1443 chars.next(); self.start_binop(chars, prefix, default)
1445 }
1446
1447 fn start_binop(
1449 &self,
1450 chars: &mut State,
1451 prefix: &str,
1452 default: Token,
1453 ) -> Result<Option<Token>, TokenizerError> {
1454 let mut custom = None;
1455 while let Some(&ch) = chars.peek() {
1456 if !self.dialect.is_custom_operator_part(ch) {
1457 break;
1458 }
1459
1460 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1461 chars.next();
1462 }
1463
1464 Ok(Some(
1465 custom.map(Token::CustomBinaryOperator).unwrap_or(default),
1466 ))
1467 }
1468
1469 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1471 let mut s = String::new();
1472 let mut value = String::new();
1473
1474 chars.next();
1475
1476 if let Some('$') = chars.peek() {
1477 chars.next();
1478
1479 let mut is_terminated = false;
1480 let mut prev: Option<char> = None;
1481
1482 while let Some(&ch) = chars.peek() {
1483 if prev == Some('$') {
1484 if ch == '$' {
1485 chars.next();
1486 is_terminated = true;
1487 break;
1488 } else {
1489 s.push('$');
1490 s.push(ch);
1491 }
1492 } else if ch != '$' {
1493 s.push(ch);
1494 }
1495
1496 prev = Some(ch);
1497 chars.next();
1498 }
1499
1500 return if chars.peek().is_none() && !is_terminated {
1501 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1502 } else {
1503 Ok(Token::DollarQuotedString(DollarQuotedString {
1504 value: s,
1505 tag: None,
1506 }))
1507 };
1508 } else {
1509 value.push_str(&peeking_take_while(chars, |ch| {
1510 ch.is_alphanumeric() || ch == '_'
1511 }));
1512
1513 if let Some('$') = chars.peek() {
1514 chars.next();
1515
1516 'searching_for_end: loop {
1517 s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
1518 match chars.peek() {
1519 Some('$') => {
1520 chars.next();
1521 let mut maybe_s = String::from("$");
1522 for c in value.chars() {
1523 if let Some(next_char) = chars.next() {
1524 maybe_s.push(next_char);
1525 if next_char != c {
1526 s.push_str(&maybe_s);
1529 continue 'searching_for_end;
1530 }
1531 } else {
1532 return self.tokenizer_error(
1533 chars.location(),
1534 "Unterminated dollar-quoted, expected $",
1535 );
1536 }
1537 }
1538 if chars.peek() == Some(&'$') {
1539 chars.next();
1540 maybe_s.push('$');
1541 break 'searching_for_end;
1543 } else {
1544 s.push_str(&maybe_s);
1548 continue 'searching_for_end;
1549 }
1550 }
1551 _ => {
1552 return self.tokenizer_error(
1553 chars.location(),
1554 "Unterminated dollar-quoted, expected $",
1555 )
1556 }
1557 }
1558 }
1559 } else {
1560 return Ok(Token::Placeholder(String::from("$") + &value));
1561 }
1562 }
1563
1564 Ok(Token::DollarQuotedString(DollarQuotedString {
1565 value: s,
1566 tag: if value.is_empty() { None } else { Some(value) },
1567 }))
1568 }
1569
1570 fn tokenizer_error<R>(
1571 &self,
1572 loc: Location,
1573 message: impl Into<String>,
1574 ) -> Result<R, TokenizerError> {
1575 Err(TokenizerError {
1576 message: message.into(),
1577 location: loc,
1578 })
1579 }
1580
1581 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1583 let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1584 if let Some(ch) = chars.next() {
1585 assert_eq!(ch, '\n');
1586 comment.push(ch);
1587 }
1588 comment
1589 }
1590
1591 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1593 let mut s = first_chars.into();
1594 s.push_str(&peeking_take_while(chars, |ch| {
1595 self.dialect.is_identifier_part(ch)
1596 }));
1597 s
1598 }
1599
1600 fn tokenize_escaped_single_quoted_string(
1602 &self,
1603 starting_loc: Location,
1604 chars: &mut State,
1605 ) -> Result<String, TokenizerError> {
1606 if let Some(s) = unescape_single_quoted_string(chars) {
1607 return Ok(s);
1608 }
1609
1610 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1611 }
1612
1613 fn tokenize_single_or_triple_quoted_string<F>(
1616 &self,
1617 chars: &mut State,
1618 quote_style: char,
1619 backslash_escape: bool,
1620 single_quote_token: F,
1621 triple_quote_token: F,
1622 ) -> Result<Option<Token>, TokenizerError>
1623 where
1624 F: Fn(String) -> Token,
1625 {
1626 let error_loc = chars.location();
1627
1628 let mut num_opening_quotes = 0u8;
1629 for _ in 0..3 {
1630 if Some("e_style) == chars.peek() {
1631 chars.next(); num_opening_quotes += 1;
1633 } else {
1634 break;
1635 }
1636 }
1637
1638 let (token_fn, num_quote_chars) = match num_opening_quotes {
1639 1 => (single_quote_token, NumStringQuoteChars::One),
1640 2 => {
1641 return Ok(Some(single_quote_token("".into())));
1643 }
1644 3 => {
1645 let Some(num_quote_chars) = NonZeroU8::new(3) else {
1646 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1647 };
1648 (
1649 triple_quote_token,
1650 NumStringQuoteChars::Many(num_quote_chars),
1651 )
1652 }
1653 _ => {
1654 return self.tokenizer_error(error_loc, "invalid string literal opening");
1655 }
1656 };
1657
1658 let settings = TokenizeQuotedStringSettings {
1659 quote_style,
1660 num_quote_chars,
1661 num_opening_quotes_to_consume: 0,
1662 backslash_escape,
1663 };
1664
1665 self.tokenize_quoted_string(chars, settings)
1666 .map(token_fn)
1667 .map(Some)
1668 }
1669
1670 fn tokenize_single_quoted_string(
1672 &self,
1673 chars: &mut State,
1674 quote_style: char,
1675 backslash_escape: bool,
1676 ) -> Result<String, TokenizerError> {
1677 self.tokenize_quoted_string(
1678 chars,
1679 TokenizeQuotedStringSettings {
1680 quote_style,
1681 num_quote_chars: NumStringQuoteChars::One,
1682 num_opening_quotes_to_consume: 1,
1683 backslash_escape,
1684 },
1685 )
1686 }
1687
1688 fn tokenize_quoted_string(
1690 &self,
1691 chars: &mut State,
1692 settings: TokenizeQuotedStringSettings,
1693 ) -> Result<String, TokenizerError> {
1694 let mut s = String::new();
1695 let error_loc = chars.location();
1696
1697 for _ in 0..settings.num_opening_quotes_to_consume {
1699 if Some(settings.quote_style) != chars.next() {
1700 return self.tokenizer_error(error_loc, "invalid string literal opening");
1701 }
1702 }
1703
1704 let mut num_consecutive_quotes = 0;
1705 while let Some(&ch) = chars.peek() {
1706 let pending_final_quote = match settings.num_quote_chars {
1707 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
1708 n @ NumStringQuoteChars::Many(count)
1709 if num_consecutive_quotes + 1 == count.get() =>
1710 {
1711 Some(n)
1712 }
1713 NumStringQuoteChars::Many(_) => None,
1714 };
1715
1716 match ch {
1717 char if char == settings.quote_style && pending_final_quote.is_some() => {
1718 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
1721 let mut buf = s.chars();
1726 for _ in 1..count.get() {
1727 buf.next_back();
1728 }
1729 return Ok(buf.as_str().to_string());
1730 } else if chars
1731 .peek()
1732 .map(|c| *c == settings.quote_style)
1733 .unwrap_or(false)
1734 {
1735 s.push(ch);
1736 if !self.unescape {
1737 s.push(ch);
1739 }
1740 chars.next();
1741 } else {
1742 return Ok(s);
1743 }
1744 }
1745 '\\' if settings.backslash_escape => {
1746 chars.next();
1748
1749 num_consecutive_quotes = 0;
1750
1751 if let Some(next) = chars.peek() {
1752 if !self.unescape {
1753 s.push(ch);
1755 s.push(*next);
1756 chars.next(); } else {
1758 let n = match next {
1759 '0' => '\0',
1760 'a' => '\u{7}',
1761 'b' => '\u{8}',
1762 'f' => '\u{c}',
1763 'n' => '\n',
1764 'r' => '\r',
1765 't' => '\t',
1766 'Z' => '\u{1a}',
1767 _ => *next,
1768 };
1769 s.push(n);
1770 chars.next(); }
1772 }
1773 }
1774 ch => {
1775 chars.next(); if ch == settings.quote_style {
1778 num_consecutive_quotes += 1;
1779 } else {
1780 num_consecutive_quotes = 0;
1781 }
1782
1783 s.push(ch);
1784 }
1785 }
1786 }
1787 self.tokenizer_error(error_loc, "Unterminated string literal")
1788 }
1789
1790 fn tokenize_multiline_comment(
1791 &self,
1792 chars: &mut State,
1793 ) -> Result<Option<Token>, TokenizerError> {
1794 let mut s = String::new();
1795 let mut nested = 1;
1796 let mut last_ch = ' ';
1797
1798 loop {
1799 match chars.next() {
1800 Some(ch) => {
1801 if last_ch == '/' && ch == '*' {
1802 nested += 1;
1803 } else if last_ch == '*' && ch == '/' {
1804 nested -= 1;
1805 if nested == 0 {
1806 s.pop();
1807 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1808 }
1809 }
1810 s.push(ch);
1811 last_ch = ch;
1812 }
1813 None => {
1814 break self.tokenizer_error(
1815 chars.location(),
1816 "Unexpected EOF while in a multi-line comment",
1817 )
1818 }
1819 }
1820 }
1821 }
1822
1823 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
1824 let mut last_char = None;
1825 let mut s = String::new();
1826 while let Some(ch) = chars.next() {
1827 if ch == quote_end {
1828 if chars.peek() == Some("e_end) {
1829 chars.next();
1830 s.push(ch);
1831 if !self.unescape {
1832 s.push(ch);
1834 }
1835 } else {
1836 last_char = Some(quote_end);
1837 break;
1838 }
1839 } else {
1840 s.push(ch);
1841 }
1842 }
1843 (s, last_char)
1844 }
1845
1846 #[allow(clippy::unnecessary_wraps)]
1847 fn consume_and_return(
1848 &self,
1849 chars: &mut State,
1850 t: Token,
1851 ) -> Result<Option<Token>, TokenizerError> {
1852 chars.next();
1853 Ok(Some(t))
1854 }
1855}
1856
1857fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1861 let mut s = String::new();
1862 while let Some(&ch) = chars.peek() {
1863 if predicate(ch) {
1864 chars.next(); s.push(ch);
1866 } else {
1867 break;
1868 }
1869 }
1870 s
1871}
1872
1873fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
1874 Unescape::new(chars).unescape()
1875}
1876
1877struct Unescape<'a: 'b, 'b> {
1878 chars: &'b mut State<'a>,
1879}
1880
1881impl<'a: 'b, 'b> Unescape<'a, 'b> {
1882 fn new(chars: &'b mut State<'a>) -> Self {
1883 Self { chars }
1884 }
1885 fn unescape(mut self) -> Option<String> {
1886 let mut unescaped = String::new();
1887
1888 self.chars.next();
1889
1890 while let Some(c) = self.chars.next() {
1891 if c == '\'' {
1892 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1894 self.chars.next();
1895 unescaped.push('\'');
1896 continue;
1897 }
1898 return Some(unescaped);
1899 }
1900
1901 if c != '\\' {
1902 unescaped.push(c);
1903 continue;
1904 }
1905
1906 let c = match self.chars.next()? {
1907 'b' => '\u{0008}',
1908 'f' => '\u{000C}',
1909 'n' => '\n',
1910 'r' => '\r',
1911 't' => '\t',
1912 'u' => self.unescape_unicode_16()?,
1913 'U' => self.unescape_unicode_32()?,
1914 'x' => self.unescape_hex()?,
1915 c if c.is_digit(8) => self.unescape_octal(c)?,
1916 c => c,
1917 };
1918
1919 unescaped.push(Self::check_null(c)?);
1920 }
1921
1922 None
1923 }
1924
1925 #[inline]
1926 fn check_null(c: char) -> Option<char> {
1927 if c == '\0' {
1928 None
1929 } else {
1930 Some(c)
1931 }
1932 }
1933
1934 #[inline]
1935 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
1936 match u32::from_str_radix(s, RADIX) {
1938 Err(_) => None,
1939 Ok(n) => {
1940 let n = n & 0xFF;
1941 if n <= 127 {
1942 char::from_u32(n)
1943 } else {
1944 None
1945 }
1946 }
1947 }
1948 }
1949
1950 fn unescape_hex(&mut self) -> Option<char> {
1952 let mut s = String::new();
1953
1954 for _ in 0..2 {
1955 match self.next_hex_digit() {
1956 Some(c) => s.push(c),
1957 None => break,
1958 }
1959 }
1960
1961 if s.is_empty() {
1962 return Some('x');
1963 }
1964
1965 Self::byte_to_char::<16>(&s)
1966 }
1967
1968 #[inline]
1969 fn next_hex_digit(&mut self) -> Option<char> {
1970 match self.chars.peek() {
1971 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
1972 _ => None,
1973 }
1974 }
1975
1976 fn unescape_octal(&mut self, c: char) -> Option<char> {
1978 let mut s = String::new();
1979
1980 s.push(c);
1981 for _ in 0..2 {
1982 match self.next_octal_digest() {
1983 Some(c) => s.push(c),
1984 None => break,
1985 }
1986 }
1987
1988 Self::byte_to_char::<8>(&s)
1989 }
1990
1991 #[inline]
1992 fn next_octal_digest(&mut self) -> Option<char> {
1993 match self.chars.peek() {
1994 Some(c) if c.is_digit(8) => self.chars.next(),
1995 _ => None,
1996 }
1997 }
1998
1999 fn unescape_unicode_16(&mut self) -> Option<char> {
2001 self.unescape_unicode::<4>()
2002 }
2003
2004 fn unescape_unicode_32(&mut self) -> Option<char> {
2006 self.unescape_unicode::<8>()
2007 }
2008
2009 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2010 let mut s = String::new();
2011 for _ in 0..NUM {
2012 s.push(self.chars.next()?);
2013 }
2014 match u32::from_str_radix(&s, 16) {
2015 Err(_) => None,
2016 Ok(n) => char::from_u32(n),
2017 }
2018 }
2019}
2020
2021fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2022 let mut unescaped = String::new();
2023 chars.next(); while let Some(c) = chars.next() {
2025 match c {
2026 '\'' => {
2027 if chars.peek() == Some(&'\'') {
2028 chars.next();
2029 unescaped.push('\'');
2030 } else {
2031 return Ok(unescaped);
2032 }
2033 }
2034 '\\' => match chars.peek() {
2035 Some('\\') => {
2036 chars.next();
2037 unescaped.push('\\');
2038 }
2039 Some('+') => {
2040 chars.next();
2041 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2042 }
2043 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2044 },
2045 _ => {
2046 unescaped.push(c);
2047 }
2048 }
2049 }
2050 Err(TokenizerError {
2051 message: "Unterminated unicode encoded string literal".to_string(),
2052 location: chars.location(),
2053 })
2054}
2055
2056fn take_char_from_hex_digits(
2057 chars: &mut State<'_>,
2058 max_digits: usize,
2059) -> Result<char, TokenizerError> {
2060 let mut result = 0u32;
2061 for _ in 0..max_digits {
2062 let next_char = chars.next().ok_or_else(|| TokenizerError {
2063 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2064 .to_string(),
2065 location: chars.location(),
2066 })?;
2067 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2068 message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
2069 location: chars.location(),
2070 })?;
2071 result = result * 16 + digit;
2072 }
2073 char::from_u32(result).ok_or_else(|| TokenizerError {
2074 message: format!("Invalid unicode character: {:x}", result),
2075 location: chars.location(),
2076 })
2077}
2078
2079#[cfg(test)]
2080mod tests {
2081 use super::*;
2082 use crate::dialect::{
2083 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect,
2084 };
2085 use core::fmt::Debug;
2086
2087 #[test]
2088 fn tokenizer_error_impl() {
2089 let err = TokenizerError {
2090 message: "test".into(),
2091 location: Location { line: 1, column: 1 },
2092 };
2093 #[cfg(feature = "std")]
2094 {
2095 use std::error::Error;
2096 assert!(err.source().is_none());
2097 }
2098 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2099 }
2100
2101 #[test]
2102 fn tokenize_select_1() {
2103 let sql = String::from("SELECT 1");
2104 let dialect = GenericDialect {};
2105 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2106
2107 let expected = vec![
2108 Token::make_keyword("SELECT"),
2109 Token::Whitespace(Whitespace::Space),
2110 Token::Number(String::from("1"), false),
2111 ];
2112
2113 compare(expected, tokens);
2114 }
2115
2116 #[test]
2117 fn tokenize_select_float() {
2118 let sql = String::from("SELECT .1");
2119 let dialect = GenericDialect {};
2120 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2121
2122 let expected = vec![
2123 Token::make_keyword("SELECT"),
2124 Token::Whitespace(Whitespace::Space),
2125 Token::Number(String::from(".1"), false),
2126 ];
2127
2128 compare(expected, tokens);
2129 }
2130
2131 #[test]
2132 fn tokenize_clickhouse_double_equal() {
2133 let sql = String::from("SELECT foo=='1'");
2134 let dialect = ClickHouseDialect {};
2135 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2136 let tokens = tokenizer.tokenize().unwrap();
2137
2138 let expected = vec![
2139 Token::make_keyword("SELECT"),
2140 Token::Whitespace(Whitespace::Space),
2141 Token::Word(Word {
2142 value: "foo".to_string(),
2143 quote_style: None,
2144 keyword: Keyword::NoKeyword,
2145 }),
2146 Token::DoubleEq,
2147 Token::SingleQuotedString("1".to_string()),
2148 ];
2149
2150 compare(expected, tokens);
2151 }
2152
2153 #[test]
2154 fn tokenize_select_exponent() {
2155 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2156 let dialect = GenericDialect {};
2157 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2158
2159 let expected = vec![
2160 Token::make_keyword("SELECT"),
2161 Token::Whitespace(Whitespace::Space),
2162 Token::Number(String::from("1e10"), false),
2163 Token::Comma,
2164 Token::Whitespace(Whitespace::Space),
2165 Token::Number(String::from("1e-10"), false),
2166 Token::Comma,
2167 Token::Whitespace(Whitespace::Space),
2168 Token::Number(String::from("1e+10"), false),
2169 Token::Comma,
2170 Token::Whitespace(Whitespace::Space),
2171 Token::Number(String::from("1"), false),
2172 Token::make_word("ea", None),
2173 Token::Comma,
2174 Token::Whitespace(Whitespace::Space),
2175 Token::Number(String::from("1e-10"), false),
2176 Token::make_word("a", None),
2177 Token::Comma,
2178 Token::Whitespace(Whitespace::Space),
2179 Token::Number(String::from("1e-10"), false),
2180 Token::Minus,
2181 Token::Number(String::from("10"), false),
2182 ];
2183
2184 compare(expected, tokens);
2185 }
2186
2187 #[test]
2188 fn tokenize_scalar_function() {
2189 let sql = String::from("SELECT sqrt(1)");
2190 let dialect = GenericDialect {};
2191 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2192
2193 let expected = vec![
2194 Token::make_keyword("SELECT"),
2195 Token::Whitespace(Whitespace::Space),
2196 Token::make_word("sqrt", None),
2197 Token::LParen,
2198 Token::Number(String::from("1"), false),
2199 Token::RParen,
2200 ];
2201
2202 compare(expected, tokens);
2203 }
2204
2205 #[test]
2206 fn tokenize_string_string_concat() {
2207 let sql = String::from("SELECT 'a' || 'b'");
2208 let dialect = GenericDialect {};
2209 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2210
2211 let expected = vec![
2212 Token::make_keyword("SELECT"),
2213 Token::Whitespace(Whitespace::Space),
2214 Token::SingleQuotedString(String::from("a")),
2215 Token::Whitespace(Whitespace::Space),
2216 Token::StringConcat,
2217 Token::Whitespace(Whitespace::Space),
2218 Token::SingleQuotedString(String::from("b")),
2219 ];
2220
2221 compare(expected, tokens);
2222 }
2223 #[test]
2224 fn tokenize_bitwise_op() {
2225 let sql = String::from("SELECT one | two ^ three");
2226 let dialect = GenericDialect {};
2227 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2228
2229 let expected = vec![
2230 Token::make_keyword("SELECT"),
2231 Token::Whitespace(Whitespace::Space),
2232 Token::make_word("one", None),
2233 Token::Whitespace(Whitespace::Space),
2234 Token::Pipe,
2235 Token::Whitespace(Whitespace::Space),
2236 Token::make_word("two", None),
2237 Token::Whitespace(Whitespace::Space),
2238 Token::Caret,
2239 Token::Whitespace(Whitespace::Space),
2240 Token::make_word("three", None),
2241 ];
2242 compare(expected, tokens);
2243 }
2244
2245 #[test]
2246 fn tokenize_logical_xor() {
2247 let sql =
2248 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2249 let dialect = GenericDialect {};
2250 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2251
2252 let expected = vec![
2253 Token::make_keyword("SELECT"),
2254 Token::Whitespace(Whitespace::Space),
2255 Token::make_keyword("true"),
2256 Token::Whitespace(Whitespace::Space),
2257 Token::make_keyword("XOR"),
2258 Token::Whitespace(Whitespace::Space),
2259 Token::make_keyword("true"),
2260 Token::Comma,
2261 Token::Whitespace(Whitespace::Space),
2262 Token::make_keyword("false"),
2263 Token::Whitespace(Whitespace::Space),
2264 Token::make_keyword("XOR"),
2265 Token::Whitespace(Whitespace::Space),
2266 Token::make_keyword("false"),
2267 Token::Comma,
2268 Token::Whitespace(Whitespace::Space),
2269 Token::make_keyword("true"),
2270 Token::Whitespace(Whitespace::Space),
2271 Token::make_keyword("XOR"),
2272 Token::Whitespace(Whitespace::Space),
2273 Token::make_keyword("false"),
2274 Token::Comma,
2275 Token::Whitespace(Whitespace::Space),
2276 Token::make_keyword("false"),
2277 Token::Whitespace(Whitespace::Space),
2278 Token::make_keyword("XOR"),
2279 Token::Whitespace(Whitespace::Space),
2280 Token::make_keyword("true"),
2281 ];
2282 compare(expected, tokens);
2283 }
2284
2285 #[test]
2286 fn tokenize_simple_select() {
2287 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2288 let dialect = GenericDialect {};
2289 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2290
2291 let expected = vec![
2292 Token::make_keyword("SELECT"),
2293 Token::Whitespace(Whitespace::Space),
2294 Token::Mul,
2295 Token::Whitespace(Whitespace::Space),
2296 Token::make_keyword("FROM"),
2297 Token::Whitespace(Whitespace::Space),
2298 Token::make_word("customer", None),
2299 Token::Whitespace(Whitespace::Space),
2300 Token::make_keyword("WHERE"),
2301 Token::Whitespace(Whitespace::Space),
2302 Token::make_word("id", None),
2303 Token::Whitespace(Whitespace::Space),
2304 Token::Eq,
2305 Token::Whitespace(Whitespace::Space),
2306 Token::Number(String::from("1"), false),
2307 Token::Whitespace(Whitespace::Space),
2308 Token::make_keyword("LIMIT"),
2309 Token::Whitespace(Whitespace::Space),
2310 Token::Number(String::from("5"), false),
2311 ];
2312
2313 compare(expected, tokens);
2314 }
2315
2316 #[test]
2317 fn tokenize_explain_select() {
2318 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2319 let dialect = GenericDialect {};
2320 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2321
2322 let expected = vec![
2323 Token::make_keyword("EXPLAIN"),
2324 Token::Whitespace(Whitespace::Space),
2325 Token::make_keyword("SELECT"),
2326 Token::Whitespace(Whitespace::Space),
2327 Token::Mul,
2328 Token::Whitespace(Whitespace::Space),
2329 Token::make_keyword("FROM"),
2330 Token::Whitespace(Whitespace::Space),
2331 Token::make_word("customer", None),
2332 Token::Whitespace(Whitespace::Space),
2333 Token::make_keyword("WHERE"),
2334 Token::Whitespace(Whitespace::Space),
2335 Token::make_word("id", None),
2336 Token::Whitespace(Whitespace::Space),
2337 Token::Eq,
2338 Token::Whitespace(Whitespace::Space),
2339 Token::Number(String::from("1"), false),
2340 ];
2341
2342 compare(expected, tokens);
2343 }
2344
2345 #[test]
2346 fn tokenize_explain_analyze_select() {
2347 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2348 let dialect = GenericDialect {};
2349 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2350
2351 let expected = vec![
2352 Token::make_keyword("EXPLAIN"),
2353 Token::Whitespace(Whitespace::Space),
2354 Token::make_keyword("ANALYZE"),
2355 Token::Whitespace(Whitespace::Space),
2356 Token::make_keyword("SELECT"),
2357 Token::Whitespace(Whitespace::Space),
2358 Token::Mul,
2359 Token::Whitespace(Whitespace::Space),
2360 Token::make_keyword("FROM"),
2361 Token::Whitespace(Whitespace::Space),
2362 Token::make_word("customer", None),
2363 Token::Whitespace(Whitespace::Space),
2364 Token::make_keyword("WHERE"),
2365 Token::Whitespace(Whitespace::Space),
2366 Token::make_word("id", None),
2367 Token::Whitespace(Whitespace::Space),
2368 Token::Eq,
2369 Token::Whitespace(Whitespace::Space),
2370 Token::Number(String::from("1"), false),
2371 ];
2372
2373 compare(expected, tokens);
2374 }
2375
2376 #[test]
2377 fn tokenize_string_predicate() {
2378 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2379 let dialect = GenericDialect {};
2380 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2381
2382 let expected = vec![
2383 Token::make_keyword("SELECT"),
2384 Token::Whitespace(Whitespace::Space),
2385 Token::Mul,
2386 Token::Whitespace(Whitespace::Space),
2387 Token::make_keyword("FROM"),
2388 Token::Whitespace(Whitespace::Space),
2389 Token::make_word("customer", None),
2390 Token::Whitespace(Whitespace::Space),
2391 Token::make_keyword("WHERE"),
2392 Token::Whitespace(Whitespace::Space),
2393 Token::make_word("salary", None),
2394 Token::Whitespace(Whitespace::Space),
2395 Token::Neq,
2396 Token::Whitespace(Whitespace::Space),
2397 Token::SingleQuotedString(String::from("Not Provided")),
2398 ];
2399
2400 compare(expected, tokens);
2401 }
2402
2403 #[test]
2404 fn tokenize_invalid_string() {
2405 let sql = String::from("\n💝مصطفىh");
2406
2407 let dialect = GenericDialect {};
2408 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2409 let expected = vec![
2411 Token::Whitespace(Whitespace::Newline),
2412 Token::Char('💝'),
2413 Token::make_word("مصطفىh", None),
2414 ];
2415 compare(expected, tokens);
2416 }
2417
2418 #[test]
2419 fn tokenize_newline_in_string_literal() {
2420 let sql = String::from("'foo\r\nbar\nbaz'");
2421
2422 let dialect = GenericDialect {};
2423 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2424 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2425 compare(expected, tokens);
2426 }
2427
2428 #[test]
2429 fn tokenize_unterminated_string_literal() {
2430 let sql = String::from("select 'foo");
2431
2432 let dialect = GenericDialect {};
2433 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2434 assert_eq!(
2435 tokenizer.tokenize(),
2436 Err(TokenizerError {
2437 message: "Unterminated string literal".to_string(),
2438 location: Location { line: 1, column: 8 },
2439 })
2440 );
2441 }
2442
2443 #[test]
2444 fn tokenize_unterminated_string_literal_utf8() {
2445 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2446
2447 let dialect = GenericDialect {};
2448 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2449 assert_eq!(
2450 tokenizer.tokenize(),
2451 Err(TokenizerError {
2452 message: "Unterminated string literal".to_string(),
2453 location: Location {
2454 line: 1,
2455 column: 35
2456 }
2457 })
2458 );
2459 }
2460
2461 #[test]
2462 fn tokenize_invalid_string_cols() {
2463 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2464
2465 let dialect = GenericDialect {};
2466 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2467 let expected = vec![
2469 Token::Whitespace(Whitespace::Newline),
2470 Token::Whitespace(Whitespace::Newline),
2471 Token::make_keyword("SELECT"),
2472 Token::Whitespace(Whitespace::Space),
2473 Token::Mul,
2474 Token::Whitespace(Whitespace::Space),
2475 Token::make_keyword("FROM"),
2476 Token::Whitespace(Whitespace::Space),
2477 Token::make_keyword("table"),
2478 Token::Whitespace(Whitespace::Tab),
2479 Token::Char('💝'),
2480 Token::make_word("مصطفىh", None),
2481 ];
2482 compare(expected, tokens);
2483 }
2484
2485 #[test]
2486 fn tokenize_dollar_quoted_string_tagged() {
2487 let sql = String::from(
2488 "SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$",
2489 );
2490 let dialect = GenericDialect {};
2491 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2492 let expected = vec![
2493 Token::make_keyword("SELECT"),
2494 Token::Whitespace(Whitespace::Space),
2495 Token::DollarQuotedString(DollarQuotedString {
2496 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2497 tag: Some("tag".into()),
2498 }),
2499 ];
2500 compare(expected, tokens);
2501 }
2502
2503 #[test]
2504 fn tokenize_dollar_quoted_string_tagged_unterminated() {
2505 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2506 let dialect = GenericDialect {};
2507 assert_eq!(
2508 Tokenizer::new(&dialect, &sql).tokenize(),
2509 Err(TokenizerError {
2510 message: "Unterminated dollar-quoted, expected $".into(),
2511 location: Location {
2512 line: 1,
2513 column: 91
2514 }
2515 })
2516 );
2517 }
2518
2519 #[test]
2520 fn tokenize_dollar_quoted_string_untagged() {
2521 let sql =
2522 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
2523 let dialect = GenericDialect {};
2524 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2525 let expected = vec![
2526 Token::make_keyword("SELECT"),
2527 Token::Whitespace(Whitespace::Space),
2528 Token::DollarQuotedString(DollarQuotedString {
2529 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
2530 tag: None,
2531 }),
2532 ];
2533 compare(expected, tokens);
2534 }
2535
2536 #[test]
2537 fn tokenize_dollar_quoted_string_untagged_unterminated() {
2538 let sql = String::from(
2539 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
2540 );
2541 let dialect = GenericDialect {};
2542 assert_eq!(
2543 Tokenizer::new(&dialect, &sql).tokenize(),
2544 Err(TokenizerError {
2545 message: "Unterminated dollar-quoted string".into(),
2546 location: Location {
2547 line: 1,
2548 column: 86
2549 }
2550 })
2551 );
2552 }
2553
2554 #[test]
2555 fn tokenize_right_arrow() {
2556 let sql = String::from("FUNCTION(key=>value)");
2557 let dialect = GenericDialect {};
2558 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2559 let expected = vec![
2560 Token::make_word("FUNCTION", None),
2561 Token::LParen,
2562 Token::make_word("key", None),
2563 Token::RArrow,
2564 Token::make_word("value", None),
2565 Token::RParen,
2566 ];
2567 compare(expected, tokens);
2568 }
2569
2570 #[test]
2571 fn tokenize_is_null() {
2572 let sql = String::from("a IS NULL");
2573 let dialect = GenericDialect {};
2574 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2575
2576 let expected = vec![
2577 Token::make_word("a", None),
2578 Token::Whitespace(Whitespace::Space),
2579 Token::make_keyword("IS"),
2580 Token::Whitespace(Whitespace::Space),
2581 Token::make_keyword("NULL"),
2582 ];
2583
2584 compare(expected, tokens);
2585 }
2586
2587 #[test]
2588 fn tokenize_comment() {
2589 let sql = String::from("0--this is a comment\n1");
2590
2591 let dialect = GenericDialect {};
2592 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2593 let expected = vec![
2594 Token::Number("0".to_string(), false),
2595 Token::Whitespace(Whitespace::SingleLineComment {
2596 prefix: "--".to_string(),
2597 comment: "this is a comment\n".to_string(),
2598 }),
2599 Token::Number("1".to_string(), false),
2600 ];
2601 compare(expected, tokens);
2602 }
2603
2604 #[test]
2605 fn tokenize_comment_at_eof() {
2606 let sql = String::from("--this is a comment");
2607
2608 let dialect = GenericDialect {};
2609 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2610 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
2611 prefix: "--".to_string(),
2612 comment: "this is a comment".to_string(),
2613 })];
2614 compare(expected, tokens);
2615 }
2616
2617 #[test]
2618 fn tokenize_multiline_comment() {
2619 let sql = String::from("0/*multi-line\n* /comment*/1");
2620
2621 let dialect = GenericDialect {};
2622 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2623 let expected = vec![
2624 Token::Number("0".to_string(), false),
2625 Token::Whitespace(Whitespace::MultiLineComment(
2626 "multi-line\n* /comment".to_string(),
2627 )),
2628 Token::Number("1".to_string(), false),
2629 ];
2630 compare(expected, tokens);
2631 }
2632
2633 #[test]
2634 fn tokenize_nested_multiline_comment() {
2635 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
2636
2637 let dialect = GenericDialect {};
2638 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2639 let expected = vec![
2640 Token::Number("0".to_string(), false),
2641 Token::Whitespace(Whitespace::MultiLineComment(
2642 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
2643 )),
2644 Token::Number("1".to_string(), false),
2645 ];
2646 compare(expected, tokens);
2647 }
2648
2649 #[test]
2650 fn tokenize_multiline_comment_with_even_asterisks() {
2651 let sql = String::from("\n/** Comment **/\n");
2652
2653 let dialect = GenericDialect {};
2654 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2655 let expected = vec![
2656 Token::Whitespace(Whitespace::Newline),
2657 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
2658 Token::Whitespace(Whitespace::Newline),
2659 ];
2660 compare(expected, tokens);
2661 }
2662
2663 #[test]
2664 fn tokenize_unicode_whitespace() {
2665 let sql = String::from(" \u{2003}\n");
2666
2667 let dialect = GenericDialect {};
2668 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2669 let expected = vec![
2670 Token::Whitespace(Whitespace::Space),
2671 Token::Whitespace(Whitespace::Space),
2672 Token::Whitespace(Whitespace::Newline),
2673 ];
2674 compare(expected, tokens);
2675 }
2676
2677 #[test]
2678 fn tokenize_mismatched_quotes() {
2679 let sql = String::from("\"foo");
2680
2681 let dialect = GenericDialect {};
2682 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2683 assert_eq!(
2684 tokenizer.tokenize(),
2685 Err(TokenizerError {
2686 message: "Expected close delimiter '\"' before EOF.".to_string(),
2687 location: Location { line: 1, column: 1 },
2688 })
2689 );
2690 }
2691
2692 #[test]
2693 fn tokenize_newlines() {
2694 let sql = String::from("line1\nline2\rline3\r\nline4\r");
2695
2696 let dialect = GenericDialect {};
2697 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2698 let expected = vec![
2699 Token::make_word("line1", None),
2700 Token::Whitespace(Whitespace::Newline),
2701 Token::make_word("line2", None),
2702 Token::Whitespace(Whitespace::Newline),
2703 Token::make_word("line3", None),
2704 Token::Whitespace(Whitespace::Newline),
2705 Token::make_word("line4", None),
2706 Token::Whitespace(Whitespace::Newline),
2707 ];
2708 compare(expected, tokens);
2709 }
2710
2711 #[test]
2712 fn tokenize_mssql_top() {
2713 let sql = "SELECT TOP 5 [bar] FROM foo";
2714 let dialect = MsSqlDialect {};
2715 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2716 let expected = vec![
2717 Token::make_keyword("SELECT"),
2718 Token::Whitespace(Whitespace::Space),
2719 Token::make_keyword("TOP"),
2720 Token::Whitespace(Whitespace::Space),
2721 Token::Number(String::from("5"), false),
2722 Token::Whitespace(Whitespace::Space),
2723 Token::make_word("bar", Some('[')),
2724 Token::Whitespace(Whitespace::Space),
2725 Token::make_keyword("FROM"),
2726 Token::Whitespace(Whitespace::Space),
2727 Token::make_word("foo", None),
2728 ];
2729 compare(expected, tokens);
2730 }
2731
2732 #[test]
2733 fn tokenize_pg_regex_match() {
2734 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
2735 let dialect = GenericDialect {};
2736 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2737 let expected = vec![
2738 Token::make_keyword("SELECT"),
2739 Token::Whitespace(Whitespace::Space),
2740 Token::make_word("col", None),
2741 Token::Whitespace(Whitespace::Space),
2742 Token::Tilde,
2743 Token::Whitespace(Whitespace::Space),
2744 Token::SingleQuotedString("^a".into()),
2745 Token::Comma,
2746 Token::Whitespace(Whitespace::Space),
2747 Token::make_word("col", None),
2748 Token::Whitespace(Whitespace::Space),
2749 Token::TildeAsterisk,
2750 Token::Whitespace(Whitespace::Space),
2751 Token::SingleQuotedString("^a".into()),
2752 Token::Comma,
2753 Token::Whitespace(Whitespace::Space),
2754 Token::make_word("col", None),
2755 Token::Whitespace(Whitespace::Space),
2756 Token::ExclamationMarkTilde,
2757 Token::Whitespace(Whitespace::Space),
2758 Token::SingleQuotedString("^a".into()),
2759 Token::Comma,
2760 Token::Whitespace(Whitespace::Space),
2761 Token::make_word("col", None),
2762 Token::Whitespace(Whitespace::Space),
2763 Token::ExclamationMarkTildeAsterisk,
2764 Token::Whitespace(Whitespace::Space),
2765 Token::SingleQuotedString("^a".into()),
2766 ];
2767 compare(expected, tokens);
2768 }
2769
2770 #[test]
2771 fn tokenize_pg_like_match() {
2772 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
2773 let dialect = GenericDialect {};
2774 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2775 let expected = vec![
2776 Token::make_keyword("SELECT"),
2777 Token::Whitespace(Whitespace::Space),
2778 Token::make_word("col", None),
2779 Token::Whitespace(Whitespace::Space),
2780 Token::DoubleTilde,
2781 Token::Whitespace(Whitespace::Space),
2782 Token::SingleQuotedString("_a%".into()),
2783 Token::Comma,
2784 Token::Whitespace(Whitespace::Space),
2785 Token::make_word("col", None),
2786 Token::Whitespace(Whitespace::Space),
2787 Token::DoubleTildeAsterisk,
2788 Token::Whitespace(Whitespace::Space),
2789 Token::SingleQuotedString("_a%".into()),
2790 Token::Comma,
2791 Token::Whitespace(Whitespace::Space),
2792 Token::make_word("col", None),
2793 Token::Whitespace(Whitespace::Space),
2794 Token::ExclamationMarkDoubleTilde,
2795 Token::Whitespace(Whitespace::Space),
2796 Token::SingleQuotedString("_a%".into()),
2797 Token::Comma,
2798 Token::Whitespace(Whitespace::Space),
2799 Token::make_word("col", None),
2800 Token::Whitespace(Whitespace::Space),
2801 Token::ExclamationMarkDoubleTildeAsterisk,
2802 Token::Whitespace(Whitespace::Space),
2803 Token::SingleQuotedString("_a%".into()),
2804 ];
2805 compare(expected, tokens);
2806 }
2807
2808 #[test]
2809 fn tokenize_quoted_identifier() {
2810 let sql = r#" "a "" b" "a """ "c """"" "#;
2811 let dialect = GenericDialect {};
2812 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2813 let expected = vec![
2814 Token::Whitespace(Whitespace::Space),
2815 Token::make_word(r#"a " b"#, Some('"')),
2816 Token::Whitespace(Whitespace::Space),
2817 Token::make_word(r#"a ""#, Some('"')),
2818 Token::Whitespace(Whitespace::Space),
2819 Token::make_word(r#"c """#, Some('"')),
2820 Token::Whitespace(Whitespace::Space),
2821 ];
2822 compare(expected, tokens);
2823 }
2824
2825 #[test]
2826 fn tokenize_snowflake_div() {
2827 let sql = r#"field/1000"#;
2828 let dialect = SnowflakeDialect {};
2829 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2830 let expected = vec![
2831 Token::make_word(r#"field"#, None),
2832 Token::Div,
2833 Token::Number("1000".to_string(), false),
2834 ];
2835 compare(expected, tokens);
2836 }
2837
2838 #[test]
2839 fn tokenize_quoted_identifier_with_no_escape() {
2840 let sql = r#" "a "" b" "a """ "c """"" "#;
2841 let dialect = GenericDialect {};
2842 let tokens = Tokenizer::new(&dialect, sql)
2843 .with_unescape(false)
2844 .tokenize()
2845 .unwrap();
2846 let expected = vec![
2847 Token::Whitespace(Whitespace::Space),
2848 Token::make_word(r#"a "" b"#, Some('"')),
2849 Token::Whitespace(Whitespace::Space),
2850 Token::make_word(r#"a """#, Some('"')),
2851 Token::Whitespace(Whitespace::Space),
2852 Token::make_word(r#"c """""#, Some('"')),
2853 Token::Whitespace(Whitespace::Space),
2854 ];
2855 compare(expected, tokens);
2856 }
2857
2858 #[test]
2859 fn tokenize_with_location() {
2860 let sql = "SELECT a,\n b";
2861 let dialect = GenericDialect {};
2862 let tokens = Tokenizer::new(&dialect, sql)
2863 .tokenize_with_location()
2864 .unwrap();
2865 let expected = vec![
2866 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
2867 TokenWithSpan::at(
2868 Token::Whitespace(Whitespace::Space),
2869 (1, 7).into(),
2870 (1, 8).into(),
2871 ),
2872 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
2873 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
2874 TokenWithSpan::at(
2875 Token::Whitespace(Whitespace::Newline),
2876 (1, 10).into(),
2877 (2, 1).into(),
2878 ),
2879 TokenWithSpan::at(
2880 Token::Whitespace(Whitespace::Space),
2881 (2, 1).into(),
2882 (2, 2).into(),
2883 ),
2884 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
2885 ];
2886 compare(expected, tokens);
2887 }
2888
2889 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
2890 assert_eq!(expected, actual);
2895 }
2896
2897 fn check_unescape(s: &str, expected: Option<&str>) {
2898 let s = format!("'{}'", s);
2899 let mut state = State {
2900 peekable: s.chars().peekable(),
2901 line: 0,
2902 col: 0,
2903 };
2904
2905 assert_eq!(
2906 unescape_single_quoted_string(&mut state),
2907 expected.map(|s| s.to_string())
2908 );
2909 }
2910
2911 #[test]
2912 fn test_unescape() {
2913 check_unescape(r"\b", Some("\u{0008}"));
2914 check_unescape(r"\f", Some("\u{000C}"));
2915 check_unescape(r"\t", Some("\t"));
2916 check_unescape(r"\r\n", Some("\r\n"));
2917 check_unescape(r"\/", Some("/"));
2918 check_unescape(r"/", Some("/"));
2919 check_unescape(r"\\", Some("\\"));
2920
2921 check_unescape(r"\u0001", Some("\u{0001}"));
2923 check_unescape(r"\u4c91", Some("\u{4c91}"));
2924 check_unescape(r"\u4c916", Some("\u{4c91}6"));
2925 check_unescape(r"\u4c", None);
2926 check_unescape(r"\u0000", None);
2927 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
2928 check_unescape(r"\U00110000", None);
2929 check_unescape(r"\U00000000", None);
2930 check_unescape(r"\u", None);
2931 check_unescape(r"\U", None);
2932 check_unescape(r"\U1010FFFF", None);
2933
2934 check_unescape(r"\x4B", Some("\u{004b}"));
2936 check_unescape(r"\x4", Some("\u{0004}"));
2937 check_unescape(r"\x4L", Some("\u{0004}L"));
2938 check_unescape(r"\x", Some("x"));
2939 check_unescape(r"\xP", Some("xP"));
2940 check_unescape(r"\x0", None);
2941 check_unescape(r"\xCAD", None);
2942 check_unescape(r"\xA9", None);
2943
2944 check_unescape(r"\1", Some("\u{0001}"));
2946 check_unescape(r"\12", Some("\u{000a}"));
2947 check_unescape(r"\123", Some("\u{0053}"));
2948 check_unescape(r"\1232", Some("\u{0053}2"));
2949 check_unescape(r"\4", Some("\u{0004}"));
2950 check_unescape(r"\45", Some("\u{0025}"));
2951 check_unescape(r"\450", Some("\u{0028}"));
2952 check_unescape(r"\603", None);
2953 check_unescape(r"\0", None);
2954 check_unescape(r"\080", None);
2955
2956 check_unescape(r"\9", Some("9"));
2958 check_unescape(r"''", Some("'"));
2959 check_unescape(
2960 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
2961 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
2962 );
2963 check_unescape(r"Hello\0", None);
2964 check_unescape(r"Hello\xCADRust", None);
2965 }
2966
2967 #[test]
2968 fn tokenize_numeric_prefix_trait() {
2969 #[derive(Debug)]
2970 struct NumericPrefixDialect;
2971
2972 impl Dialect for NumericPrefixDialect {
2973 fn is_identifier_start(&self, ch: char) -> bool {
2974 ch.is_ascii_lowercase()
2975 || ch.is_ascii_uppercase()
2976 || ch.is_ascii_digit()
2977 || ch == '$'
2978 }
2979
2980 fn is_identifier_part(&self, ch: char) -> bool {
2981 ch.is_ascii_lowercase()
2982 || ch.is_ascii_uppercase()
2983 || ch.is_ascii_digit()
2984 || ch == '_'
2985 || ch == '$'
2986 || ch == '{'
2987 || ch == '}'
2988 }
2989
2990 fn supports_numeric_prefix(&self) -> bool {
2991 true
2992 }
2993 }
2994
2995 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
2996 tokenize_numeric_prefix_inner(&HiveDialect {});
2997 tokenize_numeric_prefix_inner(&MySqlDialect {});
2998 }
2999
3000 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3001 let sql = r#"SELECT * FROM 1"#;
3002 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3003 let expected = vec![
3004 Token::make_keyword("SELECT"),
3005 Token::Whitespace(Whitespace::Space),
3006 Token::Mul,
3007 Token::Whitespace(Whitespace::Space),
3008 Token::make_keyword("FROM"),
3009 Token::Whitespace(Whitespace::Space),
3010 Token::Number(String::from("1"), false),
3011 ];
3012 compare(expected, tokens);
3013 }
3014
3015 #[test]
3016 fn tokenize_quoted_string_escape() {
3017 let dialect = SnowflakeDialect {};
3018 for (sql, expected, expected_unescaped) in [
3019 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3020 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3021 (r#"'\\'"#, r#"\\"#, r#"\"#),
3022 (
3023 r#"'\0\a\b\f\n\r\t\Z'"#,
3024 r#"\0\a\b\f\n\r\t\Z"#,
3025 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3026 ),
3027 (r#"'\"'"#, r#"\""#, "\""),
3028 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3029 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3030 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3031 ] {
3032 let tokens = Tokenizer::new(&dialect, sql)
3033 .with_unescape(false)
3034 .tokenize()
3035 .unwrap();
3036 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3037 compare(expected, tokens);
3038
3039 let tokens = Tokenizer::new(&dialect, sql)
3040 .with_unescape(true)
3041 .tokenize()
3042 .unwrap();
3043 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3044 compare(expected, tokens);
3045 }
3046
3047 for sql in [r#"'\'"#, r#"'ab\'"#] {
3048 let mut tokenizer = Tokenizer::new(&dialect, sql);
3049 assert_eq!(
3050 "Unterminated string literal",
3051 tokenizer.tokenize().unwrap_err().message.as_str(),
3052 );
3053 }
3054
3055 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3057 let dialect = GenericDialect {};
3058 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3059
3060 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3061
3062 compare(expected, tokens);
3063 }
3064 }
3065
3066 #[test]
3067 fn tokenize_triple_quoted_string() {
3068 fn check<F>(
3069 q: char, r: char, quote_token: F,
3072 ) where
3073 F: Fn(String) -> Token,
3074 {
3075 let dialect = BigQueryDialect {};
3076
3077 for (sql, expected, expected_unescaped) in [
3078 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3080 (
3082 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3083 format!(r#"ab{q}{q}\{q}{q}cd"#),
3084 format!(r#"ab{q}{q}{q}{q}cd"#),
3085 ),
3086 (
3088 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3089 "abc".into(),
3090 "abc".into(),
3091 ),
3092 (
3094 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3095 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3096 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3097 ),
3098 (
3100 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3101 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3102 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3103 ),
3104 (
3106 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3107 r#"a\'\'b\'c\'d"#.into(),
3108 r#"a''b'c'd"#.into(),
3109 ),
3110 (
3112 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3113 r#"abc\0\n\rdef"#.into(),
3114 "abc\0\n\rdef".into(),
3115 ),
3116 ] {
3117 let tokens = Tokenizer::new(&dialect, sql.as_str())
3118 .with_unescape(false)
3119 .tokenize()
3120 .unwrap();
3121 let expected = vec![quote_token(expected.to_string())];
3122 compare(expected, tokens);
3123
3124 let tokens = Tokenizer::new(&dialect, sql.as_str())
3125 .with_unescape(true)
3126 .tokenize()
3127 .unwrap();
3128 let expected = vec![quote_token(expected_unescaped.to_string())];
3129 compare(expected, tokens);
3130 }
3131
3132 for sql in [
3133 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3134 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3135 format!(r#"{q}{q}{q}{q}"#),
3136 format!(r#"{q}{q}{q}{r}{r}"#),
3137 format!(r#"{q}{q}{q}abc{q}"#),
3138 format!(r#"{q}{q}{q}abc{q}{q}"#),
3139 format!(r#"{q}{q}{q}abc"#),
3140 ] {
3141 let dialect = BigQueryDialect {};
3142 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3143 assert_eq!(
3144 "Unterminated string literal",
3145 tokenizer.tokenize().unwrap_err().message.as_str(),
3146 );
3147 }
3148 }
3149
3150 check('"', '\'', Token::TripleDoubleQuotedString);
3151
3152 check('\'', '"', Token::TripleSingleQuotedString);
3153
3154 let dialect = BigQueryDialect {};
3155
3156 let sql = r#"""''"#;
3157 let tokens = Tokenizer::new(&dialect, sql)
3158 .with_unescape(true)
3159 .tokenize()
3160 .unwrap();
3161 let expected = vec![
3162 Token::DoubleQuotedString("".to_string()),
3163 Token::SingleQuotedString("".to_string()),
3164 ];
3165 compare(expected, tokens);
3166
3167 let sql = r#"''"""#;
3168 let tokens = Tokenizer::new(&dialect, sql)
3169 .with_unescape(true)
3170 .tokenize()
3171 .unwrap();
3172 let expected = vec![
3173 Token::SingleQuotedString("".to_string()),
3174 Token::DoubleQuotedString("".to_string()),
3175 ];
3176 compare(expected, tokens);
3177
3178 let dialect = SnowflakeDialect {};
3180 let sql = r#"''''''"#;
3181 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3182 let expected = vec![Token::SingleQuotedString("''".to_string())];
3183 compare(expected, tokens);
3184 }
3185}