1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::num::NonZeroU8;
33use core::str::Chars;
34use core::{cmp, fmt};
35use core::{iter::Peekable, str};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46 SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{
50 ast::{DollarQuotedString, QuoteDelimitedString},
51 dialect::HiveDialect,
52};
53
54#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
58pub enum Token {
59 EOF,
61 Word(Word),
63 Number(String, bool),
65 Char(char),
67 SingleQuotedString(String),
69 DoubleQuotedString(String),
71 TripleSingleQuotedString(String),
74 TripleDoubleQuotedString(String),
77 DollarQuotedString(DollarQuotedString),
79 SingleQuotedByteStringLiteral(String),
82 DoubleQuotedByteStringLiteral(String),
84 TripleSingleQuotedByteStringLiteral(String),
87 TripleDoubleQuotedByteStringLiteral(String),
90 SingleQuotedRawStringLiteral(String),
93 DoubleQuotedRawStringLiteral(String),
96 TripleSingleQuotedRawStringLiteral(String),
99 TripleDoubleQuotedRawStringLiteral(String),
102 NationalStringLiteral(String),
104 QuoteDelimitedStringLiteral(QuoteDelimitedString),
107 NationalQuoteDelimitedStringLiteral(QuoteDelimitedString),
110 EscapedStringLiteral(String),
112 UnicodeStringLiteral(String),
114 HexStringLiteral(String),
116 Comma,
118 Whitespace(Whitespace),
120 DoubleEq,
122 Eq,
124 Neq,
126 Lt,
128 Gt,
130 LtEq,
132 GtEq,
134 Spaceship,
136 Plus,
138 Minus,
140 Mul,
142 Div,
144 DuckIntDiv,
146 Mod,
148 StringConcat,
150 LParen,
152 RParen,
154 Period,
156 Colon,
158 DoubleColon,
160 Assignment,
162 SemiColon,
164 Backslash,
166 LBracket,
168 RBracket,
170 Ampersand,
172 Pipe,
174 Caret,
176 LBrace,
178 RBrace,
180 RArrow,
182 Sharp,
184 DoubleSharp,
186 Tilde,
188 TildeAsterisk,
190 ExclamationMarkTilde,
192 ExclamationMarkTildeAsterisk,
194 DoubleTilde,
196 DoubleTildeAsterisk,
198 ExclamationMarkDoubleTilde,
200 ExclamationMarkDoubleTildeAsterisk,
202 ShiftLeft,
204 ShiftRight,
206 Overlap,
208 ExclamationMark,
210 DoubleExclamationMark,
212 AtSign,
214 CaretAt,
216 PGSquareRoot,
218 PGCubeRoot,
220 Placeholder(String),
222 Arrow,
224 LongArrow,
226 HashArrow,
228 AtDashAt,
230 QuestionMarkDash,
232 AmpersandLeftAngleBracket,
234 AmpersandRightAngleBracket,
236 AmpersandLeftAngleBracketVerticalBar,
238 VerticalBarAmpersandRightAngleBracket,
240 TwoWayArrow,
242 LeftAngleBracketCaret,
244 RightAngleBracketCaret,
246 QuestionMarkSharp,
248 QuestionMarkDashVerticalBar,
250 QuestionMarkDoubleVerticalBar,
252 TildeEqual,
254 ShiftLeftVerticalBar,
256 VerticalBarShiftRight,
258 VerticalBarRightAngleBracket,
260 HashLongArrow,
262 AtArrow,
264 ArrowAt,
266 HashMinus,
269 AtQuestion,
272 AtAt,
276 Question,
279 QuestionAnd,
282 QuestionPipe,
285 CustomBinaryOperator(String),
289}
290
291impl fmt::Display for Token {
292 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
293 match self {
294 Token::EOF => f.write_str("EOF"),
295 Token::Word(ref w) => write!(f, "{w}"),
296 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
297 Token::Char(ref c) => write!(f, "{c}"),
298 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
299 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
300 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
301 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
302 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
303 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
304 Token::QuoteDelimitedStringLiteral(ref s) => s.fmt(f),
305 Token::NationalQuoteDelimitedStringLiteral(ref s) => write!(f, "N{s}"),
306 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
307 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
308 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
309 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
310 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
311 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
312 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
313 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
314 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
315 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
316 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
317 Token::Comma => f.write_str(","),
318 Token::Whitespace(ws) => write!(f, "{ws}"),
319 Token::DoubleEq => f.write_str("=="),
320 Token::Spaceship => f.write_str("<=>"),
321 Token::Eq => f.write_str("="),
322 Token::Neq => f.write_str("<>"),
323 Token::Lt => f.write_str("<"),
324 Token::Gt => f.write_str(">"),
325 Token::LtEq => f.write_str("<="),
326 Token::GtEq => f.write_str(">="),
327 Token::Plus => f.write_str("+"),
328 Token::Minus => f.write_str("-"),
329 Token::Mul => f.write_str("*"),
330 Token::Div => f.write_str("/"),
331 Token::DuckIntDiv => f.write_str("//"),
332 Token::StringConcat => f.write_str("||"),
333 Token::Mod => f.write_str("%"),
334 Token::LParen => f.write_str("("),
335 Token::RParen => f.write_str(")"),
336 Token::Period => f.write_str("."),
337 Token::Colon => f.write_str(":"),
338 Token::DoubleColon => f.write_str("::"),
339 Token::Assignment => f.write_str(":="),
340 Token::SemiColon => f.write_str(";"),
341 Token::Backslash => f.write_str("\\"),
342 Token::LBracket => f.write_str("["),
343 Token::RBracket => f.write_str("]"),
344 Token::Ampersand => f.write_str("&"),
345 Token::Caret => f.write_str("^"),
346 Token::Pipe => f.write_str("|"),
347 Token::LBrace => f.write_str("{"),
348 Token::RBrace => f.write_str("}"),
349 Token::RArrow => f.write_str("=>"),
350 Token::Sharp => f.write_str("#"),
351 Token::DoubleSharp => f.write_str("##"),
352 Token::ExclamationMark => f.write_str("!"),
353 Token::DoubleExclamationMark => f.write_str("!!"),
354 Token::Tilde => f.write_str("~"),
355 Token::TildeAsterisk => f.write_str("~*"),
356 Token::ExclamationMarkTilde => f.write_str("!~"),
357 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
358 Token::DoubleTilde => f.write_str("~~"),
359 Token::DoubleTildeAsterisk => f.write_str("~~*"),
360 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
361 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
362 Token::AtSign => f.write_str("@"),
363 Token::CaretAt => f.write_str("^@"),
364 Token::ShiftLeft => f.write_str("<<"),
365 Token::ShiftRight => f.write_str(">>"),
366 Token::Overlap => f.write_str("&&"),
367 Token::PGSquareRoot => f.write_str("|/"),
368 Token::PGCubeRoot => f.write_str("||/"),
369 Token::AtDashAt => f.write_str("@-@"),
370 Token::QuestionMarkDash => f.write_str("?-"),
371 Token::AmpersandLeftAngleBracket => f.write_str("&<"),
372 Token::AmpersandRightAngleBracket => f.write_str("&>"),
373 Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
374 Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
375 Token::VerticalBarRightAngleBracket => f.write_str("|>"),
376 Token::TwoWayArrow => f.write_str("<->"),
377 Token::LeftAngleBracketCaret => f.write_str("<^"),
378 Token::RightAngleBracketCaret => f.write_str(">^"),
379 Token::QuestionMarkSharp => f.write_str("?#"),
380 Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
381 Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
382 Token::TildeEqual => f.write_str("~="),
383 Token::ShiftLeftVerticalBar => f.write_str("<<|"),
384 Token::VerticalBarShiftRight => f.write_str("|>>"),
385 Token::Placeholder(ref s) => write!(f, "{s}"),
386 Token::Arrow => write!(f, "->"),
387 Token::LongArrow => write!(f, "->>"),
388 Token::HashArrow => write!(f, "#>"),
389 Token::HashLongArrow => write!(f, "#>>"),
390 Token::AtArrow => write!(f, "@>"),
391 Token::ArrowAt => write!(f, "<@"),
392 Token::HashMinus => write!(f, "#-"),
393 Token::AtQuestion => write!(f, "@?"),
394 Token::AtAt => write!(f, "@@"),
395 Token::Question => write!(f, "?"),
396 Token::QuestionAnd => write!(f, "?&"),
397 Token::QuestionPipe => write!(f, "?|"),
398 Token::CustomBinaryOperator(s) => f.write_str(s),
399 }
400 }
401}
402
403impl Token {
404 pub fn make_keyword(keyword: &str) -> Self {
405 Token::make_word(keyword, None)
406 }
407
408 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
409 let word_uppercase = word.to_uppercase();
410 Token::Word(Word {
411 value: word.to_string(),
412 quote_style,
413 keyword: if quote_style.is_none() {
414 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
415 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
416 } else {
417 Keyword::NoKeyword
418 },
419 })
420 }
421}
422
423#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
425#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
426#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
427pub struct Word {
428 pub value: String,
431 pub quote_style: Option<char>,
435 pub keyword: Keyword,
438}
439
440impl fmt::Display for Word {
441 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
442 match self.quote_style {
443 Some(s) if s == '"' || s == '[' || s == '`' => {
444 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
445 }
446 None => f.write_str(&self.value),
447 _ => panic!("Unexpected quote_style!"),
448 }
449 }
450}
451
452impl Word {
453 fn matching_end_quote(ch: char) -> char {
454 match ch {
455 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
459 }
460 }
461}
462
463#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
464#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
465#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
466pub enum Whitespace {
467 Space,
468 Newline,
469 Tab,
470 SingleLineComment { comment: String, prefix: String },
471 MultiLineComment(String),
472}
473
474impl fmt::Display for Whitespace {
475 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
476 match self {
477 Whitespace::Space => f.write_str(" "),
478 Whitespace::Newline => f.write_str("\n"),
479 Whitespace::Tab => f.write_str("\t"),
480 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
481 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
482 }
483 }
484}
485
486#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
506#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
507#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
508pub struct Location {
509 pub line: u64,
513 pub column: u64,
517}
518
519impl fmt::Display for Location {
520 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
521 if self.line == 0 {
522 return Ok(());
523 }
524 write!(f, " at Line: {}, Column: {}", self.line, self.column)
525 }
526}
527
528impl fmt::Debug for Location {
529 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
530 write!(f, "Location({},{})", self.line, self.column)
531 }
532}
533
534impl Location {
535 pub fn empty() -> Self {
537 Self { line: 0, column: 0 }
538 }
539
540 pub fn new(line: u64, column: u64) -> Self {
542 Self { line, column }
543 }
544
545 pub fn of(line: u64, column: u64) -> Self {
550 Self::new(line, column)
551 }
552
553 pub fn span_to(self, end: Self) -> Span {
555 Span { start: self, end }
556 }
557}
558
559impl From<(u64, u64)> for Location {
560 fn from((line, column): (u64, u64)) -> Self {
561 Self { line, column }
562 }
563}
564
565#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
569#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
570#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
571pub struct Span {
572 pub start: Location,
573 pub end: Location,
574}
575
576impl fmt::Debug for Span {
577 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
578 write!(f, "Span({:?}..{:?})", self.start, self.end)
579 }
580}
581
582impl Span {
583 const EMPTY: Span = Self::empty();
586
587 pub fn new(start: Location, end: Location) -> Span {
589 Span { start, end }
590 }
591
592 pub const fn empty() -> Span {
597 Span {
598 start: Location { line: 0, column: 0 },
599 end: Location { line: 0, column: 0 },
600 }
601 }
602
603 pub fn union(&self, other: &Span) -> Span {
619 match (self, other) {
622 (&Span::EMPTY, _) => *other,
623 (_, &Span::EMPTY) => *self,
624 _ => Span {
625 start: cmp::min(self.start, other.start),
626 end: cmp::max(self.end, other.end),
627 },
628 }
629 }
630
631 pub fn union_opt(&self, other: &Option<Span>) -> Span {
635 match other {
636 Some(other) => self.union(other),
637 None => *self,
638 }
639 }
640
641 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
659 iter.into_iter()
660 .reduce(|acc, item| acc.union(&item))
661 .unwrap_or(Span::empty())
662 }
663}
664
665#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
667pub type TokenWithLocation = TokenWithSpan;
668
669#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
692#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
693#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
694pub struct TokenWithSpan {
695 pub token: Token,
696 pub span: Span,
697}
698
699impl TokenWithSpan {
700 pub fn new(token: Token, span: Span) -> Self {
702 Self { token, span }
703 }
704
705 pub fn wrap(token: Token) -> Self {
707 Self::new(token, Span::empty())
708 }
709
710 pub fn at(token: Token, start: Location, end: Location) -> Self {
712 Self::new(token, Span::new(start, end))
713 }
714
715 pub fn new_eof() -> Self {
717 Self::wrap(Token::EOF)
718 }
719}
720
721impl PartialEq<Token> for TokenWithSpan {
722 fn eq(&self, other: &Token) -> bool {
723 &self.token == other
724 }
725}
726
727impl PartialEq<TokenWithSpan> for Token {
728 fn eq(&self, other: &TokenWithSpan) -> bool {
729 self == &other.token
730 }
731}
732
733impl fmt::Display for TokenWithSpan {
734 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
735 self.token.fmt(f)
736 }
737}
738
739#[derive(Debug, PartialEq, Eq)]
741pub struct TokenizerError {
742 pub message: String,
743 pub location: Location,
744}
745
746impl fmt::Display for TokenizerError {
747 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
748 write!(f, "{}{}", self.message, self.location,)
749 }
750}
751
752#[cfg(feature = "std")]
753impl std::error::Error for TokenizerError {}
754
755struct State<'a> {
756 peekable: Peekable<Chars<'a>>,
757 pub line: u64,
758 pub col: u64,
759}
760
761impl State<'_> {
762 pub fn next(&mut self) -> Option<char> {
764 match self.peekable.next() {
765 None => None,
766 Some(s) => {
767 if s == '\n' {
768 self.line += 1;
769 self.col = 1;
770 } else {
771 self.col += 1;
772 }
773 Some(s)
774 }
775 }
776 }
777
778 pub fn peek(&mut self) -> Option<&char> {
780 self.peekable.peek()
781 }
782
783 pub fn location(&self) -> Location {
784 Location {
785 line: self.line,
786 column: self.col,
787 }
788 }
789}
790
791#[derive(Copy, Clone)]
793enum NumStringQuoteChars {
794 One,
796 Many(NonZeroU8),
798}
799
800struct TokenizeQuotedStringSettings {
802 quote_style: char,
804 num_quote_chars: NumStringQuoteChars,
806 num_opening_quotes_to_consume: u8,
812 backslash_escape: bool,
815}
816
817pub struct Tokenizer<'a> {
819 dialect: &'a dyn Dialect,
820 query: &'a str,
821 unescape: bool,
824}
825
826impl<'a> Tokenizer<'a> {
827 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
844 Self {
845 dialect,
846 query,
847 unescape: true,
848 }
849 }
850
851 pub fn with_unescape(mut self, unescape: bool) -> Self {
882 self.unescape = unescape;
883 self
884 }
885
886 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
888 let twl = self.tokenize_with_location()?;
889 Ok(twl.into_iter().map(|t| t.token).collect())
890 }
891
892 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
894 let mut tokens: Vec<TokenWithSpan> = vec![];
895 self.tokenize_with_location_into_buf(&mut tokens)
896 .map(|_| tokens)
897 }
898
899 pub fn tokenize_with_location_into_buf(
902 &mut self,
903 buf: &mut Vec<TokenWithSpan>,
904 ) -> Result<(), TokenizerError> {
905 let mut state = State {
906 peekable: self.query.chars().peekable(),
907 line: 1,
908 col: 1,
909 };
910
911 let mut location = state.location();
912 while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
913 let span = location.span_to(state.location());
914
915 buf.push(TokenWithSpan { token, span });
916
917 location = state.location();
918 }
919 Ok(())
920 }
921
922 fn tokenize_identifier_or_keyword(
924 &self,
925 ch: impl IntoIterator<Item = char>,
926 chars: &mut State,
927 ) -> Result<Option<Token>, TokenizerError> {
928 chars.next(); let ch: String = ch.into_iter().collect();
930 let word = self.tokenize_word(ch, chars);
931
932 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
934 let mut inner_state = State {
935 peekable: word.chars().peekable(),
936 line: 0,
937 col: 0,
938 };
939 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
940 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
941 s += s2.as_str();
942 return Ok(Some(Token::Number(s, false)));
943 }
944
945 Ok(Some(Token::make_word(&word, None)))
946 }
947
948 fn next_token(
950 &self,
951 chars: &mut State,
952 prev_token: Option<&Token>,
953 ) -> Result<Option<Token>, TokenizerError> {
954 match chars.peek() {
955 Some(&ch) => match ch {
956 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
957 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
958 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
959 '\r' => {
960 chars.next();
962 if let Some('\n') = chars.peek() {
963 chars.next();
964 }
965 Ok(Some(Token::Whitespace(Whitespace::Newline)))
966 }
967 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
969 {
970 chars.next(); match chars.peek() {
972 Some('\'') => {
973 if self.dialect.supports_triple_quoted_string() {
974 return self
975 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
976 chars,
977 '\'',
978 false,
979 Token::SingleQuotedByteStringLiteral,
980 Token::TripleSingleQuotedByteStringLiteral,
981 );
982 }
983 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
984 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
985 }
986 Some('\"') => {
987 if self.dialect.supports_triple_quoted_string() {
988 return self
989 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
990 chars,
991 '"',
992 false,
993 Token::DoubleQuotedByteStringLiteral,
994 Token::TripleDoubleQuotedByteStringLiteral,
995 );
996 }
997 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
998 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
999 }
1000 _ => {
1001 let s = self.tokenize_word(b, chars);
1003 Ok(Some(Token::make_word(&s, None)))
1004 }
1005 }
1006 }
1007 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
1009 chars.next(); match chars.peek() {
1011 Some('\'') => self
1012 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1013 chars,
1014 '\'',
1015 false,
1016 Token::SingleQuotedRawStringLiteral,
1017 Token::TripleSingleQuotedRawStringLiteral,
1018 ),
1019 Some('\"') => self
1020 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1021 chars,
1022 '"',
1023 false,
1024 Token::DoubleQuotedRawStringLiteral,
1025 Token::TripleDoubleQuotedRawStringLiteral,
1026 ),
1027 _ => {
1028 let s = self.tokenize_word(b, chars);
1030 Ok(Some(Token::make_word(&s, None)))
1031 }
1032 }
1033 }
1034 n @ 'N' | n @ 'n' => {
1036 chars.next(); match chars.peek() {
1038 Some('\'') => {
1039 let backslash_escape =
1041 self.dialect.supports_string_literal_backslash_escape();
1042 let s =
1043 self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1044 Ok(Some(Token::NationalStringLiteral(s)))
1045 }
1046 Some(&q @ 'q') | Some(&q @ 'Q')
1047 if self.dialect.supports_quote_delimited_string() =>
1048 {
1049 chars.next(); if let Some('\'') = chars.peek() {
1051 self.tokenize_quote_delimited_string(chars, &[n, q])
1052 .map(|s| Some(Token::NationalQuoteDelimitedStringLiteral(s)))
1053 } else {
1054 let s = self.tokenize_word(String::from_iter([n, q]), chars);
1055 Ok(Some(Token::make_word(&s, None)))
1056 }
1057 }
1058 _ => {
1059 let s = self.tokenize_word(n, chars);
1061 Ok(Some(Token::make_word(&s, None)))
1062 }
1063 }
1064 }
1065 q @ 'Q' | q @ 'q' if self.dialect.supports_quote_delimited_string() => {
1066 chars.next(); if let Some('\'') = chars.peek() {
1068 self.tokenize_quote_delimited_string(chars, &[q])
1069 .map(|s| Some(Token::QuoteDelimitedStringLiteral(s)))
1070 } else {
1071 let s = self.tokenize_word(q, chars);
1072 Ok(Some(Token::make_word(&s, None)))
1073 }
1074 }
1075 x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1077 let starting_loc = chars.location();
1078 chars.next(); match chars.peek() {
1080 Some('\'') => {
1081 let s =
1082 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1083 Ok(Some(Token::EscapedStringLiteral(s)))
1084 }
1085 _ => {
1086 let s = self.tokenize_word(x, chars);
1088 Ok(Some(Token::make_word(&s, None)))
1089 }
1090 }
1091 }
1092 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1094 chars.next(); if chars.peek() == Some(&'&') {
1096 let mut chars_clone = chars.peekable.clone();
1098 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
1100 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
1102 return Ok(Some(Token::UnicodeStringLiteral(s)));
1103 }
1104 }
1105 let s = self.tokenize_word(x, chars);
1107 Ok(Some(Token::make_word(&s, None)))
1108 }
1109 x @ 'x' | x @ 'X' => {
1112 chars.next(); match chars.peek() {
1114 Some('\'') => {
1115 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1117 Ok(Some(Token::HexStringLiteral(s)))
1118 }
1119 _ => {
1120 let s = self.tokenize_word(x, chars);
1122 Ok(Some(Token::make_word(&s, None)))
1123 }
1124 }
1125 }
1126 '\'' => {
1128 if self.dialect.supports_triple_quoted_string() {
1129 return self
1130 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1131 chars,
1132 '\'',
1133 self.dialect.supports_string_literal_backslash_escape(),
1134 Token::SingleQuotedString,
1135 Token::TripleSingleQuotedString,
1136 );
1137 }
1138 let s = self.tokenize_single_quoted_string(
1139 chars,
1140 '\'',
1141 self.dialect.supports_string_literal_backslash_escape(),
1142 )?;
1143
1144 Ok(Some(Token::SingleQuotedString(s)))
1145 }
1146 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1148 && !self.dialect.is_identifier_start(ch) =>
1149 {
1150 if self.dialect.supports_triple_quoted_string() {
1151 return self
1152 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1153 chars,
1154 '"',
1155 self.dialect.supports_string_literal_backslash_escape(),
1156 Token::DoubleQuotedString,
1157 Token::TripleDoubleQuotedString,
1158 );
1159 }
1160 let s = self.tokenize_single_quoted_string(
1161 chars,
1162 '"',
1163 self.dialect.supports_string_literal_backslash_escape(),
1164 )?;
1165
1166 Ok(Some(Token::DoubleQuotedString(s)))
1167 }
1168 quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1170 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1171 Ok(Some(Token::make_word(&word, Some(quote_start))))
1172 }
1173 quote_start
1175 if self
1176 .dialect
1177 .is_nested_delimited_identifier_start(quote_start)
1178 && self
1179 .dialect
1180 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1181 .is_some() =>
1182 {
1183 let Some((quote_start, nested_quote_start)) = self
1184 .dialect
1185 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1186 else {
1187 return self.tokenizer_error(
1188 chars.location(),
1189 format!("Expected nested delimiter '{quote_start}' before EOF."),
1190 );
1191 };
1192
1193 let Some(nested_quote_start) = nested_quote_start else {
1194 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1195 return Ok(Some(Token::make_word(&word, Some(quote_start))));
1196 };
1197
1198 let mut word = vec![];
1199 let quote_end = Word::matching_end_quote(quote_start);
1200 let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1201 let error_loc = chars.location();
1202
1203 chars.next(); peeking_take_while(chars, |ch| ch.is_whitespace());
1205 if chars.peek() != Some(&nested_quote_start) {
1206 return self.tokenizer_error(
1207 error_loc,
1208 format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1209 );
1210 }
1211 word.push(nested_quote_start.into());
1212 word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1213 word.push(nested_quote_end.into());
1214 peeking_take_while(chars, |ch| ch.is_whitespace());
1215 if chars.peek() != Some("e_end) {
1216 return self.tokenizer_error(
1217 error_loc,
1218 format!("Expected close delimiter '{quote_end}' before EOF."),
1219 );
1220 }
1221 chars.next(); Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1224 }
1225 '0'..='9' | '.' => {
1227 if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1232 if let Some(Token::Word(_)) = prev_token {
1233 chars.next();
1234 return Ok(Some(Token::Period));
1235 }
1236
1237 return self.tokenizer_error(
1238 chars.location(),
1239 "Unexpected character '_'".to_string(),
1240 );
1241 }
1242
1243 let is_number_separator = |ch: char, next_char: Option<char>| {
1246 self.dialect.supports_numeric_literal_underscores()
1247 && ch == '_'
1248 && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1249 };
1250
1251 let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1252 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1253 });
1254
1255 if s == "0" && chars.peek() == Some(&'x') {
1257 chars.next();
1258 let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1259 ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1260 });
1261 return Ok(Some(Token::HexStringLiteral(s2)));
1262 }
1263
1264 if let Some('.') = chars.peek() {
1266 s.push('.');
1267 chars.next();
1268 }
1269
1270 if s == "." && self.dialect.supports_numeric_prefix() {
1276 if let Some(Token::Word(_)) = prev_token {
1277 return Ok(Some(Token::Period));
1278 }
1279 }
1280
1281 s += &peeking_next_take_while(chars, |ch, next_ch| {
1283 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1284 });
1285
1286 if s == "." {
1288 return Ok(Some(Token::Period));
1289 }
1290
1291 let mut exponent_part = String::new();
1293 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1294 let mut char_clone = chars.peekable.clone();
1295 exponent_part.push(char_clone.next().unwrap());
1296
1297 match char_clone.peek() {
1299 Some(&c) if matches!(c, '+' | '-') => {
1300 exponent_part.push(c);
1301 char_clone.next();
1302 }
1303 _ => (),
1304 }
1305
1306 match char_clone.peek() {
1307 Some(&c) if c.is_ascii_digit() => {
1309 for _ in 0..exponent_part.len() {
1310 chars.next();
1311 }
1312 exponent_part +=
1313 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1314 s += exponent_part.as_str();
1315 }
1316 _ => (),
1318 }
1319 }
1320
1321 if self.dialect.supports_numeric_prefix() {
1325 if exponent_part.is_empty() {
1326 let word =
1329 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1330
1331 if !word.is_empty() {
1332 s += word.as_str();
1333 return Ok(Some(Token::make_word(s.as_str(), None)));
1334 }
1335 } else if prev_token == Some(&Token::Period) {
1336 return Ok(Some(Token::make_word(s.as_str(), None)));
1339 }
1340 }
1341
1342 let long = if chars.peek() == Some(&'L') {
1343 chars.next();
1344 true
1345 } else {
1346 false
1347 };
1348 Ok(Some(Token::Number(s, long)))
1349 }
1350 '(' => self.consume_and_return(chars, Token::LParen),
1352 ')' => self.consume_and_return(chars, Token::RParen),
1353 ',' => self.consume_and_return(chars, Token::Comma),
1354 '-' => {
1356 chars.next(); match chars.peek() {
1359 Some('-') => {
1360 let mut is_comment = true;
1361 if self.dialect.requires_single_line_comment_whitespace() {
1362 is_comment = Some(' ') == chars.peekable.clone().nth(1);
1363 }
1364
1365 if is_comment {
1366 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1368 return Ok(Some(Token::Whitespace(
1369 Whitespace::SingleLineComment {
1370 prefix: "--".to_owned(),
1371 comment,
1372 },
1373 )));
1374 }
1375
1376 self.start_binop(chars, "-", Token::Minus)
1377 }
1378 Some('>') => {
1379 chars.next();
1380 match chars.peek() {
1381 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1382 _ => self.start_binop(chars, "->", Token::Arrow),
1383 }
1384 }
1385 _ => self.start_binop(chars, "-", Token::Minus),
1387 }
1388 }
1389 '/' => {
1390 chars.next(); match chars.peek() {
1392 Some('*') => {
1393 chars.next(); self.tokenize_multiline_comment(chars)
1395 }
1396 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1397 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1399 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1400 prefix: "//".to_owned(),
1401 comment,
1402 })))
1403 }
1404 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1405 self.consume_and_return(chars, Token::DuckIntDiv)
1406 }
1407 _ => Ok(Some(Token::Div)),
1409 }
1410 }
1411 '+' => self.consume_and_return(chars, Token::Plus),
1412 '*' => self.consume_and_return(chars, Token::Mul),
1413 '%' => {
1414 chars.next(); match chars.peek() {
1416 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1417 Some(sch) if self.dialect.is_identifier_start('%') => {
1418 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1419 }
1420 _ => self.start_binop(chars, "%", Token::Mod),
1421 }
1422 }
1423 '|' => {
1424 chars.next(); match chars.peek() {
1426 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1427 Some('|') => {
1428 chars.next(); match chars.peek() {
1430 Some('/') => {
1431 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1432 }
1433 _ => self.start_binop(chars, "||", Token::StringConcat),
1434 }
1435 }
1436 Some('&') if self.dialect.supports_geometric_types() => {
1437 chars.next(); match chars.peek() {
1439 Some('>') => self.consume_for_binop(
1440 chars,
1441 "|&>",
1442 Token::VerticalBarAmpersandRightAngleBracket,
1443 ),
1444 _ => self.start_binop_opt(chars, "|&", None),
1445 }
1446 }
1447 Some('>') if self.dialect.supports_geometric_types() => {
1448 chars.next(); match chars.peek() {
1450 Some('>') => self.consume_for_binop(
1451 chars,
1452 "|>>",
1453 Token::VerticalBarShiftRight,
1454 ),
1455 _ => self.start_binop_opt(chars, "|>", None),
1456 }
1457 }
1458 Some('>') if self.dialect.supports_pipe_operator() => {
1459 self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1460 }
1461 _ => self.start_binop(chars, "|", Token::Pipe),
1463 }
1464 }
1465 '=' => {
1466 chars.next(); match chars.peek() {
1468 Some('>') => self.consume_and_return(chars, Token::RArrow),
1469 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1470 _ => Ok(Some(Token::Eq)),
1471 }
1472 }
1473 '!' => {
1474 chars.next(); match chars.peek() {
1476 Some('=') => self.consume_and_return(chars, Token::Neq),
1477 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1478 Some('~') => {
1479 chars.next();
1480 match chars.peek() {
1481 Some('*') => self
1482 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1483 Some('~') => {
1484 chars.next();
1485 match chars.peek() {
1486 Some('*') => self.consume_and_return(
1487 chars,
1488 Token::ExclamationMarkDoubleTildeAsterisk,
1489 ),
1490 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1491 }
1492 }
1493 _ => Ok(Some(Token::ExclamationMarkTilde)),
1494 }
1495 }
1496 _ => Ok(Some(Token::ExclamationMark)),
1497 }
1498 }
1499 '<' => {
1500 chars.next(); match chars.peek() {
1502 Some('=') => {
1503 chars.next();
1504 match chars.peek() {
1505 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1506 _ => self.start_binop(chars, "<=", Token::LtEq),
1507 }
1508 }
1509 Some('|') if self.dialect.supports_geometric_types() => {
1510 self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1511 }
1512 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1513 Some('<') if self.dialect.supports_geometric_types() => {
1514 chars.next(); match chars.peek() {
1516 Some('|') => self.consume_for_binop(
1517 chars,
1518 "<<|",
1519 Token::ShiftLeftVerticalBar,
1520 ),
1521 _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1522 }
1523 }
1524 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1525 Some('-') if self.dialect.supports_geometric_types() => {
1526 chars.next(); match chars.peek() {
1528 Some('>') => {
1529 self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1530 }
1531 _ => self.start_binop_opt(chars, "<-", None),
1532 }
1533 }
1534 Some('^') if self.dialect.supports_geometric_types() => {
1535 self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1536 }
1537 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1538 _ => self.start_binop(chars, "<", Token::Lt),
1539 }
1540 }
1541 '>' => {
1542 chars.next(); match chars.peek() {
1544 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1545 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1546 Some('^') if self.dialect.supports_geometric_types() => {
1547 self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1548 }
1549 _ => self.start_binop(chars, ">", Token::Gt),
1550 }
1551 }
1552 ':' => {
1553 chars.next();
1554 match chars.peek() {
1555 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1556 Some('=') => self.consume_and_return(chars, Token::Assignment),
1557 _ => Ok(Some(Token::Colon)),
1558 }
1559 }
1560 ';' => self.consume_and_return(chars, Token::SemiColon),
1561 '\\' => self.consume_and_return(chars, Token::Backslash),
1562 '[' => self.consume_and_return(chars, Token::LBracket),
1563 ']' => self.consume_and_return(chars, Token::RBracket),
1564 '&' => {
1565 chars.next(); match chars.peek() {
1567 Some('>') if self.dialect.supports_geometric_types() => {
1568 chars.next();
1569 self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1570 }
1571 Some('<') if self.dialect.supports_geometric_types() => {
1572 chars.next(); match chars.peek() {
1574 Some('|') => self.consume_and_return(
1575 chars,
1576 Token::AmpersandLeftAngleBracketVerticalBar,
1577 ),
1578 _ => {
1579 self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1580 }
1581 }
1582 }
1583 Some('&') => {
1584 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1586 }
1587 _ => self.start_binop(chars, "&", Token::Ampersand),
1589 }
1590 }
1591 '^' => {
1592 chars.next(); match chars.peek() {
1594 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1595 _ => Ok(Some(Token::Caret)),
1596 }
1597 }
1598 '{' => self.consume_and_return(chars, Token::LBrace),
1599 '}' => self.consume_and_return(chars, Token::RBrace),
1600 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1601 {
1602 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1604 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1605 prefix: "#".to_owned(),
1606 comment,
1607 })))
1608 }
1609 '~' => {
1610 chars.next(); match chars.peek() {
1612 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1613 Some('=') if self.dialect.supports_geometric_types() => {
1614 self.consume_for_binop(chars, "~=", Token::TildeEqual)
1615 }
1616 Some('~') => {
1617 chars.next();
1618 match chars.peek() {
1619 Some('*') => {
1620 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1621 }
1622 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1623 }
1624 }
1625 _ => self.start_binop(chars, "~", Token::Tilde),
1626 }
1627 }
1628 '#' => {
1629 chars.next();
1630 match chars.peek() {
1631 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1632 Some('>') => {
1633 chars.next();
1634 match chars.peek() {
1635 Some('>') => {
1636 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1637 }
1638 _ => self.start_binop(chars, "#>", Token::HashArrow),
1639 }
1640 }
1641 Some(' ') => Ok(Some(Token::Sharp)),
1642 Some('#') if self.dialect.supports_geometric_types() => {
1643 self.consume_for_binop(chars, "##", Token::DoubleSharp)
1644 }
1645 Some(sch) if self.dialect.is_identifier_start('#') => {
1646 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1647 }
1648 _ => self.start_binop(chars, "#", Token::Sharp),
1649 }
1650 }
1651 '@' => {
1652 chars.next();
1653 match chars.peek() {
1654 Some('@') if self.dialect.supports_geometric_types() => {
1655 self.consume_and_return(chars, Token::AtAt)
1656 }
1657 Some('-') if self.dialect.supports_geometric_types() => {
1658 chars.next();
1659 match chars.peek() {
1660 Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1661 _ => self.start_binop_opt(chars, "@-", None),
1662 }
1663 }
1664 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1665 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1666 Some('@') => {
1667 chars.next();
1668 match chars.peek() {
1669 Some(' ') => Ok(Some(Token::AtAt)),
1670 Some(tch) if self.dialect.is_identifier_start('@') => {
1671 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1672 }
1673 _ => Ok(Some(Token::AtAt)),
1674 }
1675 }
1676 Some(' ') => Ok(Some(Token::AtSign)),
1677 Some('\'') => Ok(Some(Token::AtSign)),
1687 Some('\"') => Ok(Some(Token::AtSign)),
1688 Some('`') => Ok(Some(Token::AtSign)),
1689 Some(sch) if self.dialect.is_identifier_start('@') => {
1690 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1691 }
1692 _ => Ok(Some(Token::AtSign)),
1693 }
1694 }
1695 '?' if self.dialect.supports_geometric_types() => {
1697 chars.next(); match chars.peek() {
1699 Some('|') => {
1700 chars.next();
1701 match chars.peek() {
1702 Some('|') => self.consume_and_return(
1703 chars,
1704 Token::QuestionMarkDoubleVerticalBar,
1705 ),
1706 _ => Ok(Some(Token::QuestionPipe)),
1707 }
1708 }
1709
1710 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1711 Some('-') => {
1712 chars.next(); match chars.peek() {
1714 Some('|') => self
1715 .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1716 _ => Ok(Some(Token::QuestionMarkDash)),
1717 }
1718 }
1719 Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1720 _ => Ok(Some(Token::Question)),
1721 }
1722 }
1723 '?' => {
1724 chars.next();
1725 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1726 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1727 }
1728
1729 ch if self.dialect.is_identifier_start(ch) => {
1731 self.tokenize_identifier_or_keyword([ch], chars)
1732 }
1733 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1734
1735 ch if ch.is_whitespace() => {
1737 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1738 }
1739 other => self.consume_and_return(chars, Token::Char(other)),
1740 },
1741 None => Ok(None),
1742 }
1743 }
1744
1745 fn consume_for_binop(
1747 &self,
1748 chars: &mut State,
1749 prefix: &str,
1750 default: Token,
1751 ) -> Result<Option<Token>, TokenizerError> {
1752 chars.next(); self.start_binop_opt(chars, prefix, Some(default))
1754 }
1755
1756 fn start_binop(
1758 &self,
1759 chars: &mut State,
1760 prefix: &str,
1761 default: Token,
1762 ) -> Result<Option<Token>, TokenizerError> {
1763 self.start_binop_opt(chars, prefix, Some(default))
1764 }
1765
1766 fn start_binop_opt(
1768 &self,
1769 chars: &mut State,
1770 prefix: &str,
1771 default: Option<Token>,
1772 ) -> Result<Option<Token>, TokenizerError> {
1773 let mut custom = None;
1774 while let Some(&ch) = chars.peek() {
1775 if !self.dialect.is_custom_operator_part(ch) {
1776 break;
1777 }
1778
1779 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1780 chars.next();
1781 }
1782 match (custom, default) {
1783 (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1784 (None, Some(tok)) => Ok(Some(tok)),
1785 (None, None) => self.tokenizer_error(
1786 chars.location(),
1787 format!("Expected a valid binary operator after '{prefix}'"),
1788 ),
1789 }
1790 }
1791
1792 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1794 let mut s = String::new();
1795 let mut value = String::new();
1796
1797 chars.next();
1798
1799 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1801 chars.next();
1802
1803 let mut is_terminated = false;
1804 let mut prev: Option<char> = None;
1805
1806 while let Some(&ch) = chars.peek() {
1807 if prev == Some('$') {
1808 if ch == '$' {
1809 chars.next();
1810 is_terminated = true;
1811 break;
1812 } else {
1813 s.push('$');
1814 s.push(ch);
1815 }
1816 } else if ch != '$' {
1817 s.push(ch);
1818 }
1819
1820 prev = Some(ch);
1821 chars.next();
1822 }
1823
1824 return if chars.peek().is_none() && !is_terminated {
1825 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1826 } else {
1827 Ok(Token::DollarQuotedString(DollarQuotedString {
1828 value: s,
1829 tag: None,
1830 }))
1831 };
1832 } else {
1833 value.push_str(&peeking_take_while(chars, |ch| {
1834 ch.is_alphanumeric()
1835 || ch == '_'
1836 || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1838 }));
1839
1840 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1842 chars.next();
1843
1844 let mut temp = String::new();
1845 let end_delimiter = format!("${value}$");
1846
1847 loop {
1848 match chars.next() {
1849 Some(ch) => {
1850 temp.push(ch);
1851
1852 if temp.ends_with(&end_delimiter) {
1853 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1854 s.push_str(temp);
1855 }
1856 break;
1857 }
1858 }
1859 None => {
1860 if temp.ends_with(&end_delimiter) {
1861 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1862 s.push_str(temp);
1863 }
1864 break;
1865 }
1866
1867 return self.tokenizer_error(
1868 chars.location(),
1869 "Unterminated dollar-quoted, expected $",
1870 );
1871 }
1872 }
1873 }
1874 } else {
1875 return Ok(Token::Placeholder(String::from("$") + &value));
1876 }
1877 }
1878
1879 Ok(Token::DollarQuotedString(DollarQuotedString {
1880 value: s,
1881 tag: if value.is_empty() { None } else { Some(value) },
1882 }))
1883 }
1884
1885 fn tokenizer_error<R>(
1886 &self,
1887 loc: Location,
1888 message: impl Into<String>,
1889 ) -> Result<R, TokenizerError> {
1890 Err(TokenizerError {
1891 message: message.into(),
1892 location: loc,
1893 })
1894 }
1895
1896 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1898 let mut comment = peeking_take_while(chars, |ch| match ch {
1899 '\n' => false, '\r' if dialect_of!(self is PostgreSqlDialect) => false, _ => true, });
1903
1904 if let Some(ch) = chars.next() {
1905 assert!(ch == '\n' || ch == '\r');
1906 comment.push(ch);
1907 }
1908
1909 comment
1910 }
1911
1912 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1914 let mut s = first_chars.into();
1915 s.push_str(&peeking_take_while(chars, |ch| {
1916 self.dialect.is_identifier_part(ch)
1917 }));
1918 s
1919 }
1920
1921 fn tokenize_quoted_identifier(
1923 &self,
1924 quote_start: char,
1925 chars: &mut State,
1926 ) -> Result<String, TokenizerError> {
1927 let error_loc = chars.location();
1928 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
1930 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1931
1932 if last_char == Some(quote_end) {
1933 Ok(s)
1934 } else {
1935 self.tokenizer_error(
1936 error_loc,
1937 format!("Expected close delimiter '{quote_end}' before EOF."),
1938 )
1939 }
1940 }
1941
1942 fn tokenize_escaped_single_quoted_string(
1944 &self,
1945 starting_loc: Location,
1946 chars: &mut State,
1947 ) -> Result<String, TokenizerError> {
1948 if let Some(s) = unescape_single_quoted_string(chars) {
1949 return Ok(s);
1950 }
1951
1952 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1953 }
1954
1955 fn tokenize_single_or_triple_quoted_string<F>(
1958 &self,
1959 chars: &mut State,
1960 quote_style: char,
1961 backslash_escape: bool,
1962 single_quote_token: F,
1963 triple_quote_token: F,
1964 ) -> Result<Option<Token>, TokenizerError>
1965 where
1966 F: Fn(String) -> Token,
1967 {
1968 let error_loc = chars.location();
1969
1970 let mut num_opening_quotes = 0u8;
1971 for _ in 0..3 {
1972 if Some("e_style) == chars.peek() {
1973 chars.next(); num_opening_quotes += 1;
1975 } else {
1976 break;
1977 }
1978 }
1979
1980 let (token_fn, num_quote_chars) = match num_opening_quotes {
1981 1 => (single_quote_token, NumStringQuoteChars::One),
1982 2 => {
1983 return Ok(Some(single_quote_token("".into())));
1985 }
1986 3 => {
1987 let Some(num_quote_chars) = NonZeroU8::new(3) else {
1988 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1989 };
1990 (
1991 triple_quote_token,
1992 NumStringQuoteChars::Many(num_quote_chars),
1993 )
1994 }
1995 _ => {
1996 return self.tokenizer_error(error_loc, "invalid string literal opening");
1997 }
1998 };
1999
2000 let settings = TokenizeQuotedStringSettings {
2001 quote_style,
2002 num_quote_chars,
2003 num_opening_quotes_to_consume: 0,
2004 backslash_escape,
2005 };
2006
2007 self.tokenize_quoted_string(chars, settings)
2008 .map(token_fn)
2009 .map(Some)
2010 }
2011
2012 fn tokenize_single_quoted_string(
2014 &self,
2015 chars: &mut State,
2016 quote_style: char,
2017 backslash_escape: bool,
2018 ) -> Result<String, TokenizerError> {
2019 self.tokenize_quoted_string(
2020 chars,
2021 TokenizeQuotedStringSettings {
2022 quote_style,
2023 num_quote_chars: NumStringQuoteChars::One,
2024 num_opening_quotes_to_consume: 1,
2025 backslash_escape,
2026 },
2027 )
2028 }
2029
2030 fn tokenize_quote_delimited_string(
2034 &self,
2035 chars: &mut State,
2036 literal_prefix: &[char],
2039 ) -> Result<QuoteDelimitedString, TokenizerError> {
2040 let literal_start_loc = chars.location();
2041 chars.next();
2042
2043 let start_quote_loc = chars.location();
2044 let (start_quote, end_quote) = match chars.next() {
2045 None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => {
2046 return self.tokenizer_error(
2047 start_quote_loc,
2048 format!(
2049 "Invalid space, tab, newline, or EOF after '{}''",
2050 String::from_iter(literal_prefix)
2051 ),
2052 );
2053 }
2054 Some(c) => (
2055 c,
2056 match c {
2057 '[' => ']',
2058 '{' => '}',
2059 '<' => '>',
2060 '(' => ')',
2061 c => c,
2062 },
2063 ),
2064 };
2065
2066 let mut value = String::new();
2068 while let Some(ch) = chars.next() {
2069 if ch == end_quote {
2070 if let Some('\'') = chars.peek() {
2071 chars.next(); return Ok(QuoteDelimitedString {
2073 start_quote,
2074 value,
2075 end_quote,
2076 });
2077 }
2078 }
2079 value.push(ch);
2080 }
2081
2082 self.tokenizer_error(literal_start_loc, "Unterminated string literal")
2083 }
2084
2085 fn tokenize_quoted_string(
2087 &self,
2088 chars: &mut State,
2089 settings: TokenizeQuotedStringSettings,
2090 ) -> Result<String, TokenizerError> {
2091 let mut s = String::new();
2092 let error_loc = chars.location();
2093
2094 for _ in 0..settings.num_opening_quotes_to_consume {
2096 if Some(settings.quote_style) != chars.next() {
2097 return self.tokenizer_error(error_loc, "invalid string literal opening");
2098 }
2099 }
2100
2101 let mut num_consecutive_quotes = 0;
2102 while let Some(&ch) = chars.peek() {
2103 let pending_final_quote = match settings.num_quote_chars {
2104 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2105 n @ NumStringQuoteChars::Many(count)
2106 if num_consecutive_quotes + 1 == count.get() =>
2107 {
2108 Some(n)
2109 }
2110 NumStringQuoteChars::Many(_) => None,
2111 };
2112
2113 match ch {
2114 char if char == settings.quote_style && pending_final_quote.is_some() => {
2115 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2118 let mut buf = s.chars();
2123 for _ in 1..count.get() {
2124 buf.next_back();
2125 }
2126 return Ok(buf.as_str().to_string());
2127 } else if chars
2128 .peek()
2129 .map(|c| *c == settings.quote_style)
2130 .unwrap_or(false)
2131 {
2132 s.push(ch);
2133 if !self.unescape {
2134 s.push(ch);
2136 }
2137 chars.next();
2138 } else {
2139 return Ok(s);
2140 }
2141 }
2142 '\\' if settings.backslash_escape => {
2143 chars.next();
2145
2146 num_consecutive_quotes = 0;
2147
2148 if let Some(next) = chars.peek() {
2149 if !self.unescape
2150 || (self.dialect.ignores_wildcard_escapes()
2151 && (*next == '%' || *next == '_'))
2152 {
2153 s.push(ch);
2157 s.push(*next);
2158 chars.next(); } else {
2160 let n = match next {
2161 '0' => '\0',
2162 'a' => '\u{7}',
2163 'b' => '\u{8}',
2164 'f' => '\u{c}',
2165 'n' => '\n',
2166 'r' => '\r',
2167 't' => '\t',
2168 'Z' => '\u{1a}',
2169 _ => *next,
2170 };
2171 s.push(n);
2172 chars.next(); }
2174 }
2175 }
2176 ch => {
2177 chars.next(); if ch == settings.quote_style {
2180 num_consecutive_quotes += 1;
2181 } else {
2182 num_consecutive_quotes = 0;
2183 }
2184
2185 s.push(ch);
2186 }
2187 }
2188 }
2189 self.tokenizer_error(error_loc, "Unterminated string literal")
2190 }
2191
2192 fn tokenize_multiline_comment(
2193 &self,
2194 chars: &mut State,
2195 ) -> Result<Option<Token>, TokenizerError> {
2196 let mut s = String::new();
2197 let mut nested = 1;
2198 let supports_nested_comments = self.dialect.supports_nested_comments();
2199
2200 loop {
2201 match chars.next() {
2202 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2203 chars.next(); s.push('/');
2205 s.push('*');
2206 nested += 1;
2207 }
2208 Some('*') if matches!(chars.peek(), Some('/')) => {
2209 chars.next(); nested -= 1;
2211 if nested == 0 {
2212 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2213 }
2214 s.push('*');
2215 s.push('/');
2216 }
2217 Some(ch) => {
2218 s.push(ch);
2219 }
2220 None => {
2221 break self.tokenizer_error(
2222 chars.location(),
2223 "Unexpected EOF while in a multi-line comment",
2224 );
2225 }
2226 }
2227 }
2228 }
2229
2230 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2231 let mut last_char = None;
2232 let mut s = String::new();
2233 while let Some(ch) = chars.next() {
2234 if ch == quote_end {
2235 if chars.peek() == Some("e_end) {
2236 chars.next();
2237 s.push(ch);
2238 if !self.unescape {
2239 s.push(ch);
2241 }
2242 } else {
2243 last_char = Some(quote_end);
2244 break;
2245 }
2246 } else {
2247 s.push(ch);
2248 }
2249 }
2250 (s, last_char)
2251 }
2252
2253 #[allow(clippy::unnecessary_wraps)]
2254 fn consume_and_return(
2255 &self,
2256 chars: &mut State,
2257 t: Token,
2258 ) -> Result<Option<Token>, TokenizerError> {
2259 chars.next();
2260 Ok(Some(t))
2261 }
2262}
2263
2264fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2268 let mut s = String::new();
2269 while let Some(&ch) = chars.peek() {
2270 if predicate(ch) {
2271 chars.next(); s.push(ch);
2273 } else {
2274 break;
2275 }
2276 }
2277 s
2278}
2279
2280fn peeking_next_take_while(
2282 chars: &mut State,
2283 mut predicate: impl FnMut(char, Option<char>) -> bool,
2284) -> String {
2285 let mut s = String::new();
2286 while let Some(&ch) = chars.peek() {
2287 let next_char = chars.peekable.clone().nth(1);
2288 if predicate(ch, next_char) {
2289 chars.next(); s.push(ch);
2291 } else {
2292 break;
2293 }
2294 }
2295 s
2296}
2297
2298fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2299 Unescape::new(chars).unescape()
2300}
2301
2302struct Unescape<'a: 'b, 'b> {
2303 chars: &'b mut State<'a>,
2304}
2305
2306impl<'a: 'b, 'b> Unescape<'a, 'b> {
2307 fn new(chars: &'b mut State<'a>) -> Self {
2308 Self { chars }
2309 }
2310 fn unescape(mut self) -> Option<String> {
2311 let mut unescaped = String::new();
2312
2313 self.chars.next();
2314
2315 while let Some(c) = self.chars.next() {
2316 if c == '\'' {
2317 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2319 self.chars.next();
2320 unescaped.push('\'');
2321 continue;
2322 }
2323 return Some(unescaped);
2324 }
2325
2326 if c != '\\' {
2327 unescaped.push(c);
2328 continue;
2329 }
2330
2331 let c = match self.chars.next()? {
2332 'b' => '\u{0008}',
2333 'f' => '\u{000C}',
2334 'n' => '\n',
2335 'r' => '\r',
2336 't' => '\t',
2337 'u' => self.unescape_unicode_16()?,
2338 'U' => self.unescape_unicode_32()?,
2339 'x' => self.unescape_hex()?,
2340 c if c.is_digit(8) => self.unescape_octal(c)?,
2341 c => c,
2342 };
2343
2344 unescaped.push(Self::check_null(c)?);
2345 }
2346
2347 None
2348 }
2349
2350 #[inline]
2351 fn check_null(c: char) -> Option<char> {
2352 if c == '\0' {
2353 None
2354 } else {
2355 Some(c)
2356 }
2357 }
2358
2359 #[inline]
2360 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2361 match u32::from_str_radix(s, RADIX) {
2363 Err(_) => None,
2364 Ok(n) => {
2365 let n = n & 0xFF;
2366 if n <= 127 {
2367 char::from_u32(n)
2368 } else {
2369 None
2370 }
2371 }
2372 }
2373 }
2374
2375 fn unescape_hex(&mut self) -> Option<char> {
2377 let mut s = String::new();
2378
2379 for _ in 0..2 {
2380 match self.next_hex_digit() {
2381 Some(c) => s.push(c),
2382 None => break,
2383 }
2384 }
2385
2386 if s.is_empty() {
2387 return Some('x');
2388 }
2389
2390 Self::byte_to_char::<16>(&s)
2391 }
2392
2393 #[inline]
2394 fn next_hex_digit(&mut self) -> Option<char> {
2395 match self.chars.peek() {
2396 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2397 _ => None,
2398 }
2399 }
2400
2401 fn unescape_octal(&mut self, c: char) -> Option<char> {
2403 let mut s = String::new();
2404
2405 s.push(c);
2406 for _ in 0..2 {
2407 match self.next_octal_digest() {
2408 Some(c) => s.push(c),
2409 None => break,
2410 }
2411 }
2412
2413 Self::byte_to_char::<8>(&s)
2414 }
2415
2416 #[inline]
2417 fn next_octal_digest(&mut self) -> Option<char> {
2418 match self.chars.peek() {
2419 Some(c) if c.is_digit(8) => self.chars.next(),
2420 _ => None,
2421 }
2422 }
2423
2424 fn unescape_unicode_16(&mut self) -> Option<char> {
2426 self.unescape_unicode::<4>()
2427 }
2428
2429 fn unescape_unicode_32(&mut self) -> Option<char> {
2431 self.unescape_unicode::<8>()
2432 }
2433
2434 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2435 let mut s = String::new();
2436 for _ in 0..NUM {
2437 s.push(self.chars.next()?);
2438 }
2439 match u32::from_str_radix(&s, 16) {
2440 Err(_) => None,
2441 Ok(n) => char::from_u32(n),
2442 }
2443 }
2444}
2445
2446fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2447 let mut unescaped = String::new();
2448 chars.next(); while let Some(c) = chars.next() {
2450 match c {
2451 '\'' => {
2452 if chars.peek() == Some(&'\'') {
2453 chars.next();
2454 unescaped.push('\'');
2455 } else {
2456 return Ok(unescaped);
2457 }
2458 }
2459 '\\' => match chars.peek() {
2460 Some('\\') => {
2461 chars.next();
2462 unescaped.push('\\');
2463 }
2464 Some('+') => {
2465 chars.next();
2466 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2467 }
2468 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2469 },
2470 _ => {
2471 unescaped.push(c);
2472 }
2473 }
2474 }
2475 Err(TokenizerError {
2476 message: "Unterminated unicode encoded string literal".to_string(),
2477 location: chars.location(),
2478 })
2479}
2480
2481fn take_char_from_hex_digits(
2482 chars: &mut State<'_>,
2483 max_digits: usize,
2484) -> Result<char, TokenizerError> {
2485 let mut result = 0u32;
2486 for _ in 0..max_digits {
2487 let next_char = chars.next().ok_or_else(|| TokenizerError {
2488 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2489 .to_string(),
2490 location: chars.location(),
2491 })?;
2492 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2493 message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2494 location: chars.location(),
2495 })?;
2496 result = result * 16 + digit;
2497 }
2498 char::from_u32(result).ok_or_else(|| TokenizerError {
2499 message: format!("Invalid unicode character: {result:x}"),
2500 location: chars.location(),
2501 })
2502}
2503
2504#[cfg(test)]
2505mod tests {
2506 use super::*;
2507 use crate::dialect::{
2508 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2509 };
2510 use crate::test_utils::{all_dialects_except, all_dialects_where};
2511 use core::fmt::Debug;
2512
2513 #[test]
2514 fn tokenizer_error_impl() {
2515 let err = TokenizerError {
2516 message: "test".into(),
2517 location: Location { line: 1, column: 1 },
2518 };
2519 #[cfg(feature = "std")]
2520 {
2521 use std::error::Error;
2522 assert!(err.source().is_none());
2523 }
2524 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2525 }
2526
2527 #[test]
2528 fn tokenize_select_1() {
2529 let sql = String::from("SELECT 1");
2530 let dialect = GenericDialect {};
2531 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2532
2533 let expected = vec![
2534 Token::make_keyword("SELECT"),
2535 Token::Whitespace(Whitespace::Space),
2536 Token::Number(String::from("1"), false),
2537 ];
2538
2539 compare(expected, tokens);
2540 }
2541
2542 #[test]
2543 fn tokenize_select_float() {
2544 let sql = String::from("SELECT .1");
2545 let dialect = GenericDialect {};
2546 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2547
2548 let expected = vec![
2549 Token::make_keyword("SELECT"),
2550 Token::Whitespace(Whitespace::Space),
2551 Token::Number(String::from(".1"), false),
2552 ];
2553
2554 compare(expected, tokens);
2555 }
2556
2557 #[test]
2558 fn tokenize_clickhouse_double_equal() {
2559 let sql = String::from("SELECT foo=='1'");
2560 let dialect = ClickHouseDialect {};
2561 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2562 let tokens = tokenizer.tokenize().unwrap();
2563
2564 let expected = vec![
2565 Token::make_keyword("SELECT"),
2566 Token::Whitespace(Whitespace::Space),
2567 Token::Word(Word {
2568 value: "foo".to_string(),
2569 quote_style: None,
2570 keyword: Keyword::NoKeyword,
2571 }),
2572 Token::DoubleEq,
2573 Token::SingleQuotedString("1".to_string()),
2574 ];
2575
2576 compare(expected, tokens);
2577 }
2578
2579 #[test]
2580 fn tokenize_numeric_literal_underscore() {
2581 let dialect = GenericDialect {};
2582 let sql = String::from("SELECT 10_000");
2583 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2584 let tokens = tokenizer.tokenize().unwrap();
2585 let expected = vec![
2586 Token::make_keyword("SELECT"),
2587 Token::Whitespace(Whitespace::Space),
2588 Token::Number("10".to_string(), false),
2589 Token::make_word("_000", None),
2590 ];
2591 compare(expected, tokens);
2592
2593 all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2594 "SELECT 10_000, _10_000, 10_00_, 10___0",
2595 vec![
2596 Token::make_keyword("SELECT"),
2597 Token::Whitespace(Whitespace::Space),
2598 Token::Number("10_000".to_string(), false),
2599 Token::Comma,
2600 Token::Whitespace(Whitespace::Space),
2601 Token::make_word("_10_000", None), Token::Comma,
2603 Token::Whitespace(Whitespace::Space),
2604 Token::Number("10_00".to_string(), false),
2605 Token::make_word("_", None), Token::Comma,
2607 Token::Whitespace(Whitespace::Space),
2608 Token::Number("10".to_string(), false),
2609 Token::make_word("___0", None), ],
2611 );
2612 }
2613
2614 #[test]
2615 fn tokenize_select_exponent() {
2616 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2617 let dialect = GenericDialect {};
2618 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2619
2620 let expected = vec![
2621 Token::make_keyword("SELECT"),
2622 Token::Whitespace(Whitespace::Space),
2623 Token::Number(String::from("1e10"), false),
2624 Token::Comma,
2625 Token::Whitespace(Whitespace::Space),
2626 Token::Number(String::from("1e-10"), false),
2627 Token::Comma,
2628 Token::Whitespace(Whitespace::Space),
2629 Token::Number(String::from("1e+10"), false),
2630 Token::Comma,
2631 Token::Whitespace(Whitespace::Space),
2632 Token::Number(String::from("1"), false),
2633 Token::make_word("ea", None),
2634 Token::Comma,
2635 Token::Whitespace(Whitespace::Space),
2636 Token::Number(String::from("1e-10"), false),
2637 Token::make_word("a", None),
2638 Token::Comma,
2639 Token::Whitespace(Whitespace::Space),
2640 Token::Number(String::from("1e-10"), false),
2641 Token::Minus,
2642 Token::Number(String::from("10"), false),
2643 ];
2644
2645 compare(expected, tokens);
2646 }
2647
2648 #[test]
2649 fn tokenize_scalar_function() {
2650 let sql = String::from("SELECT sqrt(1)");
2651 let dialect = GenericDialect {};
2652 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2653
2654 let expected = vec![
2655 Token::make_keyword("SELECT"),
2656 Token::Whitespace(Whitespace::Space),
2657 Token::make_word("sqrt", None),
2658 Token::LParen,
2659 Token::Number(String::from("1"), false),
2660 Token::RParen,
2661 ];
2662
2663 compare(expected, tokens);
2664 }
2665
2666 #[test]
2667 fn tokenize_string_string_concat() {
2668 let sql = String::from("SELECT 'a' || 'b'");
2669 let dialect = GenericDialect {};
2670 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2671
2672 let expected = vec![
2673 Token::make_keyword("SELECT"),
2674 Token::Whitespace(Whitespace::Space),
2675 Token::SingleQuotedString(String::from("a")),
2676 Token::Whitespace(Whitespace::Space),
2677 Token::StringConcat,
2678 Token::Whitespace(Whitespace::Space),
2679 Token::SingleQuotedString(String::from("b")),
2680 ];
2681
2682 compare(expected, tokens);
2683 }
2684 #[test]
2685 fn tokenize_bitwise_op() {
2686 let sql = String::from("SELECT one | two ^ three");
2687 let dialect = GenericDialect {};
2688 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2689
2690 let expected = vec![
2691 Token::make_keyword("SELECT"),
2692 Token::Whitespace(Whitespace::Space),
2693 Token::make_word("one", None),
2694 Token::Whitespace(Whitespace::Space),
2695 Token::Pipe,
2696 Token::Whitespace(Whitespace::Space),
2697 Token::make_word("two", None),
2698 Token::Whitespace(Whitespace::Space),
2699 Token::Caret,
2700 Token::Whitespace(Whitespace::Space),
2701 Token::make_word("three", None),
2702 ];
2703 compare(expected, tokens);
2704 }
2705
2706 #[test]
2707 fn tokenize_logical_xor() {
2708 let sql =
2709 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2710 let dialect = GenericDialect {};
2711 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2712
2713 let expected = vec![
2714 Token::make_keyword("SELECT"),
2715 Token::Whitespace(Whitespace::Space),
2716 Token::make_keyword("true"),
2717 Token::Whitespace(Whitespace::Space),
2718 Token::make_keyword("XOR"),
2719 Token::Whitespace(Whitespace::Space),
2720 Token::make_keyword("true"),
2721 Token::Comma,
2722 Token::Whitespace(Whitespace::Space),
2723 Token::make_keyword("false"),
2724 Token::Whitespace(Whitespace::Space),
2725 Token::make_keyword("XOR"),
2726 Token::Whitespace(Whitespace::Space),
2727 Token::make_keyword("false"),
2728 Token::Comma,
2729 Token::Whitespace(Whitespace::Space),
2730 Token::make_keyword("true"),
2731 Token::Whitespace(Whitespace::Space),
2732 Token::make_keyword("XOR"),
2733 Token::Whitespace(Whitespace::Space),
2734 Token::make_keyword("false"),
2735 Token::Comma,
2736 Token::Whitespace(Whitespace::Space),
2737 Token::make_keyword("false"),
2738 Token::Whitespace(Whitespace::Space),
2739 Token::make_keyword("XOR"),
2740 Token::Whitespace(Whitespace::Space),
2741 Token::make_keyword("true"),
2742 ];
2743 compare(expected, tokens);
2744 }
2745
2746 #[test]
2747 fn tokenize_simple_select() {
2748 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2749 let dialect = GenericDialect {};
2750 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2751
2752 let expected = vec![
2753 Token::make_keyword("SELECT"),
2754 Token::Whitespace(Whitespace::Space),
2755 Token::Mul,
2756 Token::Whitespace(Whitespace::Space),
2757 Token::make_keyword("FROM"),
2758 Token::Whitespace(Whitespace::Space),
2759 Token::make_word("customer", None),
2760 Token::Whitespace(Whitespace::Space),
2761 Token::make_keyword("WHERE"),
2762 Token::Whitespace(Whitespace::Space),
2763 Token::make_word("id", None),
2764 Token::Whitespace(Whitespace::Space),
2765 Token::Eq,
2766 Token::Whitespace(Whitespace::Space),
2767 Token::Number(String::from("1"), false),
2768 Token::Whitespace(Whitespace::Space),
2769 Token::make_keyword("LIMIT"),
2770 Token::Whitespace(Whitespace::Space),
2771 Token::Number(String::from("5"), false),
2772 ];
2773
2774 compare(expected, tokens);
2775 }
2776
2777 #[test]
2778 fn tokenize_explain_select() {
2779 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2780 let dialect = GenericDialect {};
2781 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2782
2783 let expected = vec![
2784 Token::make_keyword("EXPLAIN"),
2785 Token::Whitespace(Whitespace::Space),
2786 Token::make_keyword("SELECT"),
2787 Token::Whitespace(Whitespace::Space),
2788 Token::Mul,
2789 Token::Whitespace(Whitespace::Space),
2790 Token::make_keyword("FROM"),
2791 Token::Whitespace(Whitespace::Space),
2792 Token::make_word("customer", None),
2793 Token::Whitespace(Whitespace::Space),
2794 Token::make_keyword("WHERE"),
2795 Token::Whitespace(Whitespace::Space),
2796 Token::make_word("id", None),
2797 Token::Whitespace(Whitespace::Space),
2798 Token::Eq,
2799 Token::Whitespace(Whitespace::Space),
2800 Token::Number(String::from("1"), false),
2801 ];
2802
2803 compare(expected, tokens);
2804 }
2805
2806 #[test]
2807 fn tokenize_explain_analyze_select() {
2808 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2809 let dialect = GenericDialect {};
2810 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2811
2812 let expected = vec![
2813 Token::make_keyword("EXPLAIN"),
2814 Token::Whitespace(Whitespace::Space),
2815 Token::make_keyword("ANALYZE"),
2816 Token::Whitespace(Whitespace::Space),
2817 Token::make_keyword("SELECT"),
2818 Token::Whitespace(Whitespace::Space),
2819 Token::Mul,
2820 Token::Whitespace(Whitespace::Space),
2821 Token::make_keyword("FROM"),
2822 Token::Whitespace(Whitespace::Space),
2823 Token::make_word("customer", None),
2824 Token::Whitespace(Whitespace::Space),
2825 Token::make_keyword("WHERE"),
2826 Token::Whitespace(Whitespace::Space),
2827 Token::make_word("id", None),
2828 Token::Whitespace(Whitespace::Space),
2829 Token::Eq,
2830 Token::Whitespace(Whitespace::Space),
2831 Token::Number(String::from("1"), false),
2832 ];
2833
2834 compare(expected, tokens);
2835 }
2836
2837 #[test]
2838 fn tokenize_string_predicate() {
2839 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2840 let dialect = GenericDialect {};
2841 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2842
2843 let expected = vec![
2844 Token::make_keyword("SELECT"),
2845 Token::Whitespace(Whitespace::Space),
2846 Token::Mul,
2847 Token::Whitespace(Whitespace::Space),
2848 Token::make_keyword("FROM"),
2849 Token::Whitespace(Whitespace::Space),
2850 Token::make_word("customer", None),
2851 Token::Whitespace(Whitespace::Space),
2852 Token::make_keyword("WHERE"),
2853 Token::Whitespace(Whitespace::Space),
2854 Token::make_word("salary", None),
2855 Token::Whitespace(Whitespace::Space),
2856 Token::Neq,
2857 Token::Whitespace(Whitespace::Space),
2858 Token::SingleQuotedString(String::from("Not Provided")),
2859 ];
2860
2861 compare(expected, tokens);
2862 }
2863
2864 #[test]
2865 fn tokenize_invalid_string() {
2866 let sql = String::from("\n💝مصطفىh");
2867
2868 let dialect = GenericDialect {};
2869 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2870 let expected = vec![
2872 Token::Whitespace(Whitespace::Newline),
2873 Token::Char('💝'),
2874 Token::make_word("مصطفىh", None),
2875 ];
2876 compare(expected, tokens);
2877 }
2878
2879 #[test]
2880 fn tokenize_newline_in_string_literal() {
2881 let sql = String::from("'foo\r\nbar\nbaz'");
2882
2883 let dialect = GenericDialect {};
2884 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2885 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2886 compare(expected, tokens);
2887 }
2888
2889 #[test]
2890 fn tokenize_unterminated_string_literal() {
2891 let sql = String::from("select 'foo");
2892
2893 let dialect = GenericDialect {};
2894 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2895 assert_eq!(
2896 tokenizer.tokenize(),
2897 Err(TokenizerError {
2898 message: "Unterminated string literal".to_string(),
2899 location: Location { line: 1, column: 8 },
2900 })
2901 );
2902 }
2903
2904 #[test]
2905 fn tokenize_unterminated_string_literal_utf8() {
2906 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2907
2908 let dialect = GenericDialect {};
2909 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2910 assert_eq!(
2911 tokenizer.tokenize(),
2912 Err(TokenizerError {
2913 message: "Unterminated string literal".to_string(),
2914 location: Location {
2915 line: 1,
2916 column: 35
2917 }
2918 })
2919 );
2920 }
2921
2922 #[test]
2923 fn tokenize_invalid_string_cols() {
2924 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2925
2926 let dialect = GenericDialect {};
2927 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2928 let expected = vec![
2930 Token::Whitespace(Whitespace::Newline),
2931 Token::Whitespace(Whitespace::Newline),
2932 Token::make_keyword("SELECT"),
2933 Token::Whitespace(Whitespace::Space),
2934 Token::Mul,
2935 Token::Whitespace(Whitespace::Space),
2936 Token::make_keyword("FROM"),
2937 Token::Whitespace(Whitespace::Space),
2938 Token::make_keyword("table"),
2939 Token::Whitespace(Whitespace::Tab),
2940 Token::Char('💝'),
2941 Token::make_word("مصطفىh", None),
2942 ];
2943 compare(expected, tokens);
2944 }
2945
2946 #[test]
2947 fn tokenize_dollar_quoted_string_tagged() {
2948 let test_cases = vec![
2949 (
2950 String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2951 vec![
2952 Token::make_keyword("SELECT"),
2953 Token::Whitespace(Whitespace::Space),
2954 Token::DollarQuotedString(DollarQuotedString {
2955 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2956 tag: Some("tag".into()),
2957 })
2958 ]
2959 ),
2960 (
2961 String::from("SELECT $abc$x$ab$abc$"),
2962 vec![
2963 Token::make_keyword("SELECT"),
2964 Token::Whitespace(Whitespace::Space),
2965 Token::DollarQuotedString(DollarQuotedString {
2966 value: "x$ab".into(),
2967 tag: Some("abc".into()),
2968 })
2969 ]
2970 ),
2971 (
2972 String::from("SELECT $abc$$abc$"),
2973 vec![
2974 Token::make_keyword("SELECT"),
2975 Token::Whitespace(Whitespace::Space),
2976 Token::DollarQuotedString(DollarQuotedString {
2977 value: "".into(),
2978 tag: Some("abc".into()),
2979 })
2980 ]
2981 ),
2982 (
2983 String::from("0$abc$$abc$1"),
2984 vec![
2985 Token::Number("0".into(), false),
2986 Token::DollarQuotedString(DollarQuotedString {
2987 value: "".into(),
2988 tag: Some("abc".into()),
2989 }),
2990 Token::Number("1".into(), false),
2991 ]
2992 ),
2993 (
2994 String::from("$function$abc$q$data$q$$function$"),
2995 vec![
2996 Token::DollarQuotedString(DollarQuotedString {
2997 value: "abc$q$data$q$".into(),
2998 tag: Some("function".into()),
2999 }),
3000 ]
3001 ),
3002 ];
3003
3004 let dialect = GenericDialect {};
3005 for (sql, expected) in test_cases {
3006 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3007 compare(expected, tokens);
3008 }
3009 }
3010
3011 #[test]
3012 fn tokenize_dollar_quoted_string_tagged_unterminated() {
3013 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
3014 let dialect = GenericDialect {};
3015 assert_eq!(
3016 Tokenizer::new(&dialect, &sql).tokenize(),
3017 Err(TokenizerError {
3018 message: "Unterminated dollar-quoted, expected $".into(),
3019 location: Location {
3020 line: 1,
3021 column: 91
3022 }
3023 })
3024 );
3025 }
3026
3027 #[test]
3028 fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
3029 let sql = String::from("SELECT $abc$abc$");
3030 let dialect = GenericDialect {};
3031 assert_eq!(
3032 Tokenizer::new(&dialect, &sql).tokenize(),
3033 Err(TokenizerError {
3034 message: "Unterminated dollar-quoted, expected $".into(),
3035 location: Location {
3036 line: 1,
3037 column: 17
3038 }
3039 })
3040 );
3041 }
3042
3043 #[test]
3044 fn tokenize_dollar_placeholder() {
3045 let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
3046 let dialect = SQLiteDialect {};
3047 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3048 assert_eq!(
3049 tokens,
3050 vec![
3051 Token::make_keyword("SELECT"),
3052 Token::Whitespace(Whitespace::Space),
3053 Token::Placeholder("$$".into()),
3054 Token::Comma,
3055 Token::Whitespace(Whitespace::Space),
3056 Token::Placeholder("$$ABC$$".into()),
3057 Token::Comma,
3058 Token::Whitespace(Whitespace::Space),
3059 Token::Placeholder("$ABC$".into()),
3060 Token::Comma,
3061 Token::Whitespace(Whitespace::Space),
3062 Token::Placeholder("$ABC".into()),
3063 ]
3064 );
3065 }
3066
3067 #[test]
3068 fn tokenize_nested_dollar_quoted_strings() {
3069 let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
3070 let dialect = GenericDialect {};
3071 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3072 let expected = vec![
3073 Token::make_keyword("SELECT"),
3074 Token::Whitespace(Whitespace::Space),
3075 Token::DollarQuotedString(DollarQuotedString {
3076 value: "dollar $nested$ string".into(),
3077 tag: Some("tag".into()),
3078 }),
3079 ];
3080 compare(expected, tokens);
3081 }
3082
3083 #[test]
3084 fn tokenize_dollar_quoted_string_untagged_empty() {
3085 let sql = String::from("SELECT $$$$");
3086 let dialect = GenericDialect {};
3087 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3088 let expected = vec![
3089 Token::make_keyword("SELECT"),
3090 Token::Whitespace(Whitespace::Space),
3091 Token::DollarQuotedString(DollarQuotedString {
3092 value: "".into(),
3093 tag: None,
3094 }),
3095 ];
3096 compare(expected, tokens);
3097 }
3098
3099 #[test]
3100 fn tokenize_dollar_quoted_string_untagged() {
3101 let sql =
3102 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3103 let dialect = GenericDialect {};
3104 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3105 let expected = vec![
3106 Token::make_keyword("SELECT"),
3107 Token::Whitespace(Whitespace::Space),
3108 Token::DollarQuotedString(DollarQuotedString {
3109 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3110 tag: None,
3111 }),
3112 ];
3113 compare(expected, tokens);
3114 }
3115
3116 #[test]
3117 fn tokenize_dollar_quoted_string_untagged_unterminated() {
3118 let sql = String::from(
3119 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3120 );
3121 let dialect = GenericDialect {};
3122 assert_eq!(
3123 Tokenizer::new(&dialect, &sql).tokenize(),
3124 Err(TokenizerError {
3125 message: "Unterminated dollar-quoted string".into(),
3126 location: Location {
3127 line: 1,
3128 column: 86
3129 }
3130 })
3131 );
3132 }
3133
3134 #[test]
3135 fn tokenize_right_arrow() {
3136 let sql = String::from("FUNCTION(key=>value)");
3137 let dialect = GenericDialect {};
3138 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3139 let expected = vec![
3140 Token::make_word("FUNCTION", None),
3141 Token::LParen,
3142 Token::make_word("key", None),
3143 Token::RArrow,
3144 Token::make_word("value", None),
3145 Token::RParen,
3146 ];
3147 compare(expected, tokens);
3148 }
3149
3150 #[test]
3151 fn tokenize_is_null() {
3152 let sql = String::from("a IS NULL");
3153 let dialect = GenericDialect {};
3154 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3155
3156 let expected = vec![
3157 Token::make_word("a", None),
3158 Token::Whitespace(Whitespace::Space),
3159 Token::make_keyword("IS"),
3160 Token::Whitespace(Whitespace::Space),
3161 Token::make_keyword("NULL"),
3162 ];
3163
3164 compare(expected, tokens);
3165 }
3166
3167 #[test]
3168 fn tokenize_comment() {
3169 let test_cases = vec![
3170 (
3171 String::from("0--this is a comment\n1"),
3172 vec![
3173 Token::Number("0".to_string(), false),
3174 Token::Whitespace(Whitespace::SingleLineComment {
3175 prefix: "--".to_string(),
3176 comment: "this is a comment\n".to_string(),
3177 }),
3178 Token::Number("1".to_string(), false),
3179 ],
3180 ),
3181 (
3182 String::from("0--this is a comment\r1"),
3183 vec![
3184 Token::Number("0".to_string(), false),
3185 Token::Whitespace(Whitespace::SingleLineComment {
3186 prefix: "--".to_string(),
3187 comment: "this is a comment\r1".to_string(),
3188 }),
3189 ],
3190 ),
3191 (
3192 String::from("0--this is a comment\r\n1"),
3193 vec![
3194 Token::Number("0".to_string(), false),
3195 Token::Whitespace(Whitespace::SingleLineComment {
3196 prefix: "--".to_string(),
3197 comment: "this is a comment\r\n".to_string(),
3198 }),
3199 Token::Number("1".to_string(), false),
3200 ],
3201 ),
3202 ];
3203
3204 let dialect = GenericDialect {};
3205
3206 for (sql, expected) in test_cases {
3207 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3208 compare(expected, tokens);
3209 }
3210 }
3211
3212 #[test]
3213 fn tokenize_comment_postgres() {
3214 let sql = String::from("1--\r0");
3215
3216 let dialect = PostgreSqlDialect {};
3217 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3218 let expected = vec![
3219 Token::Number("1".to_string(), false),
3220 Token::Whitespace(Whitespace::SingleLineComment {
3221 prefix: "--".to_string(),
3222 comment: "\r".to_string(),
3223 }),
3224 Token::Number("0".to_string(), false),
3225 ];
3226 compare(expected, tokens);
3227 }
3228
3229 #[test]
3230 fn tokenize_comment_at_eof() {
3231 let sql = String::from("--this is a comment");
3232
3233 let dialect = GenericDialect {};
3234 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3235 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3236 prefix: "--".to_string(),
3237 comment: "this is a comment".to_string(),
3238 })];
3239 compare(expected, tokens);
3240 }
3241
3242 #[test]
3243 fn tokenize_multiline_comment() {
3244 let sql = String::from("0/*multi-line\n* /comment*/1");
3245
3246 let dialect = GenericDialect {};
3247 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3248 let expected = vec![
3249 Token::Number("0".to_string(), false),
3250 Token::Whitespace(Whitespace::MultiLineComment(
3251 "multi-line\n* /comment".to_string(),
3252 )),
3253 Token::Number("1".to_string(), false),
3254 ];
3255 compare(expected, tokens);
3256 }
3257
3258 #[test]
3259 fn tokenize_nested_multiline_comment() {
3260 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3261 "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3262 vec![
3263 Token::Number("0".to_string(), false),
3264 Token::Whitespace(Whitespace::MultiLineComment(
3265 "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3266 )),
3267 Token::Whitespace(Whitespace::Space),
3268 Token::Div,
3269 Token::Word(Word {
3270 value: "comment".to_string(),
3271 quote_style: None,
3272 keyword: Keyword::COMMENT,
3273 }),
3274 Token::Mul,
3275 Token::Div,
3276 Token::Number("1".to_string(), false),
3277 ],
3278 );
3279
3280 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3281 "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3282 vec![
3283 Token::Number("0".to_string(), false),
3284 Token::Whitespace(Whitespace::MultiLineComment(
3285 "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3286 )),
3287 Token::Number("1".to_string(), false),
3288 ],
3289 );
3290
3291 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3292 "SELECT 1/* a /* b */ c */0",
3293 vec![
3294 Token::make_keyword("SELECT"),
3295 Token::Whitespace(Whitespace::Space),
3296 Token::Number("1".to_string(), false),
3297 Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3298 Token::Number("0".to_string(), false),
3299 ],
3300 );
3301 }
3302
3303 #[test]
3304 fn tokenize_nested_multiline_comment_empty() {
3305 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3306 "select 1/*/**/*/0",
3307 vec![
3308 Token::make_keyword("select"),
3309 Token::Whitespace(Whitespace::Space),
3310 Token::Number("1".to_string(), false),
3311 Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3312 Token::Number("0".to_string(), false),
3313 ],
3314 );
3315 }
3316
3317 #[test]
3318 fn tokenize_nested_comments_if_not_supported() {
3319 all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3320 "SELECT 1/*/* nested comment */*/0",
3321 vec![
3322 Token::make_keyword("SELECT"),
3323 Token::Whitespace(Whitespace::Space),
3324 Token::Number("1".to_string(), false),
3325 Token::Whitespace(Whitespace::MultiLineComment(
3326 "/* nested comment ".to_string(),
3327 )),
3328 Token::Mul,
3329 Token::Div,
3330 Token::Number("0".to_string(), false),
3331 ],
3332 );
3333 }
3334
3335 #[test]
3336 fn tokenize_multiline_comment_with_even_asterisks() {
3337 let sql = String::from("\n/** Comment **/\n");
3338
3339 let dialect = GenericDialect {};
3340 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3341 let expected = vec![
3342 Token::Whitespace(Whitespace::Newline),
3343 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3344 Token::Whitespace(Whitespace::Newline),
3345 ];
3346 compare(expected, tokens);
3347 }
3348
3349 #[test]
3350 fn tokenize_unicode_whitespace() {
3351 let sql = String::from(" \u{2003}\n");
3352
3353 let dialect = GenericDialect {};
3354 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3355 let expected = vec![
3356 Token::Whitespace(Whitespace::Space),
3357 Token::Whitespace(Whitespace::Space),
3358 Token::Whitespace(Whitespace::Newline),
3359 ];
3360 compare(expected, tokens);
3361 }
3362
3363 #[test]
3364 fn tokenize_mismatched_quotes() {
3365 let sql = String::from("\"foo");
3366
3367 let dialect = GenericDialect {};
3368 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3369 assert_eq!(
3370 tokenizer.tokenize(),
3371 Err(TokenizerError {
3372 message: "Expected close delimiter '\"' before EOF.".to_string(),
3373 location: Location { line: 1, column: 1 },
3374 })
3375 );
3376 }
3377
3378 #[test]
3379 fn tokenize_newlines() {
3380 let sql = String::from("line1\nline2\rline3\r\nline4\r");
3381
3382 let dialect = GenericDialect {};
3383 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3384 let expected = vec![
3385 Token::make_word("line1", None),
3386 Token::Whitespace(Whitespace::Newline),
3387 Token::make_word("line2", None),
3388 Token::Whitespace(Whitespace::Newline),
3389 Token::make_word("line3", None),
3390 Token::Whitespace(Whitespace::Newline),
3391 Token::make_word("line4", None),
3392 Token::Whitespace(Whitespace::Newline),
3393 ];
3394 compare(expected, tokens);
3395 }
3396
3397 #[test]
3398 fn tokenize_mssql_top() {
3399 let sql = "SELECT TOP 5 [bar] FROM foo";
3400 let dialect = MsSqlDialect {};
3401 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3402 let expected = vec![
3403 Token::make_keyword("SELECT"),
3404 Token::Whitespace(Whitespace::Space),
3405 Token::make_keyword("TOP"),
3406 Token::Whitespace(Whitespace::Space),
3407 Token::Number(String::from("5"), false),
3408 Token::Whitespace(Whitespace::Space),
3409 Token::make_word("bar", Some('[')),
3410 Token::Whitespace(Whitespace::Space),
3411 Token::make_keyword("FROM"),
3412 Token::Whitespace(Whitespace::Space),
3413 Token::make_word("foo", None),
3414 ];
3415 compare(expected, tokens);
3416 }
3417
3418 #[test]
3419 fn tokenize_pg_regex_match() {
3420 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3421 let dialect = GenericDialect {};
3422 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3423 let expected = vec![
3424 Token::make_keyword("SELECT"),
3425 Token::Whitespace(Whitespace::Space),
3426 Token::make_word("col", None),
3427 Token::Whitespace(Whitespace::Space),
3428 Token::Tilde,
3429 Token::Whitespace(Whitespace::Space),
3430 Token::SingleQuotedString("^a".into()),
3431 Token::Comma,
3432 Token::Whitespace(Whitespace::Space),
3433 Token::make_word("col", None),
3434 Token::Whitespace(Whitespace::Space),
3435 Token::TildeAsterisk,
3436 Token::Whitespace(Whitespace::Space),
3437 Token::SingleQuotedString("^a".into()),
3438 Token::Comma,
3439 Token::Whitespace(Whitespace::Space),
3440 Token::make_word("col", None),
3441 Token::Whitespace(Whitespace::Space),
3442 Token::ExclamationMarkTilde,
3443 Token::Whitespace(Whitespace::Space),
3444 Token::SingleQuotedString("^a".into()),
3445 Token::Comma,
3446 Token::Whitespace(Whitespace::Space),
3447 Token::make_word("col", None),
3448 Token::Whitespace(Whitespace::Space),
3449 Token::ExclamationMarkTildeAsterisk,
3450 Token::Whitespace(Whitespace::Space),
3451 Token::SingleQuotedString("^a".into()),
3452 ];
3453 compare(expected, tokens);
3454 }
3455
3456 #[test]
3457 fn tokenize_pg_like_match() {
3458 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3459 let dialect = GenericDialect {};
3460 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3461 let expected = vec![
3462 Token::make_keyword("SELECT"),
3463 Token::Whitespace(Whitespace::Space),
3464 Token::make_word("col", None),
3465 Token::Whitespace(Whitespace::Space),
3466 Token::DoubleTilde,
3467 Token::Whitespace(Whitespace::Space),
3468 Token::SingleQuotedString("_a%".into()),
3469 Token::Comma,
3470 Token::Whitespace(Whitespace::Space),
3471 Token::make_word("col", None),
3472 Token::Whitespace(Whitespace::Space),
3473 Token::DoubleTildeAsterisk,
3474 Token::Whitespace(Whitespace::Space),
3475 Token::SingleQuotedString("_a%".into()),
3476 Token::Comma,
3477 Token::Whitespace(Whitespace::Space),
3478 Token::make_word("col", None),
3479 Token::Whitespace(Whitespace::Space),
3480 Token::ExclamationMarkDoubleTilde,
3481 Token::Whitespace(Whitespace::Space),
3482 Token::SingleQuotedString("_a%".into()),
3483 Token::Comma,
3484 Token::Whitespace(Whitespace::Space),
3485 Token::make_word("col", None),
3486 Token::Whitespace(Whitespace::Space),
3487 Token::ExclamationMarkDoubleTildeAsterisk,
3488 Token::Whitespace(Whitespace::Space),
3489 Token::SingleQuotedString("_a%".into()),
3490 ];
3491 compare(expected, tokens);
3492 }
3493
3494 #[test]
3495 fn tokenize_quoted_identifier() {
3496 let sql = r#" "a "" b" "a """ "c """"" "#;
3497 let dialect = GenericDialect {};
3498 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3499 let expected = vec![
3500 Token::Whitespace(Whitespace::Space),
3501 Token::make_word(r#"a " b"#, Some('"')),
3502 Token::Whitespace(Whitespace::Space),
3503 Token::make_word(r#"a ""#, Some('"')),
3504 Token::Whitespace(Whitespace::Space),
3505 Token::make_word(r#"c """#, Some('"')),
3506 Token::Whitespace(Whitespace::Space),
3507 ];
3508 compare(expected, tokens);
3509 }
3510
3511 #[test]
3512 fn tokenize_snowflake_div() {
3513 let sql = r#"field/1000"#;
3514 let dialect = SnowflakeDialect {};
3515 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3516 let expected = vec![
3517 Token::make_word(r#"field"#, None),
3518 Token::Div,
3519 Token::Number("1000".to_string(), false),
3520 ];
3521 compare(expected, tokens);
3522 }
3523
3524 #[test]
3525 fn tokenize_quoted_identifier_with_no_escape() {
3526 let sql = r#" "a "" b" "a """ "c """"" "#;
3527 let dialect = GenericDialect {};
3528 let tokens = Tokenizer::new(&dialect, sql)
3529 .with_unescape(false)
3530 .tokenize()
3531 .unwrap();
3532 let expected = vec![
3533 Token::Whitespace(Whitespace::Space),
3534 Token::make_word(r#"a "" b"#, Some('"')),
3535 Token::Whitespace(Whitespace::Space),
3536 Token::make_word(r#"a """#, Some('"')),
3537 Token::Whitespace(Whitespace::Space),
3538 Token::make_word(r#"c """""#, Some('"')),
3539 Token::Whitespace(Whitespace::Space),
3540 ];
3541 compare(expected, tokens);
3542 }
3543
3544 #[test]
3545 fn tokenize_with_location() {
3546 let sql = "SELECT a,\n b";
3547 let dialect = GenericDialect {};
3548 let tokens = Tokenizer::new(&dialect, sql)
3549 .tokenize_with_location()
3550 .unwrap();
3551 let expected = vec![
3552 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3553 TokenWithSpan::at(
3554 Token::Whitespace(Whitespace::Space),
3555 (1, 7).into(),
3556 (1, 8).into(),
3557 ),
3558 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3559 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3560 TokenWithSpan::at(
3561 Token::Whitespace(Whitespace::Newline),
3562 (1, 10).into(),
3563 (2, 1).into(),
3564 ),
3565 TokenWithSpan::at(
3566 Token::Whitespace(Whitespace::Space),
3567 (2, 1).into(),
3568 (2, 2).into(),
3569 ),
3570 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3571 ];
3572 compare(expected, tokens);
3573 }
3574
3575 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3576 assert_eq!(expected, actual);
3581 }
3582
3583 fn check_unescape(s: &str, expected: Option<&str>) {
3584 let s = format!("'{s}'");
3585 let mut state = State {
3586 peekable: s.chars().peekable(),
3587 line: 0,
3588 col: 0,
3589 };
3590
3591 assert_eq!(
3592 unescape_single_quoted_string(&mut state),
3593 expected.map(|s| s.to_string())
3594 );
3595 }
3596
3597 #[test]
3598 fn test_unescape() {
3599 check_unescape(r"\b", Some("\u{0008}"));
3600 check_unescape(r"\f", Some("\u{000C}"));
3601 check_unescape(r"\t", Some("\t"));
3602 check_unescape(r"\r\n", Some("\r\n"));
3603 check_unescape(r"\/", Some("/"));
3604 check_unescape(r"/", Some("/"));
3605 check_unescape(r"\\", Some("\\"));
3606
3607 check_unescape(r"\u0001", Some("\u{0001}"));
3609 check_unescape(r"\u4c91", Some("\u{4c91}"));
3610 check_unescape(r"\u4c916", Some("\u{4c91}6"));
3611 check_unescape(r"\u4c", None);
3612 check_unescape(r"\u0000", None);
3613 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3614 check_unescape(r"\U00110000", None);
3615 check_unescape(r"\U00000000", None);
3616 check_unescape(r"\u", None);
3617 check_unescape(r"\U", None);
3618 check_unescape(r"\U1010FFFF", None);
3619
3620 check_unescape(r"\x4B", Some("\u{004b}"));
3622 check_unescape(r"\x4", Some("\u{0004}"));
3623 check_unescape(r"\x4L", Some("\u{0004}L"));
3624 check_unescape(r"\x", Some("x"));
3625 check_unescape(r"\xP", Some("xP"));
3626 check_unescape(r"\x0", None);
3627 check_unescape(r"\xCAD", None);
3628 check_unescape(r"\xA9", None);
3629
3630 check_unescape(r"\1", Some("\u{0001}"));
3632 check_unescape(r"\12", Some("\u{000a}"));
3633 check_unescape(r"\123", Some("\u{0053}"));
3634 check_unescape(r"\1232", Some("\u{0053}2"));
3635 check_unescape(r"\4", Some("\u{0004}"));
3636 check_unescape(r"\45", Some("\u{0025}"));
3637 check_unescape(r"\450", Some("\u{0028}"));
3638 check_unescape(r"\603", None);
3639 check_unescape(r"\0", None);
3640 check_unescape(r"\080", None);
3641
3642 check_unescape(r"\9", Some("9"));
3644 check_unescape(r"''", Some("'"));
3645 check_unescape(
3646 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3647 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3648 );
3649 check_unescape(r"Hello\0", None);
3650 check_unescape(r"Hello\xCADRust", None);
3651 }
3652
3653 #[test]
3654 fn tokenize_numeric_prefix_trait() {
3655 #[derive(Debug)]
3656 struct NumericPrefixDialect;
3657
3658 impl Dialect for NumericPrefixDialect {
3659 fn is_identifier_start(&self, ch: char) -> bool {
3660 ch.is_ascii_lowercase()
3661 || ch.is_ascii_uppercase()
3662 || ch.is_ascii_digit()
3663 || ch == '$'
3664 }
3665
3666 fn is_identifier_part(&self, ch: char) -> bool {
3667 ch.is_ascii_lowercase()
3668 || ch.is_ascii_uppercase()
3669 || ch.is_ascii_digit()
3670 || ch == '_'
3671 || ch == '$'
3672 || ch == '{'
3673 || ch == '}'
3674 }
3675
3676 fn supports_numeric_prefix(&self) -> bool {
3677 true
3678 }
3679 }
3680
3681 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3682 tokenize_numeric_prefix_inner(&HiveDialect {});
3683 tokenize_numeric_prefix_inner(&MySqlDialect {});
3684 }
3685
3686 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3687 let sql = r#"SELECT * FROM 1"#;
3688 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3689 let expected = vec![
3690 Token::make_keyword("SELECT"),
3691 Token::Whitespace(Whitespace::Space),
3692 Token::Mul,
3693 Token::Whitespace(Whitespace::Space),
3694 Token::make_keyword("FROM"),
3695 Token::Whitespace(Whitespace::Space),
3696 Token::Number(String::from("1"), false),
3697 ];
3698 compare(expected, tokens);
3699 }
3700
3701 #[test]
3702 fn tokenize_quoted_string_escape() {
3703 let dialect = SnowflakeDialect {};
3704 for (sql, expected, expected_unescaped) in [
3705 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3706 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3707 (r#"'\\'"#, r#"\\"#, r#"\"#),
3708 (
3709 r#"'\0\a\b\f\n\r\t\Z'"#,
3710 r#"\0\a\b\f\n\r\t\Z"#,
3711 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3712 ),
3713 (r#"'\"'"#, r#"\""#, "\""),
3714 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3715 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3716 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3717 (r#"'\q'"#, r#"\q"#, r#"q"#),
3718 (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3719 (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3720 ] {
3721 let tokens = Tokenizer::new(&dialect, sql)
3722 .with_unescape(false)
3723 .tokenize()
3724 .unwrap();
3725 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3726 compare(expected, tokens);
3727
3728 let tokens = Tokenizer::new(&dialect, sql)
3729 .with_unescape(true)
3730 .tokenize()
3731 .unwrap();
3732 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3733 compare(expected, tokens);
3734 }
3735
3736 for sql in [r#"'\'"#, r#"'ab\'"#] {
3737 let mut tokenizer = Tokenizer::new(&dialect, sql);
3738 assert_eq!(
3739 "Unterminated string literal",
3740 tokenizer.tokenize().unwrap_err().message.as_str(),
3741 );
3742 }
3743
3744 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3746 let dialect = GenericDialect {};
3747 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3748
3749 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3750
3751 compare(expected, tokens);
3752 }
3753
3754 for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3756 let dialect = MySqlDialect {};
3757 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3758
3759 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3760
3761 compare(expected, tokens);
3762 }
3763 }
3764
3765 #[test]
3766 fn tokenize_triple_quoted_string() {
3767 fn check<F>(
3768 q: char, r: char, quote_token: F,
3771 ) where
3772 F: Fn(String) -> Token,
3773 {
3774 let dialect = BigQueryDialect {};
3775
3776 for (sql, expected, expected_unescaped) in [
3777 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3779 (
3781 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3782 format!(r#"ab{q}{q}\{q}{q}cd"#),
3783 format!(r#"ab{q}{q}{q}{q}cd"#),
3784 ),
3785 (
3787 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3788 "abc".into(),
3789 "abc".into(),
3790 ),
3791 (
3793 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3794 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3795 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3796 ),
3797 (
3799 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3800 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3801 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3802 ),
3803 (
3805 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3806 r#"a\'\'b\'c\'d"#.into(),
3807 r#"a''b'c'd"#.into(),
3808 ),
3809 (
3811 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3812 r#"abc\0\n\rdef"#.into(),
3813 "abc\0\n\rdef".into(),
3814 ),
3815 ] {
3816 let tokens = Tokenizer::new(&dialect, sql.as_str())
3817 .with_unescape(false)
3818 .tokenize()
3819 .unwrap();
3820 let expected = vec![quote_token(expected.to_string())];
3821 compare(expected, tokens);
3822
3823 let tokens = Tokenizer::new(&dialect, sql.as_str())
3824 .with_unescape(true)
3825 .tokenize()
3826 .unwrap();
3827 let expected = vec![quote_token(expected_unescaped.to_string())];
3828 compare(expected, tokens);
3829 }
3830
3831 for sql in [
3832 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3833 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3834 format!(r#"{q}{q}{q}{q}"#),
3835 format!(r#"{q}{q}{q}{r}{r}"#),
3836 format!(r#"{q}{q}{q}abc{q}"#),
3837 format!(r#"{q}{q}{q}abc{q}{q}"#),
3838 format!(r#"{q}{q}{q}abc"#),
3839 ] {
3840 let dialect = BigQueryDialect {};
3841 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3842 assert_eq!(
3843 "Unterminated string literal",
3844 tokenizer.tokenize().unwrap_err().message.as_str(),
3845 );
3846 }
3847 }
3848
3849 check('"', '\'', Token::TripleDoubleQuotedString);
3850
3851 check('\'', '"', Token::TripleSingleQuotedString);
3852
3853 let dialect = BigQueryDialect {};
3854
3855 let sql = r#"""''"#;
3856 let tokens = Tokenizer::new(&dialect, sql)
3857 .with_unescape(true)
3858 .tokenize()
3859 .unwrap();
3860 let expected = vec![
3861 Token::DoubleQuotedString("".to_string()),
3862 Token::SingleQuotedString("".to_string()),
3863 ];
3864 compare(expected, tokens);
3865
3866 let sql = r#"''"""#;
3867 let tokens = Tokenizer::new(&dialect, sql)
3868 .with_unescape(true)
3869 .tokenize()
3870 .unwrap();
3871 let expected = vec![
3872 Token::SingleQuotedString("".to_string()),
3873 Token::DoubleQuotedString("".to_string()),
3874 ];
3875 compare(expected, tokens);
3876
3877 let dialect = SnowflakeDialect {};
3879 let sql = r#"''''''"#;
3880 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3881 let expected = vec![Token::SingleQuotedString("''".to_string())];
3882 compare(expected, tokens);
3883 }
3884
3885 #[test]
3886 fn test_mysql_users_grantees() {
3887 let dialect = MySqlDialect {};
3888
3889 let sql = "CREATE USER `root`@`%`";
3890 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3891 let expected = vec![
3892 Token::make_keyword("CREATE"),
3893 Token::Whitespace(Whitespace::Space),
3894 Token::make_keyword("USER"),
3895 Token::Whitespace(Whitespace::Space),
3896 Token::make_word("root", Some('`')),
3897 Token::AtSign,
3898 Token::make_word("%", Some('`')),
3899 ];
3900 compare(expected, tokens);
3901 }
3902
3903 #[test]
3904 fn test_postgres_abs_without_space_and_string_literal() {
3905 let dialect = MySqlDialect {};
3906
3907 let sql = "SELECT @'1'";
3908 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3909 let expected = vec![
3910 Token::make_keyword("SELECT"),
3911 Token::Whitespace(Whitespace::Space),
3912 Token::AtSign,
3913 Token::SingleQuotedString("1".to_string()),
3914 ];
3915 compare(expected, tokens);
3916 }
3917
3918 #[test]
3919 fn test_postgres_abs_without_space_and_quoted_column() {
3920 let dialect = MySqlDialect {};
3921
3922 let sql = r#"SELECT @"bar" FROM foo"#;
3923 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3924 let expected = vec![
3925 Token::make_keyword("SELECT"),
3926 Token::Whitespace(Whitespace::Space),
3927 Token::AtSign,
3928 Token::DoubleQuotedString("bar".to_string()),
3929 Token::Whitespace(Whitespace::Space),
3930 Token::make_keyword("FROM"),
3931 Token::Whitespace(Whitespace::Space),
3932 Token::make_word("foo", None),
3933 ];
3934 compare(expected, tokens);
3935 }
3936
3937 #[test]
3938 fn test_national_strings_backslash_escape_not_supported() {
3939 all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3940 .tokenizes_to(
3941 "select n'''''\\'",
3942 vec![
3943 Token::make_keyword("select"),
3944 Token::Whitespace(Whitespace::Space),
3945 Token::NationalStringLiteral("''\\".to_string()),
3946 ],
3947 );
3948 }
3949
3950 #[test]
3951 fn test_national_strings_backslash_escape_supported() {
3952 all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3953 .tokenizes_to(
3954 "select n'''''\\''",
3955 vec![
3956 Token::make_keyword("select"),
3957 Token::Whitespace(Whitespace::Space),
3958 Token::NationalStringLiteral("'''".to_string()),
3959 ],
3960 );
3961 }
3962
3963 #[test]
3964 fn test_string_escape_constant_not_supported() {
3965 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3966 "select e'...'",
3967 vec![
3968 Token::make_keyword("select"),
3969 Token::Whitespace(Whitespace::Space),
3970 Token::make_word("e", None),
3971 Token::SingleQuotedString("...".to_string()),
3972 ],
3973 );
3974
3975 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3976 "select E'...'",
3977 vec![
3978 Token::make_keyword("select"),
3979 Token::Whitespace(Whitespace::Space),
3980 Token::make_word("E", None),
3981 Token::SingleQuotedString("...".to_string()),
3982 ],
3983 );
3984 }
3985
3986 #[test]
3987 fn test_string_escape_constant_supported() {
3988 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3989 "select e'\\''",
3990 vec![
3991 Token::make_keyword("select"),
3992 Token::Whitespace(Whitespace::Space),
3993 Token::EscapedStringLiteral("'".to_string()),
3994 ],
3995 );
3996
3997 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3998 "select E'\\''",
3999 vec![
4000 Token::make_keyword("select"),
4001 Token::Whitespace(Whitespace::Space),
4002 Token::EscapedStringLiteral("'".to_string()),
4003 ],
4004 );
4005 }
4006
4007 #[test]
4008 fn test_whitespace_required_after_single_line_comment() {
4009 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4010 .tokenizes_to(
4011 "SELECT --'abc'",
4012 vec![
4013 Token::make_keyword("SELECT"),
4014 Token::Whitespace(Whitespace::Space),
4015 Token::Minus,
4016 Token::Minus,
4017 Token::SingleQuotedString("abc".to_string()),
4018 ],
4019 );
4020
4021 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4022 .tokenizes_to(
4023 "SELECT -- 'abc'",
4024 vec![
4025 Token::make_keyword("SELECT"),
4026 Token::Whitespace(Whitespace::Space),
4027 Token::Whitespace(Whitespace::SingleLineComment {
4028 prefix: "--".to_string(),
4029 comment: " 'abc'".to_string(),
4030 }),
4031 ],
4032 );
4033
4034 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4035 .tokenizes_to(
4036 "SELECT --",
4037 vec![
4038 Token::make_keyword("SELECT"),
4039 Token::Whitespace(Whitespace::Space),
4040 Token::Minus,
4041 Token::Minus,
4042 ],
4043 );
4044 }
4045
4046 #[test]
4047 fn test_whitespace_not_required_after_single_line_comment() {
4048 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4049 .tokenizes_to(
4050 "SELECT --'abc'",
4051 vec![
4052 Token::make_keyword("SELECT"),
4053 Token::Whitespace(Whitespace::Space),
4054 Token::Whitespace(Whitespace::SingleLineComment {
4055 prefix: "--".to_string(),
4056 comment: "'abc'".to_string(),
4057 }),
4058 ],
4059 );
4060
4061 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4062 .tokenizes_to(
4063 "SELECT -- 'abc'",
4064 vec![
4065 Token::make_keyword("SELECT"),
4066 Token::Whitespace(Whitespace::Space),
4067 Token::Whitespace(Whitespace::SingleLineComment {
4068 prefix: "--".to_string(),
4069 comment: " 'abc'".to_string(),
4070 }),
4071 ],
4072 );
4073
4074 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4075 .tokenizes_to(
4076 "SELECT --",
4077 vec![
4078 Token::make_keyword("SELECT"),
4079 Token::Whitespace(Whitespace::Space),
4080 Token::Whitespace(Whitespace::SingleLineComment {
4081 prefix: "--".to_string(),
4082 comment: "".to_string(),
4083 }),
4084 ],
4085 );
4086 }
4087
4088 #[test]
4089 fn test_tokenize_identifiers_numeric_prefix() {
4090 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4091 .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4092
4093 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4094 .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4095
4096 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4097 "t.12e34",
4098 vec![
4099 Token::make_word("t", None),
4100 Token::Period,
4101 Token::make_word("12e34", None),
4102 ],
4103 );
4104
4105 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4106 "t.1two3",
4107 vec![
4108 Token::make_word("t", None),
4109 Token::Period,
4110 Token::make_word("1two3", None),
4111 ],
4112 );
4113 }
4114
4115 #[test]
4116 fn tokenize_period_underscore() {
4117 let sql = String::from("SELECT table._col");
4118 let dialect = PostgreSqlDialect {};
4120 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4121
4122 let expected = vec![
4123 Token::make_keyword("SELECT"),
4124 Token::Whitespace(Whitespace::Space),
4125 Token::Word(Word {
4126 value: "table".to_string(),
4127 quote_style: None,
4128 keyword: Keyword::TABLE,
4129 }),
4130 Token::Period,
4131 Token::Word(Word {
4132 value: "_col".to_string(),
4133 quote_style: None,
4134 keyword: Keyword::NoKeyword,
4135 }),
4136 ];
4137
4138 compare(expected, tokens);
4139
4140 let sql = String::from("SELECT ._123");
4141 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4142 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4143 }
4144
4145 let sql = String::from("SELECT ._abc");
4146 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4147 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4148 }
4149 }
4150
4151 #[test]
4152 fn tokenize_question_mark() {
4153 let dialect = PostgreSqlDialect {};
4154 let sql = "SELECT x ? y";
4155 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4156 compare(
4157 tokens,
4158 vec![
4159 Token::make_keyword("SELECT"),
4160 Token::Whitespace(Whitespace::Space),
4161 Token::make_word("x", None),
4162 Token::Whitespace(Whitespace::Space),
4163 Token::Question,
4164 Token::Whitespace(Whitespace::Space),
4165 Token::make_word("y", None),
4166 ],
4167 )
4168 }
4169}