1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::num::NonZeroU8;
33use core::str::Chars;
34use core::{cmp, fmt};
35use core::{iter::Peekable, str};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46 SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{
50 ast::{DollarQuotedString, QuoteDelimitedString},
51 dialect::HiveDialect,
52};
53
54#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
58pub enum Token {
59 EOF,
61 Word(Word),
63 Number(String, bool),
65 Char(char),
67 SingleQuotedString(String),
69 DoubleQuotedString(String),
71 TripleSingleQuotedString(String),
74 TripleDoubleQuotedString(String),
77 DollarQuotedString(DollarQuotedString),
79 SingleQuotedByteStringLiteral(String),
82 DoubleQuotedByteStringLiteral(String),
84 TripleSingleQuotedByteStringLiteral(String),
87 TripleDoubleQuotedByteStringLiteral(String),
90 SingleQuotedRawStringLiteral(String),
93 DoubleQuotedRawStringLiteral(String),
96 TripleSingleQuotedRawStringLiteral(String),
99 TripleDoubleQuotedRawStringLiteral(String),
102 NationalStringLiteral(String),
104 QuoteDelimitedStringLiteral(QuoteDelimitedString),
107 NationalQuoteDelimitedStringLiteral(QuoteDelimitedString),
110 EscapedStringLiteral(String),
112 UnicodeStringLiteral(String),
114 HexStringLiteral(String),
116 Comma,
118 Whitespace(Whitespace),
120 DoubleEq,
122 Eq,
124 Neq,
126 Lt,
128 Gt,
130 LtEq,
132 GtEq,
134 Spaceship,
136 Plus,
138 Minus,
140 Mul,
142 Div,
144 DuckIntDiv,
146 Mod,
148 StringConcat,
150 LParen,
152 RParen,
154 Period,
156 Colon,
158 DoubleColon,
160 Assignment,
162 SemiColon,
164 Backslash,
166 LBracket,
168 RBracket,
170 Ampersand,
172 Pipe,
174 Caret,
176 LBrace,
178 RBrace,
180 RArrow,
182 Sharp,
184 DoubleSharp,
186 Tilde,
188 TildeAsterisk,
190 ExclamationMarkTilde,
192 ExclamationMarkTildeAsterisk,
194 DoubleTilde,
196 DoubleTildeAsterisk,
198 ExclamationMarkDoubleTilde,
200 ExclamationMarkDoubleTildeAsterisk,
202 ShiftLeft,
204 ShiftRight,
206 Overlap,
208 ExclamationMark,
210 DoubleExclamationMark,
212 AtSign,
214 CaretAt,
216 PGSquareRoot,
218 PGCubeRoot,
220 Placeholder(String),
222 Arrow,
224 LongArrow,
226 HashArrow,
228 AtDashAt,
230 QuestionMarkDash,
232 AmpersandLeftAngleBracket,
234 AmpersandRightAngleBracket,
236 AmpersandLeftAngleBracketVerticalBar,
238 VerticalBarAmpersandRightAngleBracket,
240 TwoWayArrow,
242 LeftAngleBracketCaret,
244 RightAngleBracketCaret,
246 QuestionMarkSharp,
248 QuestionMarkDashVerticalBar,
250 QuestionMarkDoubleVerticalBar,
252 TildeEqual,
254 ShiftLeftVerticalBar,
256 VerticalBarShiftRight,
258 VerticalBarRightAngleBracket,
260 HashLongArrow,
262 AtArrow,
264 ArrowAt,
266 HashMinus,
269 AtQuestion,
272 AtAt,
276 Question,
279 QuestionAnd,
282 QuestionPipe,
285 CustomBinaryOperator(String),
289}
290
291impl fmt::Display for Token {
292 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
293 match self {
294 Token::EOF => f.write_str("EOF"),
295 Token::Word(ref w) => write!(f, "{w}"),
296 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
297 Token::Char(ref c) => write!(f, "{c}"),
298 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
299 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
300 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
301 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
302 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
303 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
304 Token::QuoteDelimitedStringLiteral(ref s) => s.fmt(f),
305 Token::NationalQuoteDelimitedStringLiteral(ref s) => write!(f, "N{s}"),
306 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
307 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
308 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
309 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
310 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
311 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
312 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
313 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
314 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
315 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
316 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
317 Token::Comma => f.write_str(","),
318 Token::Whitespace(ws) => write!(f, "{ws}"),
319 Token::DoubleEq => f.write_str("=="),
320 Token::Spaceship => f.write_str("<=>"),
321 Token::Eq => f.write_str("="),
322 Token::Neq => f.write_str("<>"),
323 Token::Lt => f.write_str("<"),
324 Token::Gt => f.write_str(">"),
325 Token::LtEq => f.write_str("<="),
326 Token::GtEq => f.write_str(">="),
327 Token::Plus => f.write_str("+"),
328 Token::Minus => f.write_str("-"),
329 Token::Mul => f.write_str("*"),
330 Token::Div => f.write_str("/"),
331 Token::DuckIntDiv => f.write_str("//"),
332 Token::StringConcat => f.write_str("||"),
333 Token::Mod => f.write_str("%"),
334 Token::LParen => f.write_str("("),
335 Token::RParen => f.write_str(")"),
336 Token::Period => f.write_str("."),
337 Token::Colon => f.write_str(":"),
338 Token::DoubleColon => f.write_str("::"),
339 Token::Assignment => f.write_str(":="),
340 Token::SemiColon => f.write_str(";"),
341 Token::Backslash => f.write_str("\\"),
342 Token::LBracket => f.write_str("["),
343 Token::RBracket => f.write_str("]"),
344 Token::Ampersand => f.write_str("&"),
345 Token::Caret => f.write_str("^"),
346 Token::Pipe => f.write_str("|"),
347 Token::LBrace => f.write_str("{"),
348 Token::RBrace => f.write_str("}"),
349 Token::RArrow => f.write_str("=>"),
350 Token::Sharp => f.write_str("#"),
351 Token::DoubleSharp => f.write_str("##"),
352 Token::ExclamationMark => f.write_str("!"),
353 Token::DoubleExclamationMark => f.write_str("!!"),
354 Token::Tilde => f.write_str("~"),
355 Token::TildeAsterisk => f.write_str("~*"),
356 Token::ExclamationMarkTilde => f.write_str("!~"),
357 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
358 Token::DoubleTilde => f.write_str("~~"),
359 Token::DoubleTildeAsterisk => f.write_str("~~*"),
360 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
361 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
362 Token::AtSign => f.write_str("@"),
363 Token::CaretAt => f.write_str("^@"),
364 Token::ShiftLeft => f.write_str("<<"),
365 Token::ShiftRight => f.write_str(">>"),
366 Token::Overlap => f.write_str("&&"),
367 Token::PGSquareRoot => f.write_str("|/"),
368 Token::PGCubeRoot => f.write_str("||/"),
369 Token::AtDashAt => f.write_str("@-@"),
370 Token::QuestionMarkDash => f.write_str("?-"),
371 Token::AmpersandLeftAngleBracket => f.write_str("&<"),
372 Token::AmpersandRightAngleBracket => f.write_str("&>"),
373 Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
374 Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
375 Token::VerticalBarRightAngleBracket => f.write_str("|>"),
376 Token::TwoWayArrow => f.write_str("<->"),
377 Token::LeftAngleBracketCaret => f.write_str("<^"),
378 Token::RightAngleBracketCaret => f.write_str(">^"),
379 Token::QuestionMarkSharp => f.write_str("?#"),
380 Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
381 Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
382 Token::TildeEqual => f.write_str("~="),
383 Token::ShiftLeftVerticalBar => f.write_str("<<|"),
384 Token::VerticalBarShiftRight => f.write_str("|>>"),
385 Token::Placeholder(ref s) => write!(f, "{s}"),
386 Token::Arrow => write!(f, "->"),
387 Token::LongArrow => write!(f, "->>"),
388 Token::HashArrow => write!(f, "#>"),
389 Token::HashLongArrow => write!(f, "#>>"),
390 Token::AtArrow => write!(f, "@>"),
391 Token::ArrowAt => write!(f, "<@"),
392 Token::HashMinus => write!(f, "#-"),
393 Token::AtQuestion => write!(f, "@?"),
394 Token::AtAt => write!(f, "@@"),
395 Token::Question => write!(f, "?"),
396 Token::QuestionAnd => write!(f, "?&"),
397 Token::QuestionPipe => write!(f, "?|"),
398 Token::CustomBinaryOperator(s) => f.write_str(s),
399 }
400 }
401}
402
403impl Token {
404 pub fn make_keyword(keyword: &str) -> Self {
408 Token::make_word(keyword, None)
409 }
410
411 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
416 Token::Word(Word {
417 keyword: keyword_lookup(word, quote_style),
418 value: word.to_string(),
419 quote_style,
420 })
421 }
422
423 fn make_word_owned(word: String, quote_style: Option<char>) -> Self {
426 Token::Word(Word {
427 keyword: keyword_lookup(&word, quote_style),
428 value: word,
429 quote_style,
430 })
431 }
432}
433
434fn keyword_lookup(word: &str, quote_style: Option<char>) -> Keyword {
436 if quote_style.is_some() {
437 return Keyword::NoKeyword;
438 }
439 ALL_KEYWORDS
440 .binary_search_by(|probe| {
441 let probe = probe.as_bytes();
442 let word = word.as_bytes();
443 for (p, w) in probe.iter().zip(word.iter()) {
444 let cmp = p.cmp(&w.to_ascii_uppercase());
445 if cmp != core::cmp::Ordering::Equal {
446 return cmp;
447 }
448 }
449 probe.len().cmp(&word.len())
450 })
451 .map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
452}
453
454#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
456#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
457#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
458pub struct Word {
459 pub value: String,
462 pub quote_style: Option<char>,
466 pub keyword: Keyword,
469}
470
471impl fmt::Display for Word {
472 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
473 match self.quote_style {
474 Some(s) if s == '"' || s == '[' || s == '`' => {
475 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
476 }
477 None => f.write_str(&self.value),
478 _ => panic!("Unexpected quote_style!"),
479 }
480 }
481}
482
483impl Word {
484 fn matching_end_quote(ch: char) -> char {
485 match ch {
486 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
490 }
491 }
492}
493
494#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
496#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
497#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
498pub enum Whitespace {
499 Space,
501 Newline,
503 Tab,
505 SingleLineComment {
508 comment: String,
510 prefix: String,
512 },
513
514 MultiLineComment(String),
516}
517
518impl fmt::Display for Whitespace {
519 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
520 match self {
521 Whitespace::Space => f.write_str(" "),
522 Whitespace::Newline => f.write_str("\n"),
523 Whitespace::Tab => f.write_str("\t"),
524 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
525 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
526 }
527 }
528}
529
530#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
550#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
551#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
552pub struct Location {
553 pub line: u64,
557 pub column: u64,
561}
562
563impl fmt::Display for Location {
564 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
565 if self.line == 0 {
566 return Ok(());
567 }
568 write!(f, " at Line: {}, Column: {}", self.line, self.column)
569 }
570}
571
572impl fmt::Debug for Location {
573 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
574 write!(f, "Location({},{})", self.line, self.column)
575 }
576}
577
578impl Location {
579 pub fn empty() -> Self {
581 Self { line: 0, column: 0 }
582 }
583
584 pub fn new(line: u64, column: u64) -> Self {
586 Self { line, column }
587 }
588
589 pub fn of(line: u64, column: u64) -> Self {
594 Self::new(line, column)
595 }
596
597 pub fn span_to(self, end: Self) -> Span {
599 Span { start: self, end }
600 }
601}
602
603impl From<(u64, u64)> for Location {
604 fn from((line, column): (u64, u64)) -> Self {
605 Self { line, column }
606 }
607}
608
609#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
613#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
614#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
615pub struct Span {
616 pub start: Location,
618 pub end: Location,
620}
621
622impl fmt::Debug for Span {
623 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
624 write!(f, "Span({:?}..{:?})", self.start, self.end)
625 }
626}
627
628impl Span {
629 const EMPTY: Span = Self::empty();
632
633 pub fn new(start: Location, end: Location) -> Span {
635 Span { start, end }
636 }
637
638 pub const fn empty() -> Span {
643 Span {
644 start: Location { line: 0, column: 0 },
645 end: Location { line: 0, column: 0 },
646 }
647 }
648
649 pub fn union(&self, other: &Span) -> Span {
665 match (self, other) {
668 (&Span::EMPTY, _) => *other,
669 (_, &Span::EMPTY) => *self,
670 _ => Span {
671 start: cmp::min(self.start, other.start),
672 end: cmp::max(self.end, other.end),
673 },
674 }
675 }
676
677 pub fn union_opt(&self, other: &Option<Span>) -> Span {
681 match other {
682 Some(other) => self.union(other),
683 None => *self,
684 }
685 }
686
687 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
705 iter.into_iter()
706 .reduce(|acc, item| acc.union(&item))
707 .unwrap_or(Span::empty())
708 }
709}
710
711#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
713pub type TokenWithLocation = TokenWithSpan;
714
715#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
738#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
739#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
740pub struct TokenWithSpan {
742 pub token: Token,
744 pub span: Span,
746}
747
748impl TokenWithSpan {
749 pub fn new(token: Token, span: Span) -> Self {
751 Self { token, span }
752 }
753
754 pub fn wrap(token: Token) -> Self {
756 Self::new(token, Span::empty())
757 }
758
759 pub fn at(token: Token, start: Location, end: Location) -> Self {
761 Self::new(token, Span::new(start, end))
762 }
763
764 pub fn new_eof() -> Self {
766 Self::wrap(Token::EOF)
767 }
768}
769
770impl PartialEq<Token> for TokenWithSpan {
771 fn eq(&self, other: &Token) -> bool {
772 &self.token == other
773 }
774}
775
776impl PartialEq<TokenWithSpan> for Token {
777 fn eq(&self, other: &TokenWithSpan) -> bool {
778 self == &other.token
779 }
780}
781
782impl fmt::Display for TokenWithSpan {
783 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
784 self.token.fmt(f)
785 }
786}
787
788#[derive(Debug, PartialEq, Eq)]
790pub struct TokenizerError {
791 pub message: String,
793 pub location: Location,
795}
796
797impl fmt::Display for TokenizerError {
798 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
799 write!(f, "{}{}", self.message, self.location,)
800 }
801}
802
803impl core::error::Error for TokenizerError {}
804
805struct State<'a> {
806 peekable: Peekable<Chars<'a>>,
807 line: u64,
808 col: u64,
809}
810
811impl State<'_> {
812 pub fn next(&mut self) -> Option<char> {
814 match self.peekable.next() {
815 None => None,
816 Some(s) => {
817 if s == '\n' {
818 self.line += 1;
819 self.col = 1;
820 } else {
821 self.col += 1;
822 }
823 Some(s)
824 }
825 }
826 }
827
828 pub fn peek(&mut self) -> Option<&char> {
830 self.peekable.peek()
831 }
832
833 pub fn location(&self) -> Location {
835 Location {
836 line: self.line,
837 column: self.col,
838 }
839 }
840}
841
842#[derive(Copy, Clone)]
844enum NumStringQuoteChars {
845 One,
847 Many(NonZeroU8),
849}
850
851struct TokenizeQuotedStringSettings {
853 quote_style: char,
855 num_quote_chars: NumStringQuoteChars,
857 num_opening_quotes_to_consume: u8,
863 backslash_escape: bool,
866}
867
868pub struct Tokenizer<'a> {
870 dialect: &'a dyn Dialect,
871 query: &'a str,
872 unescape: bool,
875}
876
877impl<'a> Tokenizer<'a> {
878 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
895 Self {
896 dialect,
897 query,
898 unescape: true,
899 }
900 }
901
902 pub fn with_unescape(mut self, unescape: bool) -> Self {
933 self.unescape = unescape;
934 self
935 }
936
937 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
939 let twl = self.tokenize_with_location()?;
940 Ok(twl.into_iter().map(|t| t.token).collect())
941 }
942
943 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
945 let mut tokens: Vec<TokenWithSpan> = vec![];
946 self.tokenize_with_location_into_buf(&mut tokens)
947 .map(|_| tokens)
948 }
949
950 pub fn tokenize_with_location_into_buf(
953 &mut self,
954 buf: &mut Vec<TokenWithSpan>,
955 ) -> Result<(), TokenizerError> {
956 self.tokenize_with_location_into_buf_with_mapper(buf, |token| token)
957 }
958
959 pub fn tokenize_with_location_into_buf_with_mapper(
962 &mut self,
963 buf: &mut Vec<TokenWithSpan>,
964 mut mapper: impl FnMut(TokenWithSpan) -> TokenWithSpan,
965 ) -> Result<(), TokenizerError> {
966 let mut state = State {
967 peekable: self.query.chars().peekable(),
968 line: 1,
969 col: 1,
970 };
971
972 let mut location = state.location();
973 while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
974 let span = location.span_to(state.location());
975
976 match &token {
978 Token::Whitespace(Whitespace::MultiLineComment(comment))
979 if self.dialect.supports_multiline_comment_hints()
980 && comment.starts_with('!') =>
981 {
982 self.tokenize_comment_hints(comment, span, buf, &mut mapper)?;
984 }
985 _ => {
986 buf.push(mapper(TokenWithSpan { token, span }));
987 }
988 }
989
990 location = state.location();
991 }
992 Ok(())
993 }
994
995 fn tokenize_comment_hints(
998 &self,
999 comment: &str,
1000 span: Span,
1001 buf: &mut Vec<TokenWithSpan>,
1002 mut mapper: impl FnMut(TokenWithSpan) -> TokenWithSpan,
1003 ) -> Result<(), TokenizerError> {
1004 let hint_content = comment
1006 .strip_prefix('!')
1007 .unwrap_or(comment)
1008 .trim_start_matches(|c: char| c.is_ascii_digit());
1009
1010 if hint_content.is_empty() {
1012 return Ok(());
1013 }
1014
1015 let inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape);
1017
1018 let mut state = State {
1020 peekable: hint_content.chars().peekable(),
1021 line: span.start.line,
1022 col: span.start.column,
1023 };
1024
1025 let mut location = state.location();
1027 while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? {
1028 let token_span = location.span_to(state.location());
1029 buf.push(mapper(TokenWithSpan {
1030 token,
1031 span: token_span,
1032 }));
1033 location = state.location();
1034 }
1035
1036 Ok(())
1037 }
1038
1039 fn tokenize_identifier_or_keyword(
1041 &self,
1042 ch: impl IntoIterator<Item = char>,
1043 chars: &mut State,
1044 ) -> Result<Option<Token>, TokenizerError> {
1045 chars.next(); let ch: String = ch.into_iter().collect();
1047 let word = self.tokenize_word(ch, chars);
1048
1049 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
1051 let mut inner_state = State {
1052 peekable: word.chars().peekable(),
1053 line: 0,
1054 col: 0,
1055 };
1056 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
1057 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
1058 s += s2.as_str();
1059 return Ok(Some(Token::Number(s, false)));
1060 }
1061
1062 Ok(Some(Token::make_word_owned(word, None)))
1063 }
1064
1065 fn next_token(
1067 &self,
1068 chars: &mut State,
1069 prev_token: Option<&Token>,
1070 ) -> Result<Option<Token>, TokenizerError> {
1071 match chars.peek() {
1072 Some(&ch) => match ch {
1073 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
1074 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
1075 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
1076 '\r' => {
1077 chars.next();
1079 if let Some('\n') = chars.peek() {
1080 chars.next();
1081 }
1082 Ok(Some(Token::Whitespace(Whitespace::Newline)))
1083 }
1084 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
1086 {
1087 chars.next(); match chars.peek() {
1089 Some('\'') => {
1090 if self.dialect.supports_triple_quoted_string() {
1091 return self
1092 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1093 chars,
1094 '\'',
1095 false,
1096 Token::SingleQuotedByteStringLiteral,
1097 Token::TripleSingleQuotedByteStringLiteral,
1098 );
1099 }
1100 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
1101 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
1102 }
1103 Some('\"') => {
1104 if self.dialect.supports_triple_quoted_string() {
1105 return self
1106 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1107 chars,
1108 '"',
1109 false,
1110 Token::DoubleQuotedByteStringLiteral,
1111 Token::TripleDoubleQuotedByteStringLiteral,
1112 );
1113 }
1114 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
1115 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
1116 }
1117 _ => {
1118 let s = self.tokenize_word(b, chars);
1120 Ok(Some(Token::make_word_owned(s, None)))
1121 }
1122 }
1123 }
1124 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
1126 chars.next(); match chars.peek() {
1128 Some('\'') => self
1129 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1130 chars,
1131 '\'',
1132 false,
1133 Token::SingleQuotedRawStringLiteral,
1134 Token::TripleSingleQuotedRawStringLiteral,
1135 ),
1136 Some('\"') => self
1137 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1138 chars,
1139 '"',
1140 false,
1141 Token::DoubleQuotedRawStringLiteral,
1142 Token::TripleDoubleQuotedRawStringLiteral,
1143 ),
1144 _ => {
1145 let s = self.tokenize_word(b, chars);
1147 Ok(Some(Token::make_word_owned(s, None)))
1148 }
1149 }
1150 }
1151 n @ 'N' | n @ 'n' => {
1153 chars.next(); match chars.peek() {
1155 Some('\'') => {
1156 let backslash_escape =
1158 self.dialect.supports_string_literal_backslash_escape();
1159 let s =
1160 self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1161 Ok(Some(Token::NationalStringLiteral(s)))
1162 }
1163 Some(&q @ 'q') | Some(&q @ 'Q')
1164 if self.dialect.supports_quote_delimited_string() =>
1165 {
1166 chars.next(); if let Some('\'') = chars.peek() {
1168 self.tokenize_quote_delimited_string(chars, &[n, q])
1169 .map(|s| Some(Token::NationalQuoteDelimitedStringLiteral(s)))
1170 } else {
1171 let s = self.tokenize_word(String::from_iter([n, q]), chars);
1172 Ok(Some(Token::make_word_owned(s, None)))
1173 }
1174 }
1175 _ => {
1176 let s = self.tokenize_word(n, chars);
1178 Ok(Some(Token::make_word_owned(s, None)))
1179 }
1180 }
1181 }
1182 q @ 'Q' | q @ 'q' if self.dialect.supports_quote_delimited_string() => {
1183 chars.next(); if let Some('\'') = chars.peek() {
1185 self.tokenize_quote_delimited_string(chars, &[q])
1186 .map(|s| Some(Token::QuoteDelimitedStringLiteral(s)))
1187 } else {
1188 let s = self.tokenize_word(q, chars);
1189 Ok(Some(Token::make_word_owned(s, None)))
1190 }
1191 }
1192 x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1194 let starting_loc = chars.location();
1195 chars.next(); match chars.peek() {
1197 Some('\'') => {
1198 let s =
1199 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1200 Ok(Some(Token::EscapedStringLiteral(s)))
1201 }
1202 _ => {
1203 let s = self.tokenize_word(x, chars);
1205 Ok(Some(Token::make_word_owned(s, None)))
1206 }
1207 }
1208 }
1209 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1211 chars.next(); if chars.peek() == Some(&'&') {
1213 let mut chars_clone = chars.peekable.clone();
1215 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
1217 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
1219 return Ok(Some(Token::UnicodeStringLiteral(s)));
1220 }
1221 }
1222 let s = self.tokenize_word(x, chars);
1224 Ok(Some(Token::make_word_owned(s, None)))
1225 }
1226 x @ 'x' | x @ 'X' => {
1229 chars.next(); match chars.peek() {
1231 Some('\'') => {
1232 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1234 Ok(Some(Token::HexStringLiteral(s)))
1235 }
1236 _ => {
1237 let s = self.tokenize_word(x, chars);
1239 Ok(Some(Token::make_word_owned(s, None)))
1240 }
1241 }
1242 }
1243 '\'' => {
1245 if self.dialect.supports_triple_quoted_string() {
1246 return self
1247 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1248 chars,
1249 '\'',
1250 self.dialect.supports_string_literal_backslash_escape(),
1251 Token::SingleQuotedString,
1252 Token::TripleSingleQuotedString,
1253 );
1254 }
1255 let s = self.tokenize_single_quoted_string(
1256 chars,
1257 '\'',
1258 self.dialect.supports_string_literal_backslash_escape(),
1259 )?;
1260
1261 Ok(Some(Token::SingleQuotedString(s)))
1262 }
1263 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1265 && !self.dialect.is_identifier_start(ch) =>
1266 {
1267 if self.dialect.supports_triple_quoted_string() {
1268 return self
1269 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1270 chars,
1271 '"',
1272 self.dialect.supports_string_literal_backslash_escape(),
1273 Token::DoubleQuotedString,
1274 Token::TripleDoubleQuotedString,
1275 );
1276 }
1277 let s = self.tokenize_single_quoted_string(
1278 chars,
1279 '"',
1280 self.dialect.supports_string_literal_backslash_escape(),
1281 )?;
1282
1283 Ok(Some(Token::DoubleQuotedString(s)))
1284 }
1285 quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1287 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1288 Ok(Some(Token::make_word_owned(word, Some(quote_start))))
1289 }
1290 quote_start
1292 if self
1293 .dialect
1294 .is_nested_delimited_identifier_start(quote_start)
1295 && self
1296 .dialect
1297 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1298 .is_some() =>
1299 {
1300 let Some((quote_start, nested_quote_start)) = self
1301 .dialect
1302 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1303 else {
1304 return self.tokenizer_error(
1305 chars.location(),
1306 format!("Expected nested delimiter '{quote_start}' before EOF."),
1307 );
1308 };
1309
1310 let Some(nested_quote_start) = nested_quote_start else {
1311 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1312 return Ok(Some(Token::make_word_owned(word, Some(quote_start))));
1313 };
1314
1315 let mut word = vec![];
1316 let quote_end = Word::matching_end_quote(quote_start);
1317 let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1318 let error_loc = chars.location();
1319
1320 chars.next(); peeking_take_while(chars, |ch| ch.is_whitespace());
1322 if chars.peek() != Some(&nested_quote_start) {
1323 return self.tokenizer_error(
1324 error_loc,
1325 format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1326 );
1327 }
1328 word.push(nested_quote_start.into());
1329 word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1330 word.push(nested_quote_end.into());
1331 peeking_take_while(chars, |ch| ch.is_whitespace());
1332 if chars.peek() != Some("e_end) {
1333 return self.tokenizer_error(
1334 error_loc,
1335 format!("Expected close delimiter '{quote_end}' before EOF."),
1336 );
1337 }
1338 chars.next(); Ok(Some(Token::make_word_owned(
1341 word.concat(),
1342 Some(quote_start),
1343 )))
1344 }
1345 '0'..='9' | '.' => {
1347 if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1352 if let Some(Token::Word(_)) = prev_token {
1353 chars.next();
1354 return Ok(Some(Token::Period));
1355 }
1356
1357 return self.tokenizer_error(
1358 chars.location(),
1359 "Unexpected character '_'".to_string(),
1360 );
1361 }
1362
1363 let is_number_separator = |ch: char, next_char: Option<char>| {
1366 self.dialect.supports_numeric_literal_underscores()
1367 && ch == '_'
1368 && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1369 };
1370
1371 let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1372 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1373 });
1374
1375 if s == "0" && chars.peek() == Some(&'x') {
1377 chars.next();
1378 let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1379 ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1380 });
1381 return Ok(Some(Token::HexStringLiteral(s2)));
1382 }
1383
1384 if let Some('.') = chars.peek() {
1386 s.push('.');
1387 chars.next();
1388 }
1389
1390 if s == "." && self.dialect.supports_numeric_prefix() {
1396 if let Some(Token::Word(_)) = prev_token {
1397 return Ok(Some(Token::Period));
1398 }
1399 }
1400
1401 s += &peeking_next_take_while(chars, |ch, next_ch| {
1403 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1404 });
1405
1406 if s == "." {
1408 return Ok(Some(Token::Period));
1409 }
1410
1411 let mut exponent_part = String::new();
1413 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1414 let mut char_clone = chars.peekable.clone();
1415 exponent_part.push(char_clone.next().unwrap());
1416
1417 match char_clone.peek() {
1419 Some(&c) if matches!(c, '+' | '-') => {
1420 exponent_part.push(c);
1421 char_clone.next();
1422 }
1423 _ => (),
1424 }
1425
1426 match char_clone.peek() {
1427 Some(&c) if c.is_ascii_digit() => {
1429 for _ in 0..exponent_part.len() {
1430 chars.next();
1431 }
1432 exponent_part +=
1433 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1434 s += exponent_part.as_str();
1435 }
1436 _ => (),
1438 }
1439 }
1440
1441 if self.dialect.supports_numeric_prefix() {
1445 if exponent_part.is_empty() {
1446 let word =
1449 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1450
1451 if !word.is_empty() {
1452 s += word.as_str();
1453 return Ok(Some(Token::make_word_owned(s, None)));
1454 }
1455 } else if prev_token == Some(&Token::Period) {
1456 return Ok(Some(Token::make_word_owned(s, None)));
1459 }
1460 }
1461
1462 let long = if chars.peek() == Some(&'L') {
1463 chars.next();
1464 true
1465 } else {
1466 false
1467 };
1468 Ok(Some(Token::Number(s, long)))
1469 }
1470 '(' => self.consume_and_return(chars, Token::LParen),
1472 ')' => self.consume_and_return(chars, Token::RParen),
1473 ',' => self.consume_and_return(chars, Token::Comma),
1474 '-' => {
1476 chars.next(); match chars.peek() {
1479 Some('-') => {
1480 let mut is_comment = true;
1481 if self.dialect.requires_single_line_comment_whitespace() {
1482 is_comment = chars
1483 .peekable
1484 .clone()
1485 .nth(1)
1486 .is_some_and(char::is_whitespace);
1487 }
1488
1489 if is_comment {
1490 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1492 return Ok(Some(Token::Whitespace(
1493 Whitespace::SingleLineComment {
1494 prefix: "--".to_owned(),
1495 comment,
1496 },
1497 )));
1498 }
1499
1500 self.start_binop(chars, "-", Token::Minus)
1501 }
1502 Some('>') => {
1503 chars.next();
1504 match chars.peek() {
1505 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1506 _ => self.start_binop(chars, "->", Token::Arrow),
1507 }
1508 }
1509 _ => self.start_binop(chars, "-", Token::Minus),
1511 }
1512 }
1513 '/' => {
1514 chars.next(); match chars.peek() {
1516 Some('*') => {
1517 chars.next(); self.tokenize_multiline_comment(chars)
1519 }
1520 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1521 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1523 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1524 prefix: "//".to_owned(),
1525 comment,
1526 })))
1527 }
1528 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1529 self.consume_and_return(chars, Token::DuckIntDiv)
1530 }
1531 _ => Ok(Some(Token::Div)),
1533 }
1534 }
1535 '+' => self.consume_and_return(chars, Token::Plus),
1536 '*' => self.consume_and_return(chars, Token::Mul),
1537 '%' => {
1538 chars.next(); match chars.peek() {
1540 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1541 Some(sch) if self.dialect.is_identifier_start('%') => {
1542 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1543 }
1544 _ => self.start_binop(chars, "%", Token::Mod),
1545 }
1546 }
1547 '|' => {
1548 chars.next(); match chars.peek() {
1550 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1551 Some('|') => {
1552 chars.next(); match chars.peek() {
1554 Some('/') => {
1555 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1556 }
1557 _ => self.start_binop(chars, "||", Token::StringConcat),
1558 }
1559 }
1560 Some('&') if self.dialect.supports_geometric_types() => {
1561 chars.next(); match chars.peek() {
1563 Some('>') => self.consume_for_binop(
1564 chars,
1565 "|&>",
1566 Token::VerticalBarAmpersandRightAngleBracket,
1567 ),
1568 _ => self.start_binop_opt(chars, "|&", None),
1569 }
1570 }
1571 Some('>') if self.dialect.supports_geometric_types() => {
1572 chars.next(); match chars.peek() {
1574 Some('>') => self.consume_for_binop(
1575 chars,
1576 "|>>",
1577 Token::VerticalBarShiftRight,
1578 ),
1579 _ => self.start_binop_opt(chars, "|>", None),
1580 }
1581 }
1582 Some('>') if self.dialect.supports_pipe_operator() => {
1583 self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1584 }
1585 _ => self.start_binop(chars, "|", Token::Pipe),
1587 }
1588 }
1589 '=' => {
1590 chars.next(); match chars.peek() {
1592 Some('>') => self.consume_and_return(chars, Token::RArrow),
1593 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1594 _ => Ok(Some(Token::Eq)),
1595 }
1596 }
1597 '!' => {
1598 chars.next(); match chars.peek() {
1600 Some('=') => self.consume_and_return(chars, Token::Neq),
1601 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1602 Some('~') => {
1603 chars.next();
1604 match chars.peek() {
1605 Some('*') => self
1606 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1607 Some('~') => {
1608 chars.next();
1609 match chars.peek() {
1610 Some('*') => self.consume_and_return(
1611 chars,
1612 Token::ExclamationMarkDoubleTildeAsterisk,
1613 ),
1614 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1615 }
1616 }
1617 _ => Ok(Some(Token::ExclamationMarkTilde)),
1618 }
1619 }
1620 _ => Ok(Some(Token::ExclamationMark)),
1621 }
1622 }
1623 '<' => {
1624 chars.next(); match chars.peek() {
1626 Some('=') => {
1627 chars.next();
1628 match chars.peek() {
1629 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1630 Some('+') | Some('-') => Ok(Some(Token::LtEq)),
1633 _ => self.start_binop(chars, "<=", Token::LtEq),
1634 }
1635 }
1636 Some('|') if self.dialect.supports_geometric_types() => {
1637 self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1638 }
1639 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1640 Some('<') if self.dialect.supports_geometric_types() => {
1641 chars.next(); match chars.peek() {
1643 Some('|') => self.consume_for_binop(
1644 chars,
1645 "<<|",
1646 Token::ShiftLeftVerticalBar,
1647 ),
1648 _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1649 }
1650 }
1651 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1652 Some('+') => Ok(Some(Token::Lt)),
1655 Some('-') if self.dialect.supports_geometric_types() => {
1656 if chars.peekable.clone().nth(1) == Some('>') {
1657 chars.next(); self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1659 } else {
1660 Ok(Some(Token::Lt))
1661 }
1662 }
1663 Some('^') if self.dialect.supports_geometric_types() => {
1664 self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1665 }
1666 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1667 _ => self.start_binop(chars, "<", Token::Lt),
1668 }
1669 }
1670 '>' => {
1671 chars.next(); match chars.peek() {
1673 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1674 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1675 Some('^') if self.dialect.supports_geometric_types() => {
1676 self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1677 }
1678 _ => self.start_binop(chars, ">", Token::Gt),
1679 }
1680 }
1681 ':' => {
1682 chars.next();
1683 match chars.peek() {
1684 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1685 Some('=') => self.consume_and_return(chars, Token::Assignment),
1686 _ => Ok(Some(Token::Colon)),
1687 }
1688 }
1689 ';' => self.consume_and_return(chars, Token::SemiColon),
1690 '\\' => self.consume_and_return(chars, Token::Backslash),
1691 '[' => self.consume_and_return(chars, Token::LBracket),
1692 ']' => self.consume_and_return(chars, Token::RBracket),
1693 '&' => {
1694 chars.next(); match chars.peek() {
1696 Some('>') if self.dialect.supports_geometric_types() => {
1697 chars.next();
1698 self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1699 }
1700 Some('<') if self.dialect.supports_geometric_types() => {
1701 chars.next(); match chars.peek() {
1703 Some('|') => self.consume_and_return(
1704 chars,
1705 Token::AmpersandLeftAngleBracketVerticalBar,
1706 ),
1707 _ => {
1708 self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1709 }
1710 }
1711 }
1712 Some('&') => {
1713 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1715 }
1716 _ => self.start_binop(chars, "&", Token::Ampersand),
1718 }
1719 }
1720 '^' => {
1721 chars.next(); match chars.peek() {
1723 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1724 _ => Ok(Some(Token::Caret)),
1725 }
1726 }
1727 '{' => self.consume_and_return(chars, Token::LBrace),
1728 '}' => self.consume_and_return(chars, Token::RBrace),
1729 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1730 {
1731 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1733 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1734 prefix: "#".to_owned(),
1735 comment,
1736 })))
1737 }
1738 '~' => {
1739 chars.next(); match chars.peek() {
1741 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1742 Some('=') if self.dialect.supports_geometric_types() => {
1743 self.consume_for_binop(chars, "~=", Token::TildeEqual)
1744 }
1745 Some('~') => {
1746 chars.next();
1747 match chars.peek() {
1748 Some('*') => {
1749 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1750 }
1751 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1752 }
1753 }
1754 _ => self.start_binop(chars, "~", Token::Tilde),
1755 }
1756 }
1757 '#' => {
1758 chars.next();
1759 match chars.peek() {
1760 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1761 Some('>') => {
1762 chars.next();
1763 match chars.peek() {
1764 Some('>') => {
1765 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1766 }
1767 _ => self.start_binop(chars, "#>", Token::HashArrow),
1768 }
1769 }
1770 Some(' ') => Ok(Some(Token::Sharp)),
1771 Some('#') if self.dialect.supports_geometric_types() => {
1772 self.consume_for_binop(chars, "##", Token::DoubleSharp)
1773 }
1774 Some(sch) if self.dialect.is_identifier_start('#') => {
1775 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1776 }
1777 _ => self.start_binop(chars, "#", Token::Sharp),
1778 }
1779 }
1780 '@' => {
1781 chars.next();
1782 match chars.peek() {
1783 Some('@') if self.dialect.supports_geometric_types() => {
1784 self.consume_and_return(chars, Token::AtAt)
1785 }
1786 Some('-') if self.dialect.supports_geometric_types() => {
1787 chars.next();
1788 match chars.peek() {
1789 Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1790 _ => self.start_binop_opt(chars, "@-", None),
1791 }
1792 }
1793 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1794 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1795 Some('@') => {
1796 chars.next();
1797 match chars.peek() {
1798 Some(' ') => Ok(Some(Token::AtAt)),
1799 Some(tch) if self.dialect.is_identifier_start('@') => {
1800 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1801 }
1802 _ => Ok(Some(Token::AtAt)),
1803 }
1804 }
1805 Some(' ') => Ok(Some(Token::AtSign)),
1806 Some('\'') => Ok(Some(Token::AtSign)),
1816 Some('\"') => Ok(Some(Token::AtSign)),
1817 Some('`') => Ok(Some(Token::AtSign)),
1818 Some(sch) if self.dialect.is_identifier_start('@') => {
1819 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1820 }
1821 _ => Ok(Some(Token::AtSign)),
1822 }
1823 }
1824 '?' if self.dialect.supports_geometric_types() => {
1826 chars.next(); match chars.peek() {
1828 Some('|') => {
1829 chars.next();
1830 match chars.peek() {
1831 Some('|') => self.consume_and_return(
1832 chars,
1833 Token::QuestionMarkDoubleVerticalBar,
1834 ),
1835 _ => Ok(Some(Token::QuestionPipe)),
1836 }
1837 }
1838
1839 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1840 Some('-') => {
1841 chars.next(); match chars.peek() {
1843 Some('|') => self
1844 .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1845 _ => Ok(Some(Token::QuestionMarkDash)),
1846 }
1847 }
1848 Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1849 _ => Ok(Some(Token::Question)),
1850 }
1851 }
1852 '?' => {
1853 chars.next();
1854 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1855 Ok(Some(Token::Placeholder(format!("?{s}"))))
1856 }
1857
1858 ch if self.dialect.is_identifier_start(ch) => {
1860 self.tokenize_identifier_or_keyword([ch], chars)
1861 }
1862 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1863
1864 ch if ch.is_whitespace() => {
1866 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1867 }
1868 other => self.consume_and_return(chars, Token::Char(other)),
1869 },
1870 None => Ok(None),
1871 }
1872 }
1873
1874 fn consume_for_binop(
1876 &self,
1877 chars: &mut State,
1878 prefix: &str,
1879 default: Token,
1880 ) -> Result<Option<Token>, TokenizerError> {
1881 chars.next(); self.start_binop_opt(chars, prefix, Some(default))
1883 }
1884
1885 fn start_binop(
1887 &self,
1888 chars: &mut State,
1889 prefix: &str,
1890 default: Token,
1891 ) -> Result<Option<Token>, TokenizerError> {
1892 self.start_binop_opt(chars, prefix, Some(default))
1893 }
1894
1895 fn start_binop_opt(
1897 &self,
1898 chars: &mut State,
1899 prefix: &str,
1900 default: Option<Token>,
1901 ) -> Result<Option<Token>, TokenizerError> {
1902 let mut custom = None;
1903 while let Some(&ch) = chars.peek() {
1904 if !self.dialect.is_custom_operator_part(ch) {
1905 break;
1906 }
1907
1908 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1909 chars.next();
1910 }
1911 match (custom, default) {
1912 (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1913 (None, Some(tok)) => Ok(Some(tok)),
1914 (None, None) => self.tokenizer_error(
1915 chars.location(),
1916 format!("Expected a valid binary operator after '{prefix}'"),
1917 ),
1918 }
1919 }
1920
1921 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1923 let mut s = String::new();
1924 let mut value = String::new();
1925
1926 chars.next();
1927
1928 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1930 chars.next();
1931
1932 let mut is_terminated = false;
1933 let mut prev: Option<char> = None;
1934
1935 while let Some(&ch) = chars.peek() {
1936 if prev == Some('$') {
1937 if ch == '$' {
1938 chars.next();
1939 is_terminated = true;
1940 break;
1941 } else {
1942 s.push('$');
1943 s.push(ch);
1944 }
1945 } else if ch != '$' {
1946 s.push(ch);
1947 }
1948
1949 prev = Some(ch);
1950 chars.next();
1951 }
1952
1953 return if chars.peek().is_none() && !is_terminated {
1954 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1955 } else {
1956 Ok(Token::DollarQuotedString(DollarQuotedString {
1957 value: s,
1958 tag: None,
1959 }))
1960 };
1961 } else {
1962 value.push_str(&peeking_take_while(chars, |ch| {
1963 ch.is_alphanumeric()
1964 || ch == '_'
1965 || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1967 }));
1968
1969 if matches!(chars.peek(), Some('.'))
1972 && self.dialect.supports_dollar_as_money_prefix()
1973 && !value.is_empty()
1974 && value.chars().all(|c| c.is_ascii_digit())
1975 {
1976 value.push('.');
1977 chars.next();
1978 value.push_str(&peeking_take_while(chars, |ch| ch.is_ascii_digit()));
1979 return Ok(Token::Placeholder(format!("${value}")));
1980 }
1981
1982 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1984 chars.next();
1985
1986 let mut temp = String::new();
1987 let end_delimiter = format!("${value}$");
1988
1989 loop {
1990 match chars.next() {
1991 Some(ch) => {
1992 temp.push(ch);
1993
1994 if temp.ends_with(&end_delimiter) {
1995 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1996 s.push_str(temp);
1997 }
1998 break;
1999 }
2000 }
2001 None => {
2002 if temp.ends_with(&end_delimiter) {
2003 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
2004 s.push_str(temp);
2005 }
2006 break;
2007 }
2008
2009 return self.tokenizer_error(
2010 chars.location(),
2011 "Unterminated dollar-quoted, expected $",
2012 );
2013 }
2014 }
2015 }
2016 } else {
2017 return Ok(Token::Placeholder(format!("${value}")));
2018 }
2019 }
2020
2021 Ok(Token::DollarQuotedString(DollarQuotedString {
2022 value: s,
2023 tag: if value.is_empty() { None } else { Some(value) },
2024 }))
2025 }
2026
2027 fn tokenizer_error<R>(
2028 &self,
2029 loc: Location,
2030 message: impl Into<String>,
2031 ) -> Result<R, TokenizerError> {
2032 Err(TokenizerError {
2033 message: message.into(),
2034 location: loc,
2035 })
2036 }
2037
2038 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
2040 let mut comment = peeking_take_while(chars, |ch| match ch {
2041 '\n' => false, '\r' if dialect_of!(self is PostgreSqlDialect) => false, _ => true, });
2045
2046 if let Some(ch) = chars.next() {
2047 assert!(ch == '\n' || ch == '\r');
2048 comment.push(ch);
2049 }
2050
2051 comment
2052 }
2053
2054 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
2056 let mut s = first_chars.into();
2057 s.push_str(&peeking_take_while(chars, |ch| {
2058 self.dialect.is_identifier_part(ch)
2059 }));
2060 s
2061 }
2062
2063 fn tokenize_quoted_identifier(
2065 &self,
2066 quote_start: char,
2067 chars: &mut State,
2068 ) -> Result<String, TokenizerError> {
2069 let error_loc = chars.location();
2070 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
2072 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
2073
2074 if last_char == Some(quote_end) {
2075 Ok(s)
2076 } else {
2077 self.tokenizer_error(
2078 error_loc,
2079 format!("Expected close delimiter '{quote_end}' before EOF."),
2080 )
2081 }
2082 }
2083
2084 fn tokenize_escaped_single_quoted_string(
2086 &self,
2087 starting_loc: Location,
2088 chars: &mut State,
2089 ) -> Result<String, TokenizerError> {
2090 if let Some(s) = unescape_single_quoted_string(chars) {
2091 return Ok(s);
2092 }
2093
2094 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
2095 }
2096
2097 fn tokenize_single_or_triple_quoted_string<F>(
2100 &self,
2101 chars: &mut State,
2102 quote_style: char,
2103 backslash_escape: bool,
2104 single_quote_token: F,
2105 triple_quote_token: F,
2106 ) -> Result<Option<Token>, TokenizerError>
2107 where
2108 F: Fn(String) -> Token,
2109 {
2110 let error_loc = chars.location();
2111
2112 let mut num_opening_quotes = 0u8;
2113 for _ in 0..3 {
2114 if Some("e_style) == chars.peek() {
2115 chars.next(); num_opening_quotes += 1;
2117 } else {
2118 break;
2119 }
2120 }
2121
2122 let (token_fn, num_quote_chars) = match num_opening_quotes {
2123 1 => (single_quote_token, NumStringQuoteChars::One),
2124 2 => {
2125 return Ok(Some(single_quote_token("".into())));
2127 }
2128 3 => {
2129 let Some(num_quote_chars) = NonZeroU8::new(3) else {
2130 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
2131 };
2132 (
2133 triple_quote_token,
2134 NumStringQuoteChars::Many(num_quote_chars),
2135 )
2136 }
2137 _ => {
2138 return self.tokenizer_error(error_loc, "invalid string literal opening");
2139 }
2140 };
2141
2142 let settings = TokenizeQuotedStringSettings {
2143 quote_style,
2144 num_quote_chars,
2145 num_opening_quotes_to_consume: 0,
2146 backslash_escape,
2147 };
2148
2149 self.tokenize_quoted_string(chars, settings)
2150 .map(token_fn)
2151 .map(Some)
2152 }
2153
2154 fn tokenize_single_quoted_string(
2156 &self,
2157 chars: &mut State,
2158 quote_style: char,
2159 backslash_escape: bool,
2160 ) -> Result<String, TokenizerError> {
2161 self.tokenize_quoted_string(
2162 chars,
2163 TokenizeQuotedStringSettings {
2164 quote_style,
2165 num_quote_chars: NumStringQuoteChars::One,
2166 num_opening_quotes_to_consume: 1,
2167 backslash_escape,
2168 },
2169 )
2170 }
2171
2172 fn tokenize_quote_delimited_string(
2176 &self,
2177 chars: &mut State,
2178 literal_prefix: &[char],
2181 ) -> Result<QuoteDelimitedString, TokenizerError> {
2182 let literal_start_loc = chars.location();
2183 chars.next();
2184
2185 let start_quote_loc = chars.location();
2186 let (start_quote, end_quote) = match chars.next() {
2187 None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => {
2188 return self.tokenizer_error(
2189 start_quote_loc,
2190 format!(
2191 "Invalid space, tab, newline, or EOF after '{}''",
2192 String::from_iter(literal_prefix)
2193 ),
2194 );
2195 }
2196 Some(c) => (
2197 c,
2198 match c {
2199 '[' => ']',
2200 '{' => '}',
2201 '<' => '>',
2202 '(' => ')',
2203 c => c,
2204 },
2205 ),
2206 };
2207
2208 let mut value = String::new();
2210 while let Some(ch) = chars.next() {
2211 if ch == end_quote {
2212 if let Some('\'') = chars.peek() {
2213 chars.next(); return Ok(QuoteDelimitedString {
2215 start_quote,
2216 value,
2217 end_quote,
2218 });
2219 }
2220 }
2221 value.push(ch);
2222 }
2223
2224 self.tokenizer_error(literal_start_loc, "Unterminated string literal")
2225 }
2226
2227 fn tokenize_quoted_string(
2229 &self,
2230 chars: &mut State,
2231 settings: TokenizeQuotedStringSettings,
2232 ) -> Result<String, TokenizerError> {
2233 let mut s = String::new();
2234 let error_loc = chars.location();
2235
2236 for _ in 0..settings.num_opening_quotes_to_consume {
2238 if Some(settings.quote_style) != chars.next() {
2239 return self.tokenizer_error(error_loc, "invalid string literal opening");
2240 }
2241 }
2242
2243 let mut num_consecutive_quotes = 0;
2244 while let Some(&ch) = chars.peek() {
2245 let pending_final_quote = match settings.num_quote_chars {
2246 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2247 n @ NumStringQuoteChars::Many(count)
2248 if num_consecutive_quotes + 1 == count.get() =>
2249 {
2250 Some(n)
2251 }
2252 NumStringQuoteChars::Many(_) => None,
2253 };
2254
2255 match ch {
2256 char if char == settings.quote_style && pending_final_quote.is_some() => {
2257 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2260 let mut buf = s.chars();
2265 for _ in 1..count.get() {
2266 buf.next_back();
2267 }
2268 return Ok(buf.as_str().to_string());
2269 } else if chars
2270 .peek()
2271 .map(|c| *c == settings.quote_style)
2272 .unwrap_or(false)
2273 {
2274 s.push(ch);
2275 if !self.unescape {
2276 s.push(ch);
2278 }
2279 chars.next();
2280 } else {
2281 return Ok(s);
2282 }
2283 }
2284 '\\' if settings.backslash_escape => {
2285 chars.next();
2287
2288 num_consecutive_quotes = 0;
2289
2290 if let Some(next) = chars.peek() {
2291 if !self.unescape
2292 || (self.dialect.ignores_wildcard_escapes()
2293 && (*next == '%' || *next == '_'))
2294 {
2295 s.push(ch);
2299 s.push(*next);
2300 chars.next(); } else {
2302 let n = match next {
2303 '0' => '\0',
2304 'a' => '\u{7}',
2305 'b' => '\u{8}',
2306 'f' => '\u{c}',
2307 'n' => '\n',
2308 'r' => '\r',
2309 't' => '\t',
2310 'Z' => '\u{1a}',
2311 _ => *next,
2312 };
2313 s.push(n);
2314 chars.next(); }
2316 }
2317 }
2318 ch => {
2319 chars.next(); if ch == settings.quote_style {
2322 num_consecutive_quotes += 1;
2323 } else {
2324 num_consecutive_quotes = 0;
2325 }
2326
2327 s.push(ch);
2328 }
2329 }
2330 }
2331 self.tokenizer_error(error_loc, "Unterminated string literal")
2332 }
2333
2334 fn tokenize_multiline_comment(
2335 &self,
2336 chars: &mut State,
2337 ) -> Result<Option<Token>, TokenizerError> {
2338 let mut s = String::new();
2339 let mut nested = 1;
2340 let supports_nested_comments = self.dialect.supports_nested_comments();
2341 loop {
2342 match chars.next() {
2343 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2344 chars.next(); s.push('/');
2346 s.push('*');
2347 nested += 1;
2348 }
2349 Some('*') if matches!(chars.peek(), Some('/')) => {
2350 chars.next(); nested -= 1;
2352 if nested == 0 {
2353 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2354 }
2355 s.push('*');
2356 s.push('/');
2357 }
2358 Some(ch) => {
2359 s.push(ch);
2360 }
2361 None => {
2362 break self.tokenizer_error(
2363 chars.location(),
2364 "Unexpected EOF while in a multi-line comment",
2365 );
2366 }
2367 }
2368 }
2369 }
2370
2371 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2372 let mut last_char = None;
2373 let mut s = String::new();
2374 while let Some(ch) = chars.next() {
2375 if ch == quote_end {
2376 if chars.peek() == Some("e_end) {
2377 chars.next();
2378 s.push(ch);
2379 if !self.unescape {
2380 s.push(ch);
2382 }
2383 } else {
2384 last_char = Some(quote_end);
2385 break;
2386 }
2387 } else {
2388 s.push(ch);
2389 }
2390 }
2391 (s, last_char)
2392 }
2393
2394 #[allow(clippy::unnecessary_wraps)]
2395 fn consume_and_return(
2396 &self,
2397 chars: &mut State,
2398 t: Token,
2399 ) -> Result<Option<Token>, TokenizerError> {
2400 chars.next();
2401 Ok(Some(t))
2402 }
2403}
2404
2405fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2409 let mut s = String::new();
2410 while let Some(&ch) = chars.peek() {
2411 if predicate(ch) {
2412 chars.next(); s.push(ch);
2414 } else {
2415 break;
2416 }
2417 }
2418 s
2419}
2420
2421fn peeking_next_take_while(
2423 chars: &mut State,
2424 mut predicate: impl FnMut(char, Option<char>) -> bool,
2425) -> String {
2426 let mut s = String::new();
2427 while let Some(&ch) = chars.peek() {
2428 let next_char = chars.peekable.clone().nth(1);
2429 if predicate(ch, next_char) {
2430 chars.next(); s.push(ch);
2432 } else {
2433 break;
2434 }
2435 }
2436 s
2437}
2438
2439fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2440 Unescape::new(chars).unescape()
2441}
2442
2443struct Unescape<'a: 'b, 'b> {
2444 chars: &'b mut State<'a>,
2445}
2446
2447impl<'a: 'b, 'b> Unescape<'a, 'b> {
2448 fn new(chars: &'b mut State<'a>) -> Self {
2449 Self { chars }
2450 }
2451 fn unescape(mut self) -> Option<String> {
2452 let mut unescaped = String::new();
2453
2454 self.chars.next();
2455
2456 while let Some(c) = self.chars.next() {
2457 if c == '\'' {
2458 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2460 self.chars.next();
2461 unescaped.push('\'');
2462 continue;
2463 }
2464 return Some(unescaped);
2465 }
2466
2467 if c != '\\' {
2468 unescaped.push(c);
2469 continue;
2470 }
2471
2472 let c = match self.chars.next()? {
2473 'b' => '\u{0008}',
2474 'f' => '\u{000C}',
2475 'n' => '\n',
2476 'r' => '\r',
2477 't' => '\t',
2478 'u' => self.unescape_unicode_16()?,
2479 'U' => self.unescape_unicode_32()?,
2480 'x' => self.unescape_hex()?,
2481 c if c.is_digit(8) => self.unescape_octal(c)?,
2482 c => c,
2483 };
2484
2485 unescaped.push(Self::check_null(c)?);
2486 }
2487
2488 None
2489 }
2490
2491 #[inline]
2492 fn check_null(c: char) -> Option<char> {
2493 if c == '\0' {
2494 None
2495 } else {
2496 Some(c)
2497 }
2498 }
2499
2500 #[inline]
2501 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2502 match u32::from_str_radix(s, RADIX) {
2504 Err(_) => None,
2505 Ok(n) => {
2506 let n = n & 0xFF;
2507 if n <= 127 {
2508 char::from_u32(n)
2509 } else {
2510 None
2511 }
2512 }
2513 }
2514 }
2515
2516 fn unescape_hex(&mut self) -> Option<char> {
2518 let mut s = String::new();
2519
2520 for _ in 0..2 {
2521 match self.next_hex_digit() {
2522 Some(c) => s.push(c),
2523 None => break,
2524 }
2525 }
2526
2527 if s.is_empty() {
2528 return Some('x');
2529 }
2530
2531 Self::byte_to_char::<16>(&s)
2532 }
2533
2534 #[inline]
2535 fn next_hex_digit(&mut self) -> Option<char> {
2536 match self.chars.peek() {
2537 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2538 _ => None,
2539 }
2540 }
2541
2542 fn unescape_octal(&mut self, c: char) -> Option<char> {
2544 let mut s = String::new();
2545
2546 s.push(c);
2547 for _ in 0..2 {
2548 match self.next_octal_digest() {
2549 Some(c) => s.push(c),
2550 None => break,
2551 }
2552 }
2553
2554 Self::byte_to_char::<8>(&s)
2555 }
2556
2557 #[inline]
2558 fn next_octal_digest(&mut self) -> Option<char> {
2559 match self.chars.peek() {
2560 Some(c) if c.is_digit(8) => self.chars.next(),
2561 _ => None,
2562 }
2563 }
2564
2565 fn unescape_unicode_16(&mut self) -> Option<char> {
2567 self.unescape_unicode::<4>()
2568 }
2569
2570 fn unescape_unicode_32(&mut self) -> Option<char> {
2572 self.unescape_unicode::<8>()
2573 }
2574
2575 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2576 let mut s = String::new();
2577 for _ in 0..NUM {
2578 s.push(self.chars.next()?);
2579 }
2580 match u32::from_str_radix(&s, 16) {
2581 Err(_) => None,
2582 Ok(n) => char::from_u32(n),
2583 }
2584 }
2585}
2586
2587fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2588 let mut unescaped = String::new();
2589 chars.next(); while let Some(c) = chars.next() {
2591 match c {
2592 '\'' => {
2593 if chars.peek() == Some(&'\'') {
2594 chars.next();
2595 unescaped.push('\'');
2596 } else {
2597 return Ok(unescaped);
2598 }
2599 }
2600 '\\' => match chars.peek() {
2601 Some('\\') => {
2602 chars.next();
2603 unescaped.push('\\');
2604 }
2605 Some('+') => {
2606 chars.next();
2607 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2608 }
2609 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2610 },
2611 _ => {
2612 unescaped.push(c);
2613 }
2614 }
2615 }
2616 Err(TokenizerError {
2617 message: "Unterminated unicode encoded string literal".to_string(),
2618 location: chars.location(),
2619 })
2620}
2621
2622fn take_char_from_hex_digits(
2623 chars: &mut State<'_>,
2624 max_digits: usize,
2625) -> Result<char, TokenizerError> {
2626 let mut result = 0u32;
2627 for _ in 0..max_digits {
2628 let next_char = chars.next().ok_or_else(|| TokenizerError {
2629 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2630 .to_string(),
2631 location: chars.location(),
2632 })?;
2633 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2634 message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2635 location: chars.location(),
2636 })?;
2637 result = result * 16 + digit;
2638 }
2639 char::from_u32(result).ok_or_else(|| TokenizerError {
2640 message: format!("Invalid unicode character: {result:x}"),
2641 location: chars.location(),
2642 })
2643}
2644
2645#[cfg(test)]
2646mod tests {
2647 use super::*;
2648 use crate::dialect::{
2649 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect,
2650 PostgreSqlDialect, SQLiteDialect,
2651 };
2652 use crate::test_utils::{all_dialects, all_dialects_except, all_dialects_where};
2653 use core::fmt::Debug;
2654
2655 #[test]
2656 fn tokenizer_error_impl() {
2657 let err = TokenizerError {
2658 message: "test".into(),
2659 location: Location { line: 1, column: 1 },
2660 };
2661 {
2662 use core::error::Error;
2663 assert!(err.source().is_none());
2664 }
2665 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2666 }
2667
2668 #[test]
2669 fn tokenize_select_1() {
2670 let sql = String::from("SELECT 1");
2671 let dialect = GenericDialect {};
2672 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2673
2674 let expected = vec![
2675 Token::make_keyword("SELECT"),
2676 Token::Whitespace(Whitespace::Space),
2677 Token::Number(String::from("1"), false),
2678 ];
2679
2680 compare(expected, tokens);
2681 }
2682
2683 #[test]
2684 fn tokenize_select_float() {
2685 let sql = String::from("SELECT .1");
2686 let dialect = GenericDialect {};
2687 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2688
2689 let expected = vec![
2690 Token::make_keyword("SELECT"),
2691 Token::Whitespace(Whitespace::Space),
2692 Token::Number(String::from(".1"), false),
2693 ];
2694
2695 compare(expected, tokens);
2696 }
2697
2698 #[test]
2699 fn tokenize_with_mapper() {
2700 let sql = String::from("SELECT ?");
2701 let dialect = GenericDialect {};
2702 let mut param_num = 1;
2703
2704 let mut tokens = vec![];
2705 Tokenizer::new(&dialect, &sql)
2706 .tokenize_with_location_into_buf_with_mapper(&mut tokens, |mut token_span| {
2707 token_span.token = match token_span.token {
2708 Token::Placeholder(n) => Token::Placeholder(if n == "?" {
2709 let ret = format!("${}", param_num);
2710 param_num += 1;
2711 ret
2712 } else {
2713 n
2714 }),
2715 token => token,
2716 };
2717 token_span
2718 })
2719 .unwrap();
2720 let actual = tokens.into_iter().map(|t| t.token).collect();
2721 let expected = vec![
2722 Token::make_keyword("SELECT"),
2723 Token::Whitespace(Whitespace::Space),
2724 Token::Placeholder("$1".to_string()),
2725 ];
2726
2727 compare(expected, actual);
2728 }
2729
2730 #[test]
2731 fn tokenize_clickhouse_double_equal() {
2732 let sql = String::from("SELECT foo=='1'");
2733 let dialect = ClickHouseDialect {};
2734 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2735 let tokens = tokenizer.tokenize().unwrap();
2736
2737 let expected = vec![
2738 Token::make_keyword("SELECT"),
2739 Token::Whitespace(Whitespace::Space),
2740 Token::Word(Word {
2741 value: "foo".to_string(),
2742 quote_style: None,
2743 keyword: Keyword::NoKeyword,
2744 }),
2745 Token::DoubleEq,
2746 Token::SingleQuotedString("1".to_string()),
2747 ];
2748
2749 compare(expected, tokens);
2750 }
2751
2752 #[test]
2753 fn tokenize_numeric_literal_underscore() {
2754 let dialect = GenericDialect {};
2755 let sql = String::from("SELECT 10_000");
2756 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2757 let tokens = tokenizer.tokenize().unwrap();
2758 let expected = vec![
2759 Token::make_keyword("SELECT"),
2760 Token::Whitespace(Whitespace::Space),
2761 Token::Number("10".to_string(), false),
2762 Token::make_word("_000", None),
2763 ];
2764 compare(expected, tokens);
2765
2766 all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2767 "SELECT 10_000, _10_000, 10_00_, 10___0",
2768 vec![
2769 Token::make_keyword("SELECT"),
2770 Token::Whitespace(Whitespace::Space),
2771 Token::Number("10_000".to_string(), false),
2772 Token::Comma,
2773 Token::Whitespace(Whitespace::Space),
2774 Token::make_word("_10_000", None), Token::Comma,
2776 Token::Whitespace(Whitespace::Space),
2777 Token::Number("10_00".to_string(), false),
2778 Token::make_word("_", None), Token::Comma,
2780 Token::Whitespace(Whitespace::Space),
2781 Token::Number("10".to_string(), false),
2782 Token::make_word("___0", None), ],
2784 );
2785 }
2786
2787 #[test]
2788 fn tokenize_select_exponent() {
2789 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2790 let dialect = GenericDialect {};
2791 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2792
2793 let expected = vec![
2794 Token::make_keyword("SELECT"),
2795 Token::Whitespace(Whitespace::Space),
2796 Token::Number(String::from("1e10"), false),
2797 Token::Comma,
2798 Token::Whitespace(Whitespace::Space),
2799 Token::Number(String::from("1e-10"), false),
2800 Token::Comma,
2801 Token::Whitespace(Whitespace::Space),
2802 Token::Number(String::from("1e+10"), false),
2803 Token::Comma,
2804 Token::Whitespace(Whitespace::Space),
2805 Token::Number(String::from("1"), false),
2806 Token::make_word("ea", None),
2807 Token::Comma,
2808 Token::Whitespace(Whitespace::Space),
2809 Token::Number(String::from("1e-10"), false),
2810 Token::make_word("a", None),
2811 Token::Comma,
2812 Token::Whitespace(Whitespace::Space),
2813 Token::Number(String::from("1e-10"), false),
2814 Token::Minus,
2815 Token::Number(String::from("10"), false),
2816 ];
2817
2818 compare(expected, tokens);
2819 }
2820
2821 #[test]
2822 fn tokenize_scalar_function() {
2823 let sql = String::from("SELECT sqrt(1)");
2824 let dialect = GenericDialect {};
2825 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2826
2827 let expected = vec![
2828 Token::make_keyword("SELECT"),
2829 Token::Whitespace(Whitespace::Space),
2830 Token::make_word("sqrt", None),
2831 Token::LParen,
2832 Token::Number(String::from("1"), false),
2833 Token::RParen,
2834 ];
2835
2836 compare(expected, tokens);
2837 }
2838
2839 #[test]
2840 fn tokenize_string_string_concat() {
2841 let sql = String::from("SELECT 'a' || 'b'");
2842 let dialect = GenericDialect {};
2843 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2844
2845 let expected = vec![
2846 Token::make_keyword("SELECT"),
2847 Token::Whitespace(Whitespace::Space),
2848 Token::SingleQuotedString(String::from("a")),
2849 Token::Whitespace(Whitespace::Space),
2850 Token::StringConcat,
2851 Token::Whitespace(Whitespace::Space),
2852 Token::SingleQuotedString(String::from("b")),
2853 ];
2854
2855 compare(expected, tokens);
2856 }
2857 #[test]
2858 fn tokenize_bitwise_op() {
2859 let sql = String::from("SELECT one | two ^ three");
2860 let dialect = GenericDialect {};
2861 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2862
2863 let expected = vec![
2864 Token::make_keyword("SELECT"),
2865 Token::Whitespace(Whitespace::Space),
2866 Token::make_word("one", None),
2867 Token::Whitespace(Whitespace::Space),
2868 Token::Pipe,
2869 Token::Whitespace(Whitespace::Space),
2870 Token::make_word("two", None),
2871 Token::Whitespace(Whitespace::Space),
2872 Token::Caret,
2873 Token::Whitespace(Whitespace::Space),
2874 Token::make_word("three", None),
2875 ];
2876 compare(expected, tokens);
2877 }
2878
2879 #[test]
2880 fn tokenize_logical_xor() {
2881 let sql =
2882 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2883 let dialect = GenericDialect {};
2884 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2885
2886 let expected = vec![
2887 Token::make_keyword("SELECT"),
2888 Token::Whitespace(Whitespace::Space),
2889 Token::make_keyword("true"),
2890 Token::Whitespace(Whitespace::Space),
2891 Token::make_keyword("XOR"),
2892 Token::Whitespace(Whitespace::Space),
2893 Token::make_keyword("true"),
2894 Token::Comma,
2895 Token::Whitespace(Whitespace::Space),
2896 Token::make_keyword("false"),
2897 Token::Whitespace(Whitespace::Space),
2898 Token::make_keyword("XOR"),
2899 Token::Whitespace(Whitespace::Space),
2900 Token::make_keyword("false"),
2901 Token::Comma,
2902 Token::Whitespace(Whitespace::Space),
2903 Token::make_keyword("true"),
2904 Token::Whitespace(Whitespace::Space),
2905 Token::make_keyword("XOR"),
2906 Token::Whitespace(Whitespace::Space),
2907 Token::make_keyword("false"),
2908 Token::Comma,
2909 Token::Whitespace(Whitespace::Space),
2910 Token::make_keyword("false"),
2911 Token::Whitespace(Whitespace::Space),
2912 Token::make_keyword("XOR"),
2913 Token::Whitespace(Whitespace::Space),
2914 Token::make_keyword("true"),
2915 ];
2916 compare(expected, tokens);
2917 }
2918
2919 #[test]
2920 fn tokenize_simple_select() {
2921 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2922 let dialect = GenericDialect {};
2923 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2924
2925 let expected = vec![
2926 Token::make_keyword("SELECT"),
2927 Token::Whitespace(Whitespace::Space),
2928 Token::Mul,
2929 Token::Whitespace(Whitespace::Space),
2930 Token::make_keyword("FROM"),
2931 Token::Whitespace(Whitespace::Space),
2932 Token::make_word("customer", None),
2933 Token::Whitespace(Whitespace::Space),
2934 Token::make_keyword("WHERE"),
2935 Token::Whitespace(Whitespace::Space),
2936 Token::make_word("id", None),
2937 Token::Whitespace(Whitespace::Space),
2938 Token::Eq,
2939 Token::Whitespace(Whitespace::Space),
2940 Token::Number(String::from("1"), false),
2941 Token::Whitespace(Whitespace::Space),
2942 Token::make_keyword("LIMIT"),
2943 Token::Whitespace(Whitespace::Space),
2944 Token::Number(String::from("5"), false),
2945 ];
2946
2947 compare(expected, tokens);
2948 }
2949
2950 #[test]
2951 fn tokenize_explain_select() {
2952 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2953 let dialect = GenericDialect {};
2954 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2955
2956 let expected = vec![
2957 Token::make_keyword("EXPLAIN"),
2958 Token::Whitespace(Whitespace::Space),
2959 Token::make_keyword("SELECT"),
2960 Token::Whitespace(Whitespace::Space),
2961 Token::Mul,
2962 Token::Whitespace(Whitespace::Space),
2963 Token::make_keyword("FROM"),
2964 Token::Whitespace(Whitespace::Space),
2965 Token::make_word("customer", None),
2966 Token::Whitespace(Whitespace::Space),
2967 Token::make_keyword("WHERE"),
2968 Token::Whitespace(Whitespace::Space),
2969 Token::make_word("id", None),
2970 Token::Whitespace(Whitespace::Space),
2971 Token::Eq,
2972 Token::Whitespace(Whitespace::Space),
2973 Token::Number(String::from("1"), false),
2974 ];
2975
2976 compare(expected, tokens);
2977 }
2978
2979 #[test]
2980 fn tokenize_explain_analyze_select() {
2981 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2982 let dialect = GenericDialect {};
2983 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2984
2985 let expected = vec![
2986 Token::make_keyword("EXPLAIN"),
2987 Token::Whitespace(Whitespace::Space),
2988 Token::make_keyword("ANALYZE"),
2989 Token::Whitespace(Whitespace::Space),
2990 Token::make_keyword("SELECT"),
2991 Token::Whitespace(Whitespace::Space),
2992 Token::Mul,
2993 Token::Whitespace(Whitespace::Space),
2994 Token::make_keyword("FROM"),
2995 Token::Whitespace(Whitespace::Space),
2996 Token::make_word("customer", None),
2997 Token::Whitespace(Whitespace::Space),
2998 Token::make_keyword("WHERE"),
2999 Token::Whitespace(Whitespace::Space),
3000 Token::make_word("id", None),
3001 Token::Whitespace(Whitespace::Space),
3002 Token::Eq,
3003 Token::Whitespace(Whitespace::Space),
3004 Token::Number(String::from("1"), false),
3005 ];
3006
3007 compare(expected, tokens);
3008 }
3009
3010 #[test]
3011 fn tokenize_string_predicate() {
3012 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
3013 let dialect = GenericDialect {};
3014 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3015
3016 let expected = vec![
3017 Token::make_keyword("SELECT"),
3018 Token::Whitespace(Whitespace::Space),
3019 Token::Mul,
3020 Token::Whitespace(Whitespace::Space),
3021 Token::make_keyword("FROM"),
3022 Token::Whitespace(Whitespace::Space),
3023 Token::make_word("customer", None),
3024 Token::Whitespace(Whitespace::Space),
3025 Token::make_keyword("WHERE"),
3026 Token::Whitespace(Whitespace::Space),
3027 Token::make_word("salary", None),
3028 Token::Whitespace(Whitespace::Space),
3029 Token::Neq,
3030 Token::Whitespace(Whitespace::Space),
3031 Token::SingleQuotedString(String::from("Not Provided")),
3032 ];
3033
3034 compare(expected, tokens);
3035 }
3036
3037 #[test]
3038 fn tokenize_invalid_string() {
3039 let sql = String::from("\n💝مصطفىh");
3040
3041 let dialect = GenericDialect {};
3042 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3043 let expected = vec![
3045 Token::Whitespace(Whitespace::Newline),
3046 Token::Char('💝'),
3047 Token::make_word("مصطفىh", None),
3048 ];
3049 compare(expected, tokens);
3050 }
3051
3052 #[test]
3053 fn tokenize_newline_in_string_literal() {
3054 let sql = String::from("'foo\r\nbar\nbaz'");
3055
3056 let dialect = GenericDialect {};
3057 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3058 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
3059 compare(expected, tokens);
3060 }
3061
3062 #[test]
3063 fn tokenize_unterminated_string_literal() {
3064 let sql = String::from("select 'foo");
3065
3066 let dialect = GenericDialect {};
3067 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3068 assert_eq!(
3069 tokenizer.tokenize(),
3070 Err(TokenizerError {
3071 message: "Unterminated string literal".to_string(),
3072 location: Location { line: 1, column: 8 },
3073 })
3074 );
3075 }
3076
3077 #[test]
3078 fn tokenize_unterminated_string_literal_utf8() {
3079 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
3080
3081 let dialect = GenericDialect {};
3082 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3083 assert_eq!(
3084 tokenizer.tokenize(),
3085 Err(TokenizerError {
3086 message: "Unterminated string literal".to_string(),
3087 location: Location {
3088 line: 1,
3089 column: 35
3090 }
3091 })
3092 );
3093 }
3094
3095 #[test]
3096 fn tokenize_invalid_string_cols() {
3097 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
3098
3099 let dialect = GenericDialect {};
3100 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3101 let expected = vec![
3103 Token::Whitespace(Whitespace::Newline),
3104 Token::Whitespace(Whitespace::Newline),
3105 Token::make_keyword("SELECT"),
3106 Token::Whitespace(Whitespace::Space),
3107 Token::Mul,
3108 Token::Whitespace(Whitespace::Space),
3109 Token::make_keyword("FROM"),
3110 Token::Whitespace(Whitespace::Space),
3111 Token::make_keyword("table"),
3112 Token::Whitespace(Whitespace::Tab),
3113 Token::Char('💝'),
3114 Token::make_word("مصطفىh", None),
3115 ];
3116 compare(expected, tokens);
3117 }
3118
3119 #[test]
3120 fn tokenize_dollar_quoted_string_tagged() {
3121 let test_cases = vec![
3122 (
3123 String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
3124 vec![
3125 Token::make_keyword("SELECT"),
3126 Token::Whitespace(Whitespace::Space),
3127 Token::DollarQuotedString(DollarQuotedString {
3128 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
3129 tag: Some("tag".into()),
3130 })
3131 ]
3132 ),
3133 (
3134 String::from("SELECT $abc$x$ab$abc$"),
3135 vec![
3136 Token::make_keyword("SELECT"),
3137 Token::Whitespace(Whitespace::Space),
3138 Token::DollarQuotedString(DollarQuotedString {
3139 value: "x$ab".into(),
3140 tag: Some("abc".into()),
3141 })
3142 ]
3143 ),
3144 (
3145 String::from("SELECT $abc$$abc$"),
3146 vec![
3147 Token::make_keyword("SELECT"),
3148 Token::Whitespace(Whitespace::Space),
3149 Token::DollarQuotedString(DollarQuotedString {
3150 value: "".into(),
3151 tag: Some("abc".into()),
3152 })
3153 ]
3154 ),
3155 (
3156 String::from("0$abc$$abc$1"),
3157 vec![
3158 Token::Number("0".into(), false),
3159 Token::DollarQuotedString(DollarQuotedString {
3160 value: "".into(),
3161 tag: Some("abc".into()),
3162 }),
3163 Token::Number("1".into(), false),
3164 ]
3165 ),
3166 (
3167 String::from("$function$abc$q$data$q$$function$"),
3168 vec![
3169 Token::DollarQuotedString(DollarQuotedString {
3170 value: "abc$q$data$q$".into(),
3171 tag: Some("function".into()),
3172 }),
3173 ]
3174 ),
3175 ];
3176
3177 let dialect = GenericDialect {};
3178 for (sql, expected) in test_cases {
3179 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3180 compare(expected, tokens);
3181 }
3182 }
3183
3184 #[test]
3185 fn tokenize_dollar_quoted_string_tagged_unterminated() {
3186 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
3187 let dialect = GenericDialect {};
3188 assert_eq!(
3189 Tokenizer::new(&dialect, &sql).tokenize(),
3190 Err(TokenizerError {
3191 message: "Unterminated dollar-quoted, expected $".into(),
3192 location: Location {
3193 line: 1,
3194 column: 91
3195 }
3196 })
3197 );
3198 }
3199
3200 #[test]
3201 fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
3202 let sql = String::from("SELECT $abc$abc$");
3203 let dialect = GenericDialect {};
3204 assert_eq!(
3205 Tokenizer::new(&dialect, &sql).tokenize(),
3206 Err(TokenizerError {
3207 message: "Unterminated dollar-quoted, expected $".into(),
3208 location: Location {
3209 line: 1,
3210 column: 17
3211 }
3212 })
3213 );
3214 }
3215
3216 #[test]
3217 fn tokenize_dollar_placeholder() {
3218 let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
3219 let dialect = SQLiteDialect {};
3220 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3221 assert_eq!(
3222 tokens,
3223 vec![
3224 Token::make_keyword("SELECT"),
3225 Token::Whitespace(Whitespace::Space),
3226 Token::Placeholder("$$".into()),
3227 Token::Comma,
3228 Token::Whitespace(Whitespace::Space),
3229 Token::Placeholder("$$ABC$$".into()),
3230 Token::Comma,
3231 Token::Whitespace(Whitespace::Space),
3232 Token::Placeholder("$ABC$".into()),
3233 Token::Comma,
3234 Token::Whitespace(Whitespace::Space),
3235 Token::Placeholder("$ABC".into()),
3236 ]
3237 );
3238 }
3239
3240 #[test]
3241 fn tokenize_nested_dollar_quoted_strings() {
3242 let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
3243 let dialect = GenericDialect {};
3244 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3245 let expected = vec![
3246 Token::make_keyword("SELECT"),
3247 Token::Whitespace(Whitespace::Space),
3248 Token::DollarQuotedString(DollarQuotedString {
3249 value: "dollar $nested$ string".into(),
3250 tag: Some("tag".into()),
3251 }),
3252 ];
3253 compare(expected, tokens);
3254 }
3255
3256 #[test]
3257 fn tokenize_dollar_quoted_string_untagged_empty() {
3258 let sql = String::from("SELECT $$$$");
3259 let dialect = GenericDialect {};
3260 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3261 let expected = vec![
3262 Token::make_keyword("SELECT"),
3263 Token::Whitespace(Whitespace::Space),
3264 Token::DollarQuotedString(DollarQuotedString {
3265 value: "".into(),
3266 tag: None,
3267 }),
3268 ];
3269 compare(expected, tokens);
3270 }
3271
3272 #[test]
3273 fn tokenize_dollar_quoted_string_untagged() {
3274 let sql =
3275 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3276 let dialect = GenericDialect {};
3277 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3278 let expected = vec![
3279 Token::make_keyword("SELECT"),
3280 Token::Whitespace(Whitespace::Space),
3281 Token::DollarQuotedString(DollarQuotedString {
3282 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3283 tag: None,
3284 }),
3285 ];
3286 compare(expected, tokens);
3287 }
3288
3289 #[test]
3290 fn tokenize_dollar_quoted_string_untagged_unterminated() {
3291 let sql = String::from(
3292 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3293 );
3294 let dialect = GenericDialect {};
3295 assert_eq!(
3296 Tokenizer::new(&dialect, &sql).tokenize(),
3297 Err(TokenizerError {
3298 message: "Unterminated dollar-quoted string".into(),
3299 location: Location {
3300 line: 1,
3301 column: 86
3302 }
3303 })
3304 );
3305 }
3306
3307 #[test]
3308 fn tokenize_right_arrow() {
3309 let sql = String::from("FUNCTION(key=>value)");
3310 let dialect = GenericDialect {};
3311 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3312 let expected = vec![
3313 Token::make_word("FUNCTION", None),
3314 Token::LParen,
3315 Token::make_word("key", None),
3316 Token::RArrow,
3317 Token::make_word("value", None),
3318 Token::RParen,
3319 ];
3320 compare(expected, tokens);
3321 }
3322
3323 #[test]
3324 fn tokenize_is_null() {
3325 let sql = String::from("a IS NULL");
3326 let dialect = GenericDialect {};
3327 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3328
3329 let expected = vec![
3330 Token::make_word("a", None),
3331 Token::Whitespace(Whitespace::Space),
3332 Token::make_keyword("IS"),
3333 Token::Whitespace(Whitespace::Space),
3334 Token::make_keyword("NULL"),
3335 ];
3336
3337 compare(expected, tokens);
3338 }
3339
3340 #[test]
3341 fn tokenize_comment() {
3342 let test_cases = vec![
3343 (
3344 String::from("0--this is a comment\n1"),
3345 vec![
3346 Token::Number("0".to_string(), false),
3347 Token::Whitespace(Whitespace::SingleLineComment {
3348 prefix: "--".to_string(),
3349 comment: "this is a comment\n".to_string(),
3350 }),
3351 Token::Number("1".to_string(), false),
3352 ],
3353 ),
3354 (
3355 String::from("0--this is a comment\r1"),
3356 vec![
3357 Token::Number("0".to_string(), false),
3358 Token::Whitespace(Whitespace::SingleLineComment {
3359 prefix: "--".to_string(),
3360 comment: "this is a comment\r1".to_string(),
3361 }),
3362 ],
3363 ),
3364 (
3365 String::from("0--this is a comment\r\n1"),
3366 vec![
3367 Token::Number("0".to_string(), false),
3368 Token::Whitespace(Whitespace::SingleLineComment {
3369 prefix: "--".to_string(),
3370 comment: "this is a comment\r\n".to_string(),
3371 }),
3372 Token::Number("1".to_string(), false),
3373 ],
3374 ),
3375 ];
3376
3377 let dialect = GenericDialect {};
3378
3379 for (sql, expected) in test_cases {
3380 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3381 compare(expected, tokens);
3382 }
3383 }
3384
3385 #[test]
3386 fn tokenize_comment_postgres() {
3387 let sql = String::from("1--\r0");
3388
3389 let dialect = PostgreSqlDialect {};
3390 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3391 let expected = vec![
3392 Token::Number("1".to_string(), false),
3393 Token::Whitespace(Whitespace::SingleLineComment {
3394 prefix: "--".to_string(),
3395 comment: "\r".to_string(),
3396 }),
3397 Token::Number("0".to_string(), false),
3398 ];
3399 compare(expected, tokens);
3400 }
3401
3402 #[test]
3403 fn tokenize_comment_at_eof() {
3404 let sql = String::from("--this is a comment");
3405
3406 let dialect = GenericDialect {};
3407 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3408 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3409 prefix: "--".to_string(),
3410 comment: "this is a comment".to_string(),
3411 })];
3412 compare(expected, tokens);
3413 }
3414
3415 #[test]
3416 fn tokenize_multiline_comment() {
3417 let sql = String::from("0/*multi-line\n* /comment*/1");
3418
3419 let dialect = GenericDialect {};
3420 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3421 let expected = vec![
3422 Token::Number("0".to_string(), false),
3423 Token::Whitespace(Whitespace::MultiLineComment(
3424 "multi-line\n* /comment".to_string(),
3425 )),
3426 Token::Number("1".to_string(), false),
3427 ];
3428 compare(expected, tokens);
3429 }
3430
3431 #[test]
3432 fn tokenize_nested_multiline_comment() {
3433 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3434 "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3435 vec![
3436 Token::Number("0".to_string(), false),
3437 Token::Whitespace(Whitespace::MultiLineComment(
3438 "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3439 )),
3440 Token::Whitespace(Whitespace::Space),
3441 Token::Div,
3442 Token::Word(Word {
3443 value: "comment".to_string(),
3444 quote_style: None,
3445 keyword: Keyword::COMMENT,
3446 }),
3447 Token::Mul,
3448 Token::Div,
3449 Token::Number("1".to_string(), false),
3450 ],
3451 );
3452
3453 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3454 "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3455 vec![
3456 Token::Number("0".to_string(), false),
3457 Token::Whitespace(Whitespace::MultiLineComment(
3458 "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3459 )),
3460 Token::Number("1".to_string(), false),
3461 ],
3462 );
3463
3464 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3465 "SELECT 1/* a /* b */ c */0",
3466 vec![
3467 Token::make_keyword("SELECT"),
3468 Token::Whitespace(Whitespace::Space),
3469 Token::Number("1".to_string(), false),
3470 Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3471 Token::Number("0".to_string(), false),
3472 ],
3473 );
3474 }
3475
3476 #[test]
3477 fn tokenize_nested_multiline_comment_empty() {
3478 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3479 "select 1/*/**/*/0",
3480 vec![
3481 Token::make_keyword("select"),
3482 Token::Whitespace(Whitespace::Space),
3483 Token::Number("1".to_string(), false),
3484 Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3485 Token::Number("0".to_string(), false),
3486 ],
3487 );
3488 }
3489
3490 #[test]
3491 fn tokenize_nested_comments_if_not_supported() {
3492 all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3493 "SELECT 1/*/* nested comment */*/0",
3494 vec![
3495 Token::make_keyword("SELECT"),
3496 Token::Whitespace(Whitespace::Space),
3497 Token::Number("1".to_string(), false),
3498 Token::Whitespace(Whitespace::MultiLineComment(
3499 "/* nested comment ".to_string(),
3500 )),
3501 Token::Mul,
3502 Token::Div,
3503 Token::Number("0".to_string(), false),
3504 ],
3505 );
3506 }
3507
3508 #[test]
3509 fn tokenize_multiline_comment_with_even_asterisks() {
3510 let sql = String::from("\n/** Comment **/\n");
3511
3512 let dialect = GenericDialect {};
3513 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3514 let expected = vec![
3515 Token::Whitespace(Whitespace::Newline),
3516 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3517 Token::Whitespace(Whitespace::Newline),
3518 ];
3519 compare(expected, tokens);
3520 }
3521
3522 #[test]
3523 fn tokenize_unicode_whitespace() {
3524 let sql = String::from(" \u{2003}\n");
3525
3526 let dialect = GenericDialect {};
3527 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3528 let expected = vec![
3529 Token::Whitespace(Whitespace::Space),
3530 Token::Whitespace(Whitespace::Space),
3531 Token::Whitespace(Whitespace::Newline),
3532 ];
3533 compare(expected, tokens);
3534 }
3535
3536 #[test]
3537 fn tokenize_mismatched_quotes() {
3538 let sql = String::from("\"foo");
3539
3540 let dialect = GenericDialect {};
3541 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3542 assert_eq!(
3543 tokenizer.tokenize(),
3544 Err(TokenizerError {
3545 message: "Expected close delimiter '\"' before EOF.".to_string(),
3546 location: Location { line: 1, column: 1 },
3547 })
3548 );
3549 }
3550
3551 #[test]
3552 fn tokenize_newlines() {
3553 let sql = String::from("line1\nline2\rline3\r\nline4\r");
3554
3555 let dialect = GenericDialect {};
3556 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3557 let expected = vec![
3558 Token::make_word("line1", None),
3559 Token::Whitespace(Whitespace::Newline),
3560 Token::make_word("line2", None),
3561 Token::Whitespace(Whitespace::Newline),
3562 Token::make_word("line3", None),
3563 Token::Whitespace(Whitespace::Newline),
3564 Token::make_word("line4", None),
3565 Token::Whitespace(Whitespace::Newline),
3566 ];
3567 compare(expected, tokens);
3568 }
3569
3570 #[test]
3571 fn tokenize_mssql_top() {
3572 let sql = "SELECT TOP 5 [bar] FROM foo";
3573 let dialect = MsSqlDialect {};
3574 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3575 let expected = vec![
3576 Token::make_keyword("SELECT"),
3577 Token::Whitespace(Whitespace::Space),
3578 Token::make_keyword("TOP"),
3579 Token::Whitespace(Whitespace::Space),
3580 Token::Number(String::from("5"), false),
3581 Token::Whitespace(Whitespace::Space),
3582 Token::make_word("bar", Some('[')),
3583 Token::Whitespace(Whitespace::Space),
3584 Token::make_keyword("FROM"),
3585 Token::Whitespace(Whitespace::Space),
3586 Token::make_word("foo", None),
3587 ];
3588 compare(expected, tokens);
3589 }
3590
3591 #[test]
3592 fn tokenize_pg_regex_match() {
3593 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3594 let dialect = GenericDialect {};
3595 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3596 let expected = vec![
3597 Token::make_keyword("SELECT"),
3598 Token::Whitespace(Whitespace::Space),
3599 Token::make_word("col", None),
3600 Token::Whitespace(Whitespace::Space),
3601 Token::Tilde,
3602 Token::Whitespace(Whitespace::Space),
3603 Token::SingleQuotedString("^a".into()),
3604 Token::Comma,
3605 Token::Whitespace(Whitespace::Space),
3606 Token::make_word("col", None),
3607 Token::Whitespace(Whitespace::Space),
3608 Token::TildeAsterisk,
3609 Token::Whitespace(Whitespace::Space),
3610 Token::SingleQuotedString("^a".into()),
3611 Token::Comma,
3612 Token::Whitespace(Whitespace::Space),
3613 Token::make_word("col", None),
3614 Token::Whitespace(Whitespace::Space),
3615 Token::ExclamationMarkTilde,
3616 Token::Whitespace(Whitespace::Space),
3617 Token::SingleQuotedString("^a".into()),
3618 Token::Comma,
3619 Token::Whitespace(Whitespace::Space),
3620 Token::make_word("col", None),
3621 Token::Whitespace(Whitespace::Space),
3622 Token::ExclamationMarkTildeAsterisk,
3623 Token::Whitespace(Whitespace::Space),
3624 Token::SingleQuotedString("^a".into()),
3625 ];
3626 compare(expected, tokens);
3627 }
3628
3629 #[test]
3630 fn tokenize_pg_like_match() {
3631 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3632 let dialect = GenericDialect {};
3633 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3634 let expected = vec![
3635 Token::make_keyword("SELECT"),
3636 Token::Whitespace(Whitespace::Space),
3637 Token::make_word("col", None),
3638 Token::Whitespace(Whitespace::Space),
3639 Token::DoubleTilde,
3640 Token::Whitespace(Whitespace::Space),
3641 Token::SingleQuotedString("_a%".into()),
3642 Token::Comma,
3643 Token::Whitespace(Whitespace::Space),
3644 Token::make_word("col", None),
3645 Token::Whitespace(Whitespace::Space),
3646 Token::DoubleTildeAsterisk,
3647 Token::Whitespace(Whitespace::Space),
3648 Token::SingleQuotedString("_a%".into()),
3649 Token::Comma,
3650 Token::Whitespace(Whitespace::Space),
3651 Token::make_word("col", None),
3652 Token::Whitespace(Whitespace::Space),
3653 Token::ExclamationMarkDoubleTilde,
3654 Token::Whitespace(Whitespace::Space),
3655 Token::SingleQuotedString("_a%".into()),
3656 Token::Comma,
3657 Token::Whitespace(Whitespace::Space),
3658 Token::make_word("col", None),
3659 Token::Whitespace(Whitespace::Space),
3660 Token::ExclamationMarkDoubleTildeAsterisk,
3661 Token::Whitespace(Whitespace::Space),
3662 Token::SingleQuotedString("_a%".into()),
3663 ];
3664 compare(expected, tokens);
3665 }
3666
3667 #[test]
3668 fn tokenize_quoted_identifier() {
3669 let sql = r#" "a "" b" "a """ "c """"" "#;
3670 let dialect = GenericDialect {};
3671 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3672 let expected = vec![
3673 Token::Whitespace(Whitespace::Space),
3674 Token::make_word(r#"a " b"#, Some('"')),
3675 Token::Whitespace(Whitespace::Space),
3676 Token::make_word(r#"a ""#, Some('"')),
3677 Token::Whitespace(Whitespace::Space),
3678 Token::make_word(r#"c """#, Some('"')),
3679 Token::Whitespace(Whitespace::Space),
3680 ];
3681 compare(expected, tokens);
3682 }
3683
3684 #[test]
3685 fn tokenize_snowflake_div() {
3686 let sql = r#"field/1000"#;
3687 let dialect = SnowflakeDialect {};
3688 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3689 let expected = vec![
3690 Token::make_word(r#"field"#, None),
3691 Token::Div,
3692 Token::Number("1000".to_string(), false),
3693 ];
3694 compare(expected, tokens);
3695 }
3696
3697 #[test]
3698 fn tokenize_quoted_identifier_with_no_escape() {
3699 let sql = r#" "a "" b" "a """ "c """"" "#;
3700 let dialect = GenericDialect {};
3701 let tokens = Tokenizer::new(&dialect, sql)
3702 .with_unescape(false)
3703 .tokenize()
3704 .unwrap();
3705 let expected = vec![
3706 Token::Whitespace(Whitespace::Space),
3707 Token::make_word(r#"a "" b"#, Some('"')),
3708 Token::Whitespace(Whitespace::Space),
3709 Token::make_word(r#"a """#, Some('"')),
3710 Token::Whitespace(Whitespace::Space),
3711 Token::make_word(r#"c """""#, Some('"')),
3712 Token::Whitespace(Whitespace::Space),
3713 ];
3714 compare(expected, tokens);
3715 }
3716
3717 #[test]
3718 fn tokenize_with_location() {
3719 let sql = "SELECT a,\n b";
3720 let dialect = GenericDialect {};
3721 let tokens = Tokenizer::new(&dialect, sql)
3722 .tokenize_with_location()
3723 .unwrap();
3724 let expected = vec![
3725 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3726 TokenWithSpan::at(
3727 Token::Whitespace(Whitespace::Space),
3728 (1, 7).into(),
3729 (1, 8).into(),
3730 ),
3731 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3732 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3733 TokenWithSpan::at(
3734 Token::Whitespace(Whitespace::Newline),
3735 (1, 10).into(),
3736 (2, 1).into(),
3737 ),
3738 TokenWithSpan::at(
3739 Token::Whitespace(Whitespace::Space),
3740 (2, 1).into(),
3741 (2, 2).into(),
3742 ),
3743 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3744 ];
3745 compare(expected, tokens);
3746 }
3747
3748 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3749 assert_eq!(expected, actual);
3754 }
3755
3756 fn check_unescape(s: &str, expected: Option<&str>) {
3757 let s = format!("'{s}'");
3758 let mut state = State {
3759 peekable: s.chars().peekable(),
3760 line: 0,
3761 col: 0,
3762 };
3763
3764 assert_eq!(
3765 unescape_single_quoted_string(&mut state),
3766 expected.map(|s| s.to_string())
3767 );
3768 }
3769
3770 #[test]
3771 fn test_unescape() {
3772 check_unescape(r"\b", Some("\u{0008}"));
3773 check_unescape(r"\f", Some("\u{000C}"));
3774 check_unescape(r"\t", Some("\t"));
3775 check_unescape(r"\r\n", Some("\r\n"));
3776 check_unescape(r"\/", Some("/"));
3777 check_unescape(r"/", Some("/"));
3778 check_unescape(r"\\", Some("\\"));
3779
3780 check_unescape(r"\u0001", Some("\u{0001}"));
3782 check_unescape(r"\u4c91", Some("\u{4c91}"));
3783 check_unescape(r"\u4c916", Some("\u{4c91}6"));
3784 check_unescape(r"\u4c", None);
3785 check_unescape(r"\u0000", None);
3786 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3787 check_unescape(r"\U00110000", None);
3788 check_unescape(r"\U00000000", None);
3789 check_unescape(r"\u", None);
3790 check_unescape(r"\U", None);
3791 check_unescape(r"\U1010FFFF", None);
3792
3793 check_unescape(r"\x4B", Some("\u{004b}"));
3795 check_unescape(r"\x4", Some("\u{0004}"));
3796 check_unescape(r"\x4L", Some("\u{0004}L"));
3797 check_unescape(r"\x", Some("x"));
3798 check_unescape(r"\xP", Some("xP"));
3799 check_unescape(r"\x0", None);
3800 check_unescape(r"\xCAD", None);
3801 check_unescape(r"\xA9", None);
3802
3803 check_unescape(r"\1", Some("\u{0001}"));
3805 check_unescape(r"\12", Some("\u{000a}"));
3806 check_unescape(r"\123", Some("\u{0053}"));
3807 check_unescape(r"\1232", Some("\u{0053}2"));
3808 check_unescape(r"\4", Some("\u{0004}"));
3809 check_unescape(r"\45", Some("\u{0025}"));
3810 check_unescape(r"\450", Some("\u{0028}"));
3811 check_unescape(r"\603", None);
3812 check_unescape(r"\0", None);
3813 check_unescape(r"\080", None);
3814
3815 check_unescape(r"\9", Some("9"));
3817 check_unescape(r"''", Some("'"));
3818 check_unescape(
3819 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3820 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3821 );
3822 check_unescape(r"Hello\0", None);
3823 check_unescape(r"Hello\xCADRust", None);
3824 }
3825
3826 #[test]
3827 fn tokenize_numeric_prefix_trait() {
3828 #[derive(Debug)]
3829 struct NumericPrefixDialect;
3830
3831 impl Dialect for NumericPrefixDialect {
3832 fn is_identifier_start(&self, ch: char) -> bool {
3833 ch.is_ascii_lowercase()
3834 || ch.is_ascii_uppercase()
3835 || ch.is_ascii_digit()
3836 || ch == '$'
3837 }
3838
3839 fn is_identifier_part(&self, ch: char) -> bool {
3840 ch.is_ascii_lowercase()
3841 || ch.is_ascii_uppercase()
3842 || ch.is_ascii_digit()
3843 || ch == '_'
3844 || ch == '$'
3845 || ch == '{'
3846 || ch == '}'
3847 }
3848
3849 fn supports_numeric_prefix(&self) -> bool {
3850 true
3851 }
3852 }
3853
3854 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3855 tokenize_numeric_prefix_inner(&HiveDialect {});
3856 tokenize_numeric_prefix_inner(&MySqlDialect {});
3857 }
3858
3859 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3860 let sql = r#"SELECT * FROM 1"#;
3861 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3862 let expected = vec![
3863 Token::make_keyword("SELECT"),
3864 Token::Whitespace(Whitespace::Space),
3865 Token::Mul,
3866 Token::Whitespace(Whitespace::Space),
3867 Token::make_keyword("FROM"),
3868 Token::Whitespace(Whitespace::Space),
3869 Token::Number(String::from("1"), false),
3870 ];
3871 compare(expected, tokens);
3872 }
3873
3874 #[test]
3875 fn tokenize_quoted_string_escape() {
3876 let dialect = SnowflakeDialect {};
3877 for (sql, expected, expected_unescaped) in [
3878 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3879 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3880 (r#"'\\'"#, r#"\\"#, r#"\"#),
3881 (
3882 r#"'\0\a\b\f\n\r\t\Z'"#,
3883 r#"\0\a\b\f\n\r\t\Z"#,
3884 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3885 ),
3886 (r#"'\"'"#, r#"\""#, "\""),
3887 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3888 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3889 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3890 (r#"'\q'"#, r#"\q"#, r#"q"#),
3891 (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3892 (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3893 ] {
3894 let tokens = Tokenizer::new(&dialect, sql)
3895 .with_unescape(false)
3896 .tokenize()
3897 .unwrap();
3898 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3899 compare(expected, tokens);
3900
3901 let tokens = Tokenizer::new(&dialect, sql)
3902 .with_unescape(true)
3903 .tokenize()
3904 .unwrap();
3905 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3906 compare(expected, tokens);
3907 }
3908
3909 for sql in [r#"'\'"#, r#"'ab\'"#] {
3910 let mut tokenizer = Tokenizer::new(&dialect, sql);
3911 assert_eq!(
3912 "Unterminated string literal",
3913 tokenizer.tokenize().unwrap_err().message.as_str(),
3914 );
3915 }
3916
3917 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3919 let dialect = GenericDialect {};
3920 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3921
3922 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3923
3924 compare(expected, tokens);
3925 }
3926
3927 for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3929 let dialect = MySqlDialect {};
3930 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3931
3932 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3933
3934 compare(expected, tokens);
3935 }
3936 }
3937
3938 #[test]
3939 fn tokenize_triple_quoted_string() {
3940 fn check<F>(
3941 q: char, r: char, quote_token: F,
3944 ) where
3945 F: Fn(String) -> Token,
3946 {
3947 let dialect = BigQueryDialect {};
3948
3949 for (sql, expected, expected_unescaped) in [
3950 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3952 (
3954 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3955 format!(r#"ab{q}{q}\{q}{q}cd"#),
3956 format!(r#"ab{q}{q}{q}{q}cd"#),
3957 ),
3958 (
3960 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3961 "abc".into(),
3962 "abc".into(),
3963 ),
3964 (
3966 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3967 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3968 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3969 ),
3970 (
3972 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3973 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3974 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3975 ),
3976 (
3978 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3979 r#"a\'\'b\'c\'d"#.into(),
3980 r#"a''b'c'd"#.into(),
3981 ),
3982 (
3984 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3985 r#"abc\0\n\rdef"#.into(),
3986 "abc\0\n\rdef".into(),
3987 ),
3988 ] {
3989 let tokens = Tokenizer::new(&dialect, sql.as_str())
3990 .with_unescape(false)
3991 .tokenize()
3992 .unwrap();
3993 let expected = vec![quote_token(expected.to_string())];
3994 compare(expected, tokens);
3995
3996 let tokens = Tokenizer::new(&dialect, sql.as_str())
3997 .with_unescape(true)
3998 .tokenize()
3999 .unwrap();
4000 let expected = vec![quote_token(expected_unescaped.to_string())];
4001 compare(expected, tokens);
4002 }
4003
4004 for sql in [
4005 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
4006 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
4007 format!(r#"{q}{q}{q}{q}"#),
4008 format!(r#"{q}{q}{q}{r}{r}"#),
4009 format!(r#"{q}{q}{q}abc{q}"#),
4010 format!(r#"{q}{q}{q}abc{q}{q}"#),
4011 format!(r#"{q}{q}{q}abc"#),
4012 ] {
4013 let dialect = BigQueryDialect {};
4014 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
4015 assert_eq!(
4016 "Unterminated string literal",
4017 tokenizer.tokenize().unwrap_err().message.as_str(),
4018 );
4019 }
4020 }
4021
4022 check('"', '\'', Token::TripleDoubleQuotedString);
4023
4024 check('\'', '"', Token::TripleSingleQuotedString);
4025
4026 let dialect = BigQueryDialect {};
4027
4028 let sql = r#"""''"#;
4029 let tokens = Tokenizer::new(&dialect, sql)
4030 .with_unescape(true)
4031 .tokenize()
4032 .unwrap();
4033 let expected = vec![
4034 Token::DoubleQuotedString("".to_string()),
4035 Token::SingleQuotedString("".to_string()),
4036 ];
4037 compare(expected, tokens);
4038
4039 let sql = r#"''"""#;
4040 let tokens = Tokenizer::new(&dialect, sql)
4041 .with_unescape(true)
4042 .tokenize()
4043 .unwrap();
4044 let expected = vec![
4045 Token::SingleQuotedString("".to_string()),
4046 Token::DoubleQuotedString("".to_string()),
4047 ];
4048 compare(expected, tokens);
4049
4050 let dialect = SnowflakeDialect {};
4052 let sql = r#"''''''"#;
4053 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4054 let expected = vec![Token::SingleQuotedString("''".to_string())];
4055 compare(expected, tokens);
4056 }
4057
4058 #[test]
4059 fn test_mysql_users_grantees() {
4060 let dialect = MySqlDialect {};
4061
4062 let sql = "CREATE USER `root`@`%`";
4063 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4064 let expected = vec![
4065 Token::make_keyword("CREATE"),
4066 Token::Whitespace(Whitespace::Space),
4067 Token::make_keyword("USER"),
4068 Token::Whitespace(Whitespace::Space),
4069 Token::make_word("root", Some('`')),
4070 Token::AtSign,
4071 Token::make_word("%", Some('`')),
4072 ];
4073 compare(expected, tokens);
4074 }
4075
4076 #[test]
4077 fn test_postgres_abs_without_space_and_string_literal() {
4078 let dialect = MySqlDialect {};
4079
4080 let sql = "SELECT @'1'";
4081 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4082 let expected = vec![
4083 Token::make_keyword("SELECT"),
4084 Token::Whitespace(Whitespace::Space),
4085 Token::AtSign,
4086 Token::SingleQuotedString("1".to_string()),
4087 ];
4088 compare(expected, tokens);
4089 }
4090
4091 #[test]
4092 fn test_postgres_abs_without_space_and_quoted_column() {
4093 let dialect = MySqlDialect {};
4094
4095 let sql = r#"SELECT @"bar" FROM foo"#;
4096 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4097 let expected = vec![
4098 Token::make_keyword("SELECT"),
4099 Token::Whitespace(Whitespace::Space),
4100 Token::AtSign,
4101 Token::DoubleQuotedString("bar".to_string()),
4102 Token::Whitespace(Whitespace::Space),
4103 Token::make_keyword("FROM"),
4104 Token::Whitespace(Whitespace::Space),
4105 Token::make_word("foo", None),
4106 ];
4107 compare(expected, tokens);
4108 }
4109
4110 #[test]
4111 fn test_national_strings_backslash_escape_not_supported() {
4112 all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
4113 .tokenizes_to(
4114 "select n'''''\\'",
4115 vec![
4116 Token::make_keyword("select"),
4117 Token::Whitespace(Whitespace::Space),
4118 Token::NationalStringLiteral("''\\".to_string()),
4119 ],
4120 );
4121 }
4122
4123 #[test]
4124 fn test_national_strings_backslash_escape_supported() {
4125 all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
4126 .tokenizes_to(
4127 "select n'''''\\''",
4128 vec![
4129 Token::make_keyword("select"),
4130 Token::Whitespace(Whitespace::Space),
4131 Token::NationalStringLiteral("'''".to_string()),
4132 ],
4133 );
4134 }
4135
4136 #[test]
4137 fn test_string_escape_constant_not_supported() {
4138 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
4139 "select e'...'",
4140 vec![
4141 Token::make_keyword("select"),
4142 Token::Whitespace(Whitespace::Space),
4143 Token::make_word("e", None),
4144 Token::SingleQuotedString("...".to_string()),
4145 ],
4146 );
4147
4148 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
4149 "select E'...'",
4150 vec![
4151 Token::make_keyword("select"),
4152 Token::Whitespace(Whitespace::Space),
4153 Token::make_word("E", None),
4154 Token::SingleQuotedString("...".to_string()),
4155 ],
4156 );
4157 }
4158
4159 #[test]
4160 fn test_string_escape_constant_supported() {
4161 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
4162 "select e'\\''",
4163 vec![
4164 Token::make_keyword("select"),
4165 Token::Whitespace(Whitespace::Space),
4166 Token::EscapedStringLiteral("'".to_string()),
4167 ],
4168 );
4169
4170 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
4171 "select E'\\''",
4172 vec![
4173 Token::make_keyword("select"),
4174 Token::Whitespace(Whitespace::Space),
4175 Token::EscapedStringLiteral("'".to_string()),
4176 ],
4177 );
4178 }
4179
4180 #[test]
4181 fn test_whitespace_required_after_single_line_comment() {
4182 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4183 .tokenizes_to(
4184 "SELECT --'abc'",
4185 vec![
4186 Token::make_keyword("SELECT"),
4187 Token::Whitespace(Whitespace::Space),
4188 Token::Minus,
4189 Token::Minus,
4190 Token::SingleQuotedString("abc".to_string()),
4191 ],
4192 );
4193
4194 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4195 .tokenizes_to(
4196 "SELECT -- 'abc'",
4197 vec![
4198 Token::make_keyword("SELECT"),
4199 Token::Whitespace(Whitespace::Space),
4200 Token::Whitespace(Whitespace::SingleLineComment {
4201 prefix: "--".to_string(),
4202 comment: " 'abc'".to_string(),
4203 }),
4204 ],
4205 );
4206
4207 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4208 .tokenizes_to(
4209 "SELECT --",
4210 vec![
4211 Token::make_keyword("SELECT"),
4212 Token::Whitespace(Whitespace::Space),
4213 Token::Minus,
4214 Token::Minus,
4215 ],
4216 );
4217
4218 all_dialects_where(|d| d.requires_single_line_comment_whitespace()).tokenizes_to(
4219 "--\n-- Table structure for table...\n--\n",
4220 vec![
4221 Token::Whitespace(Whitespace::SingleLineComment {
4222 prefix: "--".to_string(),
4223 comment: "\n".to_string(),
4224 }),
4225 Token::Whitespace(Whitespace::SingleLineComment {
4226 prefix: "--".to_string(),
4227 comment: " Table structure for table...\n".to_string(),
4228 }),
4229 Token::Whitespace(Whitespace::SingleLineComment {
4230 prefix: "--".to_string(),
4231 comment: "\n".to_string(),
4232 }),
4233 ],
4234 );
4235 }
4236
4237 #[test]
4238 fn test_whitespace_not_required_after_single_line_comment() {
4239 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4240 .tokenizes_to(
4241 "SELECT --'abc'",
4242 vec![
4243 Token::make_keyword("SELECT"),
4244 Token::Whitespace(Whitespace::Space),
4245 Token::Whitespace(Whitespace::SingleLineComment {
4246 prefix: "--".to_string(),
4247 comment: "'abc'".to_string(),
4248 }),
4249 ],
4250 );
4251
4252 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4253 .tokenizes_to(
4254 "SELECT -- 'abc'",
4255 vec![
4256 Token::make_keyword("SELECT"),
4257 Token::Whitespace(Whitespace::Space),
4258 Token::Whitespace(Whitespace::SingleLineComment {
4259 prefix: "--".to_string(),
4260 comment: " 'abc'".to_string(),
4261 }),
4262 ],
4263 );
4264
4265 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4266 .tokenizes_to(
4267 "SELECT --",
4268 vec![
4269 Token::make_keyword("SELECT"),
4270 Token::Whitespace(Whitespace::Space),
4271 Token::Whitespace(Whitespace::SingleLineComment {
4272 prefix: "--".to_string(),
4273 comment: "".to_string(),
4274 }),
4275 ],
4276 );
4277 }
4278
4279 #[test]
4280 fn test_tokenize_identifiers_numeric_prefix() {
4281 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4282 .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4283
4284 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4285 .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4286
4287 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4288 "t.12e34",
4289 vec![
4290 Token::make_word("t", None),
4291 Token::Period,
4292 Token::make_word("12e34", None),
4293 ],
4294 );
4295
4296 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4297 "t.1two3",
4298 vec![
4299 Token::make_word("t", None),
4300 Token::Period,
4301 Token::make_word("1two3", None),
4302 ],
4303 );
4304 }
4305
4306 #[test]
4307 fn tokenize_period_underscore() {
4308 let sql = String::from("SELECT table._col");
4309 let dialect = PostgreSqlDialect {};
4311 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4312
4313 let expected = vec![
4314 Token::make_keyword("SELECT"),
4315 Token::Whitespace(Whitespace::Space),
4316 Token::Word(Word {
4317 value: "table".to_string(),
4318 quote_style: None,
4319 keyword: Keyword::TABLE,
4320 }),
4321 Token::Period,
4322 Token::Word(Word {
4323 value: "_col".to_string(),
4324 quote_style: None,
4325 keyword: Keyword::NoKeyword,
4326 }),
4327 ];
4328
4329 compare(expected, tokens);
4330
4331 let sql = String::from("SELECT ._123");
4332 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4333 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4334 }
4335
4336 let sql = String::from("SELECT ._abc");
4337 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4338 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4339 }
4340 }
4341
4342 #[test]
4343 fn tokenize_question_mark() {
4344 let dialect = PostgreSqlDialect {};
4345 let sql = "SELECT x ? y";
4346 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4347 compare(
4348 tokens,
4349 vec![
4350 Token::make_keyword("SELECT"),
4351 Token::Whitespace(Whitespace::Space),
4352 Token::make_word("x", None),
4353 Token::Whitespace(Whitespace::Space),
4354 Token::Question,
4355 Token::Whitespace(Whitespace::Space),
4356 Token::make_word("y", None),
4357 ],
4358 );
4359 }
4360
4361 #[test]
4362 fn tokenize_multiline_comment_with_comment_hint() {
4363 let sql = String::from("0/*! word */1");
4364
4365 let dialect = MySqlDialect {};
4366 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4367 let expected = vec![
4368 Token::Number("0".to_string(), false),
4369 Token::Whitespace(Whitespace::Space),
4370 Token::Word(Word {
4371 value: "word".to_string(),
4372 quote_style: None,
4373 keyword: Keyword::NoKeyword,
4374 }),
4375 Token::Whitespace(Whitespace::Space),
4376 Token::Number("1".to_string(), false),
4377 ];
4378 compare(expected, tokens);
4379 }
4380
4381 #[test]
4382 fn tokenize_multiline_comment_with_comment_hint_and_version() {
4383 let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
4384 let dialect = MySqlDialect {};
4385 let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
4386 let expected = vec![
4387 Token::Number("0".to_string(), false),
4388 Token::Whitespace(Whitespace::Space),
4389 Token::Whitespace(Whitespace::Space),
4390 Token::Word(Word {
4391 value: "KEY_BLOCK_SIZE".to_string(),
4392 quote_style: None,
4393 keyword: Keyword::KEY_BLOCK_SIZE,
4394 }),
4395 Token::Whitespace(Whitespace::Space),
4396 Token::Eq,
4397 Token::Whitespace(Whitespace::Space),
4398 Token::Number("1024".to_string(), false),
4399 Token::Whitespace(Whitespace::Space),
4400 Token::Number("1".to_string(), false),
4401 ];
4402 compare(expected, tokens);
4403
4404 let tokens = Tokenizer::new(&dialect, "0 /*!50110 */ 1")
4405 .tokenize()
4406 .unwrap();
4407 compare(
4408 vec![
4409 Token::Number("0".to_string(), false),
4410 Token::Whitespace(Whitespace::Space),
4411 Token::Whitespace(Whitespace::Space),
4412 Token::Whitespace(Whitespace::Space),
4413 Token::Number("1".to_string(), false),
4414 ],
4415 tokens,
4416 );
4417
4418 let tokens = Tokenizer::new(&dialect, "0 /*!*/ 1").tokenize().unwrap();
4419 compare(
4420 vec![
4421 Token::Number("0".to_string(), false),
4422 Token::Whitespace(Whitespace::Space),
4423 Token::Whitespace(Whitespace::Space),
4424 Token::Number("1".to_string(), false),
4425 ],
4426 tokens,
4427 );
4428 let tokens = Tokenizer::new(&dialect, "0 /*! */ 1").tokenize().unwrap();
4429 compare(
4430 vec![
4431 Token::Number("0".to_string(), false),
4432 Token::Whitespace(Whitespace::Space),
4433 Token::Whitespace(Whitespace::Space),
4434 Token::Whitespace(Whitespace::Space),
4435 Token::Whitespace(Whitespace::Space),
4436 Token::Whitespace(Whitespace::Space),
4437 Token::Number("1".to_string(), false),
4438 ],
4439 tokens,
4440 );
4441 }
4442
4443 #[test]
4444 fn tokenize_lt() {
4445 all_dialects().tokenizes_to(
4446 "select a <-50",
4447 vec![
4448 Token::make_keyword("select"),
4449 Token::Whitespace(Whitespace::Space),
4450 Token::make_word("a", None),
4451 Token::Whitespace(Whitespace::Space),
4452 Token::Lt,
4453 Token::Minus,
4454 Token::Number("50".to_string(), false),
4455 ],
4456 );
4457 all_dialects().tokenizes_to(
4458 "select a <+50",
4459 vec![
4460 Token::make_keyword("select"),
4461 Token::Whitespace(Whitespace::Space),
4462 Token::make_word("a", None),
4463 Token::Whitespace(Whitespace::Space),
4464 Token::Lt,
4465 Token::Plus,
4466 Token::Number("50".to_string(), false),
4467 ],
4468 );
4469 all_dialects().tokenizes_to(
4470 "select a <=-50",
4471 vec![
4472 Token::make_keyword("select"),
4473 Token::Whitespace(Whitespace::Space),
4474 Token::make_word("a", None),
4475 Token::Whitespace(Whitespace::Space),
4476 Token::LtEq,
4477 Token::Minus,
4478 Token::Number("50".to_string(), false),
4479 ],
4480 );
4481 all_dialects().tokenizes_to(
4482 "select a <=+50",
4483 vec![
4484 Token::make_keyword("select"),
4485 Token::Whitespace(Whitespace::Space),
4486 Token::make_word("a", None),
4487 Token::Whitespace(Whitespace::Space),
4488 Token::LtEq,
4489 Token::Plus,
4490 Token::Number("50".to_string(), false),
4491 ],
4492 );
4493 all_dialects_where(|d| d.supports_geometric_types()).tokenizes_to(
4494 "select a <->b",
4495 vec![
4496 Token::make_keyword("select"),
4497 Token::Whitespace(Whitespace::Space),
4498 Token::make_word("a", None),
4499 Token::Whitespace(Whitespace::Space),
4500 Token::TwoWayArrow,
4501 Token::make_word("b", None),
4502 ],
4503 );
4504
4505 all_dialects().tokenizes_to(
4506 "select a <-b",
4507 vec![
4508 Token::make_keyword("select"),
4509 Token::Whitespace(Whitespace::Space),
4510 Token::make_word("a", None),
4511 Token::Whitespace(Whitespace::Space),
4512 Token::Lt,
4513 Token::Minus,
4514 Token::make_word("b", None),
4515 ],
4516 );
4517 all_dialects().tokenizes_to(
4518 "select a <+b",
4519 vec![
4520 Token::make_keyword("select"),
4521 Token::Whitespace(Whitespace::Space),
4522 Token::make_word("a", None),
4523 Token::Whitespace(Whitespace::Space),
4524 Token::Lt,
4525 Token::Plus,
4526 Token::make_word("b", None),
4527 ],
4528 );
4529 }
4530}