1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::fmt;
33use core::iter::Peekable;
34use core::num::NonZeroU8;
35use core::str::Chars;
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqltk_parser_derive::{Visit, VisitMut};
42
43use crate::ast::DollarQuotedString;
44use crate::dialect::Dialect;
45use crate::dialect::{
46 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
47 SnowflakeDialect,
48};
49use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
50
51#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56 EOF,
58 Word(Word),
60 Number(String, bool),
62 Char(char),
64 SingleQuotedString(String),
66 DoubleQuotedString(String),
68 TripleSingleQuotedString(String),
71 TripleDoubleQuotedString(String),
74 DollarQuotedString(DollarQuotedString),
76 SingleQuotedByteStringLiteral(String),
79 DoubleQuotedByteStringLiteral(String),
81 TripleSingleQuotedByteStringLiteral(String),
84 TripleDoubleQuotedByteStringLiteral(String),
87 SingleQuotedRawStringLiteral(String),
90 DoubleQuotedRawStringLiteral(String),
93 TripleSingleQuotedRawStringLiteral(String),
96 TripleDoubleQuotedRawStringLiteral(String),
99 NationalStringLiteral(String),
101 EscapedStringLiteral(String),
103 UnicodeStringLiteral(String),
105 HexStringLiteral(String),
107 Comma,
109 Whitespace(Whitespace),
111 DoubleEq,
113 Eq,
115 Neq,
117 Lt,
119 Gt,
121 LtEq,
123 GtEq,
125 Spaceship,
127 Plus,
129 Minus,
131 Mul,
133 Div,
135 DuckIntDiv,
137 Mod,
139 StringConcat,
141 LParen,
143 RParen,
145 Period,
147 Colon,
149 DoubleColon,
151 Assignment,
153 SemiColon,
155 Backslash,
157 LBracket,
159 RBracket,
161 Ampersand,
163 Pipe,
165 Caret,
167 LBrace,
169 RBrace,
171 RArrow,
173 Sharp,
175 Tilde,
177 TildeAsterisk,
179 ExclamationMarkTilde,
181 ExclamationMarkTildeAsterisk,
183 DoubleTilde,
185 DoubleTildeAsterisk,
187 ExclamationMarkDoubleTilde,
189 ExclamationMarkDoubleTildeAsterisk,
191 ShiftLeft,
193 ShiftRight,
195 Overlap,
197 ExclamationMark,
199 DoubleExclamationMark,
201 AtSign,
203 CaretAt,
205 PGSquareRoot,
207 PGCubeRoot,
209 Placeholder(String),
211 Arrow,
213 LongArrow,
215 HashArrow,
217 HashLongArrow,
219 AtArrow,
221 ArrowAt,
223 HashMinus,
226 AtQuestion,
229 AtAt,
233 Question,
236 QuestionAnd,
239 QuestionPipe,
242 CustomBinaryOperator(String),
246}
247
248impl fmt::Display for Token {
249 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
250 match self {
251 Token::EOF => f.write_str("EOF"),
252 Token::Word(ref w) => write!(f, "{w}"),
253 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
254 Token::Char(ref c) => write!(f, "{c}"),
255 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
256 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
257 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
258 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
259 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
260 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
261 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
262 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
263 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
264 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
265 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
266 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
267 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
268 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
269 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
270 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
271 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
272 Token::Comma => f.write_str(","),
273 Token::Whitespace(ws) => write!(f, "{ws}"),
274 Token::DoubleEq => f.write_str("=="),
275 Token::Spaceship => f.write_str("<=>"),
276 Token::Eq => f.write_str("="),
277 Token::Neq => f.write_str("<>"),
278 Token::Lt => f.write_str("<"),
279 Token::Gt => f.write_str(">"),
280 Token::LtEq => f.write_str("<="),
281 Token::GtEq => f.write_str(">="),
282 Token::Plus => f.write_str("+"),
283 Token::Minus => f.write_str("-"),
284 Token::Mul => f.write_str("*"),
285 Token::Div => f.write_str("/"),
286 Token::DuckIntDiv => f.write_str("//"),
287 Token::StringConcat => f.write_str("||"),
288 Token::Mod => f.write_str("%"),
289 Token::LParen => f.write_str("("),
290 Token::RParen => f.write_str(")"),
291 Token::Period => f.write_str("."),
292 Token::Colon => f.write_str(":"),
293 Token::DoubleColon => f.write_str("::"),
294 Token::Assignment => f.write_str(":="),
295 Token::SemiColon => f.write_str(";"),
296 Token::Backslash => f.write_str("\\"),
297 Token::LBracket => f.write_str("["),
298 Token::RBracket => f.write_str("]"),
299 Token::Ampersand => f.write_str("&"),
300 Token::Caret => f.write_str("^"),
301 Token::Pipe => f.write_str("|"),
302 Token::LBrace => f.write_str("{"),
303 Token::RBrace => f.write_str("}"),
304 Token::RArrow => f.write_str("=>"),
305 Token::Sharp => f.write_str("#"),
306 Token::ExclamationMark => f.write_str("!"),
307 Token::DoubleExclamationMark => f.write_str("!!"),
308 Token::Tilde => f.write_str("~"),
309 Token::TildeAsterisk => f.write_str("~*"),
310 Token::ExclamationMarkTilde => f.write_str("!~"),
311 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
312 Token::DoubleTilde => f.write_str("~~"),
313 Token::DoubleTildeAsterisk => f.write_str("~~*"),
314 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
315 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
316 Token::AtSign => f.write_str("@"),
317 Token::CaretAt => f.write_str("^@"),
318 Token::ShiftLeft => f.write_str("<<"),
319 Token::ShiftRight => f.write_str(">>"),
320 Token::Overlap => f.write_str("&&"),
321 Token::PGSquareRoot => f.write_str("|/"),
322 Token::PGCubeRoot => f.write_str("||/"),
323 Token::Placeholder(ref s) => write!(f, "{s}"),
324 Token::Arrow => write!(f, "->"),
325 Token::LongArrow => write!(f, "->>"),
326 Token::HashArrow => write!(f, "#>"),
327 Token::HashLongArrow => write!(f, "#>>"),
328 Token::AtArrow => write!(f, "@>"),
329 Token::ArrowAt => write!(f, "<@"),
330 Token::HashMinus => write!(f, "#-"),
331 Token::AtQuestion => write!(f, "@?"),
332 Token::AtAt => write!(f, "@@"),
333 Token::Question => write!(f, "?"),
334 Token::QuestionAnd => write!(f, "?&"),
335 Token::QuestionPipe => write!(f, "?|"),
336 Token::CustomBinaryOperator(s) => f.write_str(s),
337 }
338 }
339}
340
341impl Token {
342 pub fn make_keyword(keyword: &str) -> Self {
343 Token::make_word(keyword, None)
344 }
345
346 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
347 let word_uppercase = word.to_uppercase();
348 Token::Word(Word {
349 value: word.to_string(),
350 quote_style,
351 keyword: if quote_style.is_none() {
352 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
353 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
354 } else {
355 Keyword::NoKeyword
356 },
357 })
358 }
359}
360
361#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
363#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
364#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
365pub struct Word {
366 pub value: String,
369 pub quote_style: Option<char>,
373 pub keyword: Keyword,
376}
377
378impl fmt::Display for Word {
379 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
380 match self.quote_style {
381 Some(s) if s == '"' || s == '[' || s == '`' => {
382 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
383 }
384 None => f.write_str(&self.value),
385 _ => panic!("Unexpected quote_style!"),
386 }
387 }
388}
389
390impl Word {
391 fn matching_end_quote(ch: char) -> char {
392 match ch {
393 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
397 }
398 }
399}
400
401#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
402#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
403#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
404pub enum Whitespace {
405 Space,
406 Newline,
407 Tab,
408 SingleLineComment { comment: String, prefix: String },
409 MultiLineComment(String),
410}
411
412impl fmt::Display for Whitespace {
413 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
414 match self {
415 Whitespace::Space => f.write_str(" "),
416 Whitespace::Newline => f.write_str("\n"),
417 Whitespace::Tab => f.write_str("\t"),
418 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
419 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
420 }
421 }
422}
423
424#[derive(Debug, Eq, PartialEq, Clone, Copy)]
426pub struct Location {
427 pub line: u64,
429 pub column: u64,
431}
432
433impl fmt::Display for Location {
434 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
435 if self.line == 0 {
436 return Ok(());
437 }
438 write!(
439 f,
440 " at Line: {}, Column: {}",
442 self.line, self.column,
443 )
444 }
445}
446
447#[derive(Debug, Eq, PartialEq, Clone)]
449pub struct TokenWithLocation {
450 pub token: Token,
451 pub location: Location,
452}
453
454impl TokenWithLocation {
455 pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
456 TokenWithLocation {
457 token,
458 location: Location { line, column },
459 }
460 }
461
462 pub fn wrap(token: Token) -> TokenWithLocation {
463 TokenWithLocation::new(token, 0, 0)
464 }
465}
466
467impl PartialEq<Token> for TokenWithLocation {
468 fn eq(&self, other: &Token) -> bool {
469 &self.token == other
470 }
471}
472
473impl PartialEq<TokenWithLocation> for Token {
474 fn eq(&self, other: &TokenWithLocation) -> bool {
475 self == &other.token
476 }
477}
478
479impl fmt::Display for TokenWithLocation {
480 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
481 self.token.fmt(f)
482 }
483}
484
485#[derive(Debug, PartialEq, Eq)]
487pub struct TokenizerError {
488 pub message: String,
489 pub location: Location,
490}
491
492impl fmt::Display for TokenizerError {
493 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
494 write!(f, "{}{}", self.message, self.location,)
495 }
496}
497
498#[cfg(feature = "std")]
499impl std::error::Error for TokenizerError {}
500
501struct State<'a> {
502 peekable: Peekable<Chars<'a>>,
503 pub line: u64,
504 pub col: u64,
505}
506
507impl<'a> State<'a> {
508 pub fn next(&mut self) -> Option<char> {
510 match self.peekable.next() {
511 None => None,
512 Some(s) => {
513 if s == '\n' {
514 self.line += 1;
515 self.col = 1;
516 } else {
517 self.col += 1;
518 }
519 Some(s)
520 }
521 }
522 }
523
524 pub fn peek(&mut self) -> Option<&char> {
526 self.peekable.peek()
527 }
528
529 pub fn location(&self) -> Location {
530 Location {
531 line: self.line,
532 column: self.col,
533 }
534 }
535}
536
537#[derive(Copy, Clone)]
539enum NumStringQuoteChars {
540 One,
542 Many(NonZeroU8),
544}
545
546struct TokenizeQuotedStringSettings {
548 quote_style: char,
550 num_quote_chars: NumStringQuoteChars,
552 num_opening_quotes_to_consume: u8,
558 backslash_escape: bool,
561}
562
563pub struct Tokenizer<'a> {
565 dialect: &'a dyn Dialect,
566 query: &'a str,
567 unescape: bool,
570}
571
572impl<'a> Tokenizer<'a> {
573 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
590 Self {
591 dialect,
592 query,
593 unescape: true,
594 }
595 }
596
597 pub fn with_unescape(mut self, unescape: bool) -> Self {
628 self.unescape = unescape;
629 self
630 }
631
632 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
634 let twl = self.tokenize_with_location()?;
635 Ok(twl.into_iter().map(|t| t.token).collect())
636 }
637
638 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
640 let mut tokens: Vec<TokenWithLocation> = vec![];
641 self.tokenize_with_location_into_buf(&mut tokens)
642 .map(|_| tokens)
643 }
644
645 pub fn tokenize_with_location_into_buf(
648 &mut self,
649 buf: &mut Vec<TokenWithLocation>,
650 ) -> Result<(), TokenizerError> {
651 let mut state = State {
652 peekable: self.query.chars().peekable(),
653 line: 1,
654 col: 1,
655 };
656
657 let mut location = state.location();
658 while let Some(token) = self.next_token(&mut state)? {
659 buf.push(TokenWithLocation { token, location });
660
661 location = state.location();
662 }
663 Ok(())
664 }
665
666 fn tokenize_identifier_or_keyword(
668 &self,
669 ch: impl IntoIterator<Item = char>,
670 chars: &mut State,
671 ) -> Result<Option<Token>, TokenizerError> {
672 chars.next(); let ch: String = ch.into_iter().collect();
674 let word = self.tokenize_word(ch, chars);
675
676 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
678 let mut inner_state = State {
679 peekable: word.chars().peekable(),
680 line: 0,
681 col: 0,
682 };
683 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
684 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
685 s += s2.as_str();
686 return Ok(Some(Token::Number(s, false)));
687 }
688
689 Ok(Some(Token::make_word(&word, None)))
690 }
691
692 fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
694 match chars.peek() {
695 Some(&ch) => match ch {
696 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
697 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
698 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
699 '\r' => {
700 chars.next();
702 if let Some('\n') = chars.peek() {
703 chars.next();
704 }
705 Ok(Some(Token::Whitespace(Whitespace::Newline)))
706 }
707 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
709 chars.next(); match chars.peek() {
711 Some('\'') => {
712 if self.dialect.supports_triple_quoted_string() {
713 return self
714 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
715 chars,
716 '\'',
717 false,
718 Token::SingleQuotedByteStringLiteral,
719 Token::TripleSingleQuotedByteStringLiteral,
720 );
721 }
722 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
723 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
724 }
725 Some('\"') => {
726 if self.dialect.supports_triple_quoted_string() {
727 return self
728 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
729 chars,
730 '"',
731 false,
732 Token::DoubleQuotedByteStringLiteral,
733 Token::TripleDoubleQuotedByteStringLiteral,
734 );
735 }
736 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
737 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
738 }
739 _ => {
740 let s = self.tokenize_word(b, chars);
742 Ok(Some(Token::make_word(&s, None)))
743 }
744 }
745 }
746 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
748 chars.next(); match chars.peek() {
750 Some('\'') => self
751 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
752 chars,
753 '\'',
754 false,
755 Token::SingleQuotedRawStringLiteral,
756 Token::TripleSingleQuotedRawStringLiteral,
757 ),
758 Some('\"') => self
759 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
760 chars,
761 '"',
762 false,
763 Token::DoubleQuotedRawStringLiteral,
764 Token::TripleDoubleQuotedRawStringLiteral,
765 ),
766 _ => {
767 let s = self.tokenize_word(b, chars);
769 Ok(Some(Token::make_word(&s, None)))
770 }
771 }
772 }
773 n @ 'N' | n @ 'n' => {
775 chars.next(); match chars.peek() {
777 Some('\'') => {
778 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
780 Ok(Some(Token::NationalStringLiteral(s)))
781 }
782 _ => {
783 let s = self.tokenize_word(n, chars);
785 Ok(Some(Token::make_word(&s, None)))
786 }
787 }
788 }
789 x @ 'e' | x @ 'E' => {
791 let starting_loc = chars.location();
792 chars.next(); match chars.peek() {
794 Some('\'') => {
795 let s =
796 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
797 Ok(Some(Token::EscapedStringLiteral(s)))
798 }
799 _ => {
800 let s = self.tokenize_word(x, chars);
802 Ok(Some(Token::make_word(&s, None)))
803 }
804 }
805 }
806 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
808 chars.next(); if chars.peek() == Some(&'&') {
810 let mut chars_clone = chars.peekable.clone();
812 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
814 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
816 return Ok(Some(Token::UnicodeStringLiteral(s)));
817 }
818 }
819 let s = self.tokenize_word(x, chars);
821 Ok(Some(Token::make_word(&s, None)))
822 }
823 x @ 'x' | x @ 'X' => {
826 chars.next(); match chars.peek() {
828 Some('\'') => {
829 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
831 Ok(Some(Token::HexStringLiteral(s)))
832 }
833 _ => {
834 let s = self.tokenize_word(x, chars);
836 Ok(Some(Token::make_word(&s, None)))
837 }
838 }
839 }
840 '\'' => {
842 if self.dialect.supports_triple_quoted_string() {
843 return self
844 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
845 chars,
846 '\'',
847 self.dialect.supports_string_literal_backslash_escape(),
848 Token::SingleQuotedString,
849 Token::TripleSingleQuotedString,
850 );
851 }
852 let s = self.tokenize_single_quoted_string(
853 chars,
854 '\'',
855 self.dialect.supports_string_literal_backslash_escape(),
856 )?;
857
858 Ok(Some(Token::SingleQuotedString(s)))
859 }
860 '\"' if !self.dialect.is_delimited_identifier_start(ch)
862 && !self.dialect.is_identifier_start(ch) =>
863 {
864 if self.dialect.supports_triple_quoted_string() {
865 return self
866 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
867 chars,
868 '"',
869 self.dialect.supports_string_literal_backslash_escape(),
870 Token::DoubleQuotedString,
871 Token::TripleDoubleQuotedString,
872 );
873 }
874 let s = self.tokenize_single_quoted_string(
875 chars,
876 '"',
877 self.dialect.supports_string_literal_backslash_escape(),
878 )?;
879
880 Ok(Some(Token::DoubleQuotedString(s)))
881 }
882 quote_start
884 if self.dialect.is_delimited_identifier_start(ch)
885 && self
886 .dialect
887 .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
888 {
889 let error_loc = chars.location();
890 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
892 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
893
894 if last_char == Some(quote_end) {
895 Ok(Some(Token::make_word(&s, Some(quote_start))))
896 } else {
897 self.tokenizer_error(
898 error_loc,
899 format!("Expected close delimiter '{quote_end}' before EOF."),
900 )
901 }
902 }
903 '0'..='9' | '.' => {
905 let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
906
907 if s == "0" && chars.peek() == Some(&'x') {
909 chars.next();
910 let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
911 return Ok(Some(Token::HexStringLiteral(s2)));
912 }
913
914 if let Some('.') = chars.peek() {
916 s.push('.');
917 chars.next();
918 }
919 s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
920
921 if s == "." {
923 return Ok(Some(Token::Period));
924 }
925
926 let mut exponent_part = String::new();
927 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
929 let mut char_clone = chars.peekable.clone();
930 exponent_part.push(char_clone.next().unwrap());
931
932 match char_clone.peek() {
934 Some(&c) if matches!(c, '+' | '-') => {
935 exponent_part.push(c);
936 char_clone.next();
937 }
938 _ => (),
939 }
940
941 match char_clone.peek() {
942 Some(&c) if c.is_ascii_digit() => {
944 for _ in 0..exponent_part.len() {
945 chars.next();
946 }
947 exponent_part +=
948 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
949 s += exponent_part.as_str();
950 }
951 _ => (),
953 }
954 }
955
956 if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
959 let word =
960 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
961
962 if !word.is_empty() {
963 s += word.as_str();
964 return Ok(Some(Token::make_word(s.as_str(), None)));
965 }
966 }
967
968 let long = if chars.peek() == Some(&'L') {
969 chars.next();
970 true
971 } else {
972 false
973 };
974 Ok(Some(Token::Number(s, long)))
975 }
976 '(' => self.consume_and_return(chars, Token::LParen),
978 ')' => self.consume_and_return(chars, Token::RParen),
979 ',' => self.consume_and_return(chars, Token::Comma),
980 '-' => {
982 chars.next(); match chars.peek() {
984 Some('-') => {
985 chars.next(); let comment = self.tokenize_single_line_comment(chars);
987 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
988 prefix: "--".to_owned(),
989 comment,
990 })))
991 }
992 Some('>') => {
993 chars.next();
994 match chars.peek() {
995 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
996 _ => self.start_binop(chars, "->", Token::Arrow),
997 }
998 }
999 _ => self.start_binop(chars, "-", Token::Minus),
1001 }
1002 }
1003 '/' => {
1004 chars.next(); match chars.peek() {
1006 Some('*') => {
1007 chars.next(); self.tokenize_multiline_comment(chars)
1009 }
1010 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1011 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1013 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1014 prefix: "//".to_owned(),
1015 comment,
1016 })))
1017 }
1018 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1019 self.consume_and_return(chars, Token::DuckIntDiv)
1020 }
1021 _ => Ok(Some(Token::Div)),
1023 }
1024 }
1025 '+' => self.consume_and_return(chars, Token::Plus),
1026 '*' => self.consume_and_return(chars, Token::Mul),
1027 '%' => {
1028 chars.next(); match chars.peek() {
1030 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1031 Some(sch) if self.dialect.is_identifier_start('%') => {
1032 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1033 }
1034 _ => self.start_binop(chars, "%", Token::Mod),
1035 }
1036 }
1037 '|' => {
1038 chars.next(); match chars.peek() {
1040 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1041 Some('|') => {
1042 chars.next(); match chars.peek() {
1044 Some('/') => {
1045 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1046 }
1047 _ => self.start_binop(chars, "||", Token::StringConcat),
1048 }
1049 }
1050 _ => self.start_binop(chars, "|", Token::Pipe),
1052 }
1053 }
1054 '=' => {
1055 chars.next(); match chars.peek() {
1057 Some('>') => self.consume_and_return(chars, Token::RArrow),
1058 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1059 _ => Ok(Some(Token::Eq)),
1060 }
1061 }
1062 '!' => {
1063 chars.next(); match chars.peek() {
1065 Some('=') => self.consume_and_return(chars, Token::Neq),
1066 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1067 Some('~') => {
1068 chars.next();
1069 match chars.peek() {
1070 Some('*') => self
1071 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1072 Some('~') => {
1073 chars.next();
1074 match chars.peek() {
1075 Some('*') => self.consume_and_return(
1076 chars,
1077 Token::ExclamationMarkDoubleTildeAsterisk,
1078 ),
1079 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1080 }
1081 }
1082 _ => Ok(Some(Token::ExclamationMarkTilde)),
1083 }
1084 }
1085 _ => Ok(Some(Token::ExclamationMark)),
1086 }
1087 }
1088 '<' => {
1089 chars.next(); match chars.peek() {
1091 Some('=') => {
1092 chars.next();
1093 match chars.peek() {
1094 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1095 _ => self.start_binop(chars, "<=", Token::LtEq),
1096 }
1097 }
1098 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1099 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1100 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1101 _ => self.start_binop(chars, "<", Token::Lt),
1102 }
1103 }
1104 '>' => {
1105 chars.next(); match chars.peek() {
1107 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1108 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1109 _ => self.start_binop(chars, ">", Token::Gt),
1110 }
1111 }
1112 ':' => {
1113 chars.next();
1114 match chars.peek() {
1115 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1116 Some('=') => self.consume_and_return(chars, Token::Assignment),
1117 _ => Ok(Some(Token::Colon)),
1118 }
1119 }
1120 ';' => self.consume_and_return(chars, Token::SemiColon),
1121 '\\' => self.consume_and_return(chars, Token::Backslash),
1122 '[' => self.consume_and_return(chars, Token::LBracket),
1123 ']' => self.consume_and_return(chars, Token::RBracket),
1124 '&' => {
1125 chars.next(); match chars.peek() {
1127 Some('&') => {
1128 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1130 }
1131 _ => self.start_binop(chars, "&", Token::Ampersand),
1133 }
1134 }
1135 '^' => {
1136 chars.next(); match chars.peek() {
1138 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1139 _ => Ok(Some(Token::Caret)),
1140 }
1141 }
1142 '{' => self.consume_and_return(chars, Token::LBrace),
1143 '}' => self.consume_and_return(chars, Token::RBrace),
1144 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect) => {
1145 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1147 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1148 prefix: "#".to_owned(),
1149 comment,
1150 })))
1151 }
1152 '~' => {
1153 chars.next(); match chars.peek() {
1155 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1156 Some('~') => {
1157 chars.next();
1158 match chars.peek() {
1159 Some('*') => {
1160 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1161 }
1162 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1163 }
1164 }
1165 _ => self.start_binop(chars, "~", Token::Tilde),
1166 }
1167 }
1168 '#' => {
1169 chars.next();
1170 match chars.peek() {
1171 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1172 Some('>') => {
1173 chars.next();
1174 match chars.peek() {
1175 Some('>') => {
1176 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1177 }
1178 _ => self.start_binop(chars, "#>", Token::HashArrow),
1179 }
1180 }
1181 Some(' ') => Ok(Some(Token::Sharp)),
1182 Some(sch) if self.dialect.is_identifier_start('#') => {
1183 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1184 }
1185 _ => self.start_binop(chars, "#", Token::Sharp),
1186 }
1187 }
1188 '@' => {
1189 chars.next();
1190 match chars.peek() {
1191 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1192 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1193 Some('@') => {
1194 chars.next();
1195 match chars.peek() {
1196 Some(' ') => Ok(Some(Token::AtAt)),
1197 Some(tch) if self.dialect.is_identifier_start('@') => {
1198 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1199 }
1200 _ => Ok(Some(Token::AtAt)),
1201 }
1202 }
1203 Some(' ') => Ok(Some(Token::AtSign)),
1204 Some(sch) if self.dialect.is_identifier_start('@') => {
1205 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1206 }
1207 _ => Ok(Some(Token::AtSign)),
1208 }
1209 }
1210 '?' if dialect_of!(self is PostgreSqlDialect) => {
1212 chars.next();
1213 match chars.peek() {
1214 Some('|') => self.consume_and_return(chars, Token::QuestionPipe),
1215 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1216 _ => self.consume_and_return(chars, Token::Question),
1217 }
1218 }
1219 '?' => {
1220 chars.next();
1221 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1222 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1223 }
1224
1225 ch if self.dialect.is_identifier_start(ch) => {
1227 self.tokenize_identifier_or_keyword([ch], chars)
1228 }
1229 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1230
1231 ch if ch.is_whitespace() => {
1233 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1234 }
1235 other => self.consume_and_return(chars, Token::Char(other)),
1236 },
1237 None => Ok(None),
1238 }
1239 }
1240
1241 fn consume_for_binop(
1243 &self,
1244 chars: &mut State,
1245 prefix: &str,
1246 default: Token,
1247 ) -> Result<Option<Token>, TokenizerError> {
1248 chars.next(); self.start_binop(chars, prefix, default)
1250 }
1251
1252 fn start_binop(
1254 &self,
1255 chars: &mut State,
1256 prefix: &str,
1257 default: Token,
1258 ) -> Result<Option<Token>, TokenizerError> {
1259 let mut custom = None;
1260 while let Some(&ch) = chars.peek() {
1261 if !self.dialect.is_custom_operator_part(ch) {
1262 break;
1263 }
1264
1265 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1266 chars.next();
1267 }
1268
1269 Ok(Some(
1270 custom.map(Token::CustomBinaryOperator).unwrap_or(default),
1271 ))
1272 }
1273
1274 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1276 let mut s = String::new();
1277 let mut value = String::new();
1278
1279 chars.next();
1280
1281 if let Some('$') = chars.peek() {
1282 chars.next();
1283
1284 let mut is_terminated = false;
1285 let mut prev: Option<char> = None;
1286
1287 while let Some(&ch) = chars.peek() {
1288 if prev == Some('$') {
1289 if ch == '$' {
1290 chars.next();
1291 is_terminated = true;
1292 break;
1293 } else {
1294 s.push('$');
1295 s.push(ch);
1296 }
1297 } else if ch != '$' {
1298 s.push(ch);
1299 }
1300
1301 prev = Some(ch);
1302 chars.next();
1303 }
1304
1305 return if chars.peek().is_none() && !is_terminated {
1306 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1307 } else {
1308 Ok(Token::DollarQuotedString(DollarQuotedString {
1309 value: s,
1310 tag: None,
1311 }))
1312 };
1313 } else {
1314 value.push_str(&peeking_take_while(chars, |ch| {
1315 ch.is_alphanumeric() || ch == '_'
1316 }));
1317
1318 if let Some('$') = chars.peek() {
1319 chars.next();
1320
1321 'searching_for_end: loop {
1322 s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
1323 match chars.peek() {
1324 Some('$') => {
1325 chars.next();
1326 let mut maybe_s = String::from("$");
1327 for c in value.chars() {
1328 if let Some(next_char) = chars.next() {
1329 maybe_s.push(next_char);
1330 if next_char != c {
1331 s.push_str(&maybe_s);
1334 continue 'searching_for_end;
1335 }
1336 } else {
1337 return self.tokenizer_error(
1338 chars.location(),
1339 "Unterminated dollar-quoted, expected $",
1340 );
1341 }
1342 }
1343 if chars.peek() == Some(&'$') {
1344 chars.next();
1345 maybe_s.push('$');
1346 break 'searching_for_end;
1348 } else {
1349 s.push_str(&maybe_s);
1353 continue 'searching_for_end;
1354 }
1355 }
1356 _ => {
1357 return self.tokenizer_error(
1358 chars.location(),
1359 "Unterminated dollar-quoted, expected $",
1360 )
1361 }
1362 }
1363 }
1364 } else {
1365 return Ok(Token::Placeholder(String::from("$") + &value));
1366 }
1367 }
1368
1369 Ok(Token::DollarQuotedString(DollarQuotedString {
1370 value: s,
1371 tag: if value.is_empty() { None } else { Some(value) },
1372 }))
1373 }
1374
1375 fn tokenizer_error<R>(
1376 &self,
1377 loc: Location,
1378 message: impl Into<String>,
1379 ) -> Result<R, TokenizerError> {
1380 Err(TokenizerError {
1381 message: message.into(),
1382 location: loc,
1383 })
1384 }
1385
1386 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1388 let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1389 if let Some(ch) = chars.next() {
1390 assert_eq!(ch, '\n');
1391 comment.push(ch);
1392 }
1393 comment
1394 }
1395
1396 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1398 let mut s = first_chars.into();
1399 s.push_str(&peeking_take_while(chars, |ch| {
1400 self.dialect.is_identifier_part(ch)
1401 }));
1402 s
1403 }
1404
1405 fn tokenize_escaped_single_quoted_string(
1407 &self,
1408 starting_loc: Location,
1409 chars: &mut State,
1410 ) -> Result<String, TokenizerError> {
1411 if let Some(s) = unescape_single_quoted_string(chars) {
1412 return Ok(s);
1413 }
1414
1415 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1416 }
1417
1418 fn tokenize_single_or_triple_quoted_string<F>(
1421 &self,
1422 chars: &mut State,
1423 quote_style: char,
1424 backslash_escape: bool,
1425 single_quote_token: F,
1426 triple_quote_token: F,
1427 ) -> Result<Option<Token>, TokenizerError>
1428 where
1429 F: Fn(String) -> Token,
1430 {
1431 let error_loc = chars.location();
1432
1433 let mut num_opening_quotes = 0u8;
1434 for _ in 0..3 {
1435 if Some("e_style) == chars.peek() {
1436 chars.next(); num_opening_quotes += 1;
1438 } else {
1439 break;
1440 }
1441 }
1442
1443 let (token_fn, num_quote_chars) = match num_opening_quotes {
1444 1 => (single_quote_token, NumStringQuoteChars::One),
1445 2 => {
1446 return Ok(Some(single_quote_token("".into())));
1448 }
1449 3 => {
1450 let Some(num_quote_chars) = NonZeroU8::new(3) else {
1451 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1452 };
1453 (
1454 triple_quote_token,
1455 NumStringQuoteChars::Many(num_quote_chars),
1456 )
1457 }
1458 _ => {
1459 return self.tokenizer_error(error_loc, "invalid string literal opening");
1460 }
1461 };
1462
1463 let settings = TokenizeQuotedStringSettings {
1464 quote_style,
1465 num_quote_chars,
1466 num_opening_quotes_to_consume: 0,
1467 backslash_escape,
1468 };
1469
1470 self.tokenize_quoted_string(chars, settings)
1471 .map(token_fn)
1472 .map(Some)
1473 }
1474
1475 fn tokenize_single_quoted_string(
1477 &self,
1478 chars: &mut State,
1479 quote_style: char,
1480 backslash_escape: bool,
1481 ) -> Result<String, TokenizerError> {
1482 self.tokenize_quoted_string(
1483 chars,
1484 TokenizeQuotedStringSettings {
1485 quote_style,
1486 num_quote_chars: NumStringQuoteChars::One,
1487 num_opening_quotes_to_consume: 1,
1488 backslash_escape,
1489 },
1490 )
1491 }
1492
1493 fn tokenize_quoted_string(
1495 &self,
1496 chars: &mut State,
1497 settings: TokenizeQuotedStringSettings,
1498 ) -> Result<String, TokenizerError> {
1499 let mut s = String::new();
1500 let error_loc = chars.location();
1501
1502 for _ in 0..settings.num_opening_quotes_to_consume {
1504 if Some(settings.quote_style) != chars.next() {
1505 return self.tokenizer_error(error_loc, "invalid string literal opening");
1506 }
1507 }
1508
1509 let mut num_consecutive_quotes = 0;
1510 while let Some(&ch) = chars.peek() {
1511 let pending_final_quote = match settings.num_quote_chars {
1512 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
1513 n @ NumStringQuoteChars::Many(count)
1514 if num_consecutive_quotes + 1 == count.get() =>
1515 {
1516 Some(n)
1517 }
1518 NumStringQuoteChars::Many(_) => None,
1519 };
1520
1521 match ch {
1522 char if char == settings.quote_style && pending_final_quote.is_some() => {
1523 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
1526 let mut buf = s.chars();
1531 for _ in 1..count.get() {
1532 buf.next_back();
1533 }
1534 return Ok(buf.as_str().to_string());
1535 } else if chars
1536 .peek()
1537 .map(|c| *c == settings.quote_style)
1538 .unwrap_or(false)
1539 {
1540 s.push(ch);
1541 if !self.unescape {
1542 s.push(ch);
1544 }
1545 chars.next();
1546 } else {
1547 return Ok(s);
1548 }
1549 }
1550 '\\' if settings.backslash_escape => {
1551 chars.next();
1553
1554 num_consecutive_quotes = 0;
1555
1556 if let Some(next) = chars.peek() {
1557 if !self.unescape {
1558 s.push(ch);
1560 s.push(*next);
1561 chars.next(); } else {
1563 let n = match next {
1564 '0' => '\0',
1565 'a' => '\u{7}',
1566 'b' => '\u{8}',
1567 'f' => '\u{c}',
1568 'n' => '\n',
1569 'r' => '\r',
1570 't' => '\t',
1571 'Z' => '\u{1a}',
1572 _ => *next,
1573 };
1574 s.push(n);
1575 chars.next(); }
1577 }
1578 }
1579 ch => {
1580 chars.next(); if ch == settings.quote_style {
1583 num_consecutive_quotes += 1;
1584 } else {
1585 num_consecutive_quotes = 0;
1586 }
1587
1588 s.push(ch);
1589 }
1590 }
1591 }
1592 self.tokenizer_error(error_loc, "Unterminated string literal")
1593 }
1594
1595 fn tokenize_multiline_comment(
1596 &self,
1597 chars: &mut State,
1598 ) -> Result<Option<Token>, TokenizerError> {
1599 let mut s = String::new();
1600 let mut nested = 1;
1601 let mut last_ch = ' ';
1602
1603 loop {
1604 match chars.next() {
1605 Some(ch) => {
1606 if last_ch == '/' && ch == '*' {
1607 nested += 1;
1608 } else if last_ch == '*' && ch == '/' {
1609 nested -= 1;
1610 if nested == 0 {
1611 s.pop();
1612 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1613 }
1614 }
1615 s.push(ch);
1616 last_ch = ch;
1617 }
1618 None => {
1619 break self.tokenizer_error(
1620 chars.location(),
1621 "Unexpected EOF while in a multi-line comment",
1622 )
1623 }
1624 }
1625 }
1626 }
1627
1628 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
1629 let mut last_char = None;
1630 let mut s = String::new();
1631 while let Some(ch) = chars.next() {
1632 if ch == quote_end {
1633 if chars.peek() == Some("e_end) {
1634 chars.next();
1635 s.push(ch);
1636 if !self.unescape {
1637 s.push(ch);
1639 }
1640 } else {
1641 last_char = Some(quote_end);
1642 break;
1643 }
1644 } else {
1645 s.push(ch);
1646 }
1647 }
1648 (s, last_char)
1649 }
1650
1651 #[allow(clippy::unnecessary_wraps)]
1652 fn consume_and_return(
1653 &self,
1654 chars: &mut State,
1655 t: Token,
1656 ) -> Result<Option<Token>, TokenizerError> {
1657 chars.next();
1658 Ok(Some(t))
1659 }
1660}
1661
1662fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1666 let mut s = String::new();
1667 while let Some(&ch) = chars.peek() {
1668 if predicate(ch) {
1669 chars.next(); s.push(ch);
1671 } else {
1672 break;
1673 }
1674 }
1675 s
1676}
1677
1678fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
1679 Unescape::new(chars).unescape()
1680}
1681
1682struct Unescape<'a: 'b, 'b> {
1683 chars: &'b mut State<'a>,
1684}
1685
1686impl<'a: 'b, 'b> Unescape<'a, 'b> {
1687 fn new(chars: &'b mut State<'a>) -> Self {
1688 Self { chars }
1689 }
1690 fn unescape(mut self) -> Option<String> {
1691 let mut unescaped = String::new();
1692
1693 self.chars.next();
1694
1695 while let Some(c) = self.chars.next() {
1696 if c == '\'' {
1697 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1699 self.chars.next();
1700 unescaped.push('\'');
1701 continue;
1702 }
1703 return Some(unescaped);
1704 }
1705
1706 if c != '\\' {
1707 unescaped.push(c);
1708 continue;
1709 }
1710
1711 let c = match self.chars.next()? {
1712 'b' => '\u{0008}',
1713 'f' => '\u{000C}',
1714 'n' => '\n',
1715 'r' => '\r',
1716 't' => '\t',
1717 'u' => self.unescape_unicode_16()?,
1718 'U' => self.unescape_unicode_32()?,
1719 'x' => self.unescape_hex()?,
1720 c if c.is_digit(8) => self.unescape_octal(c)?,
1721 c => c,
1722 };
1723
1724 unescaped.push(Self::check_null(c)?);
1725 }
1726
1727 None
1728 }
1729
1730 #[inline]
1731 fn check_null(c: char) -> Option<char> {
1732 if c == '\0' {
1733 None
1734 } else {
1735 Some(c)
1736 }
1737 }
1738
1739 #[inline]
1740 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
1741 match u32::from_str_radix(s, RADIX) {
1743 Err(_) => None,
1744 Ok(n) => {
1745 let n = n & 0xFF;
1746 if n <= 127 {
1747 char::from_u32(n)
1748 } else {
1749 None
1750 }
1751 }
1752 }
1753 }
1754
1755 fn unescape_hex(&mut self) -> Option<char> {
1757 let mut s = String::new();
1758
1759 for _ in 0..2 {
1760 match self.next_hex_digit() {
1761 Some(c) => s.push(c),
1762 None => break,
1763 }
1764 }
1765
1766 if s.is_empty() {
1767 return Some('x');
1768 }
1769
1770 Self::byte_to_char::<16>(&s)
1771 }
1772
1773 #[inline]
1774 fn next_hex_digit(&mut self) -> Option<char> {
1775 match self.chars.peek() {
1776 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
1777 _ => None,
1778 }
1779 }
1780
1781 fn unescape_octal(&mut self, c: char) -> Option<char> {
1783 let mut s = String::new();
1784
1785 s.push(c);
1786 for _ in 0..2 {
1787 match self.next_octal_digest() {
1788 Some(c) => s.push(c),
1789 None => break,
1790 }
1791 }
1792
1793 Self::byte_to_char::<8>(&s)
1794 }
1795
1796 #[inline]
1797 fn next_octal_digest(&mut self) -> Option<char> {
1798 match self.chars.peek() {
1799 Some(c) if c.is_digit(8) => self.chars.next(),
1800 _ => None,
1801 }
1802 }
1803
1804 fn unescape_unicode_16(&mut self) -> Option<char> {
1806 self.unescape_unicode::<4>()
1807 }
1808
1809 fn unescape_unicode_32(&mut self) -> Option<char> {
1811 self.unescape_unicode::<8>()
1812 }
1813
1814 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
1815 let mut s = String::new();
1816 for _ in 0..NUM {
1817 s.push(self.chars.next()?);
1818 }
1819 match u32::from_str_radix(&s, 16) {
1820 Err(_) => None,
1821 Ok(n) => char::from_u32(n),
1822 }
1823 }
1824}
1825
1826fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
1827 let mut unescaped = String::new();
1828 chars.next(); while let Some(c) = chars.next() {
1830 match c {
1831 '\'' => {
1832 if chars.peek() == Some(&'\'') {
1833 chars.next();
1834 unescaped.push('\'');
1835 } else {
1836 return Ok(unescaped);
1837 }
1838 }
1839 '\\' => match chars.peek() {
1840 Some('\\') => {
1841 chars.next();
1842 unescaped.push('\\');
1843 }
1844 Some('+') => {
1845 chars.next();
1846 unescaped.push(take_char_from_hex_digits(chars, 6)?);
1847 }
1848 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
1849 },
1850 _ => {
1851 unescaped.push(c);
1852 }
1853 }
1854 }
1855 Err(TokenizerError {
1856 message: "Unterminated unicode encoded string literal".to_string(),
1857 location: chars.location(),
1858 })
1859}
1860
1861fn take_char_from_hex_digits(
1862 chars: &mut State<'_>,
1863 max_digits: usize,
1864) -> Result<char, TokenizerError> {
1865 let mut result = 0u32;
1866 for _ in 0..max_digits {
1867 let next_char = chars.next().ok_or_else(|| TokenizerError {
1868 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
1869 .to_string(),
1870 location: chars.location(),
1871 })?;
1872 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
1873 message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
1874 location: chars.location(),
1875 })?;
1876 result = result * 16 + digit;
1877 }
1878 char::from_u32(result).ok_or_else(|| TokenizerError {
1879 message: format!("Invalid unicode character: {:x}", result),
1880 location: chars.location(),
1881 })
1882}
1883
1884#[cfg(test)]
1885mod tests {
1886 use super::*;
1887 use crate::dialect::{
1888 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect,
1889 };
1890 use core::fmt::Debug;
1891
1892 #[test]
1893 fn tokenizer_error_impl() {
1894 let err = TokenizerError {
1895 message: "test".into(),
1896 location: Location { line: 1, column: 1 },
1897 };
1898 #[cfg(feature = "std")]
1899 {
1900 use std::error::Error;
1901 assert!(err.source().is_none());
1902 }
1903 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
1904 }
1905
1906 #[test]
1907 fn tokenize_select_1() {
1908 let sql = String::from("SELECT 1");
1909 let dialect = GenericDialect {};
1910 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1911
1912 let expected = vec![
1913 Token::make_keyword("SELECT"),
1914 Token::Whitespace(Whitespace::Space),
1915 Token::Number(String::from("1"), false),
1916 ];
1917
1918 compare(expected, tokens);
1919 }
1920
1921 #[test]
1922 fn tokenize_select_float() {
1923 let sql = String::from("SELECT .1");
1924 let dialect = GenericDialect {};
1925 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1926
1927 let expected = vec![
1928 Token::make_keyword("SELECT"),
1929 Token::Whitespace(Whitespace::Space),
1930 Token::Number(String::from(".1"), false),
1931 ];
1932
1933 compare(expected, tokens);
1934 }
1935
1936 #[test]
1937 fn tokenize_clickhouse_double_equal() {
1938 let sql = String::from("SELECT foo=='1'");
1939 let dialect = ClickHouseDialect {};
1940 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1941 let tokens = tokenizer.tokenize().unwrap();
1942
1943 let expected = vec![
1944 Token::make_keyword("SELECT"),
1945 Token::Whitespace(Whitespace::Space),
1946 Token::Word(Word {
1947 value: "foo".to_string(),
1948 quote_style: None,
1949 keyword: Keyword::NoKeyword,
1950 }),
1951 Token::DoubleEq,
1952 Token::SingleQuotedString("1".to_string()),
1953 ];
1954
1955 compare(expected, tokens);
1956 }
1957
1958 #[test]
1959 fn tokenize_select_exponent() {
1960 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
1961 let dialect = GenericDialect {};
1962 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1963
1964 let expected = vec![
1965 Token::make_keyword("SELECT"),
1966 Token::Whitespace(Whitespace::Space),
1967 Token::Number(String::from("1e10"), false),
1968 Token::Comma,
1969 Token::Whitespace(Whitespace::Space),
1970 Token::Number(String::from("1e-10"), false),
1971 Token::Comma,
1972 Token::Whitespace(Whitespace::Space),
1973 Token::Number(String::from("1e+10"), false),
1974 Token::Comma,
1975 Token::Whitespace(Whitespace::Space),
1976 Token::Number(String::from("1"), false),
1977 Token::make_word("ea", None),
1978 Token::Comma,
1979 Token::Whitespace(Whitespace::Space),
1980 Token::Number(String::from("1e-10"), false),
1981 Token::make_word("a", None),
1982 Token::Comma,
1983 Token::Whitespace(Whitespace::Space),
1984 Token::Number(String::from("1e-10"), false),
1985 Token::Minus,
1986 Token::Number(String::from("10"), false),
1987 ];
1988
1989 compare(expected, tokens);
1990 }
1991
1992 #[test]
1993 fn tokenize_scalar_function() {
1994 let sql = String::from("SELECT sqrt(1)");
1995 let dialect = GenericDialect {};
1996 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1997
1998 let expected = vec![
1999 Token::make_keyword("SELECT"),
2000 Token::Whitespace(Whitespace::Space),
2001 Token::make_word("sqrt", None),
2002 Token::LParen,
2003 Token::Number(String::from("1"), false),
2004 Token::RParen,
2005 ];
2006
2007 compare(expected, tokens);
2008 }
2009
2010 #[test]
2011 fn tokenize_string_string_concat() {
2012 let sql = String::from("SELECT 'a' || 'b'");
2013 let dialect = GenericDialect {};
2014 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2015
2016 let expected = vec![
2017 Token::make_keyword("SELECT"),
2018 Token::Whitespace(Whitespace::Space),
2019 Token::SingleQuotedString(String::from("a")),
2020 Token::Whitespace(Whitespace::Space),
2021 Token::StringConcat,
2022 Token::Whitespace(Whitespace::Space),
2023 Token::SingleQuotedString(String::from("b")),
2024 ];
2025
2026 compare(expected, tokens);
2027 }
2028 #[test]
2029 fn tokenize_bitwise_op() {
2030 let sql = String::from("SELECT one | two ^ three");
2031 let dialect = GenericDialect {};
2032 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2033
2034 let expected = vec![
2035 Token::make_keyword("SELECT"),
2036 Token::Whitespace(Whitespace::Space),
2037 Token::make_word("one", None),
2038 Token::Whitespace(Whitespace::Space),
2039 Token::Pipe,
2040 Token::Whitespace(Whitespace::Space),
2041 Token::make_word("two", None),
2042 Token::Whitespace(Whitespace::Space),
2043 Token::Caret,
2044 Token::Whitespace(Whitespace::Space),
2045 Token::make_word("three", None),
2046 ];
2047 compare(expected, tokens);
2048 }
2049
2050 #[test]
2051 fn tokenize_logical_xor() {
2052 let sql =
2053 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2054 let dialect = GenericDialect {};
2055 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2056
2057 let expected = vec![
2058 Token::make_keyword("SELECT"),
2059 Token::Whitespace(Whitespace::Space),
2060 Token::make_keyword("true"),
2061 Token::Whitespace(Whitespace::Space),
2062 Token::make_keyword("XOR"),
2063 Token::Whitespace(Whitespace::Space),
2064 Token::make_keyword("true"),
2065 Token::Comma,
2066 Token::Whitespace(Whitespace::Space),
2067 Token::make_keyword("false"),
2068 Token::Whitespace(Whitespace::Space),
2069 Token::make_keyword("XOR"),
2070 Token::Whitespace(Whitespace::Space),
2071 Token::make_keyword("false"),
2072 Token::Comma,
2073 Token::Whitespace(Whitespace::Space),
2074 Token::make_keyword("true"),
2075 Token::Whitespace(Whitespace::Space),
2076 Token::make_keyword("XOR"),
2077 Token::Whitespace(Whitespace::Space),
2078 Token::make_keyword("false"),
2079 Token::Comma,
2080 Token::Whitespace(Whitespace::Space),
2081 Token::make_keyword("false"),
2082 Token::Whitespace(Whitespace::Space),
2083 Token::make_keyword("XOR"),
2084 Token::Whitespace(Whitespace::Space),
2085 Token::make_keyword("true"),
2086 ];
2087 compare(expected, tokens);
2088 }
2089
2090 #[test]
2091 fn tokenize_simple_select() {
2092 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2093 let dialect = GenericDialect {};
2094 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2095
2096 let expected = vec![
2097 Token::make_keyword("SELECT"),
2098 Token::Whitespace(Whitespace::Space),
2099 Token::Mul,
2100 Token::Whitespace(Whitespace::Space),
2101 Token::make_keyword("FROM"),
2102 Token::Whitespace(Whitespace::Space),
2103 Token::make_word("customer", None),
2104 Token::Whitespace(Whitespace::Space),
2105 Token::make_keyword("WHERE"),
2106 Token::Whitespace(Whitespace::Space),
2107 Token::make_word("id", None),
2108 Token::Whitespace(Whitespace::Space),
2109 Token::Eq,
2110 Token::Whitespace(Whitespace::Space),
2111 Token::Number(String::from("1"), false),
2112 Token::Whitespace(Whitespace::Space),
2113 Token::make_keyword("LIMIT"),
2114 Token::Whitespace(Whitespace::Space),
2115 Token::Number(String::from("5"), false),
2116 ];
2117
2118 compare(expected, tokens);
2119 }
2120
2121 #[test]
2122 fn tokenize_explain_select() {
2123 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2124 let dialect = GenericDialect {};
2125 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2126
2127 let expected = vec![
2128 Token::make_keyword("EXPLAIN"),
2129 Token::Whitespace(Whitespace::Space),
2130 Token::make_keyword("SELECT"),
2131 Token::Whitespace(Whitespace::Space),
2132 Token::Mul,
2133 Token::Whitespace(Whitespace::Space),
2134 Token::make_keyword("FROM"),
2135 Token::Whitespace(Whitespace::Space),
2136 Token::make_word("customer", None),
2137 Token::Whitespace(Whitespace::Space),
2138 Token::make_keyword("WHERE"),
2139 Token::Whitespace(Whitespace::Space),
2140 Token::make_word("id", None),
2141 Token::Whitespace(Whitespace::Space),
2142 Token::Eq,
2143 Token::Whitespace(Whitespace::Space),
2144 Token::Number(String::from("1"), false),
2145 ];
2146
2147 compare(expected, tokens);
2148 }
2149
2150 #[test]
2151 fn tokenize_explain_analyze_select() {
2152 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2153 let dialect = GenericDialect {};
2154 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2155
2156 let expected = vec![
2157 Token::make_keyword("EXPLAIN"),
2158 Token::Whitespace(Whitespace::Space),
2159 Token::make_keyword("ANALYZE"),
2160 Token::Whitespace(Whitespace::Space),
2161 Token::make_keyword("SELECT"),
2162 Token::Whitespace(Whitespace::Space),
2163 Token::Mul,
2164 Token::Whitespace(Whitespace::Space),
2165 Token::make_keyword("FROM"),
2166 Token::Whitespace(Whitespace::Space),
2167 Token::make_word("customer", None),
2168 Token::Whitespace(Whitespace::Space),
2169 Token::make_keyword("WHERE"),
2170 Token::Whitespace(Whitespace::Space),
2171 Token::make_word("id", None),
2172 Token::Whitespace(Whitespace::Space),
2173 Token::Eq,
2174 Token::Whitespace(Whitespace::Space),
2175 Token::Number(String::from("1"), false),
2176 ];
2177
2178 compare(expected, tokens);
2179 }
2180
2181 #[test]
2182 fn tokenize_string_predicate() {
2183 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2184 let dialect = GenericDialect {};
2185 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2186
2187 let expected = vec![
2188 Token::make_keyword("SELECT"),
2189 Token::Whitespace(Whitespace::Space),
2190 Token::Mul,
2191 Token::Whitespace(Whitespace::Space),
2192 Token::make_keyword("FROM"),
2193 Token::Whitespace(Whitespace::Space),
2194 Token::make_word("customer", None),
2195 Token::Whitespace(Whitespace::Space),
2196 Token::make_keyword("WHERE"),
2197 Token::Whitespace(Whitespace::Space),
2198 Token::make_word("salary", None),
2199 Token::Whitespace(Whitespace::Space),
2200 Token::Neq,
2201 Token::Whitespace(Whitespace::Space),
2202 Token::SingleQuotedString(String::from("Not Provided")),
2203 ];
2204
2205 compare(expected, tokens);
2206 }
2207
2208 #[test]
2209 fn tokenize_invalid_string() {
2210 let sql = String::from("\n💝مصطفىh");
2211
2212 let dialect = GenericDialect {};
2213 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2214 let expected = vec![
2216 Token::Whitespace(Whitespace::Newline),
2217 Token::Char('💝'),
2218 Token::make_word("مصطفىh", None),
2219 ];
2220 compare(expected, tokens);
2221 }
2222
2223 #[test]
2224 fn tokenize_newline_in_string_literal() {
2225 let sql = String::from("'foo\r\nbar\nbaz'");
2226
2227 let dialect = GenericDialect {};
2228 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2229 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2230 compare(expected, tokens);
2231 }
2232
2233 #[test]
2234 fn tokenize_unterminated_string_literal() {
2235 let sql = String::from("select 'foo");
2236
2237 let dialect = GenericDialect {};
2238 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2239 assert_eq!(
2240 tokenizer.tokenize(),
2241 Err(TokenizerError {
2242 message: "Unterminated string literal".to_string(),
2243 location: Location { line: 1, column: 8 },
2244 })
2245 );
2246 }
2247
2248 #[test]
2249 fn tokenize_unterminated_string_literal_utf8() {
2250 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2251
2252 let dialect = GenericDialect {};
2253 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2254 assert_eq!(
2255 tokenizer.tokenize(),
2256 Err(TokenizerError {
2257 message: "Unterminated string literal".to_string(),
2258 location: Location {
2259 line: 1,
2260 column: 35
2261 }
2262 })
2263 );
2264 }
2265
2266 #[test]
2267 fn tokenize_invalid_string_cols() {
2268 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2269
2270 let dialect = GenericDialect {};
2271 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2272 let expected = vec![
2274 Token::Whitespace(Whitespace::Newline),
2275 Token::Whitespace(Whitespace::Newline),
2276 Token::make_keyword("SELECT"),
2277 Token::Whitespace(Whitespace::Space),
2278 Token::Mul,
2279 Token::Whitespace(Whitespace::Space),
2280 Token::make_keyword("FROM"),
2281 Token::Whitespace(Whitespace::Space),
2282 Token::make_keyword("table"),
2283 Token::Whitespace(Whitespace::Tab),
2284 Token::Char('💝'),
2285 Token::make_word("مصطفىh", None),
2286 ];
2287 compare(expected, tokens);
2288 }
2289
2290 #[test]
2291 fn tokenize_dollar_quoted_string_tagged() {
2292 let sql = String::from(
2293 "SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$",
2294 );
2295 let dialect = GenericDialect {};
2296 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2297 let expected = vec![
2298 Token::make_keyword("SELECT"),
2299 Token::Whitespace(Whitespace::Space),
2300 Token::DollarQuotedString(DollarQuotedString {
2301 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2302 tag: Some("tag".into()),
2303 }),
2304 ];
2305 compare(expected, tokens);
2306 }
2307
2308 #[test]
2309 fn tokenize_dollar_quoted_string_tagged_unterminated() {
2310 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2311 let dialect = GenericDialect {};
2312 assert_eq!(
2313 Tokenizer::new(&dialect, &sql).tokenize(),
2314 Err(TokenizerError {
2315 message: "Unterminated dollar-quoted, expected $".into(),
2316 location: Location {
2317 line: 1,
2318 column: 91
2319 }
2320 })
2321 );
2322 }
2323
2324 #[test]
2325 fn tokenize_dollar_quoted_string_untagged() {
2326 let sql =
2327 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
2328 let dialect = GenericDialect {};
2329 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2330 let expected = vec![
2331 Token::make_keyword("SELECT"),
2332 Token::Whitespace(Whitespace::Space),
2333 Token::DollarQuotedString(DollarQuotedString {
2334 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
2335 tag: None,
2336 }),
2337 ];
2338 compare(expected, tokens);
2339 }
2340
2341 #[test]
2342 fn tokenize_dollar_quoted_string_untagged_unterminated() {
2343 let sql = String::from(
2344 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
2345 );
2346 let dialect = GenericDialect {};
2347 assert_eq!(
2348 Tokenizer::new(&dialect, &sql).tokenize(),
2349 Err(TokenizerError {
2350 message: "Unterminated dollar-quoted string".into(),
2351 location: Location {
2352 line: 1,
2353 column: 86
2354 }
2355 })
2356 );
2357 }
2358
2359 #[test]
2360 fn tokenize_right_arrow() {
2361 let sql = String::from("FUNCTION(key=>value)");
2362 let dialect = GenericDialect {};
2363 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2364 let expected = vec![
2365 Token::make_word("FUNCTION", None),
2366 Token::LParen,
2367 Token::make_word("key", None),
2368 Token::RArrow,
2369 Token::make_word("value", None),
2370 Token::RParen,
2371 ];
2372 compare(expected, tokens);
2373 }
2374
2375 #[test]
2376 fn tokenize_is_null() {
2377 let sql = String::from("a IS NULL");
2378 let dialect = GenericDialect {};
2379 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2380
2381 let expected = vec![
2382 Token::make_word("a", None),
2383 Token::Whitespace(Whitespace::Space),
2384 Token::make_keyword("IS"),
2385 Token::Whitespace(Whitespace::Space),
2386 Token::make_keyword("NULL"),
2387 ];
2388
2389 compare(expected, tokens);
2390 }
2391
2392 #[test]
2393 fn tokenize_comment() {
2394 let sql = String::from("0--this is a comment\n1");
2395
2396 let dialect = GenericDialect {};
2397 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2398 let expected = vec![
2399 Token::Number("0".to_string(), false),
2400 Token::Whitespace(Whitespace::SingleLineComment {
2401 prefix: "--".to_string(),
2402 comment: "this is a comment\n".to_string(),
2403 }),
2404 Token::Number("1".to_string(), false),
2405 ];
2406 compare(expected, tokens);
2407 }
2408
2409 #[test]
2410 fn tokenize_comment_at_eof() {
2411 let sql = String::from("--this is a comment");
2412
2413 let dialect = GenericDialect {};
2414 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2415 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
2416 prefix: "--".to_string(),
2417 comment: "this is a comment".to_string(),
2418 })];
2419 compare(expected, tokens);
2420 }
2421
2422 #[test]
2423 fn tokenize_multiline_comment() {
2424 let sql = String::from("0/*multi-line\n* /comment*/1");
2425
2426 let dialect = GenericDialect {};
2427 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2428 let expected = vec![
2429 Token::Number("0".to_string(), false),
2430 Token::Whitespace(Whitespace::MultiLineComment(
2431 "multi-line\n* /comment".to_string(),
2432 )),
2433 Token::Number("1".to_string(), false),
2434 ];
2435 compare(expected, tokens);
2436 }
2437
2438 #[test]
2439 fn tokenize_nested_multiline_comment() {
2440 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
2441
2442 let dialect = GenericDialect {};
2443 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2444 let expected = vec![
2445 Token::Number("0".to_string(), false),
2446 Token::Whitespace(Whitespace::MultiLineComment(
2447 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
2448 )),
2449 Token::Number("1".to_string(), false),
2450 ];
2451 compare(expected, tokens);
2452 }
2453
2454 #[test]
2455 fn tokenize_multiline_comment_with_even_asterisks() {
2456 let sql = String::from("\n/** Comment **/\n");
2457
2458 let dialect = GenericDialect {};
2459 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2460 let expected = vec![
2461 Token::Whitespace(Whitespace::Newline),
2462 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
2463 Token::Whitespace(Whitespace::Newline),
2464 ];
2465 compare(expected, tokens);
2466 }
2467
2468 #[test]
2469 fn tokenize_unicode_whitespace() {
2470 let sql = String::from(" \u{2003}\n");
2471
2472 let dialect = GenericDialect {};
2473 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2474 let expected = vec![
2475 Token::Whitespace(Whitespace::Space),
2476 Token::Whitespace(Whitespace::Space),
2477 Token::Whitespace(Whitespace::Newline),
2478 ];
2479 compare(expected, tokens);
2480 }
2481
2482 #[test]
2483 fn tokenize_mismatched_quotes() {
2484 let sql = String::from("\"foo");
2485
2486 let dialect = GenericDialect {};
2487 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2488 assert_eq!(
2489 tokenizer.tokenize(),
2490 Err(TokenizerError {
2491 message: "Expected close delimiter '\"' before EOF.".to_string(),
2492 location: Location { line: 1, column: 1 },
2493 })
2494 );
2495 }
2496
2497 #[test]
2498 fn tokenize_newlines() {
2499 let sql = String::from("line1\nline2\rline3\r\nline4\r");
2500
2501 let dialect = GenericDialect {};
2502 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2503 let expected = vec![
2504 Token::make_word("line1", None),
2505 Token::Whitespace(Whitespace::Newline),
2506 Token::make_word("line2", None),
2507 Token::Whitespace(Whitespace::Newline),
2508 Token::make_word("line3", None),
2509 Token::Whitespace(Whitespace::Newline),
2510 Token::make_word("line4", None),
2511 Token::Whitespace(Whitespace::Newline),
2512 ];
2513 compare(expected, tokens);
2514 }
2515
2516 #[test]
2517 fn tokenize_mssql_top() {
2518 let sql = "SELECT TOP 5 [bar] FROM foo";
2519 let dialect = MsSqlDialect {};
2520 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2521 let expected = vec![
2522 Token::make_keyword("SELECT"),
2523 Token::Whitespace(Whitespace::Space),
2524 Token::make_keyword("TOP"),
2525 Token::Whitespace(Whitespace::Space),
2526 Token::Number(String::from("5"), false),
2527 Token::Whitespace(Whitespace::Space),
2528 Token::make_word("bar", Some('[')),
2529 Token::Whitespace(Whitespace::Space),
2530 Token::make_keyword("FROM"),
2531 Token::Whitespace(Whitespace::Space),
2532 Token::make_word("foo", None),
2533 ];
2534 compare(expected, tokens);
2535 }
2536
2537 #[test]
2538 fn tokenize_pg_regex_match() {
2539 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
2540 let dialect = GenericDialect {};
2541 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2542 let expected = vec![
2543 Token::make_keyword("SELECT"),
2544 Token::Whitespace(Whitespace::Space),
2545 Token::make_word("col", None),
2546 Token::Whitespace(Whitespace::Space),
2547 Token::Tilde,
2548 Token::Whitespace(Whitespace::Space),
2549 Token::SingleQuotedString("^a".into()),
2550 Token::Comma,
2551 Token::Whitespace(Whitespace::Space),
2552 Token::make_word("col", None),
2553 Token::Whitespace(Whitespace::Space),
2554 Token::TildeAsterisk,
2555 Token::Whitespace(Whitespace::Space),
2556 Token::SingleQuotedString("^a".into()),
2557 Token::Comma,
2558 Token::Whitespace(Whitespace::Space),
2559 Token::make_word("col", None),
2560 Token::Whitespace(Whitespace::Space),
2561 Token::ExclamationMarkTilde,
2562 Token::Whitespace(Whitespace::Space),
2563 Token::SingleQuotedString("^a".into()),
2564 Token::Comma,
2565 Token::Whitespace(Whitespace::Space),
2566 Token::make_word("col", None),
2567 Token::Whitespace(Whitespace::Space),
2568 Token::ExclamationMarkTildeAsterisk,
2569 Token::Whitespace(Whitespace::Space),
2570 Token::SingleQuotedString("^a".into()),
2571 ];
2572 compare(expected, tokens);
2573 }
2574
2575 #[test]
2576 fn tokenize_pg_like_match() {
2577 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
2578 let dialect = GenericDialect {};
2579 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2580 let expected = vec![
2581 Token::make_keyword("SELECT"),
2582 Token::Whitespace(Whitespace::Space),
2583 Token::make_word("col", None),
2584 Token::Whitespace(Whitespace::Space),
2585 Token::DoubleTilde,
2586 Token::Whitespace(Whitespace::Space),
2587 Token::SingleQuotedString("_a%".into()),
2588 Token::Comma,
2589 Token::Whitespace(Whitespace::Space),
2590 Token::make_word("col", None),
2591 Token::Whitespace(Whitespace::Space),
2592 Token::DoubleTildeAsterisk,
2593 Token::Whitespace(Whitespace::Space),
2594 Token::SingleQuotedString("_a%".into()),
2595 Token::Comma,
2596 Token::Whitespace(Whitespace::Space),
2597 Token::make_word("col", None),
2598 Token::Whitespace(Whitespace::Space),
2599 Token::ExclamationMarkDoubleTilde,
2600 Token::Whitespace(Whitespace::Space),
2601 Token::SingleQuotedString("_a%".into()),
2602 Token::Comma,
2603 Token::Whitespace(Whitespace::Space),
2604 Token::make_word("col", None),
2605 Token::Whitespace(Whitespace::Space),
2606 Token::ExclamationMarkDoubleTildeAsterisk,
2607 Token::Whitespace(Whitespace::Space),
2608 Token::SingleQuotedString("_a%".into()),
2609 ];
2610 compare(expected, tokens);
2611 }
2612
2613 #[test]
2614 fn tokenize_quoted_identifier() {
2615 let sql = r#" "a "" b" "a """ "c """"" "#;
2616 let dialect = GenericDialect {};
2617 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2618 let expected = vec![
2619 Token::Whitespace(Whitespace::Space),
2620 Token::make_word(r#"a " b"#, Some('"')),
2621 Token::Whitespace(Whitespace::Space),
2622 Token::make_word(r#"a ""#, Some('"')),
2623 Token::Whitespace(Whitespace::Space),
2624 Token::make_word(r#"c """#, Some('"')),
2625 Token::Whitespace(Whitespace::Space),
2626 ];
2627 compare(expected, tokens);
2628 }
2629
2630 #[test]
2631 fn tokenize_snowflake_div() {
2632 let sql = r#"field/1000"#;
2633 let dialect = SnowflakeDialect {};
2634 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2635 let expected = vec![
2636 Token::make_word(r#"field"#, None),
2637 Token::Div,
2638 Token::Number("1000".to_string(), false),
2639 ];
2640 compare(expected, tokens);
2641 }
2642
2643 #[test]
2644 fn tokenize_quoted_identifier_with_no_escape() {
2645 let sql = r#" "a "" b" "a """ "c """"" "#;
2646 let dialect = GenericDialect {};
2647 let tokens = Tokenizer::new(&dialect, sql)
2648 .with_unescape(false)
2649 .tokenize()
2650 .unwrap();
2651 let expected = vec![
2652 Token::Whitespace(Whitespace::Space),
2653 Token::make_word(r#"a "" b"#, Some('"')),
2654 Token::Whitespace(Whitespace::Space),
2655 Token::make_word(r#"a """#, Some('"')),
2656 Token::Whitespace(Whitespace::Space),
2657 Token::make_word(r#"c """""#, Some('"')),
2658 Token::Whitespace(Whitespace::Space),
2659 ];
2660 compare(expected, tokens);
2661 }
2662
2663 #[test]
2664 fn tokenize_with_location() {
2665 let sql = "SELECT a,\n b";
2666 let dialect = GenericDialect {};
2667 let tokens = Tokenizer::new(&dialect, sql)
2668 .tokenize_with_location()
2669 .unwrap();
2670 let expected = vec![
2671 TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
2672 TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
2673 TokenWithLocation::new(Token::make_word("a", None), 1, 8),
2674 TokenWithLocation::new(Token::Comma, 1, 9),
2675 TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
2676 TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
2677 TokenWithLocation::new(Token::make_word("b", None), 2, 2),
2678 ];
2679 compare(expected, tokens);
2680 }
2681
2682 fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
2683 assert_eq!(expected, actual);
2688 }
2689
2690 fn check_unescape(s: &str, expected: Option<&str>) {
2691 let s = format!("'{}'", s);
2692 let mut state = State {
2693 peekable: s.chars().peekable(),
2694 line: 0,
2695 col: 0,
2696 };
2697
2698 assert_eq!(
2699 unescape_single_quoted_string(&mut state),
2700 expected.map(|s| s.to_string())
2701 );
2702 }
2703
2704 #[test]
2705 fn test_unescape() {
2706 check_unescape(r"\b", Some("\u{0008}"));
2707 check_unescape(r"\f", Some("\u{000C}"));
2708 check_unescape(r"\t", Some("\t"));
2709 check_unescape(r"\r\n", Some("\r\n"));
2710 check_unescape(r"\/", Some("/"));
2711 check_unescape(r"/", Some("/"));
2712 check_unescape(r"\\", Some("\\"));
2713
2714 check_unescape(r"\u0001", Some("\u{0001}"));
2716 check_unescape(r"\u4c91", Some("\u{4c91}"));
2717 check_unescape(r"\u4c916", Some("\u{4c91}6"));
2718 check_unescape(r"\u4c", None);
2719 check_unescape(r"\u0000", None);
2720 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
2721 check_unescape(r"\U00110000", None);
2722 check_unescape(r"\U00000000", None);
2723 check_unescape(r"\u", None);
2724 check_unescape(r"\U", None);
2725 check_unescape(r"\U1010FFFF", None);
2726
2727 check_unescape(r"\x4B", Some("\u{004b}"));
2729 check_unescape(r"\x4", Some("\u{0004}"));
2730 check_unescape(r"\x4L", Some("\u{0004}L"));
2731 check_unescape(r"\x", Some("x"));
2732 check_unescape(r"\xP", Some("xP"));
2733 check_unescape(r"\x0", None);
2734 check_unescape(r"\xCAD", None);
2735 check_unescape(r"\xA9", None);
2736
2737 check_unescape(r"\1", Some("\u{0001}"));
2739 check_unescape(r"\12", Some("\u{000a}"));
2740 check_unescape(r"\123", Some("\u{0053}"));
2741 check_unescape(r"\1232", Some("\u{0053}2"));
2742 check_unescape(r"\4", Some("\u{0004}"));
2743 check_unescape(r"\45", Some("\u{0025}"));
2744 check_unescape(r"\450", Some("\u{0028}"));
2745 check_unescape(r"\603", None);
2746 check_unescape(r"\0", None);
2747 check_unescape(r"\080", None);
2748
2749 check_unescape(r"\9", Some("9"));
2751 check_unescape(r"''", Some("'"));
2752 check_unescape(
2753 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
2754 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
2755 );
2756 check_unescape(r"Hello\0", None);
2757 check_unescape(r"Hello\xCADRust", None);
2758 }
2759
2760 #[test]
2761 fn tokenize_numeric_prefix_trait() {
2762 #[derive(Debug)]
2763 struct NumericPrefixDialect;
2764
2765 impl Dialect for NumericPrefixDialect {
2766 fn is_identifier_start(&self, ch: char) -> bool {
2767 ch.is_ascii_lowercase()
2768 || ch.is_ascii_uppercase()
2769 || ch.is_ascii_digit()
2770 || ch == '$'
2771 }
2772
2773 fn is_identifier_part(&self, ch: char) -> bool {
2774 ch.is_ascii_lowercase()
2775 || ch.is_ascii_uppercase()
2776 || ch.is_ascii_digit()
2777 || ch == '_'
2778 || ch == '$'
2779 || ch == '{'
2780 || ch == '}'
2781 }
2782
2783 fn supports_numeric_prefix(&self) -> bool {
2784 true
2785 }
2786 }
2787
2788 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
2789 tokenize_numeric_prefix_inner(&HiveDialect {});
2790 tokenize_numeric_prefix_inner(&MySqlDialect {});
2791 }
2792
2793 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
2794 let sql = r#"SELECT * FROM 1"#;
2795 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
2796 let expected = vec![
2797 Token::make_keyword("SELECT"),
2798 Token::Whitespace(Whitespace::Space),
2799 Token::Mul,
2800 Token::Whitespace(Whitespace::Space),
2801 Token::make_keyword("FROM"),
2802 Token::Whitespace(Whitespace::Space),
2803 Token::Number(String::from("1"), false),
2804 ];
2805 compare(expected, tokens);
2806 }
2807
2808 #[test]
2809 fn tokenize_quoted_string_escape() {
2810 let dialect = SnowflakeDialect {};
2811 for (sql, expected, expected_unescaped) in [
2812 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
2813 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
2814 (r#"'\\'"#, r#"\\"#, r#"\"#),
2815 (
2816 r#"'\0\a\b\f\n\r\t\Z'"#,
2817 r#"\0\a\b\f\n\r\t\Z"#,
2818 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
2819 ),
2820 (r#"'\"'"#, r#"\""#, "\""),
2821 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
2822 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
2823 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
2824 ] {
2825 let tokens = Tokenizer::new(&dialect, sql)
2826 .with_unescape(false)
2827 .tokenize()
2828 .unwrap();
2829 let expected = vec![Token::SingleQuotedString(expected.to_string())];
2830 compare(expected, tokens);
2831
2832 let tokens = Tokenizer::new(&dialect, sql)
2833 .with_unescape(true)
2834 .tokenize()
2835 .unwrap();
2836 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
2837 compare(expected, tokens);
2838 }
2839
2840 for sql in [r#"'\'"#, r#"'ab\'"#] {
2841 let mut tokenizer = Tokenizer::new(&dialect, sql);
2842 assert_eq!(
2843 "Unterminated string literal",
2844 tokenizer.tokenize().unwrap_err().message.as_str(),
2845 );
2846 }
2847
2848 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
2850 let dialect = GenericDialect {};
2851 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2852
2853 let expected = vec![Token::SingleQuotedString(expected.to_string())];
2854
2855 compare(expected, tokens);
2856 }
2857 }
2858
2859 #[test]
2860 fn tokenize_triple_quoted_string() {
2861 fn check<F>(
2862 q: char, r: char, quote_token: F,
2865 ) where
2866 F: Fn(String) -> Token,
2867 {
2868 let dialect = BigQueryDialect {};
2869
2870 for (sql, expected, expected_unescaped) in [
2871 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
2873 (
2875 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
2876 format!(r#"ab{q}{q}\{q}{q}cd"#),
2877 format!(r#"ab{q}{q}{q}{q}cd"#),
2878 ),
2879 (
2881 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
2882 "abc".into(),
2883 "abc".into(),
2884 ),
2885 (
2887 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
2888 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
2889 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
2890 ),
2891 (
2893 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
2894 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
2895 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
2896 ),
2897 (
2899 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
2900 r#"a\'\'b\'c\'d"#.into(),
2901 r#"a''b'c'd"#.into(),
2902 ),
2903 (
2905 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
2906 r#"abc\0\n\rdef"#.into(),
2907 "abc\0\n\rdef".into(),
2908 ),
2909 ] {
2910 let tokens = Tokenizer::new(&dialect, sql.as_str())
2911 .with_unescape(false)
2912 .tokenize()
2913 .unwrap();
2914 let expected = vec![quote_token(expected.to_string())];
2915 compare(expected, tokens);
2916
2917 let tokens = Tokenizer::new(&dialect, sql.as_str())
2918 .with_unescape(true)
2919 .tokenize()
2920 .unwrap();
2921 let expected = vec![quote_token(expected_unescaped.to_string())];
2922 compare(expected, tokens);
2923 }
2924
2925 for sql in [
2926 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
2927 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
2928 format!(r#"{q}{q}{q}{q}"#),
2929 format!(r#"{q}{q}{q}{r}{r}"#),
2930 format!(r#"{q}{q}{q}abc{q}"#),
2931 format!(r#"{q}{q}{q}abc{q}{q}"#),
2932 format!(r#"{q}{q}{q}abc"#),
2933 ] {
2934 let dialect = BigQueryDialect {};
2935 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
2936 assert_eq!(
2937 "Unterminated string literal",
2938 tokenizer.tokenize().unwrap_err().message.as_str(),
2939 );
2940 }
2941 }
2942
2943 check('"', '\'', Token::TripleDoubleQuotedString);
2944
2945 check('\'', '"', Token::TripleSingleQuotedString);
2946
2947 let dialect = BigQueryDialect {};
2948
2949 let sql = r#"""''"#;
2950 let tokens = Tokenizer::new(&dialect, sql)
2951 .with_unescape(true)
2952 .tokenize()
2953 .unwrap();
2954 let expected = vec![
2955 Token::DoubleQuotedString("".to_string()),
2956 Token::SingleQuotedString("".to_string()),
2957 ];
2958 compare(expected, tokens);
2959
2960 let sql = r#"''"""#;
2961 let tokens = Tokenizer::new(&dialect, sql)
2962 .with_unescape(true)
2963 .tokenize()
2964 .unwrap();
2965 let expected = vec![
2966 Token::SingleQuotedString("".to_string()),
2967 Token::DoubleQuotedString("".to_string()),
2968 ];
2969 compare(expected, tokens);
2970
2971 let dialect = SnowflakeDialect {};
2973 let sql = r#"''''''"#;
2974 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2975 let expected = vec![Token::SingleQuotedString("''".to_string())];
2976 compare(expected, tokens);
2977 }
2978}