1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use yachtsql_sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46 SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56 EOF,
58 Word(Word),
60 Number(String, bool),
62 Char(char),
64 SingleQuotedString(String),
66 DoubleQuotedString(String),
68 TripleSingleQuotedString(String),
71 TripleDoubleQuotedString(String),
74 DollarQuotedString(DollarQuotedString),
76 SingleQuotedByteStringLiteral(String),
79 DoubleQuotedByteStringLiteral(String),
81 TripleSingleQuotedByteStringLiteral(String),
84 TripleDoubleQuotedByteStringLiteral(String),
87 SingleQuotedRawStringLiteral(String),
90 DoubleQuotedRawStringLiteral(String),
93 TripleSingleQuotedRawStringLiteral(String),
96 TripleDoubleQuotedRawStringLiteral(String),
99 NationalStringLiteral(String),
101 EscapedStringLiteral(String),
103 UnicodeStringLiteral(String),
105 HexStringLiteral(String),
107 Comma,
109 Whitespace(Whitespace),
111 DoubleEq,
113 Eq,
115 Neq,
117 Lt,
119 Gt,
121 LtEq,
123 GtEq,
125 Spaceship,
127 Plus,
129 Minus,
131 Mul,
133 Div,
135 DuckIntDiv,
137 Mod,
139 StringConcat,
141 LParen,
143 RParen,
145 Period,
147 Colon,
149 DoubleColon,
151 Assignment,
153 SemiColon,
155 Backslash,
157 LBracket,
159 RBracket,
161 Ampersand,
163 Pipe,
165 Caret,
167 LBrace,
169 RBrace,
171 RArrow,
173 Sharp,
175 DoubleSharp,
177 Tilde,
179 TildeAsterisk,
181 ExclamationMarkTilde,
183 ExclamationMarkTildeAsterisk,
185 DoubleTilde,
187 DoubleTildeAsterisk,
189 ExclamationMarkDoubleTilde,
191 ExclamationMarkDoubleTildeAsterisk,
193 ShiftLeft,
195 ShiftRight,
197 Overlap,
199 ExclamationMark,
201 DoubleExclamationMark,
203 AtSign,
205 CaretAt,
207 PGSquareRoot,
209 PGCubeRoot,
211 Placeholder(String),
213 Arrow,
215 LongArrow,
217 HashArrow,
219 AtDashAt,
221 QuestionMarkDash,
223 AmpersandLeftAngleBracket,
225 AmpersandRightAngleBracket,
227 AmpersandLeftAngleBracketVerticalBar,
229 VerticalBarAmpersandRightAngleBracket,
231 TwoWayArrow,
233 LeftAngleBracketCaret,
235 RightAngleBracketCaret,
237 QuestionMarkSharp,
239 QuestionMarkDashVerticalBar,
241 QuestionMarkDoubleVerticalBar,
243 TildeEqual,
245 ShiftLeftVerticalBar,
247 VerticalBarShiftRight,
249 VerticalBarRightAngleBracket,
251 HashLongArrow,
253 AtArrow,
255 ArrowAt,
257 HashMinus,
260 AtQuestion,
263 AtAt,
267 Question,
270 QuestionAnd,
273 QuestionPipe,
276 CustomBinaryOperator(String),
280}
281
282impl fmt::Display for Token {
283 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
284 match self {
285 Token::EOF => f.write_str("EOF"),
286 Token::Word(ref w) => write!(f, "{w}"),
287 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
288 Token::Char(ref c) => write!(f, "{c}"),
289 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
290 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
291 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
292 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
293 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
294 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
295 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
296 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
297 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
298 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
299 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
300 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
301 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
302 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
303 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
304 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
305 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
306 Token::Comma => f.write_str(","),
307 Token::Whitespace(ws) => write!(f, "{ws}"),
308 Token::DoubleEq => f.write_str("=="),
309 Token::Spaceship => f.write_str("<=>"),
310 Token::Eq => f.write_str("="),
311 Token::Neq => f.write_str("<>"),
312 Token::Lt => f.write_str("<"),
313 Token::Gt => f.write_str(">"),
314 Token::LtEq => f.write_str("<="),
315 Token::GtEq => f.write_str(">="),
316 Token::Plus => f.write_str("+"),
317 Token::Minus => f.write_str("-"),
318 Token::Mul => f.write_str("*"),
319 Token::Div => f.write_str("/"),
320 Token::DuckIntDiv => f.write_str("//"),
321 Token::StringConcat => f.write_str("||"),
322 Token::Mod => f.write_str("%"),
323 Token::LParen => f.write_str("("),
324 Token::RParen => f.write_str(")"),
325 Token::Period => f.write_str("."),
326 Token::Colon => f.write_str(":"),
327 Token::DoubleColon => f.write_str("::"),
328 Token::Assignment => f.write_str(":="),
329 Token::SemiColon => f.write_str(";"),
330 Token::Backslash => f.write_str("\\"),
331 Token::LBracket => f.write_str("["),
332 Token::RBracket => f.write_str("]"),
333 Token::Ampersand => f.write_str("&"),
334 Token::Caret => f.write_str("^"),
335 Token::Pipe => f.write_str("|"),
336 Token::LBrace => f.write_str("{"),
337 Token::RBrace => f.write_str("}"),
338 Token::RArrow => f.write_str("=>"),
339 Token::Sharp => f.write_str("#"),
340 Token::DoubleSharp => f.write_str("##"),
341 Token::ExclamationMark => f.write_str("!"),
342 Token::DoubleExclamationMark => f.write_str("!!"),
343 Token::Tilde => f.write_str("~"),
344 Token::TildeAsterisk => f.write_str("~*"),
345 Token::ExclamationMarkTilde => f.write_str("!~"),
346 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
347 Token::DoubleTilde => f.write_str("~~"),
348 Token::DoubleTildeAsterisk => f.write_str("~~*"),
349 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
350 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
351 Token::AtSign => f.write_str("@"),
352 Token::CaretAt => f.write_str("^@"),
353 Token::ShiftLeft => f.write_str("<<"),
354 Token::ShiftRight => f.write_str(">>"),
355 Token::Overlap => f.write_str("&&"),
356 Token::PGSquareRoot => f.write_str("|/"),
357 Token::PGCubeRoot => f.write_str("||/"),
358 Token::AtDashAt => f.write_str("@-@"),
359 Token::QuestionMarkDash => f.write_str("?-"),
360 Token::AmpersandLeftAngleBracket => f.write_str("&<"),
361 Token::AmpersandRightAngleBracket => f.write_str("&>"),
362 Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
363 Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
364 Token::VerticalBarRightAngleBracket => f.write_str("|>"),
365 Token::TwoWayArrow => f.write_str("<->"),
366 Token::LeftAngleBracketCaret => f.write_str("<^"),
367 Token::RightAngleBracketCaret => f.write_str(">^"),
368 Token::QuestionMarkSharp => f.write_str("?#"),
369 Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
370 Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
371 Token::TildeEqual => f.write_str("~="),
372 Token::ShiftLeftVerticalBar => f.write_str("<<|"),
373 Token::VerticalBarShiftRight => f.write_str("|>>"),
374 Token::Placeholder(ref s) => write!(f, "{s}"),
375 Token::Arrow => write!(f, "->"),
376 Token::LongArrow => write!(f, "->>"),
377 Token::HashArrow => write!(f, "#>"),
378 Token::HashLongArrow => write!(f, "#>>"),
379 Token::AtArrow => write!(f, "@>"),
380 Token::ArrowAt => write!(f, "<@"),
381 Token::HashMinus => write!(f, "#-"),
382 Token::AtQuestion => write!(f, "@?"),
383 Token::AtAt => write!(f, "@@"),
384 Token::Question => write!(f, "?"),
385 Token::QuestionAnd => write!(f, "?&"),
386 Token::QuestionPipe => write!(f, "?|"),
387 Token::CustomBinaryOperator(s) => f.write_str(s),
388 }
389 }
390}
391
392impl Token {
393 pub fn make_keyword(keyword: &str) -> Self {
394 Token::make_word(keyword, None)
395 }
396
397 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
398 let word_uppercase = word.to_uppercase();
399 Token::Word(Word {
400 value: word.to_string(),
401 quote_style,
402 keyword: if quote_style.is_none() {
403 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
404 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
405 } else {
406 Keyword::NoKeyword
407 },
408 })
409 }
410}
411
412#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
416pub struct Word {
417 pub value: String,
420 pub quote_style: Option<char>,
424 pub keyword: Keyword,
427}
428
429impl fmt::Display for Word {
430 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
431 match self.quote_style {
432 Some(s) if s == '"' || s == '[' || s == '`' => {
433 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
434 }
435 None => f.write_str(&self.value),
436 _ => panic!("Unexpected quote_style!"),
437 }
438 }
439}
440
441impl Word {
442 fn matching_end_quote(ch: char) -> char {
443 match ch {
444 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
448 }
449 }
450}
451
452#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
453#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
454#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
455pub enum Whitespace {
456 Space,
457 Newline,
458 Tab,
459 SingleLineComment { comment: String, prefix: String },
460 MultiLineComment(String),
461}
462
463impl fmt::Display for Whitespace {
464 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
465 match self {
466 Whitespace::Space => f.write_str(" "),
467 Whitespace::Newline => f.write_str("\n"),
468 Whitespace::Tab => f.write_str("\t"),
469 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
470 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
471 }
472 }
473}
474
475#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
495#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
496#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
497pub struct Location {
498 pub line: u64,
502 pub column: u64,
506}
507
508impl fmt::Display for Location {
509 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
510 if self.line == 0 {
511 return Ok(());
512 }
513 write!(f, " at Line: {}, Column: {}", self.line, self.column)
514 }
515}
516
517impl fmt::Debug for Location {
518 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
519 write!(f, "Location({},{})", self.line, self.column)
520 }
521}
522
523impl Location {
524 pub fn empty() -> Self {
526 Self { line: 0, column: 0 }
527 }
528
529 pub fn new(line: u64, column: u64) -> Self {
531 Self { line, column }
532 }
533
534 pub fn of(line: u64, column: u64) -> Self {
539 Self::new(line, column)
540 }
541
542 pub fn span_to(self, end: Self) -> Span {
544 Span { start: self, end }
545 }
546}
547
548impl From<(u64, u64)> for Location {
549 fn from((line, column): (u64, u64)) -> Self {
550 Self { line, column }
551 }
552}
553
554#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
558#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
559#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
560pub struct Span {
561 pub start: Location,
562 pub end: Location,
563}
564
565impl fmt::Debug for Span {
566 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
567 write!(f, "Span({:?}..{:?})", self.start, self.end)
568 }
569}
570
571impl Span {
572 const EMPTY: Span = Self::empty();
575
576 pub fn new(start: Location, end: Location) -> Span {
578 Span { start, end }
579 }
580
581 pub const fn empty() -> Span {
586 Span {
587 start: Location { line: 0, column: 0 },
588 end: Location { line: 0, column: 0 },
589 }
590 }
591
592 pub fn union(&self, other: &Span) -> Span {
608 match (self, other) {
611 (&Span::EMPTY, _) => *other,
612 (_, &Span::EMPTY) => *self,
613 _ => Span {
614 start: cmp::min(self.start, other.start),
615 end: cmp::max(self.end, other.end),
616 },
617 }
618 }
619
620 pub fn union_opt(&self, other: &Option<Span>) -> Span {
624 match other {
625 Some(other) => self.union(other),
626 None => *self,
627 }
628 }
629
630 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
648 iter.into_iter()
649 .reduce(|acc, item| acc.union(&item))
650 .unwrap_or(Span::empty())
651 }
652}
653
654#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656pub type TokenWithLocation = TokenWithSpan;
657
658#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
681#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
682#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
683pub struct TokenWithSpan {
684 pub token: Token,
685 pub span: Span,
686}
687
688impl TokenWithSpan {
689 pub fn new(token: Token, span: Span) -> Self {
691 Self { token, span }
692 }
693
694 pub fn wrap(token: Token) -> Self {
696 Self::new(token, Span::empty())
697 }
698
699 pub fn at(token: Token, start: Location, end: Location) -> Self {
701 Self::new(token, Span::new(start, end))
702 }
703
704 pub fn new_eof() -> Self {
706 Self::wrap(Token::EOF)
707 }
708}
709
710impl PartialEq<Token> for TokenWithSpan {
711 fn eq(&self, other: &Token) -> bool {
712 &self.token == other
713 }
714}
715
716impl PartialEq<TokenWithSpan> for Token {
717 fn eq(&self, other: &TokenWithSpan) -> bool {
718 self == &other.token
719 }
720}
721
722impl fmt::Display for TokenWithSpan {
723 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
724 self.token.fmt(f)
725 }
726}
727
728#[derive(Debug, PartialEq, Eq)]
730pub struct TokenizerError {
731 pub message: String,
732 pub location: Location,
733}
734
735impl fmt::Display for TokenizerError {
736 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737 write!(f, "{}{}", self.message, self.location,)
738 }
739}
740
741#[cfg(feature = "std")]
742impl std::error::Error for TokenizerError {}
743
744struct State<'a> {
745 peekable: Peekable<Chars<'a>>,
746 pub line: u64,
747 pub col: u64,
748}
749
750impl State<'_> {
751 pub fn next(&mut self) -> Option<char> {
753 match self.peekable.next() {
754 None => None,
755 Some(s) => {
756 if s == '\n' {
757 self.line += 1;
758 self.col = 1;
759 } else {
760 self.col += 1;
761 }
762 Some(s)
763 }
764 }
765 }
766
767 pub fn peek(&mut self) -> Option<&char> {
769 self.peekable.peek()
770 }
771
772 pub fn location(&self) -> Location {
773 Location {
774 line: self.line,
775 column: self.col,
776 }
777 }
778}
779
780#[derive(Copy, Clone)]
782enum NumStringQuoteChars {
783 One,
785 Many(NonZeroU8),
787}
788
789struct TokenizeQuotedStringSettings {
791 quote_style: char,
793 num_quote_chars: NumStringQuoteChars,
795 num_opening_quotes_to_consume: u8,
801 backslash_escape: bool,
804}
805
806pub struct Tokenizer<'a> {
808 dialect: &'a dyn Dialect,
809 query: &'a str,
810 unescape: bool,
813}
814
815impl<'a> Tokenizer<'a> {
816 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
833 Self {
834 dialect,
835 query,
836 unescape: true,
837 }
838 }
839
840 pub fn with_unescape(mut self, unescape: bool) -> Self {
871 self.unescape = unescape;
872 self
873 }
874
875 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877 let twl = self.tokenize_with_location()?;
878 Ok(twl.into_iter().map(|t| t.token).collect())
879 }
880
881 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
883 let mut tokens: Vec<TokenWithSpan> = vec![];
884 self.tokenize_with_location_into_buf(&mut tokens)
885 .map(|_| tokens)
886 }
887
888 pub fn tokenize_with_location_into_buf(
891 &mut self,
892 buf: &mut Vec<TokenWithSpan>,
893 ) -> Result<(), TokenizerError> {
894 let mut state = State {
895 peekable: self.query.chars().peekable(),
896 line: 1,
897 col: 1,
898 };
899
900 let mut location = state.location();
901 while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902 let span = location.span_to(state.location());
903
904 buf.push(TokenWithSpan { token, span });
905
906 location = state.location();
907 }
908 Ok(())
909 }
910
911 fn tokenize_identifier_or_keyword(
913 &self,
914 ch: impl IntoIterator<Item = char>,
915 chars: &mut State,
916 ) -> Result<Option<Token>, TokenizerError> {
917 chars.next(); let ch: String = ch.into_iter().collect();
919 let word = self.tokenize_word(ch, chars);
920
921 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923 let mut inner_state = State {
924 peekable: word.chars().peekable(),
925 line: 0,
926 col: 0,
927 };
928 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
930 s += s2.as_str();
931 return Ok(Some(Token::Number(s, false)));
932 }
933
934 Ok(Some(Token::make_word(&word, None)))
935 }
936
937 fn next_token(
939 &self,
940 chars: &mut State,
941 prev_token: Option<&Token>,
942 ) -> Result<Option<Token>, TokenizerError> {
943 match chars.peek() {
944 Some(&ch) => match ch {
945 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
946 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
947 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
948 '\r' => {
949 chars.next();
951 if let Some('\n') = chars.peek() {
952 chars.next();
953 }
954 Ok(Some(Token::Whitespace(Whitespace::Newline)))
955 }
956 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
958 {
959 chars.next(); match chars.peek() {
961 Some('\'') => {
962 if self.dialect.supports_triple_quoted_string() {
963 return self
964 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
965 chars,
966 '\'',
967 false,
968 Token::SingleQuotedByteStringLiteral,
969 Token::TripleSingleQuotedByteStringLiteral,
970 );
971 }
972 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
973 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
974 }
975 Some('\"') => {
976 if self.dialect.supports_triple_quoted_string() {
977 return self
978 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
979 chars,
980 '"',
981 false,
982 Token::DoubleQuotedByteStringLiteral,
983 Token::TripleDoubleQuotedByteStringLiteral,
984 );
985 }
986 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
987 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
988 }
989 _ => {
990 let s = self.tokenize_word(b, chars);
992 Ok(Some(Token::make_word(&s, None)))
993 }
994 }
995 }
996 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
998 chars.next(); match chars.peek() {
1000 Some('\'') => self
1001 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1002 chars,
1003 '\'',
1004 false,
1005 Token::SingleQuotedRawStringLiteral,
1006 Token::TripleSingleQuotedRawStringLiteral,
1007 ),
1008 Some('\"') => self
1009 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1010 chars,
1011 '"',
1012 false,
1013 Token::DoubleQuotedRawStringLiteral,
1014 Token::TripleDoubleQuotedRawStringLiteral,
1015 ),
1016 _ => {
1017 let s = self.tokenize_word(b, chars);
1019 Ok(Some(Token::make_word(&s, None)))
1020 }
1021 }
1022 }
1023 n @ 'N' | n @ 'n' => {
1025 chars.next(); match chars.peek() {
1027 Some('\'') => {
1028 let backslash_escape =
1030 self.dialect.supports_string_literal_backslash_escape();
1031 let s =
1032 self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1033 Ok(Some(Token::NationalStringLiteral(s)))
1034 }
1035 _ => {
1036 let s = self.tokenize_word(n, chars);
1038 Ok(Some(Token::make_word(&s, None)))
1039 }
1040 }
1041 }
1042 x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1044 let starting_loc = chars.location();
1045 chars.next(); match chars.peek() {
1047 Some('\'') => {
1048 let s =
1049 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1050 Ok(Some(Token::EscapedStringLiteral(s)))
1051 }
1052 _ => {
1053 let s = self.tokenize_word(x, chars);
1055 Ok(Some(Token::make_word(&s, None)))
1056 }
1057 }
1058 }
1059 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1061 chars.next(); if chars.peek() == Some(&'&') {
1063 let mut chars_clone = chars.peekable.clone();
1065 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
1067 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
1069 return Ok(Some(Token::UnicodeStringLiteral(s)));
1070 }
1071 }
1072 let s = self.tokenize_word(x, chars);
1074 Ok(Some(Token::make_word(&s, None)))
1075 }
1076 x @ 'x' | x @ 'X' => {
1079 chars.next(); match chars.peek() {
1081 Some('\'') => {
1082 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1084 Ok(Some(Token::HexStringLiteral(s)))
1085 }
1086 _ => {
1087 let s = self.tokenize_word(x, chars);
1089 Ok(Some(Token::make_word(&s, None)))
1090 }
1091 }
1092 }
1093 '\'' => {
1095 if self.dialect.supports_triple_quoted_string() {
1096 return self
1097 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1098 chars,
1099 '\'',
1100 self.dialect.supports_string_literal_backslash_escape(),
1101 Token::SingleQuotedString,
1102 Token::TripleSingleQuotedString,
1103 );
1104 }
1105 let s = self.tokenize_single_quoted_string(
1106 chars,
1107 '\'',
1108 self.dialect.supports_string_literal_backslash_escape(),
1109 )?;
1110
1111 Ok(Some(Token::SingleQuotedString(s)))
1112 }
1113 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1115 && !self.dialect.is_identifier_start(ch) =>
1116 {
1117 if self.dialect.supports_triple_quoted_string() {
1118 return self
1119 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120 chars,
1121 '"',
1122 self.dialect.supports_string_literal_backslash_escape(),
1123 Token::DoubleQuotedString,
1124 Token::TripleDoubleQuotedString,
1125 );
1126 }
1127 let s = self.tokenize_single_quoted_string(
1128 chars,
1129 '"',
1130 self.dialect.supports_string_literal_backslash_escape(),
1131 )?;
1132
1133 Ok(Some(Token::DoubleQuotedString(s)))
1134 }
1135 quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1137 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1138 Ok(Some(Token::make_word(&word, Some(quote_start))))
1139 }
1140 quote_start
1142 if self
1143 .dialect
1144 .is_nested_delimited_identifier_start(quote_start)
1145 && self
1146 .dialect
1147 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1148 .is_some() =>
1149 {
1150 let Some((quote_start, nested_quote_start)) = self
1151 .dialect
1152 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1153 else {
1154 return self.tokenizer_error(
1155 chars.location(),
1156 format!("Expected nested delimiter '{quote_start}' before EOF."),
1157 );
1158 };
1159
1160 let Some(nested_quote_start) = nested_quote_start else {
1161 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1162 return Ok(Some(Token::make_word(&word, Some(quote_start))));
1163 };
1164
1165 let mut word = vec![];
1166 let quote_end = Word::matching_end_quote(quote_start);
1167 let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1168 let error_loc = chars.location();
1169
1170 chars.next(); peeking_take_while(chars, |ch| ch.is_whitespace());
1172 if chars.peek() != Some(&nested_quote_start) {
1173 return self.tokenizer_error(
1174 error_loc,
1175 format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1176 );
1177 }
1178 word.push(nested_quote_start.into());
1179 word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1180 word.push(nested_quote_end.into());
1181 peeking_take_while(chars, |ch| ch.is_whitespace());
1182 if chars.peek() != Some("e_end) {
1183 return self.tokenizer_error(
1184 error_loc,
1185 format!("Expected close delimiter '{quote_end}' before EOF."),
1186 );
1187 }
1188 chars.next(); Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1191 }
1192 '0'..='9' | '.' => {
1194 if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1199 if let Some(Token::Word(_)) = prev_token {
1200 chars.next();
1201 return Ok(Some(Token::Period));
1202 }
1203
1204 return self.tokenizer_error(
1205 chars.location(),
1206 "Unexpected character '_'".to_string(),
1207 );
1208 }
1209
1210 let is_number_separator = |ch: char, next_char: Option<char>| {
1213 self.dialect.supports_numeric_literal_underscores()
1214 && ch == '_'
1215 && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1216 };
1217
1218 let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1219 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1220 });
1221
1222 if s == "0" && chars.peek() == Some(&'x') {
1224 chars.next();
1225 let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1226 ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1227 });
1228 return Ok(Some(Token::HexStringLiteral(s2)));
1229 }
1230
1231 if let Some('.') = chars.peek() {
1233 s.push('.');
1234 chars.next();
1235 }
1236
1237 if s == "." && self.dialect.supports_numeric_prefix() {
1243 if let Some(Token::Word(_)) = prev_token {
1244 return Ok(Some(Token::Period));
1245 }
1246 }
1247
1248 s += &peeking_next_take_while(chars, |ch, next_ch| {
1250 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1251 });
1252
1253 if s == "." {
1255 return Ok(Some(Token::Period));
1256 }
1257
1258 let mut exponent_part = String::new();
1260 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1261 let mut char_clone = chars.peekable.clone();
1262 exponent_part.push(char_clone.next().unwrap());
1263
1264 match char_clone.peek() {
1266 Some(&c) if matches!(c, '+' | '-') => {
1267 exponent_part.push(c);
1268 char_clone.next();
1269 }
1270 _ => (),
1271 }
1272
1273 match char_clone.peek() {
1274 Some(&c) if c.is_ascii_digit() => {
1276 for _ in 0..exponent_part.len() {
1277 chars.next();
1278 }
1279 exponent_part +=
1280 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1281 s += exponent_part.as_str();
1282 }
1283 _ => (),
1285 }
1286 }
1287
1288 if self.dialect.supports_numeric_prefix() {
1292 if exponent_part.is_empty() {
1293 let word =
1296 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1297
1298 if !word.is_empty() {
1299 s += word.as_str();
1300 return Ok(Some(Token::make_word(s.as_str(), None)));
1301 }
1302 } else if prev_token == Some(&Token::Period) {
1303 return Ok(Some(Token::make_word(s.as_str(), None)));
1306 }
1307 }
1308
1309 let long = if chars.peek() == Some(&'L') {
1310 chars.next();
1311 true
1312 } else {
1313 false
1314 };
1315 Ok(Some(Token::Number(s, long)))
1316 }
1317 '(' => self.consume_and_return(chars, Token::LParen),
1319 ')' => self.consume_and_return(chars, Token::RParen),
1320 ',' => self.consume_and_return(chars, Token::Comma),
1321 '-' => {
1323 chars.next(); match chars.peek() {
1326 Some('-') => {
1327 let mut is_comment = true;
1328 if self.dialect.requires_single_line_comment_whitespace() {
1329 is_comment = Some(' ') == chars.peekable.clone().nth(1);
1330 }
1331
1332 if is_comment {
1333 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1335 return Ok(Some(Token::Whitespace(
1336 Whitespace::SingleLineComment {
1337 prefix: "--".to_owned(),
1338 comment,
1339 },
1340 )));
1341 }
1342
1343 self.start_binop(chars, "-", Token::Minus)
1344 }
1345 Some('>') => {
1346 chars.next();
1347 match chars.peek() {
1348 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1349 _ => self.start_binop(chars, "->", Token::Arrow),
1350 }
1351 }
1352 _ => self.start_binop(chars, "-", Token::Minus),
1354 }
1355 }
1356 '/' => {
1357 chars.next(); match chars.peek() {
1359 Some('*') => {
1360 chars.next(); self.tokenize_multiline_comment(chars)
1362 }
1363 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1364 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1366 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1367 prefix: "//".to_owned(),
1368 comment,
1369 })))
1370 }
1371 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1372 self.consume_and_return(chars, Token::DuckIntDiv)
1373 }
1374 _ => Ok(Some(Token::Div)),
1376 }
1377 }
1378 '+' => self.consume_and_return(chars, Token::Plus),
1379 '*' => self.consume_and_return(chars, Token::Mul),
1380 '%' => {
1381 chars.next(); match chars.peek() {
1383 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1384 Some(sch) if self.dialect.is_identifier_start('%') => {
1385 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1386 }
1387 _ => self.start_binop(chars, "%", Token::Mod),
1388 }
1389 }
1390 '|' => {
1391 chars.next(); match chars.peek() {
1393 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1394 Some('|') => {
1395 chars.next(); match chars.peek() {
1397 Some('/') => {
1398 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1399 }
1400 _ => self.start_binop(chars, "||", Token::StringConcat),
1401 }
1402 }
1403 Some('&') if self.dialect.supports_geometric_types() => {
1404 chars.next(); match chars.peek() {
1406 Some('>') => self.consume_for_binop(
1407 chars,
1408 "|&>",
1409 Token::VerticalBarAmpersandRightAngleBracket,
1410 ),
1411 _ => self.start_binop_opt(chars, "|&", None),
1412 }
1413 }
1414 Some('>') if self.dialect.supports_geometric_types() => {
1415 chars.next(); match chars.peek() {
1417 Some('>') => self.consume_for_binop(
1418 chars,
1419 "|>>",
1420 Token::VerticalBarShiftRight,
1421 ),
1422 _ => self.start_binop_opt(chars, "|>", None),
1423 }
1424 }
1425 Some('>') if self.dialect.supports_pipe_operator() => {
1426 self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1427 }
1428 _ => self.start_binop(chars, "|", Token::Pipe),
1430 }
1431 }
1432 '=' => {
1433 chars.next(); match chars.peek() {
1435 Some('>') => self.consume_and_return(chars, Token::RArrow),
1436 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1437 _ => Ok(Some(Token::Eq)),
1438 }
1439 }
1440 '!' => {
1441 chars.next(); match chars.peek() {
1443 Some('=') => self.consume_and_return(chars, Token::Neq),
1444 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1445 Some('~') => {
1446 chars.next();
1447 match chars.peek() {
1448 Some('*') => self
1449 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1450 Some('~') => {
1451 chars.next();
1452 match chars.peek() {
1453 Some('*') => self.consume_and_return(
1454 chars,
1455 Token::ExclamationMarkDoubleTildeAsterisk,
1456 ),
1457 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1458 }
1459 }
1460 _ => Ok(Some(Token::ExclamationMarkTilde)),
1461 }
1462 }
1463 _ => Ok(Some(Token::ExclamationMark)),
1464 }
1465 }
1466 '<' => {
1467 chars.next(); match chars.peek() {
1469 Some('=') => {
1470 chars.next();
1471 match chars.peek() {
1472 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1473 _ => self.start_binop(chars, "<=", Token::LtEq),
1474 }
1475 }
1476 Some('|') if self.dialect.supports_geometric_types() => {
1477 self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1478 }
1479 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1480 Some('<') if self.dialect.supports_geometric_types() => {
1481 chars.next(); match chars.peek() {
1483 Some('|') => self.consume_for_binop(
1484 chars,
1485 "<<|",
1486 Token::ShiftLeftVerticalBar,
1487 ),
1488 _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1489 }
1490 }
1491 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1492 Some('-') if self.dialect.supports_geometric_types() => {
1493 chars.next(); match chars.peek() {
1495 Some('>') => {
1496 self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1497 }
1498 _ => self.start_binop_opt(chars, "<-", None),
1499 }
1500 }
1501 Some('^') if self.dialect.supports_geometric_types() => {
1502 self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1503 }
1504 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1505 _ => self.start_binop(chars, "<", Token::Lt),
1506 }
1507 }
1508 '>' => {
1509 chars.next(); match chars.peek() {
1511 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1512 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1513 Some('^') if self.dialect.supports_geometric_types() => {
1514 self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1515 }
1516 _ => self.start_binop(chars, ">", Token::Gt),
1517 }
1518 }
1519 ':' => {
1520 chars.next();
1521 match chars.peek() {
1522 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1523 Some('=') => self.consume_and_return(chars, Token::Assignment),
1524 _ => Ok(Some(Token::Colon)),
1525 }
1526 }
1527 ';' => self.consume_and_return(chars, Token::SemiColon),
1528 '\\' => self.consume_and_return(chars, Token::Backslash),
1529 '[' => self.consume_and_return(chars, Token::LBracket),
1530 ']' => self.consume_and_return(chars, Token::RBracket),
1531 '&' => {
1532 chars.next(); match chars.peek() {
1534 Some('>') if self.dialect.supports_geometric_types() => {
1535 chars.next();
1536 self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1537 }
1538 Some('<') if self.dialect.supports_geometric_types() => {
1539 chars.next(); match chars.peek() {
1541 Some('|') => self.consume_and_return(
1542 chars,
1543 Token::AmpersandLeftAngleBracketVerticalBar,
1544 ),
1545 _ => {
1546 self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1547 }
1548 }
1549 }
1550 Some('&') => {
1551 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1553 }
1554 _ => self.start_binop(chars, "&", Token::Ampersand),
1556 }
1557 }
1558 '^' => {
1559 chars.next(); match chars.peek() {
1561 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1562 _ => Ok(Some(Token::Caret)),
1563 }
1564 }
1565 '{' => self.consume_and_return(chars, Token::LBrace),
1566 '}' => self.consume_and_return(chars, Token::RBrace),
1567 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1568 {
1569 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1571 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1572 prefix: "#".to_owned(),
1573 comment,
1574 })))
1575 }
1576 '~' => {
1577 chars.next(); match chars.peek() {
1579 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1580 Some('=') if self.dialect.supports_geometric_types() => {
1581 self.consume_for_binop(chars, "~=", Token::TildeEqual)
1582 }
1583 Some('~') => {
1584 chars.next();
1585 match chars.peek() {
1586 Some('*') => {
1587 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1588 }
1589 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1590 }
1591 }
1592 _ => self.start_binop(chars, "~", Token::Tilde),
1593 }
1594 }
1595 '#' => {
1596 chars.next();
1597 match chars.peek() {
1598 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1599 Some('>') => {
1600 chars.next();
1601 match chars.peek() {
1602 Some('>') => {
1603 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1604 }
1605 _ => self.start_binop(chars, "#>", Token::HashArrow),
1606 }
1607 }
1608 Some(' ') => Ok(Some(Token::Sharp)),
1609 Some('#') if self.dialect.supports_geometric_types() => {
1610 self.consume_for_binop(chars, "##", Token::DoubleSharp)
1611 }
1612 Some(sch) if self.dialect.is_identifier_start('#') => {
1613 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1614 }
1615 _ => self.start_binop(chars, "#", Token::Sharp),
1616 }
1617 }
1618 '@' => {
1619 chars.next();
1620 match chars.peek() {
1621 Some('@') if self.dialect.supports_geometric_types() => {
1622 self.consume_and_return(chars, Token::AtAt)
1623 }
1624 Some('-') if self.dialect.supports_geometric_types() => {
1625 chars.next();
1626 match chars.peek() {
1627 Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1628 _ => self.start_binop_opt(chars, "@-", None),
1629 }
1630 }
1631 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1632 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1633 Some('@') => {
1634 chars.next();
1635 match chars.peek() {
1636 Some(' ') => Ok(Some(Token::AtAt)),
1637 Some(tch) if self.dialect.is_identifier_start('@') => {
1638 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1639 }
1640 _ => Ok(Some(Token::AtAt)),
1641 }
1642 }
1643 Some(' ') => Ok(Some(Token::AtSign)),
1644 Some('\'') => Ok(Some(Token::AtSign)),
1654 Some('\"') => Ok(Some(Token::AtSign)),
1655 Some('`') => Ok(Some(Token::AtSign)),
1656 Some(sch) if self.dialect.is_identifier_start('@') => {
1657 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1658 }
1659 _ => Ok(Some(Token::AtSign)),
1660 }
1661 }
1662 '?' if self.dialect.supports_geometric_types() => {
1664 chars.next(); match chars.peek() {
1666 Some('|') => {
1667 chars.next();
1668 match chars.peek() {
1669 Some('|') => self.consume_and_return(
1670 chars,
1671 Token::QuestionMarkDoubleVerticalBar,
1672 ),
1673 _ => Ok(Some(Token::QuestionPipe)),
1674 }
1675 }
1676
1677 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1678 Some('-') => {
1679 chars.next(); match chars.peek() {
1681 Some('|') => self
1682 .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1683 _ => Ok(Some(Token::QuestionMarkDash)),
1684 }
1685 }
1686 Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1687 _ => self.consume_and_return(chars, Token::Question),
1688 }
1689 }
1690 '?' => {
1691 chars.next();
1692 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1693 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1694 }
1695
1696 ch if self.dialect.is_identifier_start(ch) => {
1698 self.tokenize_identifier_or_keyword([ch], chars)
1699 }
1700 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1701
1702 ch if ch.is_whitespace() => {
1704 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1705 }
1706 other => self.consume_and_return(chars, Token::Char(other)),
1707 },
1708 None => Ok(None),
1709 }
1710 }
1711
1712 fn consume_for_binop(
1714 &self,
1715 chars: &mut State,
1716 prefix: &str,
1717 default: Token,
1718 ) -> Result<Option<Token>, TokenizerError> {
1719 chars.next(); self.start_binop_opt(chars, prefix, Some(default))
1721 }
1722
1723 fn start_binop(
1725 &self,
1726 chars: &mut State,
1727 prefix: &str,
1728 default: Token,
1729 ) -> Result<Option<Token>, TokenizerError> {
1730 self.start_binop_opt(chars, prefix, Some(default))
1731 }
1732
1733 fn start_binop_opt(
1735 &self,
1736 chars: &mut State,
1737 prefix: &str,
1738 default: Option<Token>,
1739 ) -> Result<Option<Token>, TokenizerError> {
1740 let mut custom = None;
1741 while let Some(&ch) = chars.peek() {
1742 if !self.dialect.is_custom_operator_part(ch) {
1743 break;
1744 }
1745
1746 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1747 chars.next();
1748 }
1749 match (custom, default) {
1750 (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1751 (None, Some(tok)) => Ok(Some(tok)),
1752 (None, None) => self.tokenizer_error(
1753 chars.location(),
1754 format!("Expected a valid binary operator after '{prefix}'"),
1755 ),
1756 }
1757 }
1758
1759 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1761 let mut s = String::new();
1762 let mut value = String::new();
1763
1764 chars.next();
1765
1766 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1768 chars.next();
1769
1770 let mut is_terminated = false;
1771 let mut prev: Option<char> = None;
1772
1773 while let Some(&ch) = chars.peek() {
1774 if prev == Some('$') {
1775 if ch == '$' {
1776 chars.next();
1777 is_terminated = true;
1778 break;
1779 } else {
1780 s.push('$');
1781 s.push(ch);
1782 }
1783 } else if ch != '$' {
1784 s.push(ch);
1785 }
1786
1787 prev = Some(ch);
1788 chars.next();
1789 }
1790
1791 return if chars.peek().is_none() && !is_terminated {
1792 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1793 } else {
1794 Ok(Token::DollarQuotedString(DollarQuotedString {
1795 value: s,
1796 tag: None,
1797 }))
1798 };
1799 } else {
1800 value.push_str(&peeking_take_while(chars, |ch| {
1801 ch.is_alphanumeric()
1802 || ch == '_'
1803 || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1805 }));
1806
1807 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1809 chars.next();
1810
1811 let mut temp = String::new();
1812 let end_delimiter = format!("${value}$");
1813
1814 loop {
1815 match chars.next() {
1816 Some(ch) => {
1817 temp.push(ch);
1818
1819 if temp.ends_with(&end_delimiter) {
1820 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1821 s.push_str(temp);
1822 }
1823 break;
1824 }
1825 }
1826 None => {
1827 if temp.ends_with(&end_delimiter) {
1828 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1829 s.push_str(temp);
1830 }
1831 break;
1832 }
1833
1834 return self.tokenizer_error(
1835 chars.location(),
1836 "Unterminated dollar-quoted, expected $",
1837 );
1838 }
1839 }
1840 }
1841 } else {
1842 return Ok(Token::Placeholder(String::from("$") + &value));
1843 }
1844 }
1845
1846 Ok(Token::DollarQuotedString(DollarQuotedString {
1847 value: s,
1848 tag: if value.is_empty() { None } else { Some(value) },
1849 }))
1850 }
1851
1852 fn tokenizer_error<R>(
1853 &self,
1854 loc: Location,
1855 message: impl Into<String>,
1856 ) -> Result<R, TokenizerError> {
1857 Err(TokenizerError {
1858 message: message.into(),
1859 location: loc,
1860 })
1861 }
1862
1863 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1865 let mut comment = peeking_take_while(chars, |ch| match ch {
1866 '\n' => false, '\r' if dialect_of!(self is PostgreSqlDialect) => false, _ => true, });
1870
1871 if let Some(ch) = chars.next() {
1872 assert!(ch == '\n' || ch == '\r');
1873 comment.push(ch);
1874 }
1875
1876 comment
1877 }
1878
1879 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881 let mut s = first_chars.into();
1882 s.push_str(&peeking_take_while(chars, |ch| {
1883 self.dialect.is_identifier_part(ch)
1884 }));
1885 s
1886 }
1887
1888 fn tokenize_quoted_identifier(
1890 &self,
1891 quote_start: char,
1892 chars: &mut State,
1893 ) -> Result<String, TokenizerError> {
1894 let error_loc = chars.location();
1895 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
1897 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1898
1899 if last_char == Some(quote_end) {
1900 Ok(s)
1901 } else {
1902 self.tokenizer_error(
1903 error_loc,
1904 format!("Expected close delimiter '{quote_end}' before EOF."),
1905 )
1906 }
1907 }
1908
1909 fn tokenize_escaped_single_quoted_string(
1911 &self,
1912 starting_loc: Location,
1913 chars: &mut State,
1914 ) -> Result<String, TokenizerError> {
1915 if let Some(s) = unescape_single_quoted_string(chars) {
1916 return Ok(s);
1917 }
1918
1919 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1920 }
1921
1922 fn tokenize_single_or_triple_quoted_string<F>(
1925 &self,
1926 chars: &mut State,
1927 quote_style: char,
1928 backslash_escape: bool,
1929 single_quote_token: F,
1930 triple_quote_token: F,
1931 ) -> Result<Option<Token>, TokenizerError>
1932 where
1933 F: Fn(String) -> Token,
1934 {
1935 let error_loc = chars.location();
1936
1937 let mut num_opening_quotes = 0u8;
1938 for _ in 0..3 {
1939 if Some("e_style) == chars.peek() {
1940 chars.next(); num_opening_quotes += 1;
1942 } else {
1943 break;
1944 }
1945 }
1946
1947 let (token_fn, num_quote_chars) = match num_opening_quotes {
1948 1 => (single_quote_token, NumStringQuoteChars::One),
1949 2 => {
1950 return Ok(Some(single_quote_token("".into())));
1952 }
1953 3 => {
1954 let Some(num_quote_chars) = NonZeroU8::new(3) else {
1955 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1956 };
1957 (
1958 triple_quote_token,
1959 NumStringQuoteChars::Many(num_quote_chars),
1960 )
1961 }
1962 _ => {
1963 return self.tokenizer_error(error_loc, "invalid string literal opening");
1964 }
1965 };
1966
1967 let settings = TokenizeQuotedStringSettings {
1968 quote_style,
1969 num_quote_chars,
1970 num_opening_quotes_to_consume: 0,
1971 backslash_escape,
1972 };
1973
1974 self.tokenize_quoted_string(chars, settings)
1975 .map(token_fn)
1976 .map(Some)
1977 }
1978
1979 fn tokenize_single_quoted_string(
1981 &self,
1982 chars: &mut State,
1983 quote_style: char,
1984 backslash_escape: bool,
1985 ) -> Result<String, TokenizerError> {
1986 self.tokenize_quoted_string(
1987 chars,
1988 TokenizeQuotedStringSettings {
1989 quote_style,
1990 num_quote_chars: NumStringQuoteChars::One,
1991 num_opening_quotes_to_consume: 1,
1992 backslash_escape,
1993 },
1994 )
1995 }
1996
1997 fn tokenize_quoted_string(
1999 &self,
2000 chars: &mut State,
2001 settings: TokenizeQuotedStringSettings,
2002 ) -> Result<String, TokenizerError> {
2003 let mut s = String::new();
2004 let error_loc = chars.location();
2005
2006 for _ in 0..settings.num_opening_quotes_to_consume {
2008 if Some(settings.quote_style) != chars.next() {
2009 return self.tokenizer_error(error_loc, "invalid string literal opening");
2010 }
2011 }
2012
2013 let mut num_consecutive_quotes = 0;
2014 while let Some(&ch) = chars.peek() {
2015 let pending_final_quote = match settings.num_quote_chars {
2016 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2017 n @ NumStringQuoteChars::Many(count)
2018 if num_consecutive_quotes + 1 == count.get() =>
2019 {
2020 Some(n)
2021 }
2022 NumStringQuoteChars::Many(_) => None,
2023 };
2024
2025 match ch {
2026 char if char == settings.quote_style && pending_final_quote.is_some() => {
2027 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2030 let mut buf = s.chars();
2035 for _ in 1..count.get() {
2036 buf.next_back();
2037 }
2038 return Ok(buf.as_str().to_string());
2039 } else if chars
2040 .peek()
2041 .map(|c| *c == settings.quote_style)
2042 .unwrap_or(false)
2043 {
2044 s.push(ch);
2045 if !self.unescape {
2046 s.push(ch);
2048 }
2049 chars.next();
2050 } else {
2051 return Ok(s);
2052 }
2053 }
2054 '\\' if settings.backslash_escape => {
2055 chars.next();
2057
2058 num_consecutive_quotes = 0;
2059
2060 if let Some(next) = chars.peek() {
2061 if !self.unescape
2062 || (self.dialect.ignores_wildcard_escapes()
2063 && (*next == '%' || *next == '_'))
2064 {
2065 s.push(ch);
2069 s.push(*next);
2070 chars.next(); } else if *next == 'u' {
2072 chars.next(); let hex: String = (0..4)
2075 .filter_map(|_| chars.next())
2076 .collect();
2077 if hex.len() == 4 {
2078 if let Ok(code_point) = u32::from_str_radix(&hex, 16) {
2079 if let Some(unicode_char) = char::from_u32(code_point) {
2080 s.push(unicode_char);
2081 } else {
2082 s.push_str("\\u");
2083 s.push_str(&hex);
2084 }
2085 } else {
2086 s.push_str("\\u");
2087 s.push_str(&hex);
2088 }
2089 } else {
2090 s.push_str("\\u");
2091 s.push_str(&hex);
2092 }
2093 } else if *next == 'U' {
2094 chars.next(); let hex: String = (0..8)
2097 .filter_map(|_| chars.next())
2098 .collect();
2099 if hex.len() == 8 {
2100 if let Ok(code_point) = u32::from_str_radix(&hex, 16) {
2101 if let Some(unicode_char) = char::from_u32(code_point) {
2102 s.push(unicode_char);
2103 } else {
2104 s.push_str("\\U");
2105 s.push_str(&hex);
2106 }
2107 } else {
2108 s.push_str("\\U");
2109 s.push_str(&hex);
2110 }
2111 } else {
2112 s.push_str("\\U");
2113 s.push_str(&hex);
2114 }
2115 } else {
2116 let n = match next {
2117 '0' => '\0',
2118 'a' => '\u{7}',
2119 'b' => '\u{8}',
2120 'f' => '\u{c}',
2121 'n' => '\n',
2122 'r' => '\r',
2123 't' => '\t',
2124 'Z' => '\u{1a}',
2125 _ => *next,
2126 };
2127 s.push(n);
2128 chars.next(); }
2130 }
2131 }
2132 ch => {
2133 chars.next(); if ch == settings.quote_style {
2136 num_consecutive_quotes += 1;
2137 } else {
2138 num_consecutive_quotes = 0;
2139 }
2140
2141 s.push(ch);
2142 }
2143 }
2144 }
2145 self.tokenizer_error(error_loc, "Unterminated string literal")
2146 }
2147
2148 fn tokenize_multiline_comment(
2149 &self,
2150 chars: &mut State,
2151 ) -> Result<Option<Token>, TokenizerError> {
2152 let mut s = String::new();
2153 let mut nested = 1;
2154 let supports_nested_comments = self.dialect.supports_nested_comments();
2155
2156 loop {
2157 match chars.next() {
2158 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2159 chars.next(); s.push('/');
2161 s.push('*');
2162 nested += 1;
2163 }
2164 Some('*') if matches!(chars.peek(), Some('/')) => {
2165 chars.next(); nested -= 1;
2167 if nested == 0 {
2168 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2169 }
2170 s.push('*');
2171 s.push('/');
2172 }
2173 Some(ch) => {
2174 s.push(ch);
2175 }
2176 None => {
2177 break self.tokenizer_error(
2178 chars.location(),
2179 "Unexpected EOF while in a multi-line comment",
2180 );
2181 }
2182 }
2183 }
2184 }
2185
2186 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2187 let mut last_char = None;
2188 let mut s = String::new();
2189 while let Some(ch) = chars.next() {
2190 if ch == quote_end {
2191 if chars.peek() == Some("e_end) {
2192 chars.next();
2193 s.push(ch);
2194 if !self.unescape {
2195 s.push(ch);
2197 }
2198 } else {
2199 last_char = Some(quote_end);
2200 break;
2201 }
2202 } else {
2203 s.push(ch);
2204 }
2205 }
2206 (s, last_char)
2207 }
2208
2209 #[allow(clippy::unnecessary_wraps)]
2210 fn consume_and_return(
2211 &self,
2212 chars: &mut State,
2213 t: Token,
2214 ) -> Result<Option<Token>, TokenizerError> {
2215 chars.next();
2216 Ok(Some(t))
2217 }
2218}
2219
2220fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2224 let mut s = String::new();
2225 while let Some(&ch) = chars.peek() {
2226 if predicate(ch) {
2227 chars.next(); s.push(ch);
2229 } else {
2230 break;
2231 }
2232 }
2233 s
2234}
2235
2236fn peeking_next_take_while(
2238 chars: &mut State,
2239 mut predicate: impl FnMut(char, Option<char>) -> bool,
2240) -> String {
2241 let mut s = String::new();
2242 while let Some(&ch) = chars.peek() {
2243 let next_char = chars.peekable.clone().nth(1);
2244 if predicate(ch, next_char) {
2245 chars.next(); s.push(ch);
2247 } else {
2248 break;
2249 }
2250 }
2251 s
2252}
2253
2254fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2255 Unescape::new(chars).unescape()
2256}
2257
2258struct Unescape<'a: 'b, 'b> {
2259 chars: &'b mut State<'a>,
2260}
2261
2262impl<'a: 'b, 'b> Unescape<'a, 'b> {
2263 fn new(chars: &'b mut State<'a>) -> Self {
2264 Self { chars }
2265 }
2266 fn unescape(mut self) -> Option<String> {
2267 let mut unescaped = String::new();
2268
2269 self.chars.next();
2270
2271 while let Some(c) = self.chars.next() {
2272 if c == '\'' {
2273 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2275 self.chars.next();
2276 unescaped.push('\'');
2277 continue;
2278 }
2279 return Some(unescaped);
2280 }
2281
2282 if c != '\\' {
2283 unescaped.push(c);
2284 continue;
2285 }
2286
2287 let c = match self.chars.next()? {
2288 'b' => '\u{0008}',
2289 'f' => '\u{000C}',
2290 'n' => '\n',
2291 'r' => '\r',
2292 't' => '\t',
2293 'u' => self.unescape_unicode_16()?,
2294 'U' => self.unescape_unicode_32()?,
2295 'x' => self.unescape_hex()?,
2296 c if c.is_digit(8) => self.unescape_octal(c)?,
2297 c => c,
2298 };
2299
2300 unescaped.push(Self::check_null(c)?);
2301 }
2302
2303 None
2304 }
2305
2306 #[inline]
2307 fn check_null(c: char) -> Option<char> {
2308 if c == '\0' {
2309 None
2310 } else {
2311 Some(c)
2312 }
2313 }
2314
2315 #[inline]
2316 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2317 match u32::from_str_radix(s, RADIX) {
2319 Err(_) => None,
2320 Ok(n) => {
2321 let n = n & 0xFF;
2322 if n <= 127 {
2323 char::from_u32(n)
2324 } else {
2325 None
2326 }
2327 }
2328 }
2329 }
2330
2331 fn unescape_hex(&mut self) -> Option<char> {
2333 let mut s = String::new();
2334
2335 for _ in 0..2 {
2336 match self.next_hex_digit() {
2337 Some(c) => s.push(c),
2338 None => break,
2339 }
2340 }
2341
2342 if s.is_empty() {
2343 return Some('x');
2344 }
2345
2346 Self::byte_to_char::<16>(&s)
2347 }
2348
2349 #[inline]
2350 fn next_hex_digit(&mut self) -> Option<char> {
2351 match self.chars.peek() {
2352 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2353 _ => None,
2354 }
2355 }
2356
2357 fn unescape_octal(&mut self, c: char) -> Option<char> {
2359 let mut s = String::new();
2360
2361 s.push(c);
2362 for _ in 0..2 {
2363 match self.next_octal_digest() {
2364 Some(c) => s.push(c),
2365 None => break,
2366 }
2367 }
2368
2369 Self::byte_to_char::<8>(&s)
2370 }
2371
2372 #[inline]
2373 fn next_octal_digest(&mut self) -> Option<char> {
2374 match self.chars.peek() {
2375 Some(c) if c.is_digit(8) => self.chars.next(),
2376 _ => None,
2377 }
2378 }
2379
2380 fn unescape_unicode_16(&mut self) -> Option<char> {
2382 self.unescape_unicode::<4>()
2383 }
2384
2385 fn unescape_unicode_32(&mut self) -> Option<char> {
2387 self.unescape_unicode::<8>()
2388 }
2389
2390 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2391 let mut s = String::new();
2392 for _ in 0..NUM {
2393 s.push(self.chars.next()?);
2394 }
2395 match u32::from_str_radix(&s, 16) {
2396 Err(_) => None,
2397 Ok(n) => char::from_u32(n),
2398 }
2399 }
2400}
2401
2402fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2403 let mut unescaped = String::new();
2404 chars.next(); while let Some(c) = chars.next() {
2406 match c {
2407 '\'' => {
2408 if chars.peek() == Some(&'\'') {
2409 chars.next();
2410 unescaped.push('\'');
2411 } else {
2412 return Ok(unescaped);
2413 }
2414 }
2415 '\\' => match chars.peek() {
2416 Some('\\') => {
2417 chars.next();
2418 unescaped.push('\\');
2419 }
2420 Some('+') => {
2421 chars.next();
2422 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2423 }
2424 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2425 },
2426 _ => {
2427 unescaped.push(c);
2428 }
2429 }
2430 }
2431 Err(TokenizerError {
2432 message: "Unterminated unicode encoded string literal".to_string(),
2433 location: chars.location(),
2434 })
2435}
2436
2437fn take_char_from_hex_digits(
2438 chars: &mut State<'_>,
2439 max_digits: usize,
2440) -> Result<char, TokenizerError> {
2441 let mut result = 0u32;
2442 for _ in 0..max_digits {
2443 let next_char = chars.next().ok_or_else(|| TokenizerError {
2444 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2445 .to_string(),
2446 location: chars.location(),
2447 })?;
2448 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2449 message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2450 location: chars.location(),
2451 })?;
2452 result = result * 16 + digit;
2453 }
2454 char::from_u32(result).ok_or_else(|| TokenizerError {
2455 message: format!("Invalid unicode character: {result:x}"),
2456 location: chars.location(),
2457 })
2458}
2459
2460#[cfg(test)]
2461mod tests {
2462 use super::*;
2463 use crate::dialect::{
2464 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2465 };
2466 use crate::test_utils::all_dialects_where;
2467 use core::fmt::Debug;
2468
2469 #[test]
2470 fn tokenizer_error_impl() {
2471 let err = TokenizerError {
2472 message: "test".into(),
2473 location: Location { line: 1, column: 1 },
2474 };
2475 #[cfg(feature = "std")]
2476 {
2477 use std::error::Error;
2478 assert!(err.source().is_none());
2479 }
2480 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2481 }
2482
2483 #[test]
2484 fn tokenize_select_1() {
2485 let sql = String::from("SELECT 1");
2486 let dialect = GenericDialect {};
2487 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2488
2489 let expected = vec![
2490 Token::make_keyword("SELECT"),
2491 Token::Whitespace(Whitespace::Space),
2492 Token::Number(String::from("1"), false),
2493 ];
2494
2495 compare(expected, tokens);
2496 }
2497
2498 #[test]
2499 fn tokenize_select_float() {
2500 let sql = String::from("SELECT .1");
2501 let dialect = GenericDialect {};
2502 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2503
2504 let expected = vec![
2505 Token::make_keyword("SELECT"),
2506 Token::Whitespace(Whitespace::Space),
2507 Token::Number(String::from(".1"), false),
2508 ];
2509
2510 compare(expected, tokens);
2511 }
2512
2513 #[test]
2514 fn tokenize_clickhouse_double_equal() {
2515 let sql = String::from("SELECT foo=='1'");
2516 let dialect = ClickHouseDialect {};
2517 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2518 let tokens = tokenizer.tokenize().unwrap();
2519
2520 let expected = vec![
2521 Token::make_keyword("SELECT"),
2522 Token::Whitespace(Whitespace::Space),
2523 Token::Word(Word {
2524 value: "foo".to_string(),
2525 quote_style: None,
2526 keyword: Keyword::NoKeyword,
2527 }),
2528 Token::DoubleEq,
2529 Token::SingleQuotedString("1".to_string()),
2530 ];
2531
2532 compare(expected, tokens);
2533 }
2534
2535 #[test]
2536 fn tokenize_numeric_literal_underscore() {
2537 let dialect = GenericDialect {};
2538 let sql = String::from("SELECT 10_000");
2539 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2540 let tokens = tokenizer.tokenize().unwrap();
2541 let expected = vec![
2542 Token::make_keyword("SELECT"),
2543 Token::Whitespace(Whitespace::Space),
2544 Token::Number("10".to_string(), false),
2545 Token::make_word("_000", None),
2546 ];
2547 compare(expected, tokens);
2548
2549 all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2550 "SELECT 10_000, _10_000, 10_00_, 10___0",
2551 vec![
2552 Token::make_keyword("SELECT"),
2553 Token::Whitespace(Whitespace::Space),
2554 Token::Number("10_000".to_string(), false),
2555 Token::Comma,
2556 Token::Whitespace(Whitespace::Space),
2557 Token::make_word("_10_000", None), Token::Comma,
2559 Token::Whitespace(Whitespace::Space),
2560 Token::Number("10_00".to_string(), false),
2561 Token::make_word("_", None), Token::Comma,
2563 Token::Whitespace(Whitespace::Space),
2564 Token::Number("10".to_string(), false),
2565 Token::make_word("___0", None), ],
2567 );
2568 }
2569
2570 #[test]
2571 fn tokenize_select_exponent() {
2572 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2573 let dialect = GenericDialect {};
2574 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2575
2576 let expected = vec![
2577 Token::make_keyword("SELECT"),
2578 Token::Whitespace(Whitespace::Space),
2579 Token::Number(String::from("1e10"), false),
2580 Token::Comma,
2581 Token::Whitespace(Whitespace::Space),
2582 Token::Number(String::from("1e-10"), false),
2583 Token::Comma,
2584 Token::Whitespace(Whitespace::Space),
2585 Token::Number(String::from("1e+10"), false),
2586 Token::Comma,
2587 Token::Whitespace(Whitespace::Space),
2588 Token::Number(String::from("1"), false),
2589 Token::make_word("ea", None),
2590 Token::Comma,
2591 Token::Whitespace(Whitespace::Space),
2592 Token::Number(String::from("1e-10"), false),
2593 Token::make_word("a", None),
2594 Token::Comma,
2595 Token::Whitespace(Whitespace::Space),
2596 Token::Number(String::from("1e-10"), false),
2597 Token::Minus,
2598 Token::Number(String::from("10"), false),
2599 ];
2600
2601 compare(expected, tokens);
2602 }
2603
2604 #[test]
2605 fn tokenize_scalar_function() {
2606 let sql = String::from("SELECT sqrt(1)");
2607 let dialect = GenericDialect {};
2608 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2609
2610 let expected = vec![
2611 Token::make_keyword("SELECT"),
2612 Token::Whitespace(Whitespace::Space),
2613 Token::make_word("sqrt", None),
2614 Token::LParen,
2615 Token::Number(String::from("1"), false),
2616 Token::RParen,
2617 ];
2618
2619 compare(expected, tokens);
2620 }
2621
2622 #[test]
2623 fn tokenize_string_string_concat() {
2624 let sql = String::from("SELECT 'a' || 'b'");
2625 let dialect = GenericDialect {};
2626 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2627
2628 let expected = vec![
2629 Token::make_keyword("SELECT"),
2630 Token::Whitespace(Whitespace::Space),
2631 Token::SingleQuotedString(String::from("a")),
2632 Token::Whitespace(Whitespace::Space),
2633 Token::StringConcat,
2634 Token::Whitespace(Whitespace::Space),
2635 Token::SingleQuotedString(String::from("b")),
2636 ];
2637
2638 compare(expected, tokens);
2639 }
2640 #[test]
2641 fn tokenize_bitwise_op() {
2642 let sql = String::from("SELECT one | two ^ three");
2643 let dialect = GenericDialect {};
2644 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2645
2646 let expected = vec![
2647 Token::make_keyword("SELECT"),
2648 Token::Whitespace(Whitespace::Space),
2649 Token::make_word("one", None),
2650 Token::Whitespace(Whitespace::Space),
2651 Token::Pipe,
2652 Token::Whitespace(Whitespace::Space),
2653 Token::make_word("two", None),
2654 Token::Whitespace(Whitespace::Space),
2655 Token::Caret,
2656 Token::Whitespace(Whitespace::Space),
2657 Token::make_word("three", None),
2658 ];
2659 compare(expected, tokens);
2660 }
2661
2662 #[test]
2663 fn tokenize_logical_xor() {
2664 let sql =
2665 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2666 let dialect = GenericDialect {};
2667 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2668
2669 let expected = vec![
2670 Token::make_keyword("SELECT"),
2671 Token::Whitespace(Whitespace::Space),
2672 Token::make_keyword("true"),
2673 Token::Whitespace(Whitespace::Space),
2674 Token::make_keyword("XOR"),
2675 Token::Whitespace(Whitespace::Space),
2676 Token::make_keyword("true"),
2677 Token::Comma,
2678 Token::Whitespace(Whitespace::Space),
2679 Token::make_keyword("false"),
2680 Token::Whitespace(Whitespace::Space),
2681 Token::make_keyword("XOR"),
2682 Token::Whitespace(Whitespace::Space),
2683 Token::make_keyword("false"),
2684 Token::Comma,
2685 Token::Whitespace(Whitespace::Space),
2686 Token::make_keyword("true"),
2687 Token::Whitespace(Whitespace::Space),
2688 Token::make_keyword("XOR"),
2689 Token::Whitespace(Whitespace::Space),
2690 Token::make_keyword("false"),
2691 Token::Comma,
2692 Token::Whitespace(Whitespace::Space),
2693 Token::make_keyword("false"),
2694 Token::Whitespace(Whitespace::Space),
2695 Token::make_keyword("XOR"),
2696 Token::Whitespace(Whitespace::Space),
2697 Token::make_keyword("true"),
2698 ];
2699 compare(expected, tokens);
2700 }
2701
2702 #[test]
2703 fn tokenize_simple_select() {
2704 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2705 let dialect = GenericDialect {};
2706 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2707
2708 let expected = vec![
2709 Token::make_keyword("SELECT"),
2710 Token::Whitespace(Whitespace::Space),
2711 Token::Mul,
2712 Token::Whitespace(Whitespace::Space),
2713 Token::make_keyword("FROM"),
2714 Token::Whitespace(Whitespace::Space),
2715 Token::make_word("customer", None),
2716 Token::Whitespace(Whitespace::Space),
2717 Token::make_keyword("WHERE"),
2718 Token::Whitespace(Whitespace::Space),
2719 Token::make_word("id", None),
2720 Token::Whitespace(Whitespace::Space),
2721 Token::Eq,
2722 Token::Whitespace(Whitespace::Space),
2723 Token::Number(String::from("1"), false),
2724 Token::Whitespace(Whitespace::Space),
2725 Token::make_keyword("LIMIT"),
2726 Token::Whitespace(Whitespace::Space),
2727 Token::Number(String::from("5"), false),
2728 ];
2729
2730 compare(expected, tokens);
2731 }
2732
2733 #[test]
2734 fn tokenize_explain_select() {
2735 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2736 let dialect = GenericDialect {};
2737 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2738
2739 let expected = vec![
2740 Token::make_keyword("EXPLAIN"),
2741 Token::Whitespace(Whitespace::Space),
2742 Token::make_keyword("SELECT"),
2743 Token::Whitespace(Whitespace::Space),
2744 Token::Mul,
2745 Token::Whitespace(Whitespace::Space),
2746 Token::make_keyword("FROM"),
2747 Token::Whitespace(Whitespace::Space),
2748 Token::make_word("customer", None),
2749 Token::Whitespace(Whitespace::Space),
2750 Token::make_keyword("WHERE"),
2751 Token::Whitespace(Whitespace::Space),
2752 Token::make_word("id", None),
2753 Token::Whitespace(Whitespace::Space),
2754 Token::Eq,
2755 Token::Whitespace(Whitespace::Space),
2756 Token::Number(String::from("1"), false),
2757 ];
2758
2759 compare(expected, tokens);
2760 }
2761
2762 #[test]
2763 fn tokenize_explain_analyze_select() {
2764 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2765 let dialect = GenericDialect {};
2766 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2767
2768 let expected = vec![
2769 Token::make_keyword("EXPLAIN"),
2770 Token::Whitespace(Whitespace::Space),
2771 Token::make_keyword("ANALYZE"),
2772 Token::Whitespace(Whitespace::Space),
2773 Token::make_keyword("SELECT"),
2774 Token::Whitespace(Whitespace::Space),
2775 Token::Mul,
2776 Token::Whitespace(Whitespace::Space),
2777 Token::make_keyword("FROM"),
2778 Token::Whitespace(Whitespace::Space),
2779 Token::make_word("customer", None),
2780 Token::Whitespace(Whitespace::Space),
2781 Token::make_keyword("WHERE"),
2782 Token::Whitespace(Whitespace::Space),
2783 Token::make_word("id", None),
2784 Token::Whitespace(Whitespace::Space),
2785 Token::Eq,
2786 Token::Whitespace(Whitespace::Space),
2787 Token::Number(String::from("1"), false),
2788 ];
2789
2790 compare(expected, tokens);
2791 }
2792
2793 #[test]
2794 fn tokenize_string_predicate() {
2795 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2796 let dialect = GenericDialect {};
2797 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2798
2799 let expected = vec![
2800 Token::make_keyword("SELECT"),
2801 Token::Whitespace(Whitespace::Space),
2802 Token::Mul,
2803 Token::Whitespace(Whitespace::Space),
2804 Token::make_keyword("FROM"),
2805 Token::Whitespace(Whitespace::Space),
2806 Token::make_word("customer", None),
2807 Token::Whitespace(Whitespace::Space),
2808 Token::make_keyword("WHERE"),
2809 Token::Whitespace(Whitespace::Space),
2810 Token::make_word("salary", None),
2811 Token::Whitespace(Whitespace::Space),
2812 Token::Neq,
2813 Token::Whitespace(Whitespace::Space),
2814 Token::SingleQuotedString(String::from("Not Provided")),
2815 ];
2816
2817 compare(expected, tokens);
2818 }
2819
2820 #[test]
2821 fn tokenize_invalid_string() {
2822 let sql = String::from("\n💝مصطفىh");
2823
2824 let dialect = GenericDialect {};
2825 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2826 let expected = vec![
2828 Token::Whitespace(Whitespace::Newline),
2829 Token::Char('💝'),
2830 Token::make_word("مصطفىh", None),
2831 ];
2832 compare(expected, tokens);
2833 }
2834
2835 #[test]
2836 fn tokenize_newline_in_string_literal() {
2837 let sql = String::from("'foo\r\nbar\nbaz'");
2838
2839 let dialect = GenericDialect {};
2840 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2841 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2842 compare(expected, tokens);
2843 }
2844
2845 #[test]
2846 fn tokenize_unterminated_string_literal() {
2847 let sql = String::from("select 'foo");
2848
2849 let dialect = GenericDialect {};
2850 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2851 assert_eq!(
2852 tokenizer.tokenize(),
2853 Err(TokenizerError {
2854 message: "Unterminated string literal".to_string(),
2855 location: Location { line: 1, column: 8 },
2856 })
2857 );
2858 }
2859
2860 #[test]
2861 fn tokenize_unterminated_string_literal_utf8() {
2862 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2863
2864 let dialect = GenericDialect {};
2865 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2866 assert_eq!(
2867 tokenizer.tokenize(),
2868 Err(TokenizerError {
2869 message: "Unterminated string literal".to_string(),
2870 location: Location {
2871 line: 1,
2872 column: 35
2873 }
2874 })
2875 );
2876 }
2877
2878 #[test]
2879 fn tokenize_invalid_string_cols() {
2880 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2881
2882 let dialect = GenericDialect {};
2883 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2884 let expected = vec![
2886 Token::Whitespace(Whitespace::Newline),
2887 Token::Whitespace(Whitespace::Newline),
2888 Token::make_keyword("SELECT"),
2889 Token::Whitespace(Whitespace::Space),
2890 Token::Mul,
2891 Token::Whitespace(Whitespace::Space),
2892 Token::make_keyword("FROM"),
2893 Token::Whitespace(Whitespace::Space),
2894 Token::make_keyword("table"),
2895 Token::Whitespace(Whitespace::Tab),
2896 Token::Char('💝'),
2897 Token::make_word("مصطفىh", None),
2898 ];
2899 compare(expected, tokens);
2900 }
2901
2902 #[test]
2903 fn tokenize_dollar_quoted_string_tagged() {
2904 let test_cases = vec![
2905 (
2906 String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2907 vec![
2908 Token::make_keyword("SELECT"),
2909 Token::Whitespace(Whitespace::Space),
2910 Token::DollarQuotedString(DollarQuotedString {
2911 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2912 tag: Some("tag".into()),
2913 })
2914 ]
2915 ),
2916 (
2917 String::from("SELECT $abc$x$ab$abc$"),
2918 vec![
2919 Token::make_keyword("SELECT"),
2920 Token::Whitespace(Whitespace::Space),
2921 Token::DollarQuotedString(DollarQuotedString {
2922 value: "x$ab".into(),
2923 tag: Some("abc".into()),
2924 })
2925 ]
2926 ),
2927 (
2928 String::from("SELECT $abc$$abc$"),
2929 vec![
2930 Token::make_keyword("SELECT"),
2931 Token::Whitespace(Whitespace::Space),
2932 Token::DollarQuotedString(DollarQuotedString {
2933 value: "".into(),
2934 tag: Some("abc".into()),
2935 })
2936 ]
2937 ),
2938 (
2939 String::from("0$abc$$abc$1"),
2940 vec![
2941 Token::Number("0".into(), false),
2942 Token::DollarQuotedString(DollarQuotedString {
2943 value: "".into(),
2944 tag: Some("abc".into()),
2945 }),
2946 Token::Number("1".into(), false),
2947 ]
2948 ),
2949 (
2950 String::from("$function$abc$q$data$q$$function$"),
2951 vec![
2952 Token::DollarQuotedString(DollarQuotedString {
2953 value: "abc$q$data$q$".into(),
2954 tag: Some("function".into()),
2955 }),
2956 ]
2957 ),
2958 ];
2959
2960 let dialect = GenericDialect {};
2961 for (sql, expected) in test_cases {
2962 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2963 compare(expected, tokens);
2964 }
2965 }
2966
2967 #[test]
2968 fn tokenize_dollar_quoted_string_tagged_unterminated() {
2969 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2970 let dialect = GenericDialect {};
2971 assert_eq!(
2972 Tokenizer::new(&dialect, &sql).tokenize(),
2973 Err(TokenizerError {
2974 message: "Unterminated dollar-quoted, expected $".into(),
2975 location: Location {
2976 line: 1,
2977 column: 91
2978 }
2979 })
2980 );
2981 }
2982
2983 #[test]
2984 fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2985 let sql = String::from("SELECT $abc$abc$");
2986 let dialect = GenericDialect {};
2987 assert_eq!(
2988 Tokenizer::new(&dialect, &sql).tokenize(),
2989 Err(TokenizerError {
2990 message: "Unterminated dollar-quoted, expected $".into(),
2991 location: Location {
2992 line: 1,
2993 column: 17
2994 }
2995 })
2996 );
2997 }
2998
2999 #[test]
3000 fn tokenize_dollar_placeholder() {
3001 let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
3002 let dialect = SQLiteDialect {};
3003 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3004 assert_eq!(
3005 tokens,
3006 vec![
3007 Token::make_keyword("SELECT"),
3008 Token::Whitespace(Whitespace::Space),
3009 Token::Placeholder("$$".into()),
3010 Token::Comma,
3011 Token::Whitespace(Whitespace::Space),
3012 Token::Placeholder("$$ABC$$".into()),
3013 Token::Comma,
3014 Token::Whitespace(Whitespace::Space),
3015 Token::Placeholder("$ABC$".into()),
3016 Token::Comma,
3017 Token::Whitespace(Whitespace::Space),
3018 Token::Placeholder("$ABC".into()),
3019 ]
3020 );
3021 }
3022
3023 #[test]
3024 fn tokenize_nested_dollar_quoted_strings() {
3025 let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
3026 let dialect = GenericDialect {};
3027 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3028 let expected = vec![
3029 Token::make_keyword("SELECT"),
3030 Token::Whitespace(Whitespace::Space),
3031 Token::DollarQuotedString(DollarQuotedString {
3032 value: "dollar $nested$ string".into(),
3033 tag: Some("tag".into()),
3034 }),
3035 ];
3036 compare(expected, tokens);
3037 }
3038
3039 #[test]
3040 fn tokenize_dollar_quoted_string_untagged_empty() {
3041 let sql = String::from("SELECT $$$$");
3042 let dialect = GenericDialect {};
3043 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3044 let expected = vec![
3045 Token::make_keyword("SELECT"),
3046 Token::Whitespace(Whitespace::Space),
3047 Token::DollarQuotedString(DollarQuotedString {
3048 value: "".into(),
3049 tag: None,
3050 }),
3051 ];
3052 compare(expected, tokens);
3053 }
3054
3055 #[test]
3056 fn tokenize_dollar_quoted_string_untagged() {
3057 let sql =
3058 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3059 let dialect = GenericDialect {};
3060 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3061 let expected = vec![
3062 Token::make_keyword("SELECT"),
3063 Token::Whitespace(Whitespace::Space),
3064 Token::DollarQuotedString(DollarQuotedString {
3065 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3066 tag: None,
3067 }),
3068 ];
3069 compare(expected, tokens);
3070 }
3071
3072 #[test]
3073 fn tokenize_dollar_quoted_string_untagged_unterminated() {
3074 let sql = String::from(
3075 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3076 );
3077 let dialect = GenericDialect {};
3078 assert_eq!(
3079 Tokenizer::new(&dialect, &sql).tokenize(),
3080 Err(TokenizerError {
3081 message: "Unterminated dollar-quoted string".into(),
3082 location: Location {
3083 line: 1,
3084 column: 86
3085 }
3086 })
3087 );
3088 }
3089
3090 #[test]
3091 fn tokenize_right_arrow() {
3092 let sql = String::from("FUNCTION(key=>value)");
3093 let dialect = GenericDialect {};
3094 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3095 let expected = vec![
3096 Token::make_word("FUNCTION", None),
3097 Token::LParen,
3098 Token::make_word("key", None),
3099 Token::RArrow,
3100 Token::make_word("value", None),
3101 Token::RParen,
3102 ];
3103 compare(expected, tokens);
3104 }
3105
3106 #[test]
3107 fn tokenize_is_null() {
3108 let sql = String::from("a IS NULL");
3109 let dialect = GenericDialect {};
3110 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3111
3112 let expected = vec![
3113 Token::make_word("a", None),
3114 Token::Whitespace(Whitespace::Space),
3115 Token::make_keyword("IS"),
3116 Token::Whitespace(Whitespace::Space),
3117 Token::make_keyword("NULL"),
3118 ];
3119
3120 compare(expected, tokens);
3121 }
3122
3123 #[test]
3124 fn tokenize_comment() {
3125 let test_cases = vec![
3126 (
3127 String::from("0--this is a comment\n1"),
3128 vec![
3129 Token::Number("0".to_string(), false),
3130 Token::Whitespace(Whitespace::SingleLineComment {
3131 prefix: "--".to_string(),
3132 comment: "this is a comment\n".to_string(),
3133 }),
3134 Token::Number("1".to_string(), false),
3135 ],
3136 ),
3137 (
3138 String::from("0--this is a comment\r1"),
3139 vec![
3140 Token::Number("0".to_string(), false),
3141 Token::Whitespace(Whitespace::SingleLineComment {
3142 prefix: "--".to_string(),
3143 comment: "this is a comment\r1".to_string(),
3144 }),
3145 ],
3146 ),
3147 (
3148 String::from("0--this is a comment\r\n1"),
3149 vec![
3150 Token::Number("0".to_string(), false),
3151 Token::Whitespace(Whitespace::SingleLineComment {
3152 prefix: "--".to_string(),
3153 comment: "this is a comment\r\n".to_string(),
3154 }),
3155 Token::Number("1".to_string(), false),
3156 ],
3157 ),
3158 ];
3159
3160 let dialect = GenericDialect {};
3161
3162 for (sql, expected) in test_cases {
3163 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3164 compare(expected, tokens);
3165 }
3166 }
3167
3168 #[test]
3169 fn tokenize_comment_postgres() {
3170 let sql = String::from("1--\r0");
3171
3172 let dialect = PostgreSqlDialect {};
3173 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3174 let expected = vec![
3175 Token::Number("1".to_string(), false),
3176 Token::Whitespace(Whitespace::SingleLineComment {
3177 prefix: "--".to_string(),
3178 comment: "\r".to_string(),
3179 }),
3180 Token::Number("0".to_string(), false),
3181 ];
3182 compare(expected, tokens);
3183 }
3184
3185 #[test]
3186 fn tokenize_comment_at_eof() {
3187 let sql = String::from("--this is a comment");
3188
3189 let dialect = GenericDialect {};
3190 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3191 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3192 prefix: "--".to_string(),
3193 comment: "this is a comment".to_string(),
3194 })];
3195 compare(expected, tokens);
3196 }
3197
3198 #[test]
3199 fn tokenize_multiline_comment() {
3200 let sql = String::from("0/*multi-line\n* /comment*/1");
3201
3202 let dialect = GenericDialect {};
3203 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3204 let expected = vec![
3205 Token::Number("0".to_string(), false),
3206 Token::Whitespace(Whitespace::MultiLineComment(
3207 "multi-line\n* /comment".to_string(),
3208 )),
3209 Token::Number("1".to_string(), false),
3210 ];
3211 compare(expected, tokens);
3212 }
3213
3214 #[test]
3215 fn tokenize_nested_multiline_comment() {
3216 let dialect = GenericDialect {};
3217 let test_cases = vec![
3218 (
3219 "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3220 vec![
3221 Token::Number("0".to_string(), false),
3222 Token::Whitespace(Whitespace::MultiLineComment(
3223 "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3224 )),
3225 Token::Whitespace(Whitespace::Space),
3226 Token::Div,
3227 Token::Word(Word {
3228 value: "comment".to_string(),
3229 quote_style: None,
3230 keyword: Keyword::COMMENT,
3231 }),
3232 Token::Mul,
3233 Token::Div,
3234 Token::Number("1".to_string(), false),
3235 ],
3236 ),
3237 (
3238 "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3239 vec![
3240 Token::Number("0".to_string(), false),
3241 Token::Whitespace(Whitespace::MultiLineComment(
3242 "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3243 )),
3244 Token::Number("1".to_string(), false),
3245 ],
3246 ),
3247 (
3248 "SELECT 1/* a /* b */ c */0",
3249 vec![
3250 Token::make_keyword("SELECT"),
3251 Token::Whitespace(Whitespace::Space),
3252 Token::Number("1".to_string(), false),
3253 Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3254 Token::Number("0".to_string(), false),
3255 ],
3256 ),
3257 ];
3258
3259 for (sql, expected) in test_cases {
3260 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3261 compare(expected, tokens);
3262 }
3263 }
3264
3265 #[test]
3266 fn tokenize_nested_multiline_comment_empty() {
3267 let sql = "select 1/*/**/*/0";
3268
3269 let dialect = GenericDialect {};
3270 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3271 let expected = vec![
3272 Token::make_keyword("select"),
3273 Token::Whitespace(Whitespace::Space),
3274 Token::Number("1".to_string(), false),
3275 Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3276 Token::Number("0".to_string(), false),
3277 ];
3278
3279 compare(expected, tokens);
3280 }
3281
3282 #[test]
3283 fn tokenize_nested_comments_if_not_supported() {
3284 let dialect = SQLiteDialect {};
3285 let sql = "SELECT 1/*/* nested comment */*/0";
3286 let tokens = Tokenizer::new(&dialect, sql).tokenize();
3287 let expected = vec![
3288 Token::make_keyword("SELECT"),
3289 Token::Whitespace(Whitespace::Space),
3290 Token::Number("1".to_string(), false),
3291 Token::Whitespace(Whitespace::MultiLineComment(
3292 "/* nested comment ".to_string(),
3293 )),
3294 Token::Mul,
3295 Token::Div,
3296 Token::Number("0".to_string(), false),
3297 ];
3298
3299 compare(expected, tokens.unwrap());
3300 }
3301
3302 #[test]
3303 fn tokenize_multiline_comment_with_even_asterisks() {
3304 let sql = String::from("\n/** Comment **/\n");
3305
3306 let dialect = GenericDialect {};
3307 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3308 let expected = vec![
3309 Token::Whitespace(Whitespace::Newline),
3310 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3311 Token::Whitespace(Whitespace::Newline),
3312 ];
3313 compare(expected, tokens);
3314 }
3315
3316 #[test]
3317 fn tokenize_unicode_whitespace() {
3318 let sql = String::from(" \u{2003}\n");
3319
3320 let dialect = GenericDialect {};
3321 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3322 let expected = vec![
3323 Token::Whitespace(Whitespace::Space),
3324 Token::Whitespace(Whitespace::Space),
3325 Token::Whitespace(Whitespace::Newline),
3326 ];
3327 compare(expected, tokens);
3328 }
3329
3330 #[test]
3331 fn tokenize_mismatched_quotes() {
3332 let sql = String::from("\"foo");
3333
3334 let dialect = GenericDialect {};
3335 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3336 assert_eq!(
3337 tokenizer.tokenize(),
3338 Err(TokenizerError {
3339 message: "Expected close delimiter '\"' before EOF.".to_string(),
3340 location: Location { line: 1, column: 1 },
3341 })
3342 );
3343 }
3344
3345 #[test]
3346 fn tokenize_newlines() {
3347 let sql = String::from("line1\nline2\rline3\r\nline4\r");
3348
3349 let dialect = GenericDialect {};
3350 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3351 let expected = vec![
3352 Token::make_word("line1", None),
3353 Token::Whitespace(Whitespace::Newline),
3354 Token::make_word("line2", None),
3355 Token::Whitespace(Whitespace::Newline),
3356 Token::make_word("line3", None),
3357 Token::Whitespace(Whitespace::Newline),
3358 Token::make_word("line4", None),
3359 Token::Whitespace(Whitespace::Newline),
3360 ];
3361 compare(expected, tokens);
3362 }
3363
3364 #[test]
3365 fn tokenize_mssql_top() {
3366 let sql = "SELECT TOP 5 [bar] FROM foo";
3367 let dialect = MsSqlDialect {};
3368 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3369 let expected = vec![
3370 Token::make_keyword("SELECT"),
3371 Token::Whitespace(Whitespace::Space),
3372 Token::make_keyword("TOP"),
3373 Token::Whitespace(Whitespace::Space),
3374 Token::Number(String::from("5"), false),
3375 Token::Whitespace(Whitespace::Space),
3376 Token::make_word("bar", Some('[')),
3377 Token::Whitespace(Whitespace::Space),
3378 Token::make_keyword("FROM"),
3379 Token::Whitespace(Whitespace::Space),
3380 Token::make_word("foo", None),
3381 ];
3382 compare(expected, tokens);
3383 }
3384
3385 #[test]
3386 fn tokenize_pg_regex_match() {
3387 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3388 let dialect = GenericDialect {};
3389 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3390 let expected = vec![
3391 Token::make_keyword("SELECT"),
3392 Token::Whitespace(Whitespace::Space),
3393 Token::make_word("col", None),
3394 Token::Whitespace(Whitespace::Space),
3395 Token::Tilde,
3396 Token::Whitespace(Whitespace::Space),
3397 Token::SingleQuotedString("^a".into()),
3398 Token::Comma,
3399 Token::Whitespace(Whitespace::Space),
3400 Token::make_word("col", None),
3401 Token::Whitespace(Whitespace::Space),
3402 Token::TildeAsterisk,
3403 Token::Whitespace(Whitespace::Space),
3404 Token::SingleQuotedString("^a".into()),
3405 Token::Comma,
3406 Token::Whitespace(Whitespace::Space),
3407 Token::make_word("col", None),
3408 Token::Whitespace(Whitespace::Space),
3409 Token::ExclamationMarkTilde,
3410 Token::Whitespace(Whitespace::Space),
3411 Token::SingleQuotedString("^a".into()),
3412 Token::Comma,
3413 Token::Whitespace(Whitespace::Space),
3414 Token::make_word("col", None),
3415 Token::Whitespace(Whitespace::Space),
3416 Token::ExclamationMarkTildeAsterisk,
3417 Token::Whitespace(Whitespace::Space),
3418 Token::SingleQuotedString("^a".into()),
3419 ];
3420 compare(expected, tokens);
3421 }
3422
3423 #[test]
3424 fn tokenize_pg_like_match() {
3425 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3426 let dialect = GenericDialect {};
3427 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3428 let expected = vec![
3429 Token::make_keyword("SELECT"),
3430 Token::Whitespace(Whitespace::Space),
3431 Token::make_word("col", None),
3432 Token::Whitespace(Whitespace::Space),
3433 Token::DoubleTilde,
3434 Token::Whitespace(Whitespace::Space),
3435 Token::SingleQuotedString("_a%".into()),
3436 Token::Comma,
3437 Token::Whitespace(Whitespace::Space),
3438 Token::make_word("col", None),
3439 Token::Whitespace(Whitespace::Space),
3440 Token::DoubleTildeAsterisk,
3441 Token::Whitespace(Whitespace::Space),
3442 Token::SingleQuotedString("_a%".into()),
3443 Token::Comma,
3444 Token::Whitespace(Whitespace::Space),
3445 Token::make_word("col", None),
3446 Token::Whitespace(Whitespace::Space),
3447 Token::ExclamationMarkDoubleTilde,
3448 Token::Whitespace(Whitespace::Space),
3449 Token::SingleQuotedString("_a%".into()),
3450 Token::Comma,
3451 Token::Whitespace(Whitespace::Space),
3452 Token::make_word("col", None),
3453 Token::Whitespace(Whitespace::Space),
3454 Token::ExclamationMarkDoubleTildeAsterisk,
3455 Token::Whitespace(Whitespace::Space),
3456 Token::SingleQuotedString("_a%".into()),
3457 ];
3458 compare(expected, tokens);
3459 }
3460
3461 #[test]
3462 fn tokenize_quoted_identifier() {
3463 let sql = r#" "a "" b" "a """ "c """"" "#;
3464 let dialect = GenericDialect {};
3465 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3466 let expected = vec![
3467 Token::Whitespace(Whitespace::Space),
3468 Token::make_word(r#"a " b"#, Some('"')),
3469 Token::Whitespace(Whitespace::Space),
3470 Token::make_word(r#"a ""#, Some('"')),
3471 Token::Whitespace(Whitespace::Space),
3472 Token::make_word(r#"c """#, Some('"')),
3473 Token::Whitespace(Whitespace::Space),
3474 ];
3475 compare(expected, tokens);
3476 }
3477
3478 #[test]
3479 fn tokenize_snowflake_div() {
3480 let sql = r#"field/1000"#;
3481 let dialect = SnowflakeDialect {};
3482 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3483 let expected = vec![
3484 Token::make_word(r#"field"#, None),
3485 Token::Div,
3486 Token::Number("1000".to_string(), false),
3487 ];
3488 compare(expected, tokens);
3489 }
3490
3491 #[test]
3492 fn tokenize_quoted_identifier_with_no_escape() {
3493 let sql = r#" "a "" b" "a """ "c """"" "#;
3494 let dialect = GenericDialect {};
3495 let tokens = Tokenizer::new(&dialect, sql)
3496 .with_unescape(false)
3497 .tokenize()
3498 .unwrap();
3499 let expected = vec![
3500 Token::Whitespace(Whitespace::Space),
3501 Token::make_word(r#"a "" b"#, Some('"')),
3502 Token::Whitespace(Whitespace::Space),
3503 Token::make_word(r#"a """#, Some('"')),
3504 Token::Whitespace(Whitespace::Space),
3505 Token::make_word(r#"c """""#, Some('"')),
3506 Token::Whitespace(Whitespace::Space),
3507 ];
3508 compare(expected, tokens);
3509 }
3510
3511 #[test]
3512 fn tokenize_with_location() {
3513 let sql = "SELECT a,\n b";
3514 let dialect = GenericDialect {};
3515 let tokens = Tokenizer::new(&dialect, sql)
3516 .tokenize_with_location()
3517 .unwrap();
3518 let expected = vec![
3519 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3520 TokenWithSpan::at(
3521 Token::Whitespace(Whitespace::Space),
3522 (1, 7).into(),
3523 (1, 8).into(),
3524 ),
3525 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3526 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3527 TokenWithSpan::at(
3528 Token::Whitespace(Whitespace::Newline),
3529 (1, 10).into(),
3530 (2, 1).into(),
3531 ),
3532 TokenWithSpan::at(
3533 Token::Whitespace(Whitespace::Space),
3534 (2, 1).into(),
3535 (2, 2).into(),
3536 ),
3537 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3538 ];
3539 compare(expected, tokens);
3540 }
3541
3542 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3543 assert_eq!(expected, actual);
3548 }
3549
3550 fn check_unescape(s: &str, expected: Option<&str>) {
3551 let s = format!("'{s}'");
3552 let mut state = State {
3553 peekable: s.chars().peekable(),
3554 line: 0,
3555 col: 0,
3556 };
3557
3558 assert_eq!(
3559 unescape_single_quoted_string(&mut state),
3560 expected.map(|s| s.to_string())
3561 );
3562 }
3563
3564 #[test]
3565 fn test_unescape() {
3566 check_unescape(r"\b", Some("\u{0008}"));
3567 check_unescape(r"\f", Some("\u{000C}"));
3568 check_unescape(r"\t", Some("\t"));
3569 check_unescape(r"\r\n", Some("\r\n"));
3570 check_unescape(r"\/", Some("/"));
3571 check_unescape(r"/", Some("/"));
3572 check_unescape(r"\\", Some("\\"));
3573
3574 check_unescape(r"\u0001", Some("\u{0001}"));
3576 check_unescape(r"\u4c91", Some("\u{4c91}"));
3577 check_unescape(r"\u4c916", Some("\u{4c91}6"));
3578 check_unescape(r"\u4c", None);
3579 check_unescape(r"\u0000", None);
3580 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3581 check_unescape(r"\U00110000", None);
3582 check_unescape(r"\U00000000", None);
3583 check_unescape(r"\u", None);
3584 check_unescape(r"\U", None);
3585 check_unescape(r"\U1010FFFF", None);
3586
3587 check_unescape(r"\x4B", Some("\u{004b}"));
3589 check_unescape(r"\x4", Some("\u{0004}"));
3590 check_unescape(r"\x4L", Some("\u{0004}L"));
3591 check_unescape(r"\x", Some("x"));
3592 check_unescape(r"\xP", Some("xP"));
3593 check_unescape(r"\x0", None);
3594 check_unescape(r"\xCAD", None);
3595 check_unescape(r"\xA9", None);
3596
3597 check_unescape(r"\1", Some("\u{0001}"));
3599 check_unescape(r"\12", Some("\u{000a}"));
3600 check_unescape(r"\123", Some("\u{0053}"));
3601 check_unescape(r"\1232", Some("\u{0053}2"));
3602 check_unescape(r"\4", Some("\u{0004}"));
3603 check_unescape(r"\45", Some("\u{0025}"));
3604 check_unescape(r"\450", Some("\u{0028}"));
3605 check_unescape(r"\603", None);
3606 check_unescape(r"\0", None);
3607 check_unescape(r"\080", None);
3608
3609 check_unescape(r"\9", Some("9"));
3611 check_unescape(r"''", Some("'"));
3612 check_unescape(
3613 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3614 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3615 );
3616 check_unescape(r"Hello\0", None);
3617 check_unescape(r"Hello\xCADRust", None);
3618 }
3619
3620 #[test]
3621 fn tokenize_numeric_prefix_trait() {
3622 #[derive(Debug)]
3623 struct NumericPrefixDialect;
3624
3625 impl Dialect for NumericPrefixDialect {
3626 fn is_identifier_start(&self, ch: char) -> bool {
3627 ch.is_ascii_lowercase()
3628 || ch.is_ascii_uppercase()
3629 || ch.is_ascii_digit()
3630 || ch == '$'
3631 }
3632
3633 fn is_identifier_part(&self, ch: char) -> bool {
3634 ch.is_ascii_lowercase()
3635 || ch.is_ascii_uppercase()
3636 || ch.is_ascii_digit()
3637 || ch == '_'
3638 || ch == '$'
3639 || ch == '{'
3640 || ch == '}'
3641 }
3642
3643 fn supports_numeric_prefix(&self) -> bool {
3644 true
3645 }
3646 }
3647
3648 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3649 tokenize_numeric_prefix_inner(&HiveDialect {});
3650 tokenize_numeric_prefix_inner(&MySqlDialect {});
3651 }
3652
3653 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3654 let sql = r#"SELECT * FROM 1"#;
3655 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3656 let expected = vec![
3657 Token::make_keyword("SELECT"),
3658 Token::Whitespace(Whitespace::Space),
3659 Token::Mul,
3660 Token::Whitespace(Whitespace::Space),
3661 Token::make_keyword("FROM"),
3662 Token::Whitespace(Whitespace::Space),
3663 Token::Number(String::from("1"), false),
3664 ];
3665 compare(expected, tokens);
3666 }
3667
3668 #[test]
3669 fn tokenize_quoted_string_escape() {
3670 let dialect = SnowflakeDialect {};
3671 for (sql, expected, expected_unescaped) in [
3672 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3673 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3674 (r#"'\\'"#, r#"\\"#, r#"\"#),
3675 (
3676 r#"'\0\a\b\f\n\r\t\Z'"#,
3677 r#"\0\a\b\f\n\r\t\Z"#,
3678 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3679 ),
3680 (r#"'\"'"#, r#"\""#, "\""),
3681 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3682 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3683 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3684 (r#"'\q'"#, r#"\q"#, r#"q"#),
3685 (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3686 (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3687 ] {
3688 let tokens = Tokenizer::new(&dialect, sql)
3689 .with_unescape(false)
3690 .tokenize()
3691 .unwrap();
3692 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3693 compare(expected, tokens);
3694
3695 let tokens = Tokenizer::new(&dialect, sql)
3696 .with_unescape(true)
3697 .tokenize()
3698 .unwrap();
3699 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3700 compare(expected, tokens);
3701 }
3702
3703 for sql in [r#"'\'"#, r#"'ab\'"#] {
3704 let mut tokenizer = Tokenizer::new(&dialect, sql);
3705 assert_eq!(
3706 "Unterminated string literal",
3707 tokenizer.tokenize().unwrap_err().message.as_str(),
3708 );
3709 }
3710
3711 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3713 let dialect = GenericDialect {};
3714 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3715
3716 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3717
3718 compare(expected, tokens);
3719 }
3720
3721 for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3723 let dialect = MySqlDialect {};
3724 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3725
3726 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3727
3728 compare(expected, tokens);
3729 }
3730 }
3731
3732 #[test]
3733 fn tokenize_triple_quoted_string() {
3734 fn check<F>(
3735 q: char, r: char, quote_token: F,
3738 ) where
3739 F: Fn(String) -> Token,
3740 {
3741 let dialect = BigQueryDialect {};
3742
3743 for (sql, expected, expected_unescaped) in [
3744 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3746 (
3748 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3749 format!(r#"ab{q}{q}\{q}{q}cd"#),
3750 format!(r#"ab{q}{q}{q}{q}cd"#),
3751 ),
3752 (
3754 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3755 "abc".into(),
3756 "abc".into(),
3757 ),
3758 (
3760 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3761 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3762 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3763 ),
3764 (
3766 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3767 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3768 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3769 ),
3770 (
3772 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3773 r#"a\'\'b\'c\'d"#.into(),
3774 r#"a''b'c'd"#.into(),
3775 ),
3776 (
3778 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3779 r#"abc\0\n\rdef"#.into(),
3780 "abc\0\n\rdef".into(),
3781 ),
3782 ] {
3783 let tokens = Tokenizer::new(&dialect, sql.as_str())
3784 .with_unescape(false)
3785 .tokenize()
3786 .unwrap();
3787 let expected = vec![quote_token(expected.to_string())];
3788 compare(expected, tokens);
3789
3790 let tokens = Tokenizer::new(&dialect, sql.as_str())
3791 .with_unescape(true)
3792 .tokenize()
3793 .unwrap();
3794 let expected = vec![quote_token(expected_unescaped.to_string())];
3795 compare(expected, tokens);
3796 }
3797
3798 for sql in [
3799 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3800 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3801 format!(r#"{q}{q}{q}{q}"#),
3802 format!(r#"{q}{q}{q}{r}{r}"#),
3803 format!(r#"{q}{q}{q}abc{q}"#),
3804 format!(r#"{q}{q}{q}abc{q}{q}"#),
3805 format!(r#"{q}{q}{q}abc"#),
3806 ] {
3807 let dialect = BigQueryDialect {};
3808 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3809 assert_eq!(
3810 "Unterminated string literal",
3811 tokenizer.tokenize().unwrap_err().message.as_str(),
3812 );
3813 }
3814 }
3815
3816 check('"', '\'', Token::TripleDoubleQuotedString);
3817
3818 check('\'', '"', Token::TripleSingleQuotedString);
3819
3820 let dialect = BigQueryDialect {};
3821
3822 let sql = r#"""''"#;
3823 let tokens = Tokenizer::new(&dialect, sql)
3824 .with_unescape(true)
3825 .tokenize()
3826 .unwrap();
3827 let expected = vec![
3828 Token::DoubleQuotedString("".to_string()),
3829 Token::SingleQuotedString("".to_string()),
3830 ];
3831 compare(expected, tokens);
3832
3833 let sql = r#"''"""#;
3834 let tokens = Tokenizer::new(&dialect, sql)
3835 .with_unescape(true)
3836 .tokenize()
3837 .unwrap();
3838 let expected = vec![
3839 Token::SingleQuotedString("".to_string()),
3840 Token::DoubleQuotedString("".to_string()),
3841 ];
3842 compare(expected, tokens);
3843
3844 let dialect = SnowflakeDialect {};
3846 let sql = r#"''''''"#;
3847 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3848 let expected = vec![Token::SingleQuotedString("''".to_string())];
3849 compare(expected, tokens);
3850 }
3851
3852 #[test]
3853 fn test_mysql_users_grantees() {
3854 let dialect = MySqlDialect {};
3855
3856 let sql = "CREATE USER `root`@`%`";
3857 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3858 let expected = vec![
3859 Token::make_keyword("CREATE"),
3860 Token::Whitespace(Whitespace::Space),
3861 Token::make_keyword("USER"),
3862 Token::Whitespace(Whitespace::Space),
3863 Token::make_word("root", Some('`')),
3864 Token::AtSign,
3865 Token::make_word("%", Some('`')),
3866 ];
3867 compare(expected, tokens);
3868 }
3869
3870 #[test]
3871 fn test_postgres_abs_without_space_and_string_literal() {
3872 let dialect = MySqlDialect {};
3873
3874 let sql = "SELECT @'1'";
3875 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3876 let expected = vec![
3877 Token::make_keyword("SELECT"),
3878 Token::Whitespace(Whitespace::Space),
3879 Token::AtSign,
3880 Token::SingleQuotedString("1".to_string()),
3881 ];
3882 compare(expected, tokens);
3883 }
3884
3885 #[test]
3886 fn test_postgres_abs_without_space_and_quoted_column() {
3887 let dialect = MySqlDialect {};
3888
3889 let sql = r#"SELECT @"bar" FROM foo"#;
3890 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3891 let expected = vec![
3892 Token::make_keyword("SELECT"),
3893 Token::Whitespace(Whitespace::Space),
3894 Token::AtSign,
3895 Token::DoubleQuotedString("bar".to_string()),
3896 Token::Whitespace(Whitespace::Space),
3897 Token::make_keyword("FROM"),
3898 Token::Whitespace(Whitespace::Space),
3899 Token::make_word("foo", None),
3900 ];
3901 compare(expected, tokens);
3902 }
3903
3904 #[test]
3905 fn test_national_strings_backslash_escape_not_supported() {
3906 all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3907 .tokenizes_to(
3908 "select n'''''\\'",
3909 vec![
3910 Token::make_keyword("select"),
3911 Token::Whitespace(Whitespace::Space),
3912 Token::NationalStringLiteral("''\\".to_string()),
3913 ],
3914 );
3915 }
3916
3917 #[test]
3918 fn test_national_strings_backslash_escape_supported() {
3919 all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3920 .tokenizes_to(
3921 "select n'''''\\''",
3922 vec![
3923 Token::make_keyword("select"),
3924 Token::Whitespace(Whitespace::Space),
3925 Token::NationalStringLiteral("'''".to_string()),
3926 ],
3927 );
3928 }
3929
3930 #[test]
3931 fn test_string_escape_constant_not_supported() {
3932 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3933 "select e'...'",
3934 vec![
3935 Token::make_keyword("select"),
3936 Token::Whitespace(Whitespace::Space),
3937 Token::make_word("e", None),
3938 Token::SingleQuotedString("...".to_string()),
3939 ],
3940 );
3941
3942 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3943 "select E'...'",
3944 vec![
3945 Token::make_keyword("select"),
3946 Token::Whitespace(Whitespace::Space),
3947 Token::make_word("E", None),
3948 Token::SingleQuotedString("...".to_string()),
3949 ],
3950 );
3951 }
3952
3953 #[test]
3954 fn test_string_escape_constant_supported() {
3955 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3956 "select e'\\''",
3957 vec![
3958 Token::make_keyword("select"),
3959 Token::Whitespace(Whitespace::Space),
3960 Token::EscapedStringLiteral("'".to_string()),
3961 ],
3962 );
3963
3964 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3965 "select E'\\''",
3966 vec![
3967 Token::make_keyword("select"),
3968 Token::Whitespace(Whitespace::Space),
3969 Token::EscapedStringLiteral("'".to_string()),
3970 ],
3971 );
3972 }
3973
3974 #[test]
3975 fn test_whitespace_required_after_single_line_comment() {
3976 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3977 .tokenizes_to(
3978 "SELECT --'abc'",
3979 vec![
3980 Token::make_keyword("SELECT"),
3981 Token::Whitespace(Whitespace::Space),
3982 Token::Minus,
3983 Token::Minus,
3984 Token::SingleQuotedString("abc".to_string()),
3985 ],
3986 );
3987
3988 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3989 .tokenizes_to(
3990 "SELECT -- 'abc'",
3991 vec![
3992 Token::make_keyword("SELECT"),
3993 Token::Whitespace(Whitespace::Space),
3994 Token::Whitespace(Whitespace::SingleLineComment {
3995 prefix: "--".to_string(),
3996 comment: " 'abc'".to_string(),
3997 }),
3998 ],
3999 );
4000
4001 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4002 .tokenizes_to(
4003 "SELECT --",
4004 vec![
4005 Token::make_keyword("SELECT"),
4006 Token::Whitespace(Whitespace::Space),
4007 Token::Minus,
4008 Token::Minus,
4009 ],
4010 );
4011 }
4012
4013 #[test]
4014 fn test_whitespace_not_required_after_single_line_comment() {
4015 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4016 .tokenizes_to(
4017 "SELECT --'abc'",
4018 vec![
4019 Token::make_keyword("SELECT"),
4020 Token::Whitespace(Whitespace::Space),
4021 Token::Whitespace(Whitespace::SingleLineComment {
4022 prefix: "--".to_string(),
4023 comment: "'abc'".to_string(),
4024 }),
4025 ],
4026 );
4027
4028 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4029 .tokenizes_to(
4030 "SELECT -- 'abc'",
4031 vec![
4032 Token::make_keyword("SELECT"),
4033 Token::Whitespace(Whitespace::Space),
4034 Token::Whitespace(Whitespace::SingleLineComment {
4035 prefix: "--".to_string(),
4036 comment: " 'abc'".to_string(),
4037 }),
4038 ],
4039 );
4040
4041 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4042 .tokenizes_to(
4043 "SELECT --",
4044 vec![
4045 Token::make_keyword("SELECT"),
4046 Token::Whitespace(Whitespace::Space),
4047 Token::Whitespace(Whitespace::SingleLineComment {
4048 prefix: "--".to_string(),
4049 comment: "".to_string(),
4050 }),
4051 ],
4052 );
4053 }
4054
4055 #[test]
4056 fn test_tokenize_identifiers_numeric_prefix() {
4057 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4058 .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4059
4060 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4061 .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4062
4063 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4064 "t.12e34",
4065 vec![
4066 Token::make_word("t", None),
4067 Token::Period,
4068 Token::make_word("12e34", None),
4069 ],
4070 );
4071
4072 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4073 "t.1two3",
4074 vec![
4075 Token::make_word("t", None),
4076 Token::Period,
4077 Token::make_word("1two3", None),
4078 ],
4079 );
4080 }
4081
4082 #[test]
4083 fn tokenize_period_underscore() {
4084 let sql = String::from("SELECT table._col");
4085 let dialect = PostgreSqlDialect {};
4087 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4088
4089 let expected = vec![
4090 Token::make_keyword("SELECT"),
4091 Token::Whitespace(Whitespace::Space),
4092 Token::Word(Word {
4093 value: "table".to_string(),
4094 quote_style: None,
4095 keyword: Keyword::TABLE,
4096 }),
4097 Token::Period,
4098 Token::Word(Word {
4099 value: "_col".to_string(),
4100 quote_style: None,
4101 keyword: Keyword::NoKeyword,
4102 }),
4103 ];
4104
4105 compare(expected, tokens);
4106
4107 let sql = String::from("SELECT ._123");
4108 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4109 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4110 }
4111
4112 let sql = String::from("SELECT ._abc");
4113 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4114 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4115 }
4116 }
4117}