1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46 SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56 EOF,
58 Word(Word),
60 Number(String, bool),
62 Char(char),
64 SingleQuotedString(String),
66 DoubleQuotedString(String),
68 TripleSingleQuotedString(String),
71 TripleDoubleQuotedString(String),
74 DollarQuotedString(DollarQuotedString),
76 SingleQuotedByteStringLiteral(String),
79 DoubleQuotedByteStringLiteral(String),
81 TripleSingleQuotedByteStringLiteral(String),
84 TripleDoubleQuotedByteStringLiteral(String),
87 SingleQuotedRawStringLiteral(String),
90 DoubleQuotedRawStringLiteral(String),
93 TripleSingleQuotedRawStringLiteral(String),
96 TripleDoubleQuotedRawStringLiteral(String),
99 NationalStringLiteral(String),
101 EscapedStringLiteral(String),
103 UnicodeStringLiteral(String),
105 HexStringLiteral(String),
107 Comma,
109 Whitespace(Whitespace),
111 DoubleEq,
113 Eq,
115 Neq,
117 Lt,
119 Gt,
121 LtEq,
123 GtEq,
125 Spaceship,
127 Plus,
129 Minus,
131 Mul,
133 Div,
135 DuckIntDiv,
137 Mod,
139 StringConcat,
141 LParen,
143 RParen,
145 Period,
147 Colon,
149 DoubleColon,
151 Assignment,
153 SemiColon,
155 Backslash,
157 LBracket,
159 RBracket,
161 Ampersand,
163 Pipe,
165 Caret,
167 LBrace,
169 RBrace,
171 RArrow,
173 Sharp,
175 DoubleSharp,
177 Tilde,
179 TildeAsterisk,
181 ExclamationMarkTilde,
183 ExclamationMarkTildeAsterisk,
185 DoubleTilde,
187 DoubleTildeAsterisk,
189 ExclamationMarkDoubleTilde,
191 ExclamationMarkDoubleTildeAsterisk,
193 ShiftLeft,
195 ShiftRight,
197 Overlap,
199 ExclamationMark,
201 DoubleExclamationMark,
203 AtSign,
205 CaretAt,
207 PGSquareRoot,
209 PGCubeRoot,
211 Placeholder(String),
213 Arrow,
215 LongArrow,
217 HashArrow,
219 AtDashAt,
221 QuestionMarkDash,
223 AmpersandLeftAngleBracket,
225 AmpersandRightAngleBracket,
227 AmpersandLeftAngleBracketVerticalBar,
229 VerticalBarAmpersandRightAngleBracket,
231 TwoWayArrow,
233 LeftAngleBracketCaret,
235 RightAngleBracketCaret,
237 QuestionMarkSharp,
239 QuestionMarkDashVerticalBar,
241 QuestionMarkDoubleVerticalBar,
243 TildeEqual,
245 ShiftLeftVerticalBar,
247 VerticalBarShiftRight,
249 VerticalBarRightAngleBracket,
251 HashLongArrow,
253 AtArrow,
255 ArrowAt,
257 HashMinus,
260 AtQuestion,
263 AtAt,
267 Question,
270 QuestionAnd,
273 QuestionPipe,
276 CustomBinaryOperator(String),
280}
281
282impl fmt::Display for Token {
283 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
284 match self {
285 Token::EOF => f.write_str("EOF"),
286 Token::Word(ref w) => write!(f, "{w}"),
287 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
288 Token::Char(ref c) => write!(f, "{c}"),
289 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
290 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
291 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
292 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
293 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
294 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
295 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
296 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
297 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
298 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
299 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
300 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
301 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
302 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
303 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
304 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
305 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
306 Token::Comma => f.write_str(","),
307 Token::Whitespace(ws) => write!(f, "{ws}"),
308 Token::DoubleEq => f.write_str("=="),
309 Token::Spaceship => f.write_str("<=>"),
310 Token::Eq => f.write_str("="),
311 Token::Neq => f.write_str("<>"),
312 Token::Lt => f.write_str("<"),
313 Token::Gt => f.write_str(">"),
314 Token::LtEq => f.write_str("<="),
315 Token::GtEq => f.write_str(">="),
316 Token::Plus => f.write_str("+"),
317 Token::Minus => f.write_str("-"),
318 Token::Mul => f.write_str("*"),
319 Token::Div => f.write_str("/"),
320 Token::DuckIntDiv => f.write_str("//"),
321 Token::StringConcat => f.write_str("||"),
322 Token::Mod => f.write_str("%"),
323 Token::LParen => f.write_str("("),
324 Token::RParen => f.write_str(")"),
325 Token::Period => f.write_str("."),
326 Token::Colon => f.write_str(":"),
327 Token::DoubleColon => f.write_str("::"),
328 Token::Assignment => f.write_str(":="),
329 Token::SemiColon => f.write_str(";"),
330 Token::Backslash => f.write_str("\\"),
331 Token::LBracket => f.write_str("["),
332 Token::RBracket => f.write_str("]"),
333 Token::Ampersand => f.write_str("&"),
334 Token::Caret => f.write_str("^"),
335 Token::Pipe => f.write_str("|"),
336 Token::LBrace => f.write_str("{"),
337 Token::RBrace => f.write_str("}"),
338 Token::RArrow => f.write_str("=>"),
339 Token::Sharp => f.write_str("#"),
340 Token::DoubleSharp => f.write_str("##"),
341 Token::ExclamationMark => f.write_str("!"),
342 Token::DoubleExclamationMark => f.write_str("!!"),
343 Token::Tilde => f.write_str("~"),
344 Token::TildeAsterisk => f.write_str("~*"),
345 Token::ExclamationMarkTilde => f.write_str("!~"),
346 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
347 Token::DoubleTilde => f.write_str("~~"),
348 Token::DoubleTildeAsterisk => f.write_str("~~*"),
349 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
350 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
351 Token::AtSign => f.write_str("@"),
352 Token::CaretAt => f.write_str("^@"),
353 Token::ShiftLeft => f.write_str("<<"),
354 Token::ShiftRight => f.write_str(">>"),
355 Token::Overlap => f.write_str("&&"),
356 Token::PGSquareRoot => f.write_str("|/"),
357 Token::PGCubeRoot => f.write_str("||/"),
358 Token::AtDashAt => f.write_str("@-@"),
359 Token::QuestionMarkDash => f.write_str("?-"),
360 Token::AmpersandLeftAngleBracket => f.write_str("&<"),
361 Token::AmpersandRightAngleBracket => f.write_str("&>"),
362 Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
363 Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
364 Token::VerticalBarRightAngleBracket => f.write_str("|>"),
365 Token::TwoWayArrow => f.write_str("<->"),
366 Token::LeftAngleBracketCaret => f.write_str("<^"),
367 Token::RightAngleBracketCaret => f.write_str(">^"),
368 Token::QuestionMarkSharp => f.write_str("?#"),
369 Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
370 Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
371 Token::TildeEqual => f.write_str("~="),
372 Token::ShiftLeftVerticalBar => f.write_str("<<|"),
373 Token::VerticalBarShiftRight => f.write_str("|>>"),
374 Token::Placeholder(ref s) => write!(f, "{s}"),
375 Token::Arrow => write!(f, "->"),
376 Token::LongArrow => write!(f, "->>"),
377 Token::HashArrow => write!(f, "#>"),
378 Token::HashLongArrow => write!(f, "#>>"),
379 Token::AtArrow => write!(f, "@>"),
380 Token::ArrowAt => write!(f, "<@"),
381 Token::HashMinus => write!(f, "#-"),
382 Token::AtQuestion => write!(f, "@?"),
383 Token::AtAt => write!(f, "@@"),
384 Token::Question => write!(f, "?"),
385 Token::QuestionAnd => write!(f, "?&"),
386 Token::QuestionPipe => write!(f, "?|"),
387 Token::CustomBinaryOperator(s) => f.write_str(s),
388 }
389 }
390}
391
392impl Token {
393 pub fn make_keyword(keyword: &str) -> Self {
394 Token::make_word(keyword, None)
395 }
396
397 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
398 let word_uppercase = word.to_uppercase();
399 Token::Word(Word {
400 value: word.to_string(),
401 quote_style,
402 keyword: if quote_style.is_none() {
403 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
404 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
405 } else {
406 Keyword::NoKeyword
407 },
408 })
409 }
410}
411
412#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
416pub struct Word {
417 pub value: String,
420 pub quote_style: Option<char>,
424 pub keyword: Keyword,
427}
428
429impl fmt::Display for Word {
430 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
431 match self.quote_style {
432 Some(s) if s == '"' || s == '[' || s == '`' => {
433 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
434 }
435 None => f.write_str(&self.value),
436 _ => panic!("Unexpected quote_style!"),
437 }
438 }
439}
440
441impl Word {
442 fn matching_end_quote(ch: char) -> char {
443 match ch {
444 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
448 }
449 }
450}
451
452#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
453#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
454#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
455pub enum Whitespace {
456 Space,
457 Newline,
458 Tab,
459 SingleLineComment { comment: String, prefix: String },
460 MultiLineComment(String),
461}
462
463impl fmt::Display for Whitespace {
464 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
465 match self {
466 Whitespace::Space => f.write_str(" "),
467 Whitespace::Newline => f.write_str("\n"),
468 Whitespace::Tab => f.write_str("\t"),
469 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
470 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
471 }
472 }
473}
474
475#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
495#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
496#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
497pub struct Location {
498 pub line: u64,
502 pub column: u64,
506}
507
508impl fmt::Display for Location {
509 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
510 if self.line == 0 {
511 return Ok(());
512 }
513 write!(f, " at Line: {}, Column: {}", self.line, self.column)
514 }
515}
516
517impl fmt::Debug for Location {
518 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
519 write!(f, "Location({},{})", self.line, self.column)
520 }
521}
522
523impl Location {
524 pub fn empty() -> Self {
526 Self { line: 0, column: 0 }
527 }
528
529 pub fn new(line: u64, column: u64) -> Self {
531 Self { line, column }
532 }
533
534 pub fn of(line: u64, column: u64) -> Self {
539 Self::new(line, column)
540 }
541
542 pub fn span_to(self, end: Self) -> Span {
544 Span { start: self, end }
545 }
546}
547
548impl From<(u64, u64)> for Location {
549 fn from((line, column): (u64, u64)) -> Self {
550 Self { line, column }
551 }
552}
553
554#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
558#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
559#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
560pub struct Span {
561 pub start: Location,
562 pub end: Location,
563}
564
565impl fmt::Debug for Span {
566 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
567 write!(f, "Span({:?}..{:?})", self.start, self.end)
568 }
569}
570
571impl Span {
572 const EMPTY: Span = Self::empty();
575
576 pub fn new(start: Location, end: Location) -> Span {
578 Span { start, end }
579 }
580
581 pub const fn empty() -> Span {
586 Span {
587 start: Location { line: 0, column: 0 },
588 end: Location { line: 0, column: 0 },
589 }
590 }
591
592 pub fn union(&self, other: &Span) -> Span {
608 match (self, other) {
611 (&Span::EMPTY, _) => *other,
612 (_, &Span::EMPTY) => *self,
613 _ => Span {
614 start: cmp::min(self.start, other.start),
615 end: cmp::max(self.end, other.end),
616 },
617 }
618 }
619
620 pub fn union_opt(&self, other: &Option<Span>) -> Span {
624 match other {
625 Some(other) => self.union(other),
626 None => *self,
627 }
628 }
629
630 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
648 iter.into_iter()
649 .reduce(|acc, item| acc.union(&item))
650 .unwrap_or(Span::empty())
651 }
652}
653
654#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656pub type TokenWithLocation = TokenWithSpan;
657
658#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
681#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
682#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
683pub struct TokenWithSpan {
684 pub token: Token,
685 pub span: Span,
686}
687
688impl TokenWithSpan {
689 pub fn new(token: Token, span: Span) -> Self {
691 Self { token, span }
692 }
693
694 pub fn wrap(token: Token) -> Self {
696 Self::new(token, Span::empty())
697 }
698
699 pub fn at(token: Token, start: Location, end: Location) -> Self {
701 Self::new(token, Span::new(start, end))
702 }
703
704 pub fn new_eof() -> Self {
706 Self::wrap(Token::EOF)
707 }
708}
709
710impl PartialEq<Token> for TokenWithSpan {
711 fn eq(&self, other: &Token) -> bool {
712 &self.token == other
713 }
714}
715
716impl PartialEq<TokenWithSpan> for Token {
717 fn eq(&self, other: &TokenWithSpan) -> bool {
718 self == &other.token
719 }
720}
721
722impl fmt::Display for TokenWithSpan {
723 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
724 self.token.fmt(f)
725 }
726}
727
728#[derive(Debug, PartialEq, Eq)]
730pub struct TokenizerError {
731 pub message: String,
732 pub location: Location,
733}
734
735impl fmt::Display for TokenizerError {
736 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737 write!(f, "{}{}", self.message, self.location,)
738 }
739}
740
741#[cfg(feature = "std")]
742impl std::error::Error for TokenizerError {}
743
744struct State<'a> {
745 peekable: Peekable<Chars<'a>>,
746 pub line: u64,
747 pub col: u64,
748}
749
750impl State<'_> {
751 pub fn next(&mut self) -> Option<char> {
753 match self.peekable.next() {
754 None => None,
755 Some(s) => {
756 if s == '\n' {
757 self.line += 1;
758 self.col = 1;
759 } else {
760 self.col += 1;
761 }
762 Some(s)
763 }
764 }
765 }
766
767 pub fn peek(&mut self) -> Option<&char> {
769 self.peekable.peek()
770 }
771
772 pub fn location(&self) -> Location {
773 Location {
774 line: self.line,
775 column: self.col,
776 }
777 }
778}
779
780#[derive(Copy, Clone)]
782enum NumStringQuoteChars {
783 One,
785 Many(NonZeroU8),
787}
788
789struct TokenizeQuotedStringSettings {
791 quote_style: char,
793 num_quote_chars: NumStringQuoteChars,
795 num_opening_quotes_to_consume: u8,
801 backslash_escape: bool,
804}
805
806pub struct Tokenizer<'a> {
808 dialect: &'a dyn Dialect,
809 query: &'a str,
810 unescape: bool,
813}
814
815impl<'a> Tokenizer<'a> {
816 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
833 Self {
834 dialect,
835 query,
836 unescape: true,
837 }
838 }
839
840 pub fn with_unescape(mut self, unescape: bool) -> Self {
871 self.unescape = unescape;
872 self
873 }
874
875 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877 let twl = self.tokenize_with_location()?;
878 Ok(twl.into_iter().map(|t| t.token).collect())
879 }
880
881 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
883 let mut tokens: Vec<TokenWithSpan> = vec![];
884 self.tokenize_with_location_into_buf(&mut tokens)
885 .map(|_| tokens)
886 }
887
888 pub fn tokenize_with_location_into_buf(
891 &mut self,
892 buf: &mut Vec<TokenWithSpan>,
893 ) -> Result<(), TokenizerError> {
894 let mut state = State {
895 peekable: self.query.chars().peekable(),
896 line: 1,
897 col: 1,
898 };
899
900 let mut location = state.location();
901 while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902 let span = location.span_to(state.location());
903
904 buf.push(TokenWithSpan { token, span });
905
906 location = state.location();
907 }
908 Ok(())
909 }
910
911 fn tokenize_identifier_or_keyword(
913 &self,
914 ch: impl IntoIterator<Item = char>,
915 chars: &mut State,
916 ) -> Result<Option<Token>, TokenizerError> {
917 chars.next(); let ch: String = ch.into_iter().collect();
919 let word = self.tokenize_word(ch, chars);
920
921 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923 let mut inner_state = State {
924 peekable: word.chars().peekable(),
925 line: 0,
926 col: 0,
927 };
928 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
930 s += s2.as_str();
931 return Ok(Some(Token::Number(s, false)));
932 }
933
934 Ok(Some(Token::make_word(&word, None)))
935 }
936
937 fn next_token(
939 &self,
940 chars: &mut State,
941 prev_token: Option<&Token>,
942 ) -> Result<Option<Token>, TokenizerError> {
943 match chars.peek() {
944 Some(&ch) => match ch {
945 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
946 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
947 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
948 '\r' => {
949 chars.next();
951 if let Some('\n') = chars.peek() {
952 chars.next();
953 }
954 Ok(Some(Token::Whitespace(Whitespace::Newline)))
955 }
956 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
958 {
959 chars.next(); match chars.peek() {
961 Some('\'') => {
962 if self.dialect.supports_triple_quoted_string() {
963 return self
964 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
965 chars,
966 '\'',
967 false,
968 Token::SingleQuotedByteStringLiteral,
969 Token::TripleSingleQuotedByteStringLiteral,
970 );
971 }
972 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
973 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
974 }
975 Some('\"') => {
976 if self.dialect.supports_triple_quoted_string() {
977 return self
978 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
979 chars,
980 '"',
981 false,
982 Token::DoubleQuotedByteStringLiteral,
983 Token::TripleDoubleQuotedByteStringLiteral,
984 );
985 }
986 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
987 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
988 }
989 _ => {
990 let s = self.tokenize_word(b, chars);
992 Ok(Some(Token::make_word(&s, None)))
993 }
994 }
995 }
996 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
998 chars.next(); match chars.peek() {
1000 Some('\'') => self
1001 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1002 chars,
1003 '\'',
1004 false,
1005 Token::SingleQuotedRawStringLiteral,
1006 Token::TripleSingleQuotedRawStringLiteral,
1007 ),
1008 Some('\"') => self
1009 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1010 chars,
1011 '"',
1012 false,
1013 Token::DoubleQuotedRawStringLiteral,
1014 Token::TripleDoubleQuotedRawStringLiteral,
1015 ),
1016 _ => {
1017 let s = self.tokenize_word(b, chars);
1019 Ok(Some(Token::make_word(&s, None)))
1020 }
1021 }
1022 }
1023 n @ 'N' | n @ 'n' => {
1025 chars.next(); match chars.peek() {
1027 Some('\'') => {
1028 let backslash_escape =
1030 self.dialect.supports_string_literal_backslash_escape();
1031 let s =
1032 self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1033 Ok(Some(Token::NationalStringLiteral(s)))
1034 }
1035 _ => {
1036 let s = self.tokenize_word(n, chars);
1038 Ok(Some(Token::make_word(&s, None)))
1039 }
1040 }
1041 }
1042 x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1044 let starting_loc = chars.location();
1045 chars.next(); match chars.peek() {
1047 Some('\'') => {
1048 let s =
1049 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1050 Ok(Some(Token::EscapedStringLiteral(s)))
1051 }
1052 _ => {
1053 let s = self.tokenize_word(x, chars);
1055 Ok(Some(Token::make_word(&s, None)))
1056 }
1057 }
1058 }
1059 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1061 chars.next(); if chars.peek() == Some(&'&') {
1063 let mut chars_clone = chars.peekable.clone();
1065 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
1067 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
1069 return Ok(Some(Token::UnicodeStringLiteral(s)));
1070 }
1071 }
1072 let s = self.tokenize_word(x, chars);
1074 Ok(Some(Token::make_word(&s, None)))
1075 }
1076 x @ 'x' | x @ 'X' => {
1079 chars.next(); match chars.peek() {
1081 Some('\'') => {
1082 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1084 Ok(Some(Token::HexStringLiteral(s)))
1085 }
1086 _ => {
1087 let s = self.tokenize_word(x, chars);
1089 Ok(Some(Token::make_word(&s, None)))
1090 }
1091 }
1092 }
1093 '\'' => {
1095 if self.dialect.supports_triple_quoted_string() {
1096 return self
1097 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1098 chars,
1099 '\'',
1100 self.dialect.supports_string_literal_backslash_escape(),
1101 Token::SingleQuotedString,
1102 Token::TripleSingleQuotedString,
1103 );
1104 }
1105 let s = self.tokenize_single_quoted_string(
1106 chars,
1107 '\'',
1108 self.dialect.supports_string_literal_backslash_escape(),
1109 )?;
1110
1111 Ok(Some(Token::SingleQuotedString(s)))
1112 }
1113 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1115 && !self.dialect.is_identifier_start(ch) =>
1116 {
1117 if self.dialect.supports_triple_quoted_string() {
1118 return self
1119 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120 chars,
1121 '"',
1122 self.dialect.supports_string_literal_backslash_escape(),
1123 Token::DoubleQuotedString,
1124 Token::TripleDoubleQuotedString,
1125 );
1126 }
1127 let s = self.tokenize_single_quoted_string(
1128 chars,
1129 '"',
1130 self.dialect.supports_string_literal_backslash_escape(),
1131 )?;
1132
1133 Ok(Some(Token::DoubleQuotedString(s)))
1134 }
1135 quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1137 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1138 Ok(Some(Token::make_word(&word, Some(quote_start))))
1139 }
1140 quote_start
1142 if self
1143 .dialect
1144 .is_nested_delimited_identifier_start(quote_start)
1145 && self
1146 .dialect
1147 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1148 .is_some() =>
1149 {
1150 let Some((quote_start, nested_quote_start)) = self
1151 .dialect
1152 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1153 else {
1154 return self.tokenizer_error(
1155 chars.location(),
1156 format!("Expected nested delimiter '{quote_start}' before EOF."),
1157 );
1158 };
1159
1160 let Some(nested_quote_start) = nested_quote_start else {
1161 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1162 return Ok(Some(Token::make_word(&word, Some(quote_start))));
1163 };
1164
1165 let mut word = vec![];
1166 let quote_end = Word::matching_end_quote(quote_start);
1167 let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1168 let error_loc = chars.location();
1169
1170 chars.next(); peeking_take_while(chars, |ch| ch.is_whitespace());
1172 if chars.peek() != Some(&nested_quote_start) {
1173 return self.tokenizer_error(
1174 error_loc,
1175 format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1176 );
1177 }
1178 word.push(nested_quote_start.into());
1179 word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1180 word.push(nested_quote_end.into());
1181 peeking_take_while(chars, |ch| ch.is_whitespace());
1182 if chars.peek() != Some("e_end) {
1183 return self.tokenizer_error(
1184 error_loc,
1185 format!("Expected close delimiter '{quote_end}' before EOF."),
1186 );
1187 }
1188 chars.next(); Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1191 }
1192 '0'..='9' | '.' => {
1194 if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1199 if let Some(Token::Word(_)) = prev_token {
1200 chars.next();
1201 return Ok(Some(Token::Period));
1202 }
1203
1204 return self.tokenizer_error(
1205 chars.location(),
1206 "Unexpected character '_'".to_string(),
1207 );
1208 }
1209
1210 let is_number_separator = |ch: char, next_char: Option<char>| {
1213 self.dialect.supports_numeric_literal_underscores()
1214 && ch == '_'
1215 && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1216 };
1217
1218 let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1219 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1220 });
1221
1222 if s == "0" && chars.peek() == Some(&'x') {
1224 chars.next();
1225 let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1226 ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1227 });
1228 return Ok(Some(Token::HexStringLiteral(s2)));
1229 }
1230
1231 if let Some('.') = chars.peek() {
1233 s.push('.');
1234 chars.next();
1235 }
1236
1237 if s == "." && self.dialect.supports_numeric_prefix() {
1243 if let Some(Token::Word(_)) = prev_token {
1244 return Ok(Some(Token::Period));
1245 }
1246 }
1247
1248 s += &peeking_next_take_while(chars, |ch, next_ch| {
1250 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1251 });
1252
1253 if s == "." {
1255 return Ok(Some(Token::Period));
1256 }
1257
1258 let mut exponent_part = String::new();
1260 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1261 let mut char_clone = chars.peekable.clone();
1262 exponent_part.push(char_clone.next().unwrap());
1263
1264 match char_clone.peek() {
1266 Some(&c) if matches!(c, '+' | '-') => {
1267 exponent_part.push(c);
1268 char_clone.next();
1269 }
1270 _ => (),
1271 }
1272
1273 match char_clone.peek() {
1274 Some(&c) if c.is_ascii_digit() => {
1276 for _ in 0..exponent_part.len() {
1277 chars.next();
1278 }
1279 exponent_part +=
1280 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1281 s += exponent_part.as_str();
1282 }
1283 _ => (),
1285 }
1286 }
1287
1288 if self.dialect.supports_numeric_prefix() {
1292 if exponent_part.is_empty() {
1293 let word =
1296 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1297
1298 if !word.is_empty() {
1299 s += word.as_str();
1300 return Ok(Some(Token::make_word(s.as_str(), None)));
1301 }
1302 } else if prev_token == Some(&Token::Period) {
1303 return Ok(Some(Token::make_word(s.as_str(), None)));
1306 }
1307 }
1308
1309 let long = if chars.peek() == Some(&'L') {
1310 chars.next();
1311 true
1312 } else {
1313 false
1314 };
1315 Ok(Some(Token::Number(s, long)))
1316 }
1317 '(' => self.consume_and_return(chars, Token::LParen),
1319 ')' => self.consume_and_return(chars, Token::RParen),
1320 ',' => self.consume_and_return(chars, Token::Comma),
1321 '-' => {
1323 chars.next(); match chars.peek() {
1326 Some('-') => {
1327 let mut is_comment = true;
1328 if self.dialect.requires_single_line_comment_whitespace() {
1329 is_comment = Some(' ') == chars.peekable.clone().nth(1);
1330 }
1331
1332 if is_comment {
1333 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1335 return Ok(Some(Token::Whitespace(
1336 Whitespace::SingleLineComment {
1337 prefix: "--".to_owned(),
1338 comment,
1339 },
1340 )));
1341 }
1342
1343 self.start_binop(chars, "-", Token::Minus)
1344 }
1345 Some('>') => {
1346 chars.next();
1347 match chars.peek() {
1348 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1349 _ => self.start_binop(chars, "->", Token::Arrow),
1350 }
1351 }
1352 _ => self.start_binop(chars, "-", Token::Minus),
1354 }
1355 }
1356 '/' => {
1357 chars.next(); match chars.peek() {
1359 Some('*') => {
1360 chars.next(); self.tokenize_multiline_comment(chars)
1362 }
1363 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1364 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1366 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1367 prefix: "//".to_owned(),
1368 comment,
1369 })))
1370 }
1371 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1372 self.consume_and_return(chars, Token::DuckIntDiv)
1373 }
1374 _ => Ok(Some(Token::Div)),
1376 }
1377 }
1378 '+' => self.consume_and_return(chars, Token::Plus),
1379 '*' => self.consume_and_return(chars, Token::Mul),
1380 '%' => {
1381 chars.next(); match chars.peek() {
1383 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1384 Some(sch) if self.dialect.is_identifier_start('%') => {
1385 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1386 }
1387 _ => self.start_binop(chars, "%", Token::Mod),
1388 }
1389 }
1390 '|' => {
1391 chars.next(); match chars.peek() {
1393 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1394 Some('|') => {
1395 chars.next(); match chars.peek() {
1397 Some('/') => {
1398 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1399 }
1400 _ => self.start_binop(chars, "||", Token::StringConcat),
1401 }
1402 }
1403 Some('&') if self.dialect.supports_geometric_types() => {
1404 chars.next(); match chars.peek() {
1406 Some('>') => self.consume_for_binop(
1407 chars,
1408 "|&>",
1409 Token::VerticalBarAmpersandRightAngleBracket,
1410 ),
1411 _ => self.start_binop_opt(chars, "|&", None),
1412 }
1413 }
1414 Some('>') if self.dialect.supports_geometric_types() => {
1415 chars.next(); match chars.peek() {
1417 Some('>') => self.consume_for_binop(
1418 chars,
1419 "|>>",
1420 Token::VerticalBarShiftRight,
1421 ),
1422 _ => self.start_binop_opt(chars, "|>", None),
1423 }
1424 }
1425 Some('>') if self.dialect.supports_pipe_operator() => {
1426 self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1427 }
1428 _ => self.start_binop(chars, "|", Token::Pipe),
1430 }
1431 }
1432 '=' => {
1433 chars.next(); match chars.peek() {
1435 Some('>') => self.consume_and_return(chars, Token::RArrow),
1436 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1437 _ => Ok(Some(Token::Eq)),
1438 }
1439 }
1440 '!' => {
1441 chars.next(); match chars.peek() {
1443 Some('=') => self.consume_and_return(chars, Token::Neq),
1444 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1445 Some('~') => {
1446 chars.next();
1447 match chars.peek() {
1448 Some('*') => self
1449 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1450 Some('~') => {
1451 chars.next();
1452 match chars.peek() {
1453 Some('*') => self.consume_and_return(
1454 chars,
1455 Token::ExclamationMarkDoubleTildeAsterisk,
1456 ),
1457 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1458 }
1459 }
1460 _ => Ok(Some(Token::ExclamationMarkTilde)),
1461 }
1462 }
1463 _ => Ok(Some(Token::ExclamationMark)),
1464 }
1465 }
1466 '<' => {
1467 chars.next(); match chars.peek() {
1469 Some('=') => {
1470 chars.next();
1471 match chars.peek() {
1472 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1473 _ => self.start_binop(chars, "<=", Token::LtEq),
1474 }
1475 }
1476 Some('|') if self.dialect.supports_geometric_types() => {
1477 self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1478 }
1479 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1480 Some('<') if self.dialect.supports_geometric_types() => {
1481 chars.next(); match chars.peek() {
1483 Some('|') => self.consume_for_binop(
1484 chars,
1485 "<<|",
1486 Token::ShiftLeftVerticalBar,
1487 ),
1488 _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1489 }
1490 }
1491 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1492 Some('-') if self.dialect.supports_geometric_types() => {
1493 chars.next(); match chars.peek() {
1495 Some('>') => {
1496 self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1497 }
1498 _ => self.start_binop_opt(chars, "<-", None),
1499 }
1500 }
1501 Some('^') if self.dialect.supports_geometric_types() => {
1502 self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1503 }
1504 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1505 _ => self.start_binop(chars, "<", Token::Lt),
1506 }
1507 }
1508 '>' => {
1509 chars.next(); match chars.peek() {
1511 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1512 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1513 Some('^') if self.dialect.supports_geometric_types() => {
1514 self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1515 }
1516 _ => self.start_binop(chars, ">", Token::Gt),
1517 }
1518 }
1519 ':' => {
1520 chars.next();
1521 match chars.peek() {
1522 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1523 Some('=') => self.consume_and_return(chars, Token::Assignment),
1524 _ => Ok(Some(Token::Colon)),
1525 }
1526 }
1527 ';' => self.consume_and_return(chars, Token::SemiColon),
1528 '\\' => self.consume_and_return(chars, Token::Backslash),
1529 '[' => self.consume_and_return(chars, Token::LBracket),
1530 ']' => self.consume_and_return(chars, Token::RBracket),
1531 '&' => {
1532 chars.next(); match chars.peek() {
1534 Some('>') if self.dialect.supports_geometric_types() => {
1535 chars.next();
1536 self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1537 }
1538 Some('<') if self.dialect.supports_geometric_types() => {
1539 chars.next(); match chars.peek() {
1541 Some('|') => self.consume_and_return(
1542 chars,
1543 Token::AmpersandLeftAngleBracketVerticalBar,
1544 ),
1545 _ => {
1546 self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1547 }
1548 }
1549 }
1550 Some('&') => {
1551 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1553 }
1554 _ => self.start_binop(chars, "&", Token::Ampersand),
1556 }
1557 }
1558 '^' => {
1559 chars.next(); match chars.peek() {
1561 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1562 _ => Ok(Some(Token::Caret)),
1563 }
1564 }
1565 '{' => self.consume_and_return(chars, Token::LBrace),
1566 '}' => self.consume_and_return(chars, Token::RBrace),
1567 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1568 {
1569 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1571 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1572 prefix: "#".to_owned(),
1573 comment,
1574 })))
1575 }
1576 '~' => {
1577 chars.next(); match chars.peek() {
1579 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1580 Some('=') if self.dialect.supports_geometric_types() => {
1581 self.consume_for_binop(chars, "~=", Token::TildeEqual)
1582 }
1583 Some('~') => {
1584 chars.next();
1585 match chars.peek() {
1586 Some('*') => {
1587 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1588 }
1589 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1590 }
1591 }
1592 _ => self.start_binop(chars, "~", Token::Tilde),
1593 }
1594 }
1595 '#' => {
1596 chars.next();
1597 match chars.peek() {
1598 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1599 Some('>') => {
1600 chars.next();
1601 match chars.peek() {
1602 Some('>') => {
1603 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1604 }
1605 _ => self.start_binop(chars, "#>", Token::HashArrow),
1606 }
1607 }
1608 Some(' ') => Ok(Some(Token::Sharp)),
1609 Some('#') if self.dialect.supports_geometric_types() => {
1610 self.consume_for_binop(chars, "##", Token::DoubleSharp)
1611 }
1612 Some(sch) if self.dialect.is_identifier_start('#') => {
1613 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1614 }
1615 _ => self.start_binop(chars, "#", Token::Sharp),
1616 }
1617 }
1618 '@' => {
1619 chars.next();
1620 match chars.peek() {
1621 Some('@') if self.dialect.supports_geometric_types() => {
1622 self.consume_and_return(chars, Token::AtAt)
1623 }
1624 Some('-') if self.dialect.supports_geometric_types() => {
1625 chars.next();
1626 match chars.peek() {
1627 Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1628 _ => self.start_binop_opt(chars, "@-", None),
1629 }
1630 }
1631 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1632 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1633 Some('@') => {
1634 chars.next();
1635 match chars.peek() {
1636 Some(' ') => Ok(Some(Token::AtAt)),
1637 Some(tch) if self.dialect.is_identifier_start('@') => {
1638 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1639 }
1640 _ => Ok(Some(Token::AtAt)),
1641 }
1642 }
1643 Some(' ') => Ok(Some(Token::AtSign)),
1644 Some('\'') => Ok(Some(Token::AtSign)),
1654 Some('\"') => Ok(Some(Token::AtSign)),
1655 Some('`') => Ok(Some(Token::AtSign)),
1656 Some(sch) if self.dialect.is_identifier_start('@') => {
1657 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1658 }
1659 _ => Ok(Some(Token::AtSign)),
1660 }
1661 }
1662 '?' if self.dialect.supports_geometric_types() => {
1664 chars.next(); match chars.peek() {
1666 Some('|') => {
1667 chars.next();
1668 match chars.peek() {
1669 Some('|') => self.consume_and_return(
1670 chars,
1671 Token::QuestionMarkDoubleVerticalBar,
1672 ),
1673 _ => Ok(Some(Token::QuestionPipe)),
1674 }
1675 }
1676
1677 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1678 Some('-') => {
1679 chars.next(); match chars.peek() {
1681 Some('|') => self
1682 .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1683 _ => Ok(Some(Token::QuestionMarkDash)),
1684 }
1685 }
1686 Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1687 _ => self.consume_and_return(chars, Token::Question),
1688 }
1689 }
1690 '?' => {
1691 chars.next();
1692 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1693 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1694 }
1695
1696 ch if self.dialect.is_identifier_start(ch) => {
1698 self.tokenize_identifier_or_keyword([ch], chars)
1699 }
1700 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1701
1702 ch if ch.is_whitespace() => {
1704 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1705 }
1706 other => self.consume_and_return(chars, Token::Char(other)),
1707 },
1708 None => Ok(None),
1709 }
1710 }
1711
1712 fn consume_for_binop(
1714 &self,
1715 chars: &mut State,
1716 prefix: &str,
1717 default: Token,
1718 ) -> Result<Option<Token>, TokenizerError> {
1719 chars.next(); self.start_binop_opt(chars, prefix, Some(default))
1721 }
1722
1723 fn start_binop(
1725 &self,
1726 chars: &mut State,
1727 prefix: &str,
1728 default: Token,
1729 ) -> Result<Option<Token>, TokenizerError> {
1730 self.start_binop_opt(chars, prefix, Some(default))
1731 }
1732
1733 fn start_binop_opt(
1735 &self,
1736 chars: &mut State,
1737 prefix: &str,
1738 default: Option<Token>,
1739 ) -> Result<Option<Token>, TokenizerError> {
1740 let mut custom = None;
1741 while let Some(&ch) = chars.peek() {
1742 if !self.dialect.is_custom_operator_part(ch) {
1743 break;
1744 }
1745
1746 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1747 chars.next();
1748 }
1749 match (custom, default) {
1750 (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1751 (None, Some(tok)) => Ok(Some(tok)),
1752 (None, None) => self.tokenizer_error(
1753 chars.location(),
1754 format!("Expected a valid binary operator after '{prefix}'"),
1755 ),
1756 }
1757 }
1758
1759 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1761 let mut s = String::new();
1762 let mut value = String::new();
1763
1764 chars.next();
1765
1766 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1768 chars.next();
1769
1770 let mut is_terminated = false;
1771 let mut prev: Option<char> = None;
1772
1773 while let Some(&ch) = chars.peek() {
1774 if prev == Some('$') {
1775 if ch == '$' {
1776 chars.next();
1777 is_terminated = true;
1778 break;
1779 } else {
1780 s.push('$');
1781 s.push(ch);
1782 }
1783 } else if ch != '$' {
1784 s.push(ch);
1785 }
1786
1787 prev = Some(ch);
1788 chars.next();
1789 }
1790
1791 return if chars.peek().is_none() && !is_terminated {
1792 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1793 } else {
1794 Ok(Token::DollarQuotedString(DollarQuotedString {
1795 value: s,
1796 tag: None,
1797 }))
1798 };
1799 } else {
1800 value.push_str(&peeking_take_while(chars, |ch| {
1801 ch.is_alphanumeric()
1802 || ch == '_'
1803 || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1805 }));
1806
1807 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1809 chars.next();
1810
1811 let mut temp = String::new();
1812 let end_delimiter = format!("${value}$");
1813
1814 loop {
1815 match chars.next() {
1816 Some(ch) => {
1817 temp.push(ch);
1818
1819 if temp.ends_with(&end_delimiter) {
1820 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1821 s.push_str(temp);
1822 }
1823 break;
1824 }
1825 }
1826 None => {
1827 if temp.ends_with(&end_delimiter) {
1828 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1829 s.push_str(temp);
1830 }
1831 break;
1832 }
1833
1834 return self.tokenizer_error(
1835 chars.location(),
1836 "Unterminated dollar-quoted, expected $",
1837 );
1838 }
1839 }
1840 }
1841 } else {
1842 return Ok(Token::Placeholder(String::from("$") + &value));
1843 }
1844 }
1845
1846 Ok(Token::DollarQuotedString(DollarQuotedString {
1847 value: s,
1848 tag: if value.is_empty() { None } else { Some(value) },
1849 }))
1850 }
1851
1852 fn tokenizer_error<R>(
1853 &self,
1854 loc: Location,
1855 message: impl Into<String>,
1856 ) -> Result<R, TokenizerError> {
1857 Err(TokenizerError {
1858 message: message.into(),
1859 location: loc,
1860 })
1861 }
1862
1863 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1865 let mut comment = peeking_take_while(chars, |ch| match ch {
1866 '\n' => false, '\r' if dialect_of!(self is PostgreSqlDialect) => false, _ => true, });
1870
1871 if let Some(ch) = chars.next() {
1872 assert!(ch == '\n' || ch == '\r');
1873 comment.push(ch);
1874 }
1875
1876 comment
1877 }
1878
1879 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881 let mut s = first_chars.into();
1882 s.push_str(&peeking_take_while(chars, |ch| {
1883 self.dialect.is_identifier_part(ch)
1884 }));
1885 s
1886 }
1887
1888 fn tokenize_quoted_identifier(
1890 &self,
1891 quote_start: char,
1892 chars: &mut State,
1893 ) -> Result<String, TokenizerError> {
1894 let error_loc = chars.location();
1895 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
1897 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1898
1899 if last_char == Some(quote_end) {
1900 Ok(s)
1901 } else {
1902 self.tokenizer_error(
1903 error_loc,
1904 format!("Expected close delimiter '{quote_end}' before EOF."),
1905 )
1906 }
1907 }
1908
1909 fn tokenize_escaped_single_quoted_string(
1911 &self,
1912 starting_loc: Location,
1913 chars: &mut State,
1914 ) -> Result<String, TokenizerError> {
1915 if let Some(s) = unescape_single_quoted_string(chars) {
1916 return Ok(s);
1917 }
1918
1919 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1920 }
1921
1922 fn tokenize_single_or_triple_quoted_string<F>(
1925 &self,
1926 chars: &mut State,
1927 quote_style: char,
1928 backslash_escape: bool,
1929 single_quote_token: F,
1930 triple_quote_token: F,
1931 ) -> Result<Option<Token>, TokenizerError>
1932 where
1933 F: Fn(String) -> Token,
1934 {
1935 let error_loc = chars.location();
1936
1937 let mut num_opening_quotes = 0u8;
1938 for _ in 0..3 {
1939 if Some("e_style) == chars.peek() {
1940 chars.next(); num_opening_quotes += 1;
1942 } else {
1943 break;
1944 }
1945 }
1946
1947 let (token_fn, num_quote_chars) = match num_opening_quotes {
1948 1 => (single_quote_token, NumStringQuoteChars::One),
1949 2 => {
1950 return Ok(Some(single_quote_token("".into())));
1952 }
1953 3 => {
1954 let Some(num_quote_chars) = NonZeroU8::new(3) else {
1955 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1956 };
1957 (
1958 triple_quote_token,
1959 NumStringQuoteChars::Many(num_quote_chars),
1960 )
1961 }
1962 _ => {
1963 return self.tokenizer_error(error_loc, "invalid string literal opening");
1964 }
1965 };
1966
1967 let settings = TokenizeQuotedStringSettings {
1968 quote_style,
1969 num_quote_chars,
1970 num_opening_quotes_to_consume: 0,
1971 backslash_escape,
1972 };
1973
1974 self.tokenize_quoted_string(chars, settings)
1975 .map(token_fn)
1976 .map(Some)
1977 }
1978
1979 fn tokenize_single_quoted_string(
1981 &self,
1982 chars: &mut State,
1983 quote_style: char,
1984 backslash_escape: bool,
1985 ) -> Result<String, TokenizerError> {
1986 self.tokenize_quoted_string(
1987 chars,
1988 TokenizeQuotedStringSettings {
1989 quote_style,
1990 num_quote_chars: NumStringQuoteChars::One,
1991 num_opening_quotes_to_consume: 1,
1992 backslash_escape,
1993 },
1994 )
1995 }
1996
1997 fn tokenize_quoted_string(
1999 &self,
2000 chars: &mut State,
2001 settings: TokenizeQuotedStringSettings,
2002 ) -> Result<String, TokenizerError> {
2003 let mut s = String::new();
2004 let error_loc = chars.location();
2005
2006 for _ in 0..settings.num_opening_quotes_to_consume {
2008 if Some(settings.quote_style) != chars.next() {
2009 return self.tokenizer_error(error_loc, "invalid string literal opening");
2010 }
2011 }
2012
2013 let mut num_consecutive_quotes = 0;
2014 while let Some(&ch) = chars.peek() {
2015 let pending_final_quote = match settings.num_quote_chars {
2016 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2017 n @ NumStringQuoteChars::Many(count)
2018 if num_consecutive_quotes + 1 == count.get() =>
2019 {
2020 Some(n)
2021 }
2022 NumStringQuoteChars::Many(_) => None,
2023 };
2024
2025 match ch {
2026 char if char == settings.quote_style && pending_final_quote.is_some() => {
2027 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2030 let mut buf = s.chars();
2035 for _ in 1..count.get() {
2036 buf.next_back();
2037 }
2038 return Ok(buf.as_str().to_string());
2039 } else if chars
2040 .peek()
2041 .map(|c| *c == settings.quote_style)
2042 .unwrap_or(false)
2043 {
2044 s.push(ch);
2045 if !self.unescape {
2046 s.push(ch);
2048 }
2049 chars.next();
2050 } else {
2051 return Ok(s);
2052 }
2053 }
2054 '\\' if settings.backslash_escape => {
2055 chars.next();
2057
2058 num_consecutive_quotes = 0;
2059
2060 if let Some(next) = chars.peek() {
2061 if !self.unescape
2062 || (self.dialect.ignores_wildcard_escapes()
2063 && (*next == '%' || *next == '_'))
2064 {
2065 s.push(ch);
2069 s.push(*next);
2070 chars.next(); } else {
2072 let n = match next {
2073 '0' => '\0',
2074 'a' => '\u{7}',
2075 'b' => '\u{8}',
2076 'f' => '\u{c}',
2077 'n' => '\n',
2078 'r' => '\r',
2079 't' => '\t',
2080 'Z' => '\u{1a}',
2081 _ => *next,
2082 };
2083 s.push(n);
2084 chars.next(); }
2086 }
2087 }
2088 ch => {
2089 chars.next(); if ch == settings.quote_style {
2092 num_consecutive_quotes += 1;
2093 } else {
2094 num_consecutive_quotes = 0;
2095 }
2096
2097 s.push(ch);
2098 }
2099 }
2100 }
2101 self.tokenizer_error(error_loc, "Unterminated string literal")
2102 }
2103
2104 fn tokenize_multiline_comment(
2105 &self,
2106 chars: &mut State,
2107 ) -> Result<Option<Token>, TokenizerError> {
2108 let mut s = String::new();
2109 let mut nested = 1;
2110 let supports_nested_comments = self.dialect.supports_nested_comments();
2111
2112 loop {
2113 match chars.next() {
2114 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2115 chars.next(); s.push('/');
2117 s.push('*');
2118 nested += 1;
2119 }
2120 Some('*') if matches!(chars.peek(), Some('/')) => {
2121 chars.next(); nested -= 1;
2123 if nested == 0 {
2124 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2125 }
2126 s.push('*');
2127 s.push('/');
2128 }
2129 Some(ch) => {
2130 s.push(ch);
2131 }
2132 None => {
2133 break self.tokenizer_error(
2134 chars.location(),
2135 "Unexpected EOF while in a multi-line comment",
2136 );
2137 }
2138 }
2139 }
2140 }
2141
2142 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2143 let mut last_char = None;
2144 let mut s = String::new();
2145 while let Some(ch) = chars.next() {
2146 if ch == quote_end {
2147 if chars.peek() == Some("e_end) {
2148 chars.next();
2149 s.push(ch);
2150 if !self.unescape {
2151 s.push(ch);
2153 }
2154 } else {
2155 last_char = Some(quote_end);
2156 break;
2157 }
2158 } else {
2159 s.push(ch);
2160 }
2161 }
2162 (s, last_char)
2163 }
2164
2165 #[allow(clippy::unnecessary_wraps)]
2166 fn consume_and_return(
2167 &self,
2168 chars: &mut State,
2169 t: Token,
2170 ) -> Result<Option<Token>, TokenizerError> {
2171 chars.next();
2172 Ok(Some(t))
2173 }
2174}
2175
2176fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2180 let mut s = String::new();
2181 while let Some(&ch) = chars.peek() {
2182 if predicate(ch) {
2183 chars.next(); s.push(ch);
2185 } else {
2186 break;
2187 }
2188 }
2189 s
2190}
2191
2192fn peeking_next_take_while(
2194 chars: &mut State,
2195 mut predicate: impl FnMut(char, Option<char>) -> bool,
2196) -> String {
2197 let mut s = String::new();
2198 while let Some(&ch) = chars.peek() {
2199 let next_char = chars.peekable.clone().nth(1);
2200 if predicate(ch, next_char) {
2201 chars.next(); s.push(ch);
2203 } else {
2204 break;
2205 }
2206 }
2207 s
2208}
2209
2210fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2211 Unescape::new(chars).unescape()
2212}
2213
2214struct Unescape<'a: 'b, 'b> {
2215 chars: &'b mut State<'a>,
2216}
2217
2218impl<'a: 'b, 'b> Unescape<'a, 'b> {
2219 fn new(chars: &'b mut State<'a>) -> Self {
2220 Self { chars }
2221 }
2222 fn unescape(mut self) -> Option<String> {
2223 let mut unescaped = String::new();
2224
2225 self.chars.next();
2226
2227 while let Some(c) = self.chars.next() {
2228 if c == '\'' {
2229 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2231 self.chars.next();
2232 unescaped.push('\'');
2233 continue;
2234 }
2235 return Some(unescaped);
2236 }
2237
2238 if c != '\\' {
2239 unescaped.push(c);
2240 continue;
2241 }
2242
2243 let c = match self.chars.next()? {
2244 'b' => '\u{0008}',
2245 'f' => '\u{000C}',
2246 'n' => '\n',
2247 'r' => '\r',
2248 't' => '\t',
2249 'u' => self.unescape_unicode_16()?,
2250 'U' => self.unescape_unicode_32()?,
2251 'x' => self.unescape_hex()?,
2252 c if c.is_digit(8) => self.unescape_octal(c)?,
2253 c => c,
2254 };
2255
2256 unescaped.push(Self::check_null(c)?);
2257 }
2258
2259 None
2260 }
2261
2262 #[inline]
2263 fn check_null(c: char) -> Option<char> {
2264 if c == '\0' {
2265 None
2266 } else {
2267 Some(c)
2268 }
2269 }
2270
2271 #[inline]
2272 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2273 match u32::from_str_radix(s, RADIX) {
2275 Err(_) => None,
2276 Ok(n) => {
2277 let n = n & 0xFF;
2278 if n <= 127 {
2279 char::from_u32(n)
2280 } else {
2281 None
2282 }
2283 }
2284 }
2285 }
2286
2287 fn unescape_hex(&mut self) -> Option<char> {
2289 let mut s = String::new();
2290
2291 for _ in 0..2 {
2292 match self.next_hex_digit() {
2293 Some(c) => s.push(c),
2294 None => break,
2295 }
2296 }
2297
2298 if s.is_empty() {
2299 return Some('x');
2300 }
2301
2302 Self::byte_to_char::<16>(&s)
2303 }
2304
2305 #[inline]
2306 fn next_hex_digit(&mut self) -> Option<char> {
2307 match self.chars.peek() {
2308 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2309 _ => None,
2310 }
2311 }
2312
2313 fn unescape_octal(&mut self, c: char) -> Option<char> {
2315 let mut s = String::new();
2316
2317 s.push(c);
2318 for _ in 0..2 {
2319 match self.next_octal_digest() {
2320 Some(c) => s.push(c),
2321 None => break,
2322 }
2323 }
2324
2325 Self::byte_to_char::<8>(&s)
2326 }
2327
2328 #[inline]
2329 fn next_octal_digest(&mut self) -> Option<char> {
2330 match self.chars.peek() {
2331 Some(c) if c.is_digit(8) => self.chars.next(),
2332 _ => None,
2333 }
2334 }
2335
2336 fn unescape_unicode_16(&mut self) -> Option<char> {
2338 self.unescape_unicode::<4>()
2339 }
2340
2341 fn unescape_unicode_32(&mut self) -> Option<char> {
2343 self.unescape_unicode::<8>()
2344 }
2345
2346 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2347 let mut s = String::new();
2348 for _ in 0..NUM {
2349 s.push(self.chars.next()?);
2350 }
2351 match u32::from_str_radix(&s, 16) {
2352 Err(_) => None,
2353 Ok(n) => char::from_u32(n),
2354 }
2355 }
2356}
2357
2358fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2359 let mut unescaped = String::new();
2360 chars.next(); while let Some(c) = chars.next() {
2362 match c {
2363 '\'' => {
2364 if chars.peek() == Some(&'\'') {
2365 chars.next();
2366 unescaped.push('\'');
2367 } else {
2368 return Ok(unescaped);
2369 }
2370 }
2371 '\\' => match chars.peek() {
2372 Some('\\') => {
2373 chars.next();
2374 unescaped.push('\\');
2375 }
2376 Some('+') => {
2377 chars.next();
2378 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2379 }
2380 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2381 },
2382 _ => {
2383 unescaped.push(c);
2384 }
2385 }
2386 }
2387 Err(TokenizerError {
2388 message: "Unterminated unicode encoded string literal".to_string(),
2389 location: chars.location(),
2390 })
2391}
2392
2393fn take_char_from_hex_digits(
2394 chars: &mut State<'_>,
2395 max_digits: usize,
2396) -> Result<char, TokenizerError> {
2397 let mut result = 0u32;
2398 for _ in 0..max_digits {
2399 let next_char = chars.next().ok_or_else(|| TokenizerError {
2400 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2401 .to_string(),
2402 location: chars.location(),
2403 })?;
2404 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2405 message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2406 location: chars.location(),
2407 })?;
2408 result = result * 16 + digit;
2409 }
2410 char::from_u32(result).ok_or_else(|| TokenizerError {
2411 message: format!("Invalid unicode character: {result:x}"),
2412 location: chars.location(),
2413 })
2414}
2415
2416#[cfg(test)]
2417mod tests {
2418 use super::*;
2419 use crate::dialect::{
2420 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2421 };
2422 use crate::test_utils::{all_dialects_except, all_dialects_where};
2423 use core::fmt::Debug;
2424
2425 #[test]
2426 fn tokenizer_error_impl() {
2427 let err = TokenizerError {
2428 message: "test".into(),
2429 location: Location { line: 1, column: 1 },
2430 };
2431 #[cfg(feature = "std")]
2432 {
2433 use std::error::Error;
2434 assert!(err.source().is_none());
2435 }
2436 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2437 }
2438
2439 #[test]
2440 fn tokenize_select_1() {
2441 let sql = String::from("SELECT 1");
2442 let dialect = GenericDialect {};
2443 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2444
2445 let expected = vec![
2446 Token::make_keyword("SELECT"),
2447 Token::Whitespace(Whitespace::Space),
2448 Token::Number(String::from("1"), false),
2449 ];
2450
2451 compare(expected, tokens);
2452 }
2453
2454 #[test]
2455 fn tokenize_select_float() {
2456 let sql = String::from("SELECT .1");
2457 let dialect = GenericDialect {};
2458 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2459
2460 let expected = vec![
2461 Token::make_keyword("SELECT"),
2462 Token::Whitespace(Whitespace::Space),
2463 Token::Number(String::from(".1"), false),
2464 ];
2465
2466 compare(expected, tokens);
2467 }
2468
2469 #[test]
2470 fn tokenize_clickhouse_double_equal() {
2471 let sql = String::from("SELECT foo=='1'");
2472 let dialect = ClickHouseDialect {};
2473 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2474 let tokens = tokenizer.tokenize().unwrap();
2475
2476 let expected = vec![
2477 Token::make_keyword("SELECT"),
2478 Token::Whitespace(Whitespace::Space),
2479 Token::Word(Word {
2480 value: "foo".to_string(),
2481 quote_style: None,
2482 keyword: Keyword::NoKeyword,
2483 }),
2484 Token::DoubleEq,
2485 Token::SingleQuotedString("1".to_string()),
2486 ];
2487
2488 compare(expected, tokens);
2489 }
2490
2491 #[test]
2492 fn tokenize_numeric_literal_underscore() {
2493 let dialect = GenericDialect {};
2494 let sql = String::from("SELECT 10_000");
2495 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2496 let tokens = tokenizer.tokenize().unwrap();
2497 let expected = vec![
2498 Token::make_keyword("SELECT"),
2499 Token::Whitespace(Whitespace::Space),
2500 Token::Number("10".to_string(), false),
2501 Token::make_word("_000", None),
2502 ];
2503 compare(expected, tokens);
2504
2505 all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2506 "SELECT 10_000, _10_000, 10_00_, 10___0",
2507 vec![
2508 Token::make_keyword("SELECT"),
2509 Token::Whitespace(Whitespace::Space),
2510 Token::Number("10_000".to_string(), false),
2511 Token::Comma,
2512 Token::Whitespace(Whitespace::Space),
2513 Token::make_word("_10_000", None), Token::Comma,
2515 Token::Whitespace(Whitespace::Space),
2516 Token::Number("10_00".to_string(), false),
2517 Token::make_word("_", None), Token::Comma,
2519 Token::Whitespace(Whitespace::Space),
2520 Token::Number("10".to_string(), false),
2521 Token::make_word("___0", None), ],
2523 );
2524 }
2525
2526 #[test]
2527 fn tokenize_select_exponent() {
2528 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2529 let dialect = GenericDialect {};
2530 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2531
2532 let expected = vec![
2533 Token::make_keyword("SELECT"),
2534 Token::Whitespace(Whitespace::Space),
2535 Token::Number(String::from("1e10"), false),
2536 Token::Comma,
2537 Token::Whitespace(Whitespace::Space),
2538 Token::Number(String::from("1e-10"), false),
2539 Token::Comma,
2540 Token::Whitespace(Whitespace::Space),
2541 Token::Number(String::from("1e+10"), false),
2542 Token::Comma,
2543 Token::Whitespace(Whitespace::Space),
2544 Token::Number(String::from("1"), false),
2545 Token::make_word("ea", None),
2546 Token::Comma,
2547 Token::Whitespace(Whitespace::Space),
2548 Token::Number(String::from("1e-10"), false),
2549 Token::make_word("a", None),
2550 Token::Comma,
2551 Token::Whitespace(Whitespace::Space),
2552 Token::Number(String::from("1e-10"), false),
2553 Token::Minus,
2554 Token::Number(String::from("10"), false),
2555 ];
2556
2557 compare(expected, tokens);
2558 }
2559
2560 #[test]
2561 fn tokenize_scalar_function() {
2562 let sql = String::from("SELECT sqrt(1)");
2563 let dialect = GenericDialect {};
2564 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2565
2566 let expected = vec![
2567 Token::make_keyword("SELECT"),
2568 Token::Whitespace(Whitespace::Space),
2569 Token::make_word("sqrt", None),
2570 Token::LParen,
2571 Token::Number(String::from("1"), false),
2572 Token::RParen,
2573 ];
2574
2575 compare(expected, tokens);
2576 }
2577
2578 #[test]
2579 fn tokenize_string_string_concat() {
2580 let sql = String::from("SELECT 'a' || 'b'");
2581 let dialect = GenericDialect {};
2582 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2583
2584 let expected = vec![
2585 Token::make_keyword("SELECT"),
2586 Token::Whitespace(Whitespace::Space),
2587 Token::SingleQuotedString(String::from("a")),
2588 Token::Whitespace(Whitespace::Space),
2589 Token::StringConcat,
2590 Token::Whitespace(Whitespace::Space),
2591 Token::SingleQuotedString(String::from("b")),
2592 ];
2593
2594 compare(expected, tokens);
2595 }
2596 #[test]
2597 fn tokenize_bitwise_op() {
2598 let sql = String::from("SELECT one | two ^ three");
2599 let dialect = GenericDialect {};
2600 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2601
2602 let expected = vec![
2603 Token::make_keyword("SELECT"),
2604 Token::Whitespace(Whitespace::Space),
2605 Token::make_word("one", None),
2606 Token::Whitespace(Whitespace::Space),
2607 Token::Pipe,
2608 Token::Whitespace(Whitespace::Space),
2609 Token::make_word("two", None),
2610 Token::Whitespace(Whitespace::Space),
2611 Token::Caret,
2612 Token::Whitespace(Whitespace::Space),
2613 Token::make_word("three", None),
2614 ];
2615 compare(expected, tokens);
2616 }
2617
2618 #[test]
2619 fn tokenize_logical_xor() {
2620 let sql =
2621 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2622 let dialect = GenericDialect {};
2623 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2624
2625 let expected = vec![
2626 Token::make_keyword("SELECT"),
2627 Token::Whitespace(Whitespace::Space),
2628 Token::make_keyword("true"),
2629 Token::Whitespace(Whitespace::Space),
2630 Token::make_keyword("XOR"),
2631 Token::Whitespace(Whitespace::Space),
2632 Token::make_keyword("true"),
2633 Token::Comma,
2634 Token::Whitespace(Whitespace::Space),
2635 Token::make_keyword("false"),
2636 Token::Whitespace(Whitespace::Space),
2637 Token::make_keyword("XOR"),
2638 Token::Whitespace(Whitespace::Space),
2639 Token::make_keyword("false"),
2640 Token::Comma,
2641 Token::Whitespace(Whitespace::Space),
2642 Token::make_keyword("true"),
2643 Token::Whitespace(Whitespace::Space),
2644 Token::make_keyword("XOR"),
2645 Token::Whitespace(Whitespace::Space),
2646 Token::make_keyword("false"),
2647 Token::Comma,
2648 Token::Whitespace(Whitespace::Space),
2649 Token::make_keyword("false"),
2650 Token::Whitespace(Whitespace::Space),
2651 Token::make_keyword("XOR"),
2652 Token::Whitespace(Whitespace::Space),
2653 Token::make_keyword("true"),
2654 ];
2655 compare(expected, tokens);
2656 }
2657
2658 #[test]
2659 fn tokenize_simple_select() {
2660 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2661 let dialect = GenericDialect {};
2662 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2663
2664 let expected = vec![
2665 Token::make_keyword("SELECT"),
2666 Token::Whitespace(Whitespace::Space),
2667 Token::Mul,
2668 Token::Whitespace(Whitespace::Space),
2669 Token::make_keyword("FROM"),
2670 Token::Whitespace(Whitespace::Space),
2671 Token::make_word("customer", None),
2672 Token::Whitespace(Whitespace::Space),
2673 Token::make_keyword("WHERE"),
2674 Token::Whitespace(Whitespace::Space),
2675 Token::make_word("id", None),
2676 Token::Whitespace(Whitespace::Space),
2677 Token::Eq,
2678 Token::Whitespace(Whitespace::Space),
2679 Token::Number(String::from("1"), false),
2680 Token::Whitespace(Whitespace::Space),
2681 Token::make_keyword("LIMIT"),
2682 Token::Whitespace(Whitespace::Space),
2683 Token::Number(String::from("5"), false),
2684 ];
2685
2686 compare(expected, tokens);
2687 }
2688
2689 #[test]
2690 fn tokenize_explain_select() {
2691 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2692 let dialect = GenericDialect {};
2693 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2694
2695 let expected = vec![
2696 Token::make_keyword("EXPLAIN"),
2697 Token::Whitespace(Whitespace::Space),
2698 Token::make_keyword("SELECT"),
2699 Token::Whitespace(Whitespace::Space),
2700 Token::Mul,
2701 Token::Whitespace(Whitespace::Space),
2702 Token::make_keyword("FROM"),
2703 Token::Whitespace(Whitespace::Space),
2704 Token::make_word("customer", None),
2705 Token::Whitespace(Whitespace::Space),
2706 Token::make_keyword("WHERE"),
2707 Token::Whitespace(Whitespace::Space),
2708 Token::make_word("id", None),
2709 Token::Whitespace(Whitespace::Space),
2710 Token::Eq,
2711 Token::Whitespace(Whitespace::Space),
2712 Token::Number(String::from("1"), false),
2713 ];
2714
2715 compare(expected, tokens);
2716 }
2717
2718 #[test]
2719 fn tokenize_explain_analyze_select() {
2720 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2721 let dialect = GenericDialect {};
2722 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2723
2724 let expected = vec![
2725 Token::make_keyword("EXPLAIN"),
2726 Token::Whitespace(Whitespace::Space),
2727 Token::make_keyword("ANALYZE"),
2728 Token::Whitespace(Whitespace::Space),
2729 Token::make_keyword("SELECT"),
2730 Token::Whitespace(Whitespace::Space),
2731 Token::Mul,
2732 Token::Whitespace(Whitespace::Space),
2733 Token::make_keyword("FROM"),
2734 Token::Whitespace(Whitespace::Space),
2735 Token::make_word("customer", None),
2736 Token::Whitespace(Whitespace::Space),
2737 Token::make_keyword("WHERE"),
2738 Token::Whitespace(Whitespace::Space),
2739 Token::make_word("id", None),
2740 Token::Whitespace(Whitespace::Space),
2741 Token::Eq,
2742 Token::Whitespace(Whitespace::Space),
2743 Token::Number(String::from("1"), false),
2744 ];
2745
2746 compare(expected, tokens);
2747 }
2748
2749 #[test]
2750 fn tokenize_string_predicate() {
2751 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2752 let dialect = GenericDialect {};
2753 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2754
2755 let expected = vec![
2756 Token::make_keyword("SELECT"),
2757 Token::Whitespace(Whitespace::Space),
2758 Token::Mul,
2759 Token::Whitespace(Whitespace::Space),
2760 Token::make_keyword("FROM"),
2761 Token::Whitespace(Whitespace::Space),
2762 Token::make_word("customer", None),
2763 Token::Whitespace(Whitespace::Space),
2764 Token::make_keyword("WHERE"),
2765 Token::Whitespace(Whitespace::Space),
2766 Token::make_word("salary", None),
2767 Token::Whitespace(Whitespace::Space),
2768 Token::Neq,
2769 Token::Whitespace(Whitespace::Space),
2770 Token::SingleQuotedString(String::from("Not Provided")),
2771 ];
2772
2773 compare(expected, tokens);
2774 }
2775
2776 #[test]
2777 fn tokenize_invalid_string() {
2778 let sql = String::from("\n💝مصطفىh");
2779
2780 let dialect = GenericDialect {};
2781 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2782 let expected = vec![
2784 Token::Whitespace(Whitespace::Newline),
2785 Token::Char('💝'),
2786 Token::make_word("مصطفىh", None),
2787 ];
2788 compare(expected, tokens);
2789 }
2790
2791 #[test]
2792 fn tokenize_newline_in_string_literal() {
2793 let sql = String::from("'foo\r\nbar\nbaz'");
2794
2795 let dialect = GenericDialect {};
2796 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2797 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2798 compare(expected, tokens);
2799 }
2800
2801 #[test]
2802 fn tokenize_unterminated_string_literal() {
2803 let sql = String::from("select 'foo");
2804
2805 let dialect = GenericDialect {};
2806 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2807 assert_eq!(
2808 tokenizer.tokenize(),
2809 Err(TokenizerError {
2810 message: "Unterminated string literal".to_string(),
2811 location: Location { line: 1, column: 8 },
2812 })
2813 );
2814 }
2815
2816 #[test]
2817 fn tokenize_unterminated_string_literal_utf8() {
2818 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2819
2820 let dialect = GenericDialect {};
2821 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2822 assert_eq!(
2823 tokenizer.tokenize(),
2824 Err(TokenizerError {
2825 message: "Unterminated string literal".to_string(),
2826 location: Location {
2827 line: 1,
2828 column: 35
2829 }
2830 })
2831 );
2832 }
2833
2834 #[test]
2835 fn tokenize_invalid_string_cols() {
2836 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2837
2838 let dialect = GenericDialect {};
2839 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2840 let expected = vec![
2842 Token::Whitespace(Whitespace::Newline),
2843 Token::Whitespace(Whitespace::Newline),
2844 Token::make_keyword("SELECT"),
2845 Token::Whitespace(Whitespace::Space),
2846 Token::Mul,
2847 Token::Whitespace(Whitespace::Space),
2848 Token::make_keyword("FROM"),
2849 Token::Whitespace(Whitespace::Space),
2850 Token::make_keyword("table"),
2851 Token::Whitespace(Whitespace::Tab),
2852 Token::Char('💝'),
2853 Token::make_word("مصطفىh", None),
2854 ];
2855 compare(expected, tokens);
2856 }
2857
2858 #[test]
2859 fn tokenize_dollar_quoted_string_tagged() {
2860 let test_cases = vec![
2861 (
2862 String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2863 vec![
2864 Token::make_keyword("SELECT"),
2865 Token::Whitespace(Whitespace::Space),
2866 Token::DollarQuotedString(DollarQuotedString {
2867 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2868 tag: Some("tag".into()),
2869 })
2870 ]
2871 ),
2872 (
2873 String::from("SELECT $abc$x$ab$abc$"),
2874 vec![
2875 Token::make_keyword("SELECT"),
2876 Token::Whitespace(Whitespace::Space),
2877 Token::DollarQuotedString(DollarQuotedString {
2878 value: "x$ab".into(),
2879 tag: Some("abc".into()),
2880 })
2881 ]
2882 ),
2883 (
2884 String::from("SELECT $abc$$abc$"),
2885 vec![
2886 Token::make_keyword("SELECT"),
2887 Token::Whitespace(Whitespace::Space),
2888 Token::DollarQuotedString(DollarQuotedString {
2889 value: "".into(),
2890 tag: Some("abc".into()),
2891 })
2892 ]
2893 ),
2894 (
2895 String::from("0$abc$$abc$1"),
2896 vec![
2897 Token::Number("0".into(), false),
2898 Token::DollarQuotedString(DollarQuotedString {
2899 value: "".into(),
2900 tag: Some("abc".into()),
2901 }),
2902 Token::Number("1".into(), false),
2903 ]
2904 ),
2905 (
2906 String::from("$function$abc$q$data$q$$function$"),
2907 vec![
2908 Token::DollarQuotedString(DollarQuotedString {
2909 value: "abc$q$data$q$".into(),
2910 tag: Some("function".into()),
2911 }),
2912 ]
2913 ),
2914 ];
2915
2916 let dialect = GenericDialect {};
2917 for (sql, expected) in test_cases {
2918 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2919 compare(expected, tokens);
2920 }
2921 }
2922
2923 #[test]
2924 fn tokenize_dollar_quoted_string_tagged_unterminated() {
2925 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2926 let dialect = GenericDialect {};
2927 assert_eq!(
2928 Tokenizer::new(&dialect, &sql).tokenize(),
2929 Err(TokenizerError {
2930 message: "Unterminated dollar-quoted, expected $".into(),
2931 location: Location {
2932 line: 1,
2933 column: 91
2934 }
2935 })
2936 );
2937 }
2938
2939 #[test]
2940 fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2941 let sql = String::from("SELECT $abc$abc$");
2942 let dialect = GenericDialect {};
2943 assert_eq!(
2944 Tokenizer::new(&dialect, &sql).tokenize(),
2945 Err(TokenizerError {
2946 message: "Unterminated dollar-quoted, expected $".into(),
2947 location: Location {
2948 line: 1,
2949 column: 17
2950 }
2951 })
2952 );
2953 }
2954
2955 #[test]
2956 fn tokenize_dollar_placeholder() {
2957 let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
2958 let dialect = SQLiteDialect {};
2959 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2960 assert_eq!(
2961 tokens,
2962 vec![
2963 Token::make_keyword("SELECT"),
2964 Token::Whitespace(Whitespace::Space),
2965 Token::Placeholder("$$".into()),
2966 Token::Comma,
2967 Token::Whitespace(Whitespace::Space),
2968 Token::Placeholder("$$ABC$$".into()),
2969 Token::Comma,
2970 Token::Whitespace(Whitespace::Space),
2971 Token::Placeholder("$ABC$".into()),
2972 Token::Comma,
2973 Token::Whitespace(Whitespace::Space),
2974 Token::Placeholder("$ABC".into()),
2975 ]
2976 );
2977 }
2978
2979 #[test]
2980 fn tokenize_nested_dollar_quoted_strings() {
2981 let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
2982 let dialect = GenericDialect {};
2983 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2984 let expected = vec![
2985 Token::make_keyword("SELECT"),
2986 Token::Whitespace(Whitespace::Space),
2987 Token::DollarQuotedString(DollarQuotedString {
2988 value: "dollar $nested$ string".into(),
2989 tag: Some("tag".into()),
2990 }),
2991 ];
2992 compare(expected, tokens);
2993 }
2994
2995 #[test]
2996 fn tokenize_dollar_quoted_string_untagged_empty() {
2997 let sql = String::from("SELECT $$$$");
2998 let dialect = GenericDialect {};
2999 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3000 let expected = vec![
3001 Token::make_keyword("SELECT"),
3002 Token::Whitespace(Whitespace::Space),
3003 Token::DollarQuotedString(DollarQuotedString {
3004 value: "".into(),
3005 tag: None,
3006 }),
3007 ];
3008 compare(expected, tokens);
3009 }
3010
3011 #[test]
3012 fn tokenize_dollar_quoted_string_untagged() {
3013 let sql =
3014 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3015 let dialect = GenericDialect {};
3016 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3017 let expected = vec![
3018 Token::make_keyword("SELECT"),
3019 Token::Whitespace(Whitespace::Space),
3020 Token::DollarQuotedString(DollarQuotedString {
3021 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3022 tag: None,
3023 }),
3024 ];
3025 compare(expected, tokens);
3026 }
3027
3028 #[test]
3029 fn tokenize_dollar_quoted_string_untagged_unterminated() {
3030 let sql = String::from(
3031 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3032 );
3033 let dialect = GenericDialect {};
3034 assert_eq!(
3035 Tokenizer::new(&dialect, &sql).tokenize(),
3036 Err(TokenizerError {
3037 message: "Unterminated dollar-quoted string".into(),
3038 location: Location {
3039 line: 1,
3040 column: 86
3041 }
3042 })
3043 );
3044 }
3045
3046 #[test]
3047 fn tokenize_right_arrow() {
3048 let sql = String::from("FUNCTION(key=>value)");
3049 let dialect = GenericDialect {};
3050 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3051 let expected = vec![
3052 Token::make_word("FUNCTION", None),
3053 Token::LParen,
3054 Token::make_word("key", None),
3055 Token::RArrow,
3056 Token::make_word("value", None),
3057 Token::RParen,
3058 ];
3059 compare(expected, tokens);
3060 }
3061
3062 #[test]
3063 fn tokenize_is_null() {
3064 let sql = String::from("a IS NULL");
3065 let dialect = GenericDialect {};
3066 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3067
3068 let expected = vec![
3069 Token::make_word("a", None),
3070 Token::Whitespace(Whitespace::Space),
3071 Token::make_keyword("IS"),
3072 Token::Whitespace(Whitespace::Space),
3073 Token::make_keyword("NULL"),
3074 ];
3075
3076 compare(expected, tokens);
3077 }
3078
3079 #[test]
3080 fn tokenize_comment() {
3081 let test_cases = vec![
3082 (
3083 String::from("0--this is a comment\n1"),
3084 vec![
3085 Token::Number("0".to_string(), false),
3086 Token::Whitespace(Whitespace::SingleLineComment {
3087 prefix: "--".to_string(),
3088 comment: "this is a comment\n".to_string(),
3089 }),
3090 Token::Number("1".to_string(), false),
3091 ],
3092 ),
3093 (
3094 String::from("0--this is a comment\r1"),
3095 vec![
3096 Token::Number("0".to_string(), false),
3097 Token::Whitespace(Whitespace::SingleLineComment {
3098 prefix: "--".to_string(),
3099 comment: "this is a comment\r1".to_string(),
3100 }),
3101 ],
3102 ),
3103 (
3104 String::from("0--this is a comment\r\n1"),
3105 vec![
3106 Token::Number("0".to_string(), false),
3107 Token::Whitespace(Whitespace::SingleLineComment {
3108 prefix: "--".to_string(),
3109 comment: "this is a comment\r\n".to_string(),
3110 }),
3111 Token::Number("1".to_string(), false),
3112 ],
3113 ),
3114 ];
3115
3116 let dialect = GenericDialect {};
3117
3118 for (sql, expected) in test_cases {
3119 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3120 compare(expected, tokens);
3121 }
3122 }
3123
3124 #[test]
3125 fn tokenize_comment_postgres() {
3126 let sql = String::from("1--\r0");
3127
3128 let dialect = PostgreSqlDialect {};
3129 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3130 let expected = vec![
3131 Token::Number("1".to_string(), false),
3132 Token::Whitespace(Whitespace::SingleLineComment {
3133 prefix: "--".to_string(),
3134 comment: "\r".to_string(),
3135 }),
3136 Token::Number("0".to_string(), false),
3137 ];
3138 compare(expected, tokens);
3139 }
3140
3141 #[test]
3142 fn tokenize_comment_at_eof() {
3143 let sql = String::from("--this is a comment");
3144
3145 let dialect = GenericDialect {};
3146 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3147 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3148 prefix: "--".to_string(),
3149 comment: "this is a comment".to_string(),
3150 })];
3151 compare(expected, tokens);
3152 }
3153
3154 #[test]
3155 fn tokenize_multiline_comment() {
3156 let sql = String::from("0/*multi-line\n* /comment*/1");
3157
3158 let dialect = GenericDialect {};
3159 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3160 let expected = vec![
3161 Token::Number("0".to_string(), false),
3162 Token::Whitespace(Whitespace::MultiLineComment(
3163 "multi-line\n* /comment".to_string(),
3164 )),
3165 Token::Number("1".to_string(), false),
3166 ];
3167 compare(expected, tokens);
3168 }
3169
3170 #[test]
3171 fn tokenize_nested_multiline_comment() {
3172 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3173 "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3174 vec![
3175 Token::Number("0".to_string(), false),
3176 Token::Whitespace(Whitespace::MultiLineComment(
3177 "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3178 )),
3179 Token::Whitespace(Whitespace::Space),
3180 Token::Div,
3181 Token::Word(Word {
3182 value: "comment".to_string(),
3183 quote_style: None,
3184 keyword: Keyword::COMMENT,
3185 }),
3186 Token::Mul,
3187 Token::Div,
3188 Token::Number("1".to_string(), false),
3189 ],
3190 );
3191
3192 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3193 "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3194 vec![
3195 Token::Number("0".to_string(), false),
3196 Token::Whitespace(Whitespace::MultiLineComment(
3197 "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3198 )),
3199 Token::Number("1".to_string(), false),
3200 ],
3201 );
3202
3203 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3204 "SELECT 1/* a /* b */ c */0",
3205 vec![
3206 Token::make_keyword("SELECT"),
3207 Token::Whitespace(Whitespace::Space),
3208 Token::Number("1".to_string(), false),
3209 Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3210 Token::Number("0".to_string(), false),
3211 ],
3212 );
3213 }
3214
3215 #[test]
3216 fn tokenize_nested_multiline_comment_empty() {
3217 all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3218 "select 1/*/**/*/0",
3219 vec![
3220 Token::make_keyword("select"),
3221 Token::Whitespace(Whitespace::Space),
3222 Token::Number("1".to_string(), false),
3223 Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3224 Token::Number("0".to_string(), false),
3225 ],
3226 );
3227 }
3228
3229 #[test]
3230 fn tokenize_nested_comments_if_not_supported() {
3231 all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3232 "SELECT 1/*/* nested comment */*/0",
3233 vec![
3234 Token::make_keyword("SELECT"),
3235 Token::Whitespace(Whitespace::Space),
3236 Token::Number("1".to_string(), false),
3237 Token::Whitespace(Whitespace::MultiLineComment(
3238 "/* nested comment ".to_string(),
3239 )),
3240 Token::Mul,
3241 Token::Div,
3242 Token::Number("0".to_string(), false),
3243 ],
3244 );
3245 }
3246
3247 #[test]
3248 fn tokenize_multiline_comment_with_even_asterisks() {
3249 let sql = String::from("\n/** Comment **/\n");
3250
3251 let dialect = GenericDialect {};
3252 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3253 let expected = vec![
3254 Token::Whitespace(Whitespace::Newline),
3255 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3256 Token::Whitespace(Whitespace::Newline),
3257 ];
3258 compare(expected, tokens);
3259 }
3260
3261 #[test]
3262 fn tokenize_unicode_whitespace() {
3263 let sql = String::from(" \u{2003}\n");
3264
3265 let dialect = GenericDialect {};
3266 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3267 let expected = vec![
3268 Token::Whitespace(Whitespace::Space),
3269 Token::Whitespace(Whitespace::Space),
3270 Token::Whitespace(Whitespace::Newline),
3271 ];
3272 compare(expected, tokens);
3273 }
3274
3275 #[test]
3276 fn tokenize_mismatched_quotes() {
3277 let sql = String::from("\"foo");
3278
3279 let dialect = GenericDialect {};
3280 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3281 assert_eq!(
3282 tokenizer.tokenize(),
3283 Err(TokenizerError {
3284 message: "Expected close delimiter '\"' before EOF.".to_string(),
3285 location: Location { line: 1, column: 1 },
3286 })
3287 );
3288 }
3289
3290 #[test]
3291 fn tokenize_newlines() {
3292 let sql = String::from("line1\nline2\rline3\r\nline4\r");
3293
3294 let dialect = GenericDialect {};
3295 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3296 let expected = vec![
3297 Token::make_word("line1", None),
3298 Token::Whitespace(Whitespace::Newline),
3299 Token::make_word("line2", None),
3300 Token::Whitespace(Whitespace::Newline),
3301 Token::make_word("line3", None),
3302 Token::Whitespace(Whitespace::Newline),
3303 Token::make_word("line4", None),
3304 Token::Whitespace(Whitespace::Newline),
3305 ];
3306 compare(expected, tokens);
3307 }
3308
3309 #[test]
3310 fn tokenize_mssql_top() {
3311 let sql = "SELECT TOP 5 [bar] FROM foo";
3312 let dialect = MsSqlDialect {};
3313 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3314 let expected = vec![
3315 Token::make_keyword("SELECT"),
3316 Token::Whitespace(Whitespace::Space),
3317 Token::make_keyword("TOP"),
3318 Token::Whitespace(Whitespace::Space),
3319 Token::Number(String::from("5"), false),
3320 Token::Whitespace(Whitespace::Space),
3321 Token::make_word("bar", Some('[')),
3322 Token::Whitespace(Whitespace::Space),
3323 Token::make_keyword("FROM"),
3324 Token::Whitespace(Whitespace::Space),
3325 Token::make_word("foo", None),
3326 ];
3327 compare(expected, tokens);
3328 }
3329
3330 #[test]
3331 fn tokenize_pg_regex_match() {
3332 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3333 let dialect = GenericDialect {};
3334 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3335 let expected = vec![
3336 Token::make_keyword("SELECT"),
3337 Token::Whitespace(Whitespace::Space),
3338 Token::make_word("col", None),
3339 Token::Whitespace(Whitespace::Space),
3340 Token::Tilde,
3341 Token::Whitespace(Whitespace::Space),
3342 Token::SingleQuotedString("^a".into()),
3343 Token::Comma,
3344 Token::Whitespace(Whitespace::Space),
3345 Token::make_word("col", None),
3346 Token::Whitespace(Whitespace::Space),
3347 Token::TildeAsterisk,
3348 Token::Whitespace(Whitespace::Space),
3349 Token::SingleQuotedString("^a".into()),
3350 Token::Comma,
3351 Token::Whitespace(Whitespace::Space),
3352 Token::make_word("col", None),
3353 Token::Whitespace(Whitespace::Space),
3354 Token::ExclamationMarkTilde,
3355 Token::Whitespace(Whitespace::Space),
3356 Token::SingleQuotedString("^a".into()),
3357 Token::Comma,
3358 Token::Whitespace(Whitespace::Space),
3359 Token::make_word("col", None),
3360 Token::Whitespace(Whitespace::Space),
3361 Token::ExclamationMarkTildeAsterisk,
3362 Token::Whitespace(Whitespace::Space),
3363 Token::SingleQuotedString("^a".into()),
3364 ];
3365 compare(expected, tokens);
3366 }
3367
3368 #[test]
3369 fn tokenize_pg_like_match() {
3370 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3371 let dialect = GenericDialect {};
3372 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3373 let expected = vec![
3374 Token::make_keyword("SELECT"),
3375 Token::Whitespace(Whitespace::Space),
3376 Token::make_word("col", None),
3377 Token::Whitespace(Whitespace::Space),
3378 Token::DoubleTilde,
3379 Token::Whitespace(Whitespace::Space),
3380 Token::SingleQuotedString("_a%".into()),
3381 Token::Comma,
3382 Token::Whitespace(Whitespace::Space),
3383 Token::make_word("col", None),
3384 Token::Whitespace(Whitespace::Space),
3385 Token::DoubleTildeAsterisk,
3386 Token::Whitespace(Whitespace::Space),
3387 Token::SingleQuotedString("_a%".into()),
3388 Token::Comma,
3389 Token::Whitespace(Whitespace::Space),
3390 Token::make_word("col", None),
3391 Token::Whitespace(Whitespace::Space),
3392 Token::ExclamationMarkDoubleTilde,
3393 Token::Whitespace(Whitespace::Space),
3394 Token::SingleQuotedString("_a%".into()),
3395 Token::Comma,
3396 Token::Whitespace(Whitespace::Space),
3397 Token::make_word("col", None),
3398 Token::Whitespace(Whitespace::Space),
3399 Token::ExclamationMarkDoubleTildeAsterisk,
3400 Token::Whitespace(Whitespace::Space),
3401 Token::SingleQuotedString("_a%".into()),
3402 ];
3403 compare(expected, tokens);
3404 }
3405
3406 #[test]
3407 fn tokenize_quoted_identifier() {
3408 let sql = r#" "a "" b" "a """ "c """"" "#;
3409 let dialect = GenericDialect {};
3410 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3411 let expected = vec![
3412 Token::Whitespace(Whitespace::Space),
3413 Token::make_word(r#"a " b"#, Some('"')),
3414 Token::Whitespace(Whitespace::Space),
3415 Token::make_word(r#"a ""#, Some('"')),
3416 Token::Whitespace(Whitespace::Space),
3417 Token::make_word(r#"c """#, Some('"')),
3418 Token::Whitespace(Whitespace::Space),
3419 ];
3420 compare(expected, tokens);
3421 }
3422
3423 #[test]
3424 fn tokenize_snowflake_div() {
3425 let sql = r#"field/1000"#;
3426 let dialect = SnowflakeDialect {};
3427 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3428 let expected = vec![
3429 Token::make_word(r#"field"#, None),
3430 Token::Div,
3431 Token::Number("1000".to_string(), false),
3432 ];
3433 compare(expected, tokens);
3434 }
3435
3436 #[test]
3437 fn tokenize_quoted_identifier_with_no_escape() {
3438 let sql = r#" "a "" b" "a """ "c """"" "#;
3439 let dialect = GenericDialect {};
3440 let tokens = Tokenizer::new(&dialect, sql)
3441 .with_unescape(false)
3442 .tokenize()
3443 .unwrap();
3444 let expected = vec![
3445 Token::Whitespace(Whitespace::Space),
3446 Token::make_word(r#"a "" b"#, Some('"')),
3447 Token::Whitespace(Whitespace::Space),
3448 Token::make_word(r#"a """#, Some('"')),
3449 Token::Whitespace(Whitespace::Space),
3450 Token::make_word(r#"c """""#, Some('"')),
3451 Token::Whitespace(Whitespace::Space),
3452 ];
3453 compare(expected, tokens);
3454 }
3455
3456 #[test]
3457 fn tokenize_with_location() {
3458 let sql = "SELECT a,\n b";
3459 let dialect = GenericDialect {};
3460 let tokens = Tokenizer::new(&dialect, sql)
3461 .tokenize_with_location()
3462 .unwrap();
3463 let expected = vec![
3464 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3465 TokenWithSpan::at(
3466 Token::Whitespace(Whitespace::Space),
3467 (1, 7).into(),
3468 (1, 8).into(),
3469 ),
3470 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3471 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3472 TokenWithSpan::at(
3473 Token::Whitespace(Whitespace::Newline),
3474 (1, 10).into(),
3475 (2, 1).into(),
3476 ),
3477 TokenWithSpan::at(
3478 Token::Whitespace(Whitespace::Space),
3479 (2, 1).into(),
3480 (2, 2).into(),
3481 ),
3482 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3483 ];
3484 compare(expected, tokens);
3485 }
3486
3487 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3488 assert_eq!(expected, actual);
3493 }
3494
3495 fn check_unescape(s: &str, expected: Option<&str>) {
3496 let s = format!("'{s}'");
3497 let mut state = State {
3498 peekable: s.chars().peekable(),
3499 line: 0,
3500 col: 0,
3501 };
3502
3503 assert_eq!(
3504 unescape_single_quoted_string(&mut state),
3505 expected.map(|s| s.to_string())
3506 );
3507 }
3508
3509 #[test]
3510 fn test_unescape() {
3511 check_unescape(r"\b", Some("\u{0008}"));
3512 check_unescape(r"\f", Some("\u{000C}"));
3513 check_unescape(r"\t", Some("\t"));
3514 check_unescape(r"\r\n", Some("\r\n"));
3515 check_unescape(r"\/", Some("/"));
3516 check_unescape(r"/", Some("/"));
3517 check_unescape(r"\\", Some("\\"));
3518
3519 check_unescape(r"\u0001", Some("\u{0001}"));
3521 check_unescape(r"\u4c91", Some("\u{4c91}"));
3522 check_unescape(r"\u4c916", Some("\u{4c91}6"));
3523 check_unescape(r"\u4c", None);
3524 check_unescape(r"\u0000", None);
3525 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3526 check_unescape(r"\U00110000", None);
3527 check_unescape(r"\U00000000", None);
3528 check_unescape(r"\u", None);
3529 check_unescape(r"\U", None);
3530 check_unescape(r"\U1010FFFF", None);
3531
3532 check_unescape(r"\x4B", Some("\u{004b}"));
3534 check_unescape(r"\x4", Some("\u{0004}"));
3535 check_unescape(r"\x4L", Some("\u{0004}L"));
3536 check_unescape(r"\x", Some("x"));
3537 check_unescape(r"\xP", Some("xP"));
3538 check_unescape(r"\x0", None);
3539 check_unescape(r"\xCAD", None);
3540 check_unescape(r"\xA9", None);
3541
3542 check_unescape(r"\1", Some("\u{0001}"));
3544 check_unescape(r"\12", Some("\u{000a}"));
3545 check_unescape(r"\123", Some("\u{0053}"));
3546 check_unescape(r"\1232", Some("\u{0053}2"));
3547 check_unescape(r"\4", Some("\u{0004}"));
3548 check_unescape(r"\45", Some("\u{0025}"));
3549 check_unescape(r"\450", Some("\u{0028}"));
3550 check_unescape(r"\603", None);
3551 check_unescape(r"\0", None);
3552 check_unescape(r"\080", None);
3553
3554 check_unescape(r"\9", Some("9"));
3556 check_unescape(r"''", Some("'"));
3557 check_unescape(
3558 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3559 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3560 );
3561 check_unescape(r"Hello\0", None);
3562 check_unescape(r"Hello\xCADRust", None);
3563 }
3564
3565 #[test]
3566 fn tokenize_numeric_prefix_trait() {
3567 #[derive(Debug)]
3568 struct NumericPrefixDialect;
3569
3570 impl Dialect for NumericPrefixDialect {
3571 fn is_identifier_start(&self, ch: char) -> bool {
3572 ch.is_ascii_lowercase()
3573 || ch.is_ascii_uppercase()
3574 || ch.is_ascii_digit()
3575 || ch == '$'
3576 }
3577
3578 fn is_identifier_part(&self, ch: char) -> bool {
3579 ch.is_ascii_lowercase()
3580 || ch.is_ascii_uppercase()
3581 || ch.is_ascii_digit()
3582 || ch == '_'
3583 || ch == '$'
3584 || ch == '{'
3585 || ch == '}'
3586 }
3587
3588 fn supports_numeric_prefix(&self) -> bool {
3589 true
3590 }
3591 }
3592
3593 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3594 tokenize_numeric_prefix_inner(&HiveDialect {});
3595 tokenize_numeric_prefix_inner(&MySqlDialect {});
3596 }
3597
3598 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3599 let sql = r#"SELECT * FROM 1"#;
3600 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3601 let expected = vec![
3602 Token::make_keyword("SELECT"),
3603 Token::Whitespace(Whitespace::Space),
3604 Token::Mul,
3605 Token::Whitespace(Whitespace::Space),
3606 Token::make_keyword("FROM"),
3607 Token::Whitespace(Whitespace::Space),
3608 Token::Number(String::from("1"), false),
3609 ];
3610 compare(expected, tokens);
3611 }
3612
3613 #[test]
3614 fn tokenize_quoted_string_escape() {
3615 let dialect = SnowflakeDialect {};
3616 for (sql, expected, expected_unescaped) in [
3617 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3618 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3619 (r#"'\\'"#, r#"\\"#, r#"\"#),
3620 (
3621 r#"'\0\a\b\f\n\r\t\Z'"#,
3622 r#"\0\a\b\f\n\r\t\Z"#,
3623 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3624 ),
3625 (r#"'\"'"#, r#"\""#, "\""),
3626 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3627 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3628 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3629 (r#"'\q'"#, r#"\q"#, r#"q"#),
3630 (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3631 (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3632 ] {
3633 let tokens = Tokenizer::new(&dialect, sql)
3634 .with_unescape(false)
3635 .tokenize()
3636 .unwrap();
3637 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3638 compare(expected, tokens);
3639
3640 let tokens = Tokenizer::new(&dialect, sql)
3641 .with_unescape(true)
3642 .tokenize()
3643 .unwrap();
3644 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3645 compare(expected, tokens);
3646 }
3647
3648 for sql in [r#"'\'"#, r#"'ab\'"#] {
3649 let mut tokenizer = Tokenizer::new(&dialect, sql);
3650 assert_eq!(
3651 "Unterminated string literal",
3652 tokenizer.tokenize().unwrap_err().message.as_str(),
3653 );
3654 }
3655
3656 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3658 let dialect = GenericDialect {};
3659 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3660
3661 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3662
3663 compare(expected, tokens);
3664 }
3665
3666 for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3668 let dialect = MySqlDialect {};
3669 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3670
3671 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3672
3673 compare(expected, tokens);
3674 }
3675 }
3676
3677 #[test]
3678 fn tokenize_triple_quoted_string() {
3679 fn check<F>(
3680 q: char, r: char, quote_token: F,
3683 ) where
3684 F: Fn(String) -> Token,
3685 {
3686 let dialect = BigQueryDialect {};
3687
3688 for (sql, expected, expected_unescaped) in [
3689 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3691 (
3693 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3694 format!(r#"ab{q}{q}\{q}{q}cd"#),
3695 format!(r#"ab{q}{q}{q}{q}cd"#),
3696 ),
3697 (
3699 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3700 "abc".into(),
3701 "abc".into(),
3702 ),
3703 (
3705 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3706 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3707 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3708 ),
3709 (
3711 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3712 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3713 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3714 ),
3715 (
3717 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3718 r#"a\'\'b\'c\'d"#.into(),
3719 r#"a''b'c'd"#.into(),
3720 ),
3721 (
3723 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3724 r#"abc\0\n\rdef"#.into(),
3725 "abc\0\n\rdef".into(),
3726 ),
3727 ] {
3728 let tokens = Tokenizer::new(&dialect, sql.as_str())
3729 .with_unescape(false)
3730 .tokenize()
3731 .unwrap();
3732 let expected = vec![quote_token(expected.to_string())];
3733 compare(expected, tokens);
3734
3735 let tokens = Tokenizer::new(&dialect, sql.as_str())
3736 .with_unescape(true)
3737 .tokenize()
3738 .unwrap();
3739 let expected = vec![quote_token(expected_unescaped.to_string())];
3740 compare(expected, tokens);
3741 }
3742
3743 for sql in [
3744 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3745 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3746 format!(r#"{q}{q}{q}{q}"#),
3747 format!(r#"{q}{q}{q}{r}{r}"#),
3748 format!(r#"{q}{q}{q}abc{q}"#),
3749 format!(r#"{q}{q}{q}abc{q}{q}"#),
3750 format!(r#"{q}{q}{q}abc"#),
3751 ] {
3752 let dialect = BigQueryDialect {};
3753 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3754 assert_eq!(
3755 "Unterminated string literal",
3756 tokenizer.tokenize().unwrap_err().message.as_str(),
3757 );
3758 }
3759 }
3760
3761 check('"', '\'', Token::TripleDoubleQuotedString);
3762
3763 check('\'', '"', Token::TripleSingleQuotedString);
3764
3765 let dialect = BigQueryDialect {};
3766
3767 let sql = r#"""''"#;
3768 let tokens = Tokenizer::new(&dialect, sql)
3769 .with_unescape(true)
3770 .tokenize()
3771 .unwrap();
3772 let expected = vec![
3773 Token::DoubleQuotedString("".to_string()),
3774 Token::SingleQuotedString("".to_string()),
3775 ];
3776 compare(expected, tokens);
3777
3778 let sql = r#"''"""#;
3779 let tokens = Tokenizer::new(&dialect, sql)
3780 .with_unescape(true)
3781 .tokenize()
3782 .unwrap();
3783 let expected = vec![
3784 Token::SingleQuotedString("".to_string()),
3785 Token::DoubleQuotedString("".to_string()),
3786 ];
3787 compare(expected, tokens);
3788
3789 let dialect = SnowflakeDialect {};
3791 let sql = r#"''''''"#;
3792 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3793 let expected = vec![Token::SingleQuotedString("''".to_string())];
3794 compare(expected, tokens);
3795 }
3796
3797 #[test]
3798 fn test_mysql_users_grantees() {
3799 let dialect = MySqlDialect {};
3800
3801 let sql = "CREATE USER `root`@`%`";
3802 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3803 let expected = vec![
3804 Token::make_keyword("CREATE"),
3805 Token::Whitespace(Whitespace::Space),
3806 Token::make_keyword("USER"),
3807 Token::Whitespace(Whitespace::Space),
3808 Token::make_word("root", Some('`')),
3809 Token::AtSign,
3810 Token::make_word("%", Some('`')),
3811 ];
3812 compare(expected, tokens);
3813 }
3814
3815 #[test]
3816 fn test_postgres_abs_without_space_and_string_literal() {
3817 let dialect = MySqlDialect {};
3818
3819 let sql = "SELECT @'1'";
3820 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3821 let expected = vec![
3822 Token::make_keyword("SELECT"),
3823 Token::Whitespace(Whitespace::Space),
3824 Token::AtSign,
3825 Token::SingleQuotedString("1".to_string()),
3826 ];
3827 compare(expected, tokens);
3828 }
3829
3830 #[test]
3831 fn test_postgres_abs_without_space_and_quoted_column() {
3832 let dialect = MySqlDialect {};
3833
3834 let sql = r#"SELECT @"bar" FROM foo"#;
3835 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3836 let expected = vec![
3837 Token::make_keyword("SELECT"),
3838 Token::Whitespace(Whitespace::Space),
3839 Token::AtSign,
3840 Token::DoubleQuotedString("bar".to_string()),
3841 Token::Whitespace(Whitespace::Space),
3842 Token::make_keyword("FROM"),
3843 Token::Whitespace(Whitespace::Space),
3844 Token::make_word("foo", None),
3845 ];
3846 compare(expected, tokens);
3847 }
3848
3849 #[test]
3850 fn test_national_strings_backslash_escape_not_supported() {
3851 all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3852 .tokenizes_to(
3853 "select n'''''\\'",
3854 vec![
3855 Token::make_keyword("select"),
3856 Token::Whitespace(Whitespace::Space),
3857 Token::NationalStringLiteral("''\\".to_string()),
3858 ],
3859 );
3860 }
3861
3862 #[test]
3863 fn test_national_strings_backslash_escape_supported() {
3864 all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3865 .tokenizes_to(
3866 "select n'''''\\''",
3867 vec![
3868 Token::make_keyword("select"),
3869 Token::Whitespace(Whitespace::Space),
3870 Token::NationalStringLiteral("'''".to_string()),
3871 ],
3872 );
3873 }
3874
3875 #[test]
3876 fn test_string_escape_constant_not_supported() {
3877 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3878 "select e'...'",
3879 vec![
3880 Token::make_keyword("select"),
3881 Token::Whitespace(Whitespace::Space),
3882 Token::make_word("e", None),
3883 Token::SingleQuotedString("...".to_string()),
3884 ],
3885 );
3886
3887 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3888 "select E'...'",
3889 vec![
3890 Token::make_keyword("select"),
3891 Token::Whitespace(Whitespace::Space),
3892 Token::make_word("E", None),
3893 Token::SingleQuotedString("...".to_string()),
3894 ],
3895 );
3896 }
3897
3898 #[test]
3899 fn test_string_escape_constant_supported() {
3900 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3901 "select e'\\''",
3902 vec![
3903 Token::make_keyword("select"),
3904 Token::Whitespace(Whitespace::Space),
3905 Token::EscapedStringLiteral("'".to_string()),
3906 ],
3907 );
3908
3909 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3910 "select E'\\''",
3911 vec![
3912 Token::make_keyword("select"),
3913 Token::Whitespace(Whitespace::Space),
3914 Token::EscapedStringLiteral("'".to_string()),
3915 ],
3916 );
3917 }
3918
3919 #[test]
3920 fn test_whitespace_required_after_single_line_comment() {
3921 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3922 .tokenizes_to(
3923 "SELECT --'abc'",
3924 vec![
3925 Token::make_keyword("SELECT"),
3926 Token::Whitespace(Whitespace::Space),
3927 Token::Minus,
3928 Token::Minus,
3929 Token::SingleQuotedString("abc".to_string()),
3930 ],
3931 );
3932
3933 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3934 .tokenizes_to(
3935 "SELECT -- 'abc'",
3936 vec![
3937 Token::make_keyword("SELECT"),
3938 Token::Whitespace(Whitespace::Space),
3939 Token::Whitespace(Whitespace::SingleLineComment {
3940 prefix: "--".to_string(),
3941 comment: " 'abc'".to_string(),
3942 }),
3943 ],
3944 );
3945
3946 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3947 .tokenizes_to(
3948 "SELECT --",
3949 vec![
3950 Token::make_keyword("SELECT"),
3951 Token::Whitespace(Whitespace::Space),
3952 Token::Minus,
3953 Token::Minus,
3954 ],
3955 );
3956 }
3957
3958 #[test]
3959 fn test_whitespace_not_required_after_single_line_comment() {
3960 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3961 .tokenizes_to(
3962 "SELECT --'abc'",
3963 vec![
3964 Token::make_keyword("SELECT"),
3965 Token::Whitespace(Whitespace::Space),
3966 Token::Whitespace(Whitespace::SingleLineComment {
3967 prefix: "--".to_string(),
3968 comment: "'abc'".to_string(),
3969 }),
3970 ],
3971 );
3972
3973 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3974 .tokenizes_to(
3975 "SELECT -- 'abc'",
3976 vec![
3977 Token::make_keyword("SELECT"),
3978 Token::Whitespace(Whitespace::Space),
3979 Token::Whitespace(Whitespace::SingleLineComment {
3980 prefix: "--".to_string(),
3981 comment: " 'abc'".to_string(),
3982 }),
3983 ],
3984 );
3985
3986 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3987 .tokenizes_to(
3988 "SELECT --",
3989 vec![
3990 Token::make_keyword("SELECT"),
3991 Token::Whitespace(Whitespace::Space),
3992 Token::Whitespace(Whitespace::SingleLineComment {
3993 prefix: "--".to_string(),
3994 comment: "".to_string(),
3995 }),
3996 ],
3997 );
3998 }
3999
4000 #[test]
4001 fn test_tokenize_identifiers_numeric_prefix() {
4002 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4003 .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4004
4005 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4006 .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4007
4008 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4009 "t.12e34",
4010 vec![
4011 Token::make_word("t", None),
4012 Token::Period,
4013 Token::make_word("12e34", None),
4014 ],
4015 );
4016
4017 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4018 "t.1two3",
4019 vec![
4020 Token::make_word("t", None),
4021 Token::Period,
4022 Token::make_word("1two3", None),
4023 ],
4024 );
4025 }
4026
4027 #[test]
4028 fn tokenize_period_underscore() {
4029 let sql = String::from("SELECT table._col");
4030 let dialect = PostgreSqlDialect {};
4032 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4033
4034 let expected = vec![
4035 Token::make_keyword("SELECT"),
4036 Token::Whitespace(Whitespace::Space),
4037 Token::Word(Word {
4038 value: "table".to_string(),
4039 quote_style: None,
4040 keyword: Keyword::TABLE,
4041 }),
4042 Token::Period,
4043 Token::Word(Word {
4044 value: "_col".to_string(),
4045 quote_style: None,
4046 keyword: Keyword::NoKeyword,
4047 }),
4048 ];
4049
4050 compare(expected, tokens);
4051
4052 let sql = String::from("SELECT ._123");
4053 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4054 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4055 }
4056
4057 let sql = String::from("SELECT ._abc");
4058 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4059 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4060 }
4061 }
4062}