1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqltk_parser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46 SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56 EOF,
58 Word(Word),
60 Number(String, bool),
62 Char(char),
64 SingleQuotedString(String),
66 DoubleQuotedString(String),
68 TripleSingleQuotedString(String),
71 TripleDoubleQuotedString(String),
74 DollarQuotedString(DollarQuotedString),
76 SingleQuotedByteStringLiteral(String),
79 DoubleQuotedByteStringLiteral(String),
81 TripleSingleQuotedByteStringLiteral(String),
84 TripleDoubleQuotedByteStringLiteral(String),
87 SingleQuotedRawStringLiteral(String),
90 DoubleQuotedRawStringLiteral(String),
93 TripleSingleQuotedRawStringLiteral(String),
96 TripleDoubleQuotedRawStringLiteral(String),
99 NationalStringLiteral(String),
101 EscapedStringLiteral(String),
103 UnicodeStringLiteral(String),
105 HexStringLiteral(String),
107 Comma,
109 Whitespace(Whitespace),
111 DoubleEq,
113 Eq,
115 Neq,
117 Lt,
119 Gt,
121 LtEq,
123 GtEq,
125 Spaceship,
127 Plus,
129 Minus,
131 Mul,
133 Div,
135 DuckIntDiv,
137 Mod,
139 StringConcat,
141 LParen,
143 RParen,
145 Period,
147 Colon,
149 DoubleColon,
151 Assignment,
153 SemiColon,
155 Backslash,
157 LBracket,
159 RBracket,
161 Ampersand,
163 Pipe,
165 Caret,
167 LBrace,
169 RBrace,
171 RArrow,
173 Sharp,
175 DoubleSharp,
177 Tilde,
179 TildeAsterisk,
181 ExclamationMarkTilde,
183 ExclamationMarkTildeAsterisk,
185 DoubleTilde,
187 DoubleTildeAsterisk,
189 ExclamationMarkDoubleTilde,
191 ExclamationMarkDoubleTildeAsterisk,
193 ShiftLeft,
195 ShiftRight,
197 Overlap,
199 ExclamationMark,
201 DoubleExclamationMark,
203 AtSign,
205 CaretAt,
207 PGSquareRoot,
209 PGCubeRoot,
211 Placeholder(String),
213 Arrow,
215 LongArrow,
217 HashArrow,
219 AtDashAt,
221 QuestionMarkDash,
223 AmpersandLeftAngleBracket,
225 AmpersandRightAngleBracket,
227 AmpersandLeftAngleBracketVerticalBar,
229 VerticalBarAmpersandRightAngleBracket,
231 TwoWayArrow,
233 LeftAngleBracketCaret,
235 RightAngleBracketCaret,
237 QuestionMarkSharp,
239 QuestionMarkDashVerticalBar,
241 QuestionMarkDoubleVerticalBar,
243 TildeEqual,
245 ShiftLeftVerticalBar,
247 VerticalBarShiftRight,
249 VerticalBarRightAngleBracket,
251 HashLongArrow,
253 AtArrow,
255 ArrowAt,
257 HashMinus,
260 AtQuestion,
263 AtAt,
267 Question,
270 QuestionAnd,
273 QuestionPipe,
276 CustomBinaryOperator(String),
280}
281
282impl fmt::Display for Token {
283 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
284 match self {
285 Token::EOF => f.write_str("EOF"),
286 Token::Word(ref w) => write!(f, "{w}"),
287 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
288 Token::Char(ref c) => write!(f, "{c}"),
289 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
290 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
291 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
292 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
293 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
294 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
295 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
296 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
297 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
298 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
299 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
300 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
301 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
302 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
303 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
304 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
305 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
306 Token::Comma => f.write_str(","),
307 Token::Whitespace(ws) => write!(f, "{ws}"),
308 Token::DoubleEq => f.write_str("=="),
309 Token::Spaceship => f.write_str("<=>"),
310 Token::Eq => f.write_str("="),
311 Token::Neq => f.write_str("<>"),
312 Token::Lt => f.write_str("<"),
313 Token::Gt => f.write_str(">"),
314 Token::LtEq => f.write_str("<="),
315 Token::GtEq => f.write_str(">="),
316 Token::Plus => f.write_str("+"),
317 Token::Minus => f.write_str("-"),
318 Token::Mul => f.write_str("*"),
319 Token::Div => f.write_str("/"),
320 Token::DuckIntDiv => f.write_str("//"),
321 Token::StringConcat => f.write_str("||"),
322 Token::Mod => f.write_str("%"),
323 Token::LParen => f.write_str("("),
324 Token::RParen => f.write_str(")"),
325 Token::Period => f.write_str("."),
326 Token::Colon => f.write_str(":"),
327 Token::DoubleColon => f.write_str("::"),
328 Token::Assignment => f.write_str(":="),
329 Token::SemiColon => f.write_str(";"),
330 Token::Backslash => f.write_str("\\"),
331 Token::LBracket => f.write_str("["),
332 Token::RBracket => f.write_str("]"),
333 Token::Ampersand => f.write_str("&"),
334 Token::Caret => f.write_str("^"),
335 Token::Pipe => f.write_str("|"),
336 Token::LBrace => f.write_str("{"),
337 Token::RBrace => f.write_str("}"),
338 Token::RArrow => f.write_str("=>"),
339 Token::Sharp => f.write_str("#"),
340 Token::DoubleSharp => f.write_str("##"),
341 Token::ExclamationMark => f.write_str("!"),
342 Token::DoubleExclamationMark => f.write_str("!!"),
343 Token::Tilde => f.write_str("~"),
344 Token::TildeAsterisk => f.write_str("~*"),
345 Token::ExclamationMarkTilde => f.write_str("!~"),
346 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
347 Token::DoubleTilde => f.write_str("~~"),
348 Token::DoubleTildeAsterisk => f.write_str("~~*"),
349 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
350 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
351 Token::AtSign => f.write_str("@"),
352 Token::CaretAt => f.write_str("^@"),
353 Token::ShiftLeft => f.write_str("<<"),
354 Token::ShiftRight => f.write_str(">>"),
355 Token::Overlap => f.write_str("&&"),
356 Token::PGSquareRoot => f.write_str("|/"),
357 Token::PGCubeRoot => f.write_str("||/"),
358 Token::AtDashAt => f.write_str("@-@"),
359 Token::QuestionMarkDash => f.write_str("?-"),
360 Token::AmpersandLeftAngleBracket => f.write_str("&<"),
361 Token::AmpersandRightAngleBracket => f.write_str("&>"),
362 Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
363 Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
364 Token::VerticalBarRightAngleBracket => f.write_str("|>"),
365 Token::TwoWayArrow => f.write_str("<->"),
366 Token::LeftAngleBracketCaret => f.write_str("<^"),
367 Token::RightAngleBracketCaret => f.write_str(">^"),
368 Token::QuestionMarkSharp => f.write_str("?#"),
369 Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
370 Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
371 Token::TildeEqual => f.write_str("~="),
372 Token::ShiftLeftVerticalBar => f.write_str("<<|"),
373 Token::VerticalBarShiftRight => f.write_str("|>>"),
374 Token::Placeholder(ref s) => write!(f, "{s}"),
375 Token::Arrow => write!(f, "->"),
376 Token::LongArrow => write!(f, "->>"),
377 Token::HashArrow => write!(f, "#>"),
378 Token::HashLongArrow => write!(f, "#>>"),
379 Token::AtArrow => write!(f, "@>"),
380 Token::ArrowAt => write!(f, "<@"),
381 Token::HashMinus => write!(f, "#-"),
382 Token::AtQuestion => write!(f, "@?"),
383 Token::AtAt => write!(f, "@@"),
384 Token::Question => write!(f, "?"),
385 Token::QuestionAnd => write!(f, "?&"),
386 Token::QuestionPipe => write!(f, "?|"),
387 Token::CustomBinaryOperator(s) => f.write_str(s),
388 }
389 }
390}
391
392impl Token {
393 pub fn make_keyword(keyword: &str) -> Self {
394 Token::make_word(keyword, None)
395 }
396
397 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
398 let word_uppercase = word.to_uppercase();
399 Token::Word(Word {
400 value: word.to_string(),
401 quote_style,
402 keyword: if quote_style.is_none() {
403 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
404 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
405 } else {
406 Keyword::NoKeyword
407 },
408 })
409 }
410}
411
412#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
416pub struct Word {
417 pub value: String,
420 pub quote_style: Option<char>,
424 pub keyword: Keyword,
427}
428
429impl fmt::Display for Word {
430 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
431 match self.quote_style {
432 Some(s) if s == '"' || s == '[' || s == '`' => {
433 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
434 }
435 None => f.write_str(&self.value),
436 _ => panic!("Unexpected quote_style!"),
437 }
438 }
439}
440
441impl Word {
442 fn matching_end_quote(ch: char) -> char {
443 match ch {
444 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
448 }
449 }
450}
451
452#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
453#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
454#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
455pub enum Whitespace {
456 Space,
457 Newline,
458 Tab,
459 SingleLineComment { comment: String, prefix: String },
460 MultiLineComment(String),
461}
462
463impl fmt::Display for Whitespace {
464 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
465 match self {
466 Whitespace::Space => f.write_str(" "),
467 Whitespace::Newline => f.write_str("\n"),
468 Whitespace::Tab => f.write_str("\t"),
469 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
470 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
471 }
472 }
473}
474
475#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
495#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
496#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
497pub struct Location {
498 pub line: u64,
502 pub column: u64,
506}
507
508impl fmt::Display for Location {
509 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
510 if self.line == 0 {
511 return Ok(());
512 }
513 write!(f, " at Line: {}, Column: {}", self.line, self.column)
514 }
515}
516
517impl fmt::Debug for Location {
518 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
519 write!(f, "Location({},{})", self.line, self.column)
520 }
521}
522
523impl Location {
524 pub fn empty() -> Self {
526 Self { line: 0, column: 0 }
527 }
528
529 pub fn new(line: u64, column: u64) -> Self {
531 Self { line, column }
532 }
533
534 pub fn of(line: u64, column: u64) -> Self {
539 Self::new(line, column)
540 }
541
542 pub fn span_to(self, end: Self) -> Span {
544 Span { start: self, end }
545 }
546}
547
548impl From<(u64, u64)> for Location {
549 fn from((line, column): (u64, u64)) -> Self {
550 Self { line, column }
551 }
552}
553
554#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
558#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
559#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
560pub struct Span {
561 pub start: Location,
562 pub end: Location,
563}
564
565impl fmt::Debug for Span {
566 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
567 write!(f, "Span({:?}..{:?})", self.start, self.end)
568 }
569}
570
571impl Span {
572 const EMPTY: Span = Self::empty();
575
576 pub fn new(start: Location, end: Location) -> Span {
578 Span { start, end }
579 }
580
581 pub const fn empty() -> Span {
586 Span {
587 start: Location { line: 0, column: 0 },
588 end: Location { line: 0, column: 0 },
589 }
590 }
591
592 pub fn union(&self, other: &Span) -> Span {
608 match (self, other) {
611 (&Span::EMPTY, _) => *other,
612 (_, &Span::EMPTY) => *self,
613 _ => Span {
614 start: cmp::min(self.start, other.start),
615 end: cmp::max(self.end, other.end),
616 },
617 }
618 }
619
620 pub fn union_opt(&self, other: &Option<Span>) -> Span {
624 match other {
625 Some(other) => self.union(other),
626 None => *self,
627 }
628 }
629
630 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
648 iter.into_iter()
649 .reduce(|acc, item| acc.union(&item))
650 .unwrap_or(Span::empty())
651 }
652}
653
654#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656pub type TokenWithLocation = TokenWithSpan;
657
658#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
681#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
682#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
683pub struct TokenWithSpan {
684 pub token: Token,
685 pub span: Span,
686}
687
688impl TokenWithSpan {
689 pub fn new(token: Token, span: Span) -> Self {
691 Self { token, span }
692 }
693
694 pub fn wrap(token: Token) -> Self {
696 Self::new(token, Span::empty())
697 }
698
699 pub fn at(token: Token, start: Location, end: Location) -> Self {
701 Self::new(token, Span::new(start, end))
702 }
703
704 pub fn new_eof() -> Self {
706 Self::wrap(Token::EOF)
707 }
708}
709
710impl PartialEq<Token> for TokenWithSpan {
711 fn eq(&self, other: &Token) -> bool {
712 &self.token == other
713 }
714}
715
716impl PartialEq<TokenWithSpan> for Token {
717 fn eq(&self, other: &TokenWithSpan) -> bool {
718 self == &other.token
719 }
720}
721
722impl fmt::Display for TokenWithSpan {
723 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
724 self.token.fmt(f)
725 }
726}
727
728#[derive(Debug, PartialEq, Eq)]
730pub struct TokenizerError {
731 pub message: String,
732 pub location: Location,
733}
734
735impl fmt::Display for TokenizerError {
736 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737 write!(f, "{}{}", self.message, self.location,)
738 }
739}
740
741#[cfg(feature = "std")]
742impl std::error::Error for TokenizerError {}
743
744struct State<'a> {
745 peekable: Peekable<Chars<'a>>,
746 pub line: u64,
747 pub col: u64,
748}
749
750impl State<'_> {
751 pub fn next(&mut self) -> Option<char> {
753 match self.peekable.next() {
754 None => None,
755 Some(s) => {
756 if s == '\n' {
757 self.line += 1;
758 self.col = 1;
759 } else {
760 self.col += 1;
761 }
762 Some(s)
763 }
764 }
765 }
766
767 pub fn peek(&mut self) -> Option<&char> {
769 self.peekable.peek()
770 }
771
772 pub fn location(&self) -> Location {
773 Location {
774 line: self.line,
775 column: self.col,
776 }
777 }
778}
779
780#[derive(Copy, Clone)]
782enum NumStringQuoteChars {
783 One,
785 Many(NonZeroU8),
787}
788
789struct TokenizeQuotedStringSettings {
791 quote_style: char,
793 num_quote_chars: NumStringQuoteChars,
795 num_opening_quotes_to_consume: u8,
801 backslash_escape: bool,
804}
805
806pub struct Tokenizer<'a> {
808 dialect: &'a dyn Dialect,
809 query: &'a str,
810 unescape: bool,
813}
814
815impl<'a> Tokenizer<'a> {
816 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
833 Self {
834 dialect,
835 query,
836 unescape: true,
837 }
838 }
839
840 pub fn with_unescape(mut self, unescape: bool) -> Self {
871 self.unescape = unescape;
872 self
873 }
874
875 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877 let twl = self.tokenize_with_location()?;
878 Ok(twl.into_iter().map(|t| t.token).collect())
879 }
880
881 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
883 let mut tokens: Vec<TokenWithSpan> = vec![];
884 self.tokenize_with_location_into_buf(&mut tokens)
885 .map(|_| tokens)
886 }
887
888 pub fn tokenize_with_location_into_buf(
891 &mut self,
892 buf: &mut Vec<TokenWithSpan>,
893 ) -> Result<(), TokenizerError> {
894 let mut state = State {
895 peekable: self.query.chars().peekable(),
896 line: 1,
897 col: 1,
898 };
899
900 let mut location = state.location();
901 while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902 let span = location.span_to(state.location());
903
904 buf.push(TokenWithSpan { token, span });
905
906 location = state.location();
907 }
908 Ok(())
909 }
910
911 fn tokenize_identifier_or_keyword(
913 &self,
914 ch: impl IntoIterator<Item = char>,
915 chars: &mut State,
916 ) -> Result<Option<Token>, TokenizerError> {
917 chars.next(); let ch: String = ch.into_iter().collect();
919 let word = self.tokenize_word(ch, chars);
920
921 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923 let mut inner_state = State {
924 peekable: word.chars().peekable(),
925 line: 0,
926 col: 0,
927 };
928 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
930 s += s2.as_str();
931 return Ok(Some(Token::Number(s, false)));
932 }
933
934 Ok(Some(Token::make_word(&word, None)))
935 }
936
937 fn next_token(
939 &self,
940 chars: &mut State,
941 prev_token: Option<&Token>,
942 ) -> Result<Option<Token>, TokenizerError> {
943 match chars.peek() {
944 Some(&ch) => match ch {
945 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
946 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
947 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
948 '\r' => {
949 chars.next();
951 if let Some('\n') = chars.peek() {
952 chars.next();
953 }
954 Ok(Some(Token::Whitespace(Whitespace::Newline)))
955 }
956 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
958 {
959 chars.next(); match chars.peek() {
961 Some('\'') => {
962 if self.dialect.supports_triple_quoted_string() {
963 return self
964 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
965 chars,
966 '\'',
967 false,
968 Token::SingleQuotedByteStringLiteral,
969 Token::TripleSingleQuotedByteStringLiteral,
970 );
971 }
972 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
973 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
974 }
975 Some('\"') => {
976 if self.dialect.supports_triple_quoted_string() {
977 return self
978 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
979 chars,
980 '"',
981 false,
982 Token::DoubleQuotedByteStringLiteral,
983 Token::TripleDoubleQuotedByteStringLiteral,
984 );
985 }
986 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
987 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
988 }
989 _ => {
990 let s = self.tokenize_word(b, chars);
992 Ok(Some(Token::make_word(&s, None)))
993 }
994 }
995 }
996 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
998 chars.next(); match chars.peek() {
1000 Some('\'') => self
1001 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1002 chars,
1003 '\'',
1004 false,
1005 Token::SingleQuotedRawStringLiteral,
1006 Token::TripleSingleQuotedRawStringLiteral,
1007 ),
1008 Some('\"') => self
1009 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1010 chars,
1011 '"',
1012 false,
1013 Token::DoubleQuotedRawStringLiteral,
1014 Token::TripleDoubleQuotedRawStringLiteral,
1015 ),
1016 _ => {
1017 let s = self.tokenize_word(b, chars);
1019 Ok(Some(Token::make_word(&s, None)))
1020 }
1021 }
1022 }
1023 n @ 'N' | n @ 'n' => {
1025 chars.next(); match chars.peek() {
1027 Some('\'') => {
1028 let backslash_escape =
1030 self.dialect.supports_string_literal_backslash_escape();
1031 let s =
1032 self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1033 Ok(Some(Token::NationalStringLiteral(s)))
1034 }
1035 _ => {
1036 let s = self.tokenize_word(n, chars);
1038 Ok(Some(Token::make_word(&s, None)))
1039 }
1040 }
1041 }
1042 x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1044 let starting_loc = chars.location();
1045 chars.next(); match chars.peek() {
1047 Some('\'') => {
1048 let s =
1049 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1050 Ok(Some(Token::EscapedStringLiteral(s)))
1051 }
1052 _ => {
1053 let s = self.tokenize_word(x, chars);
1055 Ok(Some(Token::make_word(&s, None)))
1056 }
1057 }
1058 }
1059 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1061 chars.next(); if chars.peek() == Some(&'&') {
1063 let mut chars_clone = chars.peekable.clone();
1065 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
1067 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
1069 return Ok(Some(Token::UnicodeStringLiteral(s)));
1070 }
1071 }
1072 let s = self.tokenize_word(x, chars);
1074 Ok(Some(Token::make_word(&s, None)))
1075 }
1076 x @ 'x' | x @ 'X' => {
1079 chars.next(); match chars.peek() {
1081 Some('\'') => {
1082 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1084 Ok(Some(Token::HexStringLiteral(s)))
1085 }
1086 _ => {
1087 let s = self.tokenize_word(x, chars);
1089 Ok(Some(Token::make_word(&s, None)))
1090 }
1091 }
1092 }
1093 '\'' => {
1095 if self.dialect.supports_triple_quoted_string() {
1096 return self
1097 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1098 chars,
1099 '\'',
1100 self.dialect.supports_string_literal_backslash_escape(),
1101 Token::SingleQuotedString,
1102 Token::TripleSingleQuotedString,
1103 );
1104 }
1105 let s = self.tokenize_single_quoted_string(
1106 chars,
1107 '\'',
1108 self.dialect.supports_string_literal_backslash_escape(),
1109 )?;
1110
1111 Ok(Some(Token::SingleQuotedString(s)))
1112 }
1113 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1115 && !self.dialect.is_identifier_start(ch) =>
1116 {
1117 if self.dialect.supports_triple_quoted_string() {
1118 return self
1119 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120 chars,
1121 '"',
1122 self.dialect.supports_string_literal_backslash_escape(),
1123 Token::DoubleQuotedString,
1124 Token::TripleDoubleQuotedString,
1125 );
1126 }
1127 let s = self.tokenize_single_quoted_string(
1128 chars,
1129 '"',
1130 self.dialect.supports_string_literal_backslash_escape(),
1131 )?;
1132
1133 Ok(Some(Token::DoubleQuotedString(s)))
1134 }
1135 quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1137 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1138 Ok(Some(Token::make_word(&word, Some(quote_start))))
1139 }
1140 quote_start
1142 if self
1143 .dialect
1144 .is_nested_delimited_identifier_start(quote_start)
1145 && self
1146 .dialect
1147 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1148 .is_some() =>
1149 {
1150 let Some((quote_start, nested_quote_start)) = self
1151 .dialect
1152 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1153 else {
1154 return self.tokenizer_error(
1155 chars.location(),
1156 format!("Expected nested delimiter '{quote_start}' before EOF."),
1157 );
1158 };
1159
1160 let Some(nested_quote_start) = nested_quote_start else {
1161 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1162 return Ok(Some(Token::make_word(&word, Some(quote_start))));
1163 };
1164
1165 let mut word = vec![];
1166 let quote_end = Word::matching_end_quote(quote_start);
1167 let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1168 let error_loc = chars.location();
1169
1170 chars.next(); peeking_take_while(chars, |ch| ch.is_whitespace());
1172 if chars.peek() != Some(&nested_quote_start) {
1173 return self.tokenizer_error(
1174 error_loc,
1175 format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1176 );
1177 }
1178 word.push(nested_quote_start.into());
1179 word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1180 word.push(nested_quote_end.into());
1181 peeking_take_while(chars, |ch| ch.is_whitespace());
1182 if chars.peek() != Some("e_end) {
1183 return self.tokenizer_error(
1184 error_loc,
1185 format!("Expected close delimiter '{quote_end}' before EOF."),
1186 );
1187 }
1188 chars.next(); Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1191 }
1192 '0'..='9' | '.' => {
1194 let is_number_separator = |ch: char, next_char: Option<char>| {
1197 self.dialect.supports_numeric_literal_underscores()
1198 && ch == '_'
1199 && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1200 };
1201
1202 let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1203 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1204 });
1205
1206 if s == "0" && chars.peek() == Some(&'x') {
1208 chars.next();
1209 let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1210 ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1211 });
1212 return Ok(Some(Token::HexStringLiteral(s2)));
1213 }
1214
1215 if let Some('.') = chars.peek() {
1217 s.push('.');
1218 chars.next();
1219 }
1220
1221 if s == "." && self.dialect.supports_numeric_prefix() {
1227 if let Some(Token::Word(_)) = prev_token {
1228 return Ok(Some(Token::Period));
1229 }
1230 }
1231
1232 s += &peeking_next_take_while(chars, |ch, next_ch| {
1234 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1235 });
1236
1237 if s == "." {
1239 return Ok(Some(Token::Period));
1240 }
1241
1242 let mut exponent_part = String::new();
1244 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1245 let mut char_clone = chars.peekable.clone();
1246 exponent_part.push(char_clone.next().unwrap());
1247
1248 match char_clone.peek() {
1250 Some(&c) if matches!(c, '+' | '-') => {
1251 exponent_part.push(c);
1252 char_clone.next();
1253 }
1254 _ => (),
1255 }
1256
1257 match char_clone.peek() {
1258 Some(&c) if c.is_ascii_digit() => {
1260 for _ in 0..exponent_part.len() {
1261 chars.next();
1262 }
1263 exponent_part +=
1264 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1265 s += exponent_part.as_str();
1266 }
1267 _ => (),
1269 }
1270 }
1271
1272 if self.dialect.supports_numeric_prefix() {
1276 if exponent_part.is_empty() {
1277 let word =
1280 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1281
1282 if !word.is_empty() {
1283 s += word.as_str();
1284 return Ok(Some(Token::make_word(s.as_str(), None)));
1285 }
1286 } else if prev_token == Some(&Token::Period) {
1287 return Ok(Some(Token::make_word(s.as_str(), None)));
1290 }
1291 }
1292
1293 let long = if chars.peek() == Some(&'L') {
1294 chars.next();
1295 true
1296 } else {
1297 false
1298 };
1299 Ok(Some(Token::Number(s, long)))
1300 }
1301 '(' => self.consume_and_return(chars, Token::LParen),
1303 ')' => self.consume_and_return(chars, Token::RParen),
1304 ',' => self.consume_and_return(chars, Token::Comma),
1305 '-' => {
1307 chars.next(); match chars.peek() {
1310 Some('-') => {
1311 let mut is_comment = true;
1312 if self.dialect.requires_single_line_comment_whitespace() {
1313 is_comment = Some(' ') == chars.peekable.clone().nth(1);
1314 }
1315
1316 if is_comment {
1317 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1319 return Ok(Some(Token::Whitespace(
1320 Whitespace::SingleLineComment {
1321 prefix: "--".to_owned(),
1322 comment,
1323 },
1324 )));
1325 }
1326
1327 self.start_binop(chars, "-", Token::Minus)
1328 }
1329 Some('>') => {
1330 chars.next();
1331 match chars.peek() {
1332 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1333 _ => self.start_binop(chars, "->", Token::Arrow),
1334 }
1335 }
1336 _ => self.start_binop(chars, "-", Token::Minus),
1338 }
1339 }
1340 '/' => {
1341 chars.next(); match chars.peek() {
1343 Some('*') => {
1344 chars.next(); self.tokenize_multiline_comment(chars)
1346 }
1347 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1348 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1350 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1351 prefix: "//".to_owned(),
1352 comment,
1353 })))
1354 }
1355 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1356 self.consume_and_return(chars, Token::DuckIntDiv)
1357 }
1358 _ => Ok(Some(Token::Div)),
1360 }
1361 }
1362 '+' => self.consume_and_return(chars, Token::Plus),
1363 '*' => self.consume_and_return(chars, Token::Mul),
1364 '%' => {
1365 chars.next(); match chars.peek() {
1367 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1368 Some(sch) if self.dialect.is_identifier_start('%') => {
1369 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1370 }
1371 _ => self.start_binop(chars, "%", Token::Mod),
1372 }
1373 }
1374 '|' => {
1375 chars.next(); match chars.peek() {
1377 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1378 Some('|') => {
1379 chars.next(); match chars.peek() {
1381 Some('/') => {
1382 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1383 }
1384 _ => self.start_binop(chars, "||", Token::StringConcat),
1385 }
1386 }
1387 Some('&') if self.dialect.supports_geometric_types() => {
1388 chars.next(); match chars.peek() {
1390 Some('>') => self.consume_for_binop(
1391 chars,
1392 "|&>",
1393 Token::VerticalBarAmpersandRightAngleBracket,
1394 ),
1395 _ => self.start_binop_opt(chars, "|&", None),
1396 }
1397 }
1398 Some('>') if self.dialect.supports_geometric_types() => {
1399 chars.next(); match chars.peek() {
1401 Some('>') => self.consume_for_binop(
1402 chars,
1403 "|>>",
1404 Token::VerticalBarShiftRight,
1405 ),
1406 _ => self.start_binop_opt(chars, "|>", None),
1407 }
1408 }
1409 Some('>') if self.dialect.supports_pipe_operator() => {
1410 self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1411 }
1412 _ => self.start_binop(chars, "|", Token::Pipe),
1414 }
1415 }
1416 '=' => {
1417 chars.next(); match chars.peek() {
1419 Some('>') => self.consume_and_return(chars, Token::RArrow),
1420 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1421 _ => Ok(Some(Token::Eq)),
1422 }
1423 }
1424 '!' => {
1425 chars.next(); match chars.peek() {
1427 Some('=') => self.consume_and_return(chars, Token::Neq),
1428 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1429 Some('~') => {
1430 chars.next();
1431 match chars.peek() {
1432 Some('*') => self
1433 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1434 Some('~') => {
1435 chars.next();
1436 match chars.peek() {
1437 Some('*') => self.consume_and_return(
1438 chars,
1439 Token::ExclamationMarkDoubleTildeAsterisk,
1440 ),
1441 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1442 }
1443 }
1444 _ => Ok(Some(Token::ExclamationMarkTilde)),
1445 }
1446 }
1447 _ => Ok(Some(Token::ExclamationMark)),
1448 }
1449 }
1450 '<' => {
1451 chars.next(); match chars.peek() {
1453 Some('=') => {
1454 chars.next();
1455 match chars.peek() {
1456 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1457 _ => self.start_binop(chars, "<=", Token::LtEq),
1458 }
1459 }
1460 Some('|') if self.dialect.supports_geometric_types() => {
1461 self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1462 }
1463 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1464 Some('<') if self.dialect.supports_geometric_types() => {
1465 chars.next(); match chars.peek() {
1467 Some('|') => self.consume_for_binop(
1468 chars,
1469 "<<|",
1470 Token::ShiftLeftVerticalBar,
1471 ),
1472 _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1473 }
1474 }
1475 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1476 Some('-') if self.dialect.supports_geometric_types() => {
1477 chars.next(); match chars.peek() {
1479 Some('>') => {
1480 self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1481 }
1482 _ => self.start_binop_opt(chars, "<-", None),
1483 }
1484 }
1485 Some('^') if self.dialect.supports_geometric_types() => {
1486 self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1487 }
1488 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1489 _ => self.start_binop(chars, "<", Token::Lt),
1490 }
1491 }
1492 '>' => {
1493 chars.next(); match chars.peek() {
1495 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1496 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1497 Some('^') if self.dialect.supports_geometric_types() => {
1498 self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1499 }
1500 _ => self.start_binop(chars, ">", Token::Gt),
1501 }
1502 }
1503 ':' => {
1504 chars.next();
1505 match chars.peek() {
1506 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1507 Some('=') => self.consume_and_return(chars, Token::Assignment),
1508 _ => Ok(Some(Token::Colon)),
1509 }
1510 }
1511 ';' => self.consume_and_return(chars, Token::SemiColon),
1512 '\\' => self.consume_and_return(chars, Token::Backslash),
1513 '[' => self.consume_and_return(chars, Token::LBracket),
1514 ']' => self.consume_and_return(chars, Token::RBracket),
1515 '&' => {
1516 chars.next(); match chars.peek() {
1518 Some('>') if self.dialect.supports_geometric_types() => {
1519 chars.next();
1520 self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1521 }
1522 Some('<') if self.dialect.supports_geometric_types() => {
1523 chars.next(); match chars.peek() {
1525 Some('|') => self.consume_and_return(
1526 chars,
1527 Token::AmpersandLeftAngleBracketVerticalBar,
1528 ),
1529 _ => {
1530 self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1531 }
1532 }
1533 }
1534 Some('&') => {
1535 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1537 }
1538 _ => self.start_binop(chars, "&", Token::Ampersand),
1540 }
1541 }
1542 '^' => {
1543 chars.next(); match chars.peek() {
1545 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1546 _ => Ok(Some(Token::Caret)),
1547 }
1548 }
1549 '{' => self.consume_and_return(chars, Token::LBrace),
1550 '}' => self.consume_and_return(chars, Token::RBrace),
1551 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1552 {
1553 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1555 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1556 prefix: "#".to_owned(),
1557 comment,
1558 })))
1559 }
1560 '~' => {
1561 chars.next(); match chars.peek() {
1563 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1564 Some('=') if self.dialect.supports_geometric_types() => {
1565 self.consume_for_binop(chars, "~=", Token::TildeEqual)
1566 }
1567 Some('~') => {
1568 chars.next();
1569 match chars.peek() {
1570 Some('*') => {
1571 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1572 }
1573 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1574 }
1575 }
1576 _ => self.start_binop(chars, "~", Token::Tilde),
1577 }
1578 }
1579 '#' => {
1580 chars.next();
1581 match chars.peek() {
1582 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1583 Some('>') => {
1584 chars.next();
1585 match chars.peek() {
1586 Some('>') => {
1587 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1588 }
1589 _ => self.start_binop(chars, "#>", Token::HashArrow),
1590 }
1591 }
1592 Some(' ') => Ok(Some(Token::Sharp)),
1593 Some('#') if self.dialect.supports_geometric_types() => {
1594 self.consume_for_binop(chars, "##", Token::DoubleSharp)
1595 }
1596 Some(sch) if self.dialect.is_identifier_start('#') => {
1597 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1598 }
1599 _ => self.start_binop(chars, "#", Token::Sharp),
1600 }
1601 }
1602 '@' => {
1603 chars.next();
1604 match chars.peek() {
1605 Some('@') if self.dialect.supports_geometric_types() => {
1606 self.consume_and_return(chars, Token::AtAt)
1607 }
1608 Some('-') if self.dialect.supports_geometric_types() => {
1609 chars.next();
1610 match chars.peek() {
1611 Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1612 _ => self.start_binop_opt(chars, "@-", None),
1613 }
1614 }
1615 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1616 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1617 Some('@') => {
1618 chars.next();
1619 match chars.peek() {
1620 Some(' ') => Ok(Some(Token::AtAt)),
1621 Some(tch) if self.dialect.is_identifier_start('@') => {
1622 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1623 }
1624 _ => Ok(Some(Token::AtAt)),
1625 }
1626 }
1627 Some(' ') => Ok(Some(Token::AtSign)),
1628 Some('\'') => Ok(Some(Token::AtSign)),
1638 Some('\"') => Ok(Some(Token::AtSign)),
1639 Some('`') => Ok(Some(Token::AtSign)),
1640 Some(sch) if self.dialect.is_identifier_start('@') => {
1641 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1642 }
1643 _ => Ok(Some(Token::AtSign)),
1644 }
1645 }
1646 '?' if self.dialect.supports_geometric_types() => {
1648 chars.next(); match chars.peek() {
1650 Some('|') => {
1651 chars.next();
1652 match chars.peek() {
1653 Some('|') => self.consume_and_return(
1654 chars,
1655 Token::QuestionMarkDoubleVerticalBar,
1656 ),
1657 _ => Ok(Some(Token::QuestionPipe)),
1658 }
1659 }
1660
1661 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1662 Some('-') => {
1663 chars.next(); match chars.peek() {
1665 Some('|') => self
1666 .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1667 _ => Ok(Some(Token::QuestionMarkDash)),
1668 }
1669 }
1670 Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1671 _ => self.consume_and_return(chars, Token::Question),
1672 }
1673 }
1674 '?' => {
1675 chars.next();
1676 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1677 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1678 }
1679
1680 ch if self.dialect.is_identifier_start(ch) => {
1682 self.tokenize_identifier_or_keyword([ch], chars)
1683 }
1684 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1685
1686 ch if ch.is_whitespace() => {
1688 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1689 }
1690 other => self.consume_and_return(chars, Token::Char(other)),
1691 },
1692 None => Ok(None),
1693 }
1694 }
1695
1696 fn consume_for_binop(
1698 &self,
1699 chars: &mut State,
1700 prefix: &str,
1701 default: Token,
1702 ) -> Result<Option<Token>, TokenizerError> {
1703 chars.next(); self.start_binop_opt(chars, prefix, Some(default))
1705 }
1706
1707 fn start_binop(
1709 &self,
1710 chars: &mut State,
1711 prefix: &str,
1712 default: Token,
1713 ) -> Result<Option<Token>, TokenizerError> {
1714 self.start_binop_opt(chars, prefix, Some(default))
1715 }
1716
1717 fn start_binop_opt(
1719 &self,
1720 chars: &mut State,
1721 prefix: &str,
1722 default: Option<Token>,
1723 ) -> Result<Option<Token>, TokenizerError> {
1724 let mut custom = None;
1725 while let Some(&ch) = chars.peek() {
1726 if !self.dialect.is_custom_operator_part(ch) {
1727 break;
1728 }
1729
1730 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1731 chars.next();
1732 }
1733 match (custom, default) {
1734 (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1735 (None, Some(tok)) => Ok(Some(tok)),
1736 (None, None) => self.tokenizer_error(
1737 chars.location(),
1738 format!("Expected a valid binary operator after '{}'", prefix),
1739 ),
1740 }
1741 }
1742
1743 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1745 let mut s = String::new();
1746 let mut value = String::new();
1747
1748 chars.next();
1749
1750 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1752 chars.next();
1753
1754 let mut is_terminated = false;
1755 let mut prev: Option<char> = None;
1756
1757 while let Some(&ch) = chars.peek() {
1758 if prev == Some('$') {
1759 if ch == '$' {
1760 chars.next();
1761 is_terminated = true;
1762 break;
1763 } else {
1764 s.push('$');
1765 s.push(ch);
1766 }
1767 } else if ch != '$' {
1768 s.push(ch);
1769 }
1770
1771 prev = Some(ch);
1772 chars.next();
1773 }
1774
1775 return if chars.peek().is_none() && !is_terminated {
1776 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1777 } else {
1778 Ok(Token::DollarQuotedString(DollarQuotedString {
1779 value: s,
1780 tag: None,
1781 }))
1782 };
1783 } else {
1784 value.push_str(&peeking_take_while(chars, |ch| {
1785 ch.is_alphanumeric()
1786 || ch == '_'
1787 || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1789 }));
1790
1791 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1793 chars.next();
1794
1795 let mut temp = String::new();
1796 let end_delimiter = format!("${}$", value);
1797
1798 loop {
1799 match chars.next() {
1800 Some(ch) => {
1801 temp.push(ch);
1802
1803 if temp.ends_with(&end_delimiter) {
1804 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1805 s.push_str(temp);
1806 }
1807 break;
1808 }
1809 }
1810 None => {
1811 if temp.ends_with(&end_delimiter) {
1812 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1813 s.push_str(temp);
1814 }
1815 break;
1816 }
1817
1818 return self.tokenizer_error(
1819 chars.location(),
1820 "Unterminated dollar-quoted, expected $",
1821 );
1822 }
1823 }
1824 }
1825 } else {
1826 return Ok(Token::Placeholder(String::from("$") + &value));
1827 }
1828 }
1829
1830 Ok(Token::DollarQuotedString(DollarQuotedString {
1831 value: s,
1832 tag: if value.is_empty() { None } else { Some(value) },
1833 }))
1834 }
1835
1836 fn tokenizer_error<R>(
1837 &self,
1838 loc: Location,
1839 message: impl Into<String>,
1840 ) -> Result<R, TokenizerError> {
1841 Err(TokenizerError {
1842 message: message.into(),
1843 location: loc,
1844 })
1845 }
1846
1847 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1849 let mut comment = peeking_take_while(chars, |ch| match ch {
1850 '\n' => false, '\r' if dialect_of!(self is PostgreSqlDialect) => false, _ => true, });
1854
1855 if let Some(ch) = chars.next() {
1856 assert!(ch == '\n' || ch == '\r');
1857 comment.push(ch);
1858 }
1859
1860 comment
1861 }
1862
1863 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1865 let mut s = first_chars.into();
1866 s.push_str(&peeking_take_while(chars, |ch| {
1867 self.dialect.is_identifier_part(ch)
1868 }));
1869 s
1870 }
1871
1872 fn tokenize_quoted_identifier(
1874 &self,
1875 quote_start: char,
1876 chars: &mut State,
1877 ) -> Result<String, TokenizerError> {
1878 let error_loc = chars.location();
1879 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
1881 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1882
1883 if last_char == Some(quote_end) {
1884 Ok(s)
1885 } else {
1886 self.tokenizer_error(
1887 error_loc,
1888 format!("Expected close delimiter '{quote_end}' before EOF."),
1889 )
1890 }
1891 }
1892
1893 fn tokenize_escaped_single_quoted_string(
1895 &self,
1896 starting_loc: Location,
1897 chars: &mut State,
1898 ) -> Result<String, TokenizerError> {
1899 if let Some(s) = unescape_single_quoted_string(chars) {
1900 return Ok(s);
1901 }
1902
1903 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1904 }
1905
1906 fn tokenize_single_or_triple_quoted_string<F>(
1909 &self,
1910 chars: &mut State,
1911 quote_style: char,
1912 backslash_escape: bool,
1913 single_quote_token: F,
1914 triple_quote_token: F,
1915 ) -> Result<Option<Token>, TokenizerError>
1916 where
1917 F: Fn(String) -> Token,
1918 {
1919 let error_loc = chars.location();
1920
1921 let mut num_opening_quotes = 0u8;
1922 for _ in 0..3 {
1923 if Some("e_style) == chars.peek() {
1924 chars.next(); num_opening_quotes += 1;
1926 } else {
1927 break;
1928 }
1929 }
1930
1931 let (token_fn, num_quote_chars) = match num_opening_quotes {
1932 1 => (single_quote_token, NumStringQuoteChars::One),
1933 2 => {
1934 return Ok(Some(single_quote_token("".into())));
1936 }
1937 3 => {
1938 let Some(num_quote_chars) = NonZeroU8::new(3) else {
1939 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1940 };
1941 (
1942 triple_quote_token,
1943 NumStringQuoteChars::Many(num_quote_chars),
1944 )
1945 }
1946 _ => {
1947 return self.tokenizer_error(error_loc, "invalid string literal opening");
1948 }
1949 };
1950
1951 let settings = TokenizeQuotedStringSettings {
1952 quote_style,
1953 num_quote_chars,
1954 num_opening_quotes_to_consume: 0,
1955 backslash_escape,
1956 };
1957
1958 self.tokenize_quoted_string(chars, settings)
1959 .map(token_fn)
1960 .map(Some)
1961 }
1962
1963 fn tokenize_single_quoted_string(
1965 &self,
1966 chars: &mut State,
1967 quote_style: char,
1968 backslash_escape: bool,
1969 ) -> Result<String, TokenizerError> {
1970 self.tokenize_quoted_string(
1971 chars,
1972 TokenizeQuotedStringSettings {
1973 quote_style,
1974 num_quote_chars: NumStringQuoteChars::One,
1975 num_opening_quotes_to_consume: 1,
1976 backslash_escape,
1977 },
1978 )
1979 }
1980
1981 fn tokenize_quoted_string(
1983 &self,
1984 chars: &mut State,
1985 settings: TokenizeQuotedStringSettings,
1986 ) -> Result<String, TokenizerError> {
1987 let mut s = String::new();
1988 let error_loc = chars.location();
1989
1990 for _ in 0..settings.num_opening_quotes_to_consume {
1992 if Some(settings.quote_style) != chars.next() {
1993 return self.tokenizer_error(error_loc, "invalid string literal opening");
1994 }
1995 }
1996
1997 let mut num_consecutive_quotes = 0;
1998 while let Some(&ch) = chars.peek() {
1999 let pending_final_quote = match settings.num_quote_chars {
2000 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2001 n @ NumStringQuoteChars::Many(count)
2002 if num_consecutive_quotes + 1 == count.get() =>
2003 {
2004 Some(n)
2005 }
2006 NumStringQuoteChars::Many(_) => None,
2007 };
2008
2009 match ch {
2010 char if char == settings.quote_style && pending_final_quote.is_some() => {
2011 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2014 let mut buf = s.chars();
2019 for _ in 1..count.get() {
2020 buf.next_back();
2021 }
2022 return Ok(buf.as_str().to_string());
2023 } else if chars
2024 .peek()
2025 .map(|c| *c == settings.quote_style)
2026 .unwrap_or(false)
2027 {
2028 s.push(ch);
2029 if !self.unescape {
2030 s.push(ch);
2032 }
2033 chars.next();
2034 } else {
2035 return Ok(s);
2036 }
2037 }
2038 '\\' if settings.backslash_escape => {
2039 chars.next();
2041
2042 num_consecutive_quotes = 0;
2043
2044 if let Some(next) = chars.peek() {
2045 if !self.unescape
2046 || (self.dialect.ignores_wildcard_escapes()
2047 && (*next == '%' || *next == '_'))
2048 {
2049 s.push(ch);
2053 s.push(*next);
2054 chars.next(); } else {
2056 let n = match next {
2057 '0' => '\0',
2058 'a' => '\u{7}',
2059 'b' => '\u{8}',
2060 'f' => '\u{c}',
2061 'n' => '\n',
2062 'r' => '\r',
2063 't' => '\t',
2064 'Z' => '\u{1a}',
2065 _ => *next,
2066 };
2067 s.push(n);
2068 chars.next(); }
2070 }
2071 }
2072 ch => {
2073 chars.next(); if ch == settings.quote_style {
2076 num_consecutive_quotes += 1;
2077 } else {
2078 num_consecutive_quotes = 0;
2079 }
2080
2081 s.push(ch);
2082 }
2083 }
2084 }
2085 self.tokenizer_error(error_loc, "Unterminated string literal")
2086 }
2087
2088 fn tokenize_multiline_comment(
2089 &self,
2090 chars: &mut State,
2091 ) -> Result<Option<Token>, TokenizerError> {
2092 let mut s = String::new();
2093 let mut nested = 1;
2094 let supports_nested_comments = self.dialect.supports_nested_comments();
2095
2096 loop {
2097 match chars.next() {
2098 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2099 chars.next(); s.push('/');
2101 s.push('*');
2102 nested += 1;
2103 }
2104 Some('*') if matches!(chars.peek(), Some('/')) => {
2105 chars.next(); nested -= 1;
2107 if nested == 0 {
2108 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2109 }
2110 s.push('*');
2111 s.push('/');
2112 }
2113 Some(ch) => {
2114 s.push(ch);
2115 }
2116 None => {
2117 break self.tokenizer_error(
2118 chars.location(),
2119 "Unexpected EOF while in a multi-line comment",
2120 );
2121 }
2122 }
2123 }
2124 }
2125
2126 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2127 let mut last_char = None;
2128 let mut s = String::new();
2129 while let Some(ch) = chars.next() {
2130 if ch == quote_end {
2131 if chars.peek() == Some("e_end) {
2132 chars.next();
2133 s.push(ch);
2134 if !self.unescape {
2135 s.push(ch);
2137 }
2138 } else {
2139 last_char = Some(quote_end);
2140 break;
2141 }
2142 } else {
2143 s.push(ch);
2144 }
2145 }
2146 (s, last_char)
2147 }
2148
2149 #[allow(clippy::unnecessary_wraps)]
2150 fn consume_and_return(
2151 &self,
2152 chars: &mut State,
2153 t: Token,
2154 ) -> Result<Option<Token>, TokenizerError> {
2155 chars.next();
2156 Ok(Some(t))
2157 }
2158}
2159
2160fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2164 let mut s = String::new();
2165 while let Some(&ch) = chars.peek() {
2166 if predicate(ch) {
2167 chars.next(); s.push(ch);
2169 } else {
2170 break;
2171 }
2172 }
2173 s
2174}
2175
2176fn peeking_next_take_while(
2178 chars: &mut State,
2179 mut predicate: impl FnMut(char, Option<char>) -> bool,
2180) -> String {
2181 let mut s = String::new();
2182 while let Some(&ch) = chars.peek() {
2183 let next_char = chars.peekable.clone().nth(1);
2184 if predicate(ch, next_char) {
2185 chars.next(); s.push(ch);
2187 } else {
2188 break;
2189 }
2190 }
2191 s
2192}
2193
2194fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2195 Unescape::new(chars).unescape()
2196}
2197
2198struct Unescape<'a: 'b, 'b> {
2199 chars: &'b mut State<'a>,
2200}
2201
2202impl<'a: 'b, 'b> Unescape<'a, 'b> {
2203 fn new(chars: &'b mut State<'a>) -> Self {
2204 Self { chars }
2205 }
2206 fn unescape(mut self) -> Option<String> {
2207 let mut unescaped = String::new();
2208
2209 self.chars.next();
2210
2211 while let Some(c) = self.chars.next() {
2212 if c == '\'' {
2213 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2215 self.chars.next();
2216 unescaped.push('\'');
2217 continue;
2218 }
2219 return Some(unescaped);
2220 }
2221
2222 if c != '\\' {
2223 unescaped.push(c);
2224 continue;
2225 }
2226
2227 let c = match self.chars.next()? {
2228 'b' => '\u{0008}',
2229 'f' => '\u{000C}',
2230 'n' => '\n',
2231 'r' => '\r',
2232 't' => '\t',
2233 'u' => self.unescape_unicode_16()?,
2234 'U' => self.unescape_unicode_32()?,
2235 'x' => self.unescape_hex()?,
2236 c if c.is_digit(8) => self.unescape_octal(c)?,
2237 c => c,
2238 };
2239
2240 unescaped.push(Self::check_null(c)?);
2241 }
2242
2243 None
2244 }
2245
2246 #[inline]
2247 fn check_null(c: char) -> Option<char> {
2248 if c == '\0' {
2249 None
2250 } else {
2251 Some(c)
2252 }
2253 }
2254
2255 #[inline]
2256 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2257 match u32::from_str_radix(s, RADIX) {
2259 Err(_) => None,
2260 Ok(n) => {
2261 let n = n & 0xFF;
2262 if n <= 127 {
2263 char::from_u32(n)
2264 } else {
2265 None
2266 }
2267 }
2268 }
2269 }
2270
2271 fn unescape_hex(&mut self) -> Option<char> {
2273 let mut s = String::new();
2274
2275 for _ in 0..2 {
2276 match self.next_hex_digit() {
2277 Some(c) => s.push(c),
2278 None => break,
2279 }
2280 }
2281
2282 if s.is_empty() {
2283 return Some('x');
2284 }
2285
2286 Self::byte_to_char::<16>(&s)
2287 }
2288
2289 #[inline]
2290 fn next_hex_digit(&mut self) -> Option<char> {
2291 match self.chars.peek() {
2292 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2293 _ => None,
2294 }
2295 }
2296
2297 fn unescape_octal(&mut self, c: char) -> Option<char> {
2299 let mut s = String::new();
2300
2301 s.push(c);
2302 for _ in 0..2 {
2303 match self.next_octal_digest() {
2304 Some(c) => s.push(c),
2305 None => break,
2306 }
2307 }
2308
2309 Self::byte_to_char::<8>(&s)
2310 }
2311
2312 #[inline]
2313 fn next_octal_digest(&mut self) -> Option<char> {
2314 match self.chars.peek() {
2315 Some(c) if c.is_digit(8) => self.chars.next(),
2316 _ => None,
2317 }
2318 }
2319
2320 fn unescape_unicode_16(&mut self) -> Option<char> {
2322 self.unescape_unicode::<4>()
2323 }
2324
2325 fn unescape_unicode_32(&mut self) -> Option<char> {
2327 self.unescape_unicode::<8>()
2328 }
2329
2330 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2331 let mut s = String::new();
2332 for _ in 0..NUM {
2333 s.push(self.chars.next()?);
2334 }
2335 match u32::from_str_radix(&s, 16) {
2336 Err(_) => None,
2337 Ok(n) => char::from_u32(n),
2338 }
2339 }
2340}
2341
2342fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2343 let mut unescaped = String::new();
2344 chars.next(); while let Some(c) = chars.next() {
2346 match c {
2347 '\'' => {
2348 if chars.peek() == Some(&'\'') {
2349 chars.next();
2350 unescaped.push('\'');
2351 } else {
2352 return Ok(unescaped);
2353 }
2354 }
2355 '\\' => match chars.peek() {
2356 Some('\\') => {
2357 chars.next();
2358 unescaped.push('\\');
2359 }
2360 Some('+') => {
2361 chars.next();
2362 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2363 }
2364 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2365 },
2366 _ => {
2367 unescaped.push(c);
2368 }
2369 }
2370 }
2371 Err(TokenizerError {
2372 message: "Unterminated unicode encoded string literal".to_string(),
2373 location: chars.location(),
2374 })
2375}
2376
2377fn take_char_from_hex_digits(
2378 chars: &mut State<'_>,
2379 max_digits: usize,
2380) -> Result<char, TokenizerError> {
2381 let mut result = 0u32;
2382 for _ in 0..max_digits {
2383 let next_char = chars.next().ok_or_else(|| TokenizerError {
2384 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2385 .to_string(),
2386 location: chars.location(),
2387 })?;
2388 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2389 message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
2390 location: chars.location(),
2391 })?;
2392 result = result * 16 + digit;
2393 }
2394 char::from_u32(result).ok_or_else(|| TokenizerError {
2395 message: format!("Invalid unicode character: {:x}", result),
2396 location: chars.location(),
2397 })
2398}
2399
2400#[cfg(test)]
2401mod tests {
2402 use super::*;
2403 use crate::dialect::{
2404 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2405 };
2406 use crate::test_utils::all_dialects_where;
2407 use core::fmt::Debug;
2408
2409 #[test]
2410 fn tokenizer_error_impl() {
2411 let err = TokenizerError {
2412 message: "test".into(),
2413 location: Location { line: 1, column: 1 },
2414 };
2415 #[cfg(feature = "std")]
2416 {
2417 use std::error::Error;
2418 assert!(err.source().is_none());
2419 }
2420 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2421 }
2422
2423 #[test]
2424 fn tokenize_select_1() {
2425 let sql = String::from("SELECT 1");
2426 let dialect = GenericDialect {};
2427 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2428
2429 let expected = vec![
2430 Token::make_keyword("SELECT"),
2431 Token::Whitespace(Whitespace::Space),
2432 Token::Number(String::from("1"), false),
2433 ];
2434
2435 compare(expected, tokens);
2436 }
2437
2438 #[test]
2439 fn tokenize_select_float() {
2440 let sql = String::from("SELECT .1");
2441 let dialect = GenericDialect {};
2442 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2443
2444 let expected = vec![
2445 Token::make_keyword("SELECT"),
2446 Token::Whitespace(Whitespace::Space),
2447 Token::Number(String::from(".1"), false),
2448 ];
2449
2450 compare(expected, tokens);
2451 }
2452
2453 #[test]
2454 fn tokenize_clickhouse_double_equal() {
2455 let sql = String::from("SELECT foo=='1'");
2456 let dialect = ClickHouseDialect {};
2457 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2458 let tokens = tokenizer.tokenize().unwrap();
2459
2460 let expected = vec![
2461 Token::make_keyword("SELECT"),
2462 Token::Whitespace(Whitespace::Space),
2463 Token::Word(Word {
2464 value: "foo".to_string(),
2465 quote_style: None,
2466 keyword: Keyword::NoKeyword,
2467 }),
2468 Token::DoubleEq,
2469 Token::SingleQuotedString("1".to_string()),
2470 ];
2471
2472 compare(expected, tokens);
2473 }
2474
2475 #[test]
2476 fn tokenize_numeric_literal_underscore() {
2477 let dialect = GenericDialect {};
2478 let sql = String::from("SELECT 10_000");
2479 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2480 let tokens = tokenizer.tokenize().unwrap();
2481 let expected = vec![
2482 Token::make_keyword("SELECT"),
2483 Token::Whitespace(Whitespace::Space),
2484 Token::Number("10".to_string(), false),
2485 Token::make_word("_000", None),
2486 ];
2487 compare(expected, tokens);
2488
2489 all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2490 "SELECT 10_000, _10_000, 10_00_, 10___0",
2491 vec![
2492 Token::make_keyword("SELECT"),
2493 Token::Whitespace(Whitespace::Space),
2494 Token::Number("10_000".to_string(), false),
2495 Token::Comma,
2496 Token::Whitespace(Whitespace::Space),
2497 Token::make_word("_10_000", None), Token::Comma,
2499 Token::Whitespace(Whitespace::Space),
2500 Token::Number("10_00".to_string(), false),
2501 Token::make_word("_", None), Token::Comma,
2503 Token::Whitespace(Whitespace::Space),
2504 Token::Number("10".to_string(), false),
2505 Token::make_word("___0", None), ],
2507 );
2508 }
2509
2510 #[test]
2511 fn tokenize_select_exponent() {
2512 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2513 let dialect = GenericDialect {};
2514 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2515
2516 let expected = vec![
2517 Token::make_keyword("SELECT"),
2518 Token::Whitespace(Whitespace::Space),
2519 Token::Number(String::from("1e10"), false),
2520 Token::Comma,
2521 Token::Whitespace(Whitespace::Space),
2522 Token::Number(String::from("1e-10"), false),
2523 Token::Comma,
2524 Token::Whitespace(Whitespace::Space),
2525 Token::Number(String::from("1e+10"), false),
2526 Token::Comma,
2527 Token::Whitespace(Whitespace::Space),
2528 Token::Number(String::from("1"), false),
2529 Token::make_word("ea", None),
2530 Token::Comma,
2531 Token::Whitespace(Whitespace::Space),
2532 Token::Number(String::from("1e-10"), false),
2533 Token::make_word("a", None),
2534 Token::Comma,
2535 Token::Whitespace(Whitespace::Space),
2536 Token::Number(String::from("1e-10"), false),
2537 Token::Minus,
2538 Token::Number(String::from("10"), false),
2539 ];
2540
2541 compare(expected, tokens);
2542 }
2543
2544 #[test]
2545 fn tokenize_scalar_function() {
2546 let sql = String::from("SELECT sqrt(1)");
2547 let dialect = GenericDialect {};
2548 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2549
2550 let expected = vec![
2551 Token::make_keyword("SELECT"),
2552 Token::Whitespace(Whitespace::Space),
2553 Token::make_word("sqrt", None),
2554 Token::LParen,
2555 Token::Number(String::from("1"), false),
2556 Token::RParen,
2557 ];
2558
2559 compare(expected, tokens);
2560 }
2561
2562 #[test]
2563 fn tokenize_string_string_concat() {
2564 let sql = String::from("SELECT 'a' || 'b'");
2565 let dialect = GenericDialect {};
2566 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2567
2568 let expected = vec![
2569 Token::make_keyword("SELECT"),
2570 Token::Whitespace(Whitespace::Space),
2571 Token::SingleQuotedString(String::from("a")),
2572 Token::Whitespace(Whitespace::Space),
2573 Token::StringConcat,
2574 Token::Whitespace(Whitespace::Space),
2575 Token::SingleQuotedString(String::from("b")),
2576 ];
2577
2578 compare(expected, tokens);
2579 }
2580 #[test]
2581 fn tokenize_bitwise_op() {
2582 let sql = String::from("SELECT one | two ^ three");
2583 let dialect = GenericDialect {};
2584 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2585
2586 let expected = vec![
2587 Token::make_keyword("SELECT"),
2588 Token::Whitespace(Whitespace::Space),
2589 Token::make_word("one", None),
2590 Token::Whitespace(Whitespace::Space),
2591 Token::Pipe,
2592 Token::Whitespace(Whitespace::Space),
2593 Token::make_word("two", None),
2594 Token::Whitespace(Whitespace::Space),
2595 Token::Caret,
2596 Token::Whitespace(Whitespace::Space),
2597 Token::make_word("three", None),
2598 ];
2599 compare(expected, tokens);
2600 }
2601
2602 #[test]
2603 fn tokenize_logical_xor() {
2604 let sql =
2605 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2606 let dialect = GenericDialect {};
2607 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2608
2609 let expected = vec![
2610 Token::make_keyword("SELECT"),
2611 Token::Whitespace(Whitespace::Space),
2612 Token::make_keyword("true"),
2613 Token::Whitespace(Whitespace::Space),
2614 Token::make_keyword("XOR"),
2615 Token::Whitespace(Whitespace::Space),
2616 Token::make_keyword("true"),
2617 Token::Comma,
2618 Token::Whitespace(Whitespace::Space),
2619 Token::make_keyword("false"),
2620 Token::Whitespace(Whitespace::Space),
2621 Token::make_keyword("XOR"),
2622 Token::Whitespace(Whitespace::Space),
2623 Token::make_keyword("false"),
2624 Token::Comma,
2625 Token::Whitespace(Whitespace::Space),
2626 Token::make_keyword("true"),
2627 Token::Whitespace(Whitespace::Space),
2628 Token::make_keyword("XOR"),
2629 Token::Whitespace(Whitespace::Space),
2630 Token::make_keyword("false"),
2631 Token::Comma,
2632 Token::Whitespace(Whitespace::Space),
2633 Token::make_keyword("false"),
2634 Token::Whitespace(Whitespace::Space),
2635 Token::make_keyword("XOR"),
2636 Token::Whitespace(Whitespace::Space),
2637 Token::make_keyword("true"),
2638 ];
2639 compare(expected, tokens);
2640 }
2641
2642 #[test]
2643 fn tokenize_simple_select() {
2644 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2645 let dialect = GenericDialect {};
2646 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2647
2648 let expected = vec![
2649 Token::make_keyword("SELECT"),
2650 Token::Whitespace(Whitespace::Space),
2651 Token::Mul,
2652 Token::Whitespace(Whitespace::Space),
2653 Token::make_keyword("FROM"),
2654 Token::Whitespace(Whitespace::Space),
2655 Token::make_word("customer", None),
2656 Token::Whitespace(Whitespace::Space),
2657 Token::make_keyword("WHERE"),
2658 Token::Whitespace(Whitespace::Space),
2659 Token::make_word("id", None),
2660 Token::Whitespace(Whitespace::Space),
2661 Token::Eq,
2662 Token::Whitespace(Whitespace::Space),
2663 Token::Number(String::from("1"), false),
2664 Token::Whitespace(Whitespace::Space),
2665 Token::make_keyword("LIMIT"),
2666 Token::Whitespace(Whitespace::Space),
2667 Token::Number(String::from("5"), false),
2668 ];
2669
2670 compare(expected, tokens);
2671 }
2672
2673 #[test]
2674 fn tokenize_explain_select() {
2675 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2676 let dialect = GenericDialect {};
2677 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2678
2679 let expected = vec![
2680 Token::make_keyword("EXPLAIN"),
2681 Token::Whitespace(Whitespace::Space),
2682 Token::make_keyword("SELECT"),
2683 Token::Whitespace(Whitespace::Space),
2684 Token::Mul,
2685 Token::Whitespace(Whitespace::Space),
2686 Token::make_keyword("FROM"),
2687 Token::Whitespace(Whitespace::Space),
2688 Token::make_word("customer", None),
2689 Token::Whitespace(Whitespace::Space),
2690 Token::make_keyword("WHERE"),
2691 Token::Whitespace(Whitespace::Space),
2692 Token::make_word("id", None),
2693 Token::Whitespace(Whitespace::Space),
2694 Token::Eq,
2695 Token::Whitespace(Whitespace::Space),
2696 Token::Number(String::from("1"), false),
2697 ];
2698
2699 compare(expected, tokens);
2700 }
2701
2702 #[test]
2703 fn tokenize_explain_analyze_select() {
2704 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2705 let dialect = GenericDialect {};
2706 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2707
2708 let expected = vec![
2709 Token::make_keyword("EXPLAIN"),
2710 Token::Whitespace(Whitespace::Space),
2711 Token::make_keyword("ANALYZE"),
2712 Token::Whitespace(Whitespace::Space),
2713 Token::make_keyword("SELECT"),
2714 Token::Whitespace(Whitespace::Space),
2715 Token::Mul,
2716 Token::Whitespace(Whitespace::Space),
2717 Token::make_keyword("FROM"),
2718 Token::Whitespace(Whitespace::Space),
2719 Token::make_word("customer", None),
2720 Token::Whitespace(Whitespace::Space),
2721 Token::make_keyword("WHERE"),
2722 Token::Whitespace(Whitespace::Space),
2723 Token::make_word("id", None),
2724 Token::Whitespace(Whitespace::Space),
2725 Token::Eq,
2726 Token::Whitespace(Whitespace::Space),
2727 Token::Number(String::from("1"), false),
2728 ];
2729
2730 compare(expected, tokens);
2731 }
2732
2733 #[test]
2734 fn tokenize_string_predicate() {
2735 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2736 let dialect = GenericDialect {};
2737 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2738
2739 let expected = vec![
2740 Token::make_keyword("SELECT"),
2741 Token::Whitespace(Whitespace::Space),
2742 Token::Mul,
2743 Token::Whitespace(Whitespace::Space),
2744 Token::make_keyword("FROM"),
2745 Token::Whitespace(Whitespace::Space),
2746 Token::make_word("customer", None),
2747 Token::Whitespace(Whitespace::Space),
2748 Token::make_keyword("WHERE"),
2749 Token::Whitespace(Whitespace::Space),
2750 Token::make_word("salary", None),
2751 Token::Whitespace(Whitespace::Space),
2752 Token::Neq,
2753 Token::Whitespace(Whitespace::Space),
2754 Token::SingleQuotedString(String::from("Not Provided")),
2755 ];
2756
2757 compare(expected, tokens);
2758 }
2759
2760 #[test]
2761 fn tokenize_invalid_string() {
2762 let sql = String::from("\n💝مصطفىh");
2763
2764 let dialect = GenericDialect {};
2765 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2766 let expected = vec![
2768 Token::Whitespace(Whitespace::Newline),
2769 Token::Char('💝'),
2770 Token::make_word("مصطفىh", None),
2771 ];
2772 compare(expected, tokens);
2773 }
2774
2775 #[test]
2776 fn tokenize_newline_in_string_literal() {
2777 let sql = String::from("'foo\r\nbar\nbaz'");
2778
2779 let dialect = GenericDialect {};
2780 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2781 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2782 compare(expected, tokens);
2783 }
2784
2785 #[test]
2786 fn tokenize_unterminated_string_literal() {
2787 let sql = String::from("select 'foo");
2788
2789 let dialect = GenericDialect {};
2790 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2791 assert_eq!(
2792 tokenizer.tokenize(),
2793 Err(TokenizerError {
2794 message: "Unterminated string literal".to_string(),
2795 location: Location { line: 1, column: 8 },
2796 })
2797 );
2798 }
2799
2800 #[test]
2801 fn tokenize_unterminated_string_literal_utf8() {
2802 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2803
2804 let dialect = GenericDialect {};
2805 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2806 assert_eq!(
2807 tokenizer.tokenize(),
2808 Err(TokenizerError {
2809 message: "Unterminated string literal".to_string(),
2810 location: Location {
2811 line: 1,
2812 column: 35
2813 }
2814 })
2815 );
2816 }
2817
2818 #[test]
2819 fn tokenize_invalid_string_cols() {
2820 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2821
2822 let dialect = GenericDialect {};
2823 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2824 let expected = vec![
2826 Token::Whitespace(Whitespace::Newline),
2827 Token::Whitespace(Whitespace::Newline),
2828 Token::make_keyword("SELECT"),
2829 Token::Whitespace(Whitespace::Space),
2830 Token::Mul,
2831 Token::Whitespace(Whitespace::Space),
2832 Token::make_keyword("FROM"),
2833 Token::Whitespace(Whitespace::Space),
2834 Token::make_keyword("table"),
2835 Token::Whitespace(Whitespace::Tab),
2836 Token::Char('💝'),
2837 Token::make_word("مصطفىh", None),
2838 ];
2839 compare(expected, tokens);
2840 }
2841
2842 #[test]
2843 fn tokenize_dollar_quoted_string_tagged() {
2844 let test_cases = vec![
2845 (
2846 String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2847 vec![
2848 Token::make_keyword("SELECT"),
2849 Token::Whitespace(Whitespace::Space),
2850 Token::DollarQuotedString(DollarQuotedString {
2851 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2852 tag: Some("tag".into()),
2853 })
2854 ]
2855 ),
2856 (
2857 String::from("SELECT $abc$x$ab$abc$"),
2858 vec![
2859 Token::make_keyword("SELECT"),
2860 Token::Whitespace(Whitespace::Space),
2861 Token::DollarQuotedString(DollarQuotedString {
2862 value: "x$ab".into(),
2863 tag: Some("abc".into()),
2864 })
2865 ]
2866 ),
2867 (
2868 String::from("SELECT $abc$$abc$"),
2869 vec![
2870 Token::make_keyword("SELECT"),
2871 Token::Whitespace(Whitespace::Space),
2872 Token::DollarQuotedString(DollarQuotedString {
2873 value: "".into(),
2874 tag: Some("abc".into()),
2875 })
2876 ]
2877 ),
2878 (
2879 String::from("0$abc$$abc$1"),
2880 vec![
2881 Token::Number("0".into(), false),
2882 Token::DollarQuotedString(DollarQuotedString {
2883 value: "".into(),
2884 tag: Some("abc".into()),
2885 }),
2886 Token::Number("1".into(), false),
2887 ]
2888 ),
2889 (
2890 String::from("$function$abc$q$data$q$$function$"),
2891 vec![
2892 Token::DollarQuotedString(DollarQuotedString {
2893 value: "abc$q$data$q$".into(),
2894 tag: Some("function".into()),
2895 }),
2896 ]
2897 ),
2898 ];
2899
2900 let dialect = GenericDialect {};
2901 for (sql, expected) in test_cases {
2902 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2903 compare(expected, tokens);
2904 }
2905 }
2906
2907 #[test]
2908 fn tokenize_dollar_quoted_string_tagged_unterminated() {
2909 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2910 let dialect = GenericDialect {};
2911 assert_eq!(
2912 Tokenizer::new(&dialect, &sql).tokenize(),
2913 Err(TokenizerError {
2914 message: "Unterminated dollar-quoted, expected $".into(),
2915 location: Location {
2916 line: 1,
2917 column: 91
2918 }
2919 })
2920 );
2921 }
2922
2923 #[test]
2924 fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2925 let sql = String::from("SELECT $abc$abc$");
2926 let dialect = GenericDialect {};
2927 assert_eq!(
2928 Tokenizer::new(&dialect, &sql).tokenize(),
2929 Err(TokenizerError {
2930 message: "Unterminated dollar-quoted, expected $".into(),
2931 location: Location {
2932 line: 1,
2933 column: 17
2934 }
2935 })
2936 );
2937 }
2938
2939 #[test]
2940 fn tokenize_dollar_placeholder() {
2941 let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
2942 let dialect = SQLiteDialect {};
2943 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2944 assert_eq!(
2945 tokens,
2946 vec![
2947 Token::make_keyword("SELECT"),
2948 Token::Whitespace(Whitespace::Space),
2949 Token::Placeholder("$$".into()),
2950 Token::Comma,
2951 Token::Whitespace(Whitespace::Space),
2952 Token::Placeholder("$$ABC$$".into()),
2953 Token::Comma,
2954 Token::Whitespace(Whitespace::Space),
2955 Token::Placeholder("$ABC$".into()),
2956 Token::Comma,
2957 Token::Whitespace(Whitespace::Space),
2958 Token::Placeholder("$ABC".into()),
2959 ]
2960 );
2961 }
2962
2963 #[test]
2964 fn tokenize_nested_dollar_quoted_strings() {
2965 let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
2966 let dialect = GenericDialect {};
2967 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2968 let expected = vec![
2969 Token::make_keyword("SELECT"),
2970 Token::Whitespace(Whitespace::Space),
2971 Token::DollarQuotedString(DollarQuotedString {
2972 value: "dollar $nested$ string".into(),
2973 tag: Some("tag".into()),
2974 }),
2975 ];
2976 compare(expected, tokens);
2977 }
2978
2979 #[test]
2980 fn tokenize_dollar_quoted_string_untagged_empty() {
2981 let sql = String::from("SELECT $$$$");
2982 let dialect = GenericDialect {};
2983 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2984 let expected = vec![
2985 Token::make_keyword("SELECT"),
2986 Token::Whitespace(Whitespace::Space),
2987 Token::DollarQuotedString(DollarQuotedString {
2988 value: "".into(),
2989 tag: None,
2990 }),
2991 ];
2992 compare(expected, tokens);
2993 }
2994
2995 #[test]
2996 fn tokenize_dollar_quoted_string_untagged() {
2997 let sql =
2998 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
2999 let dialect = GenericDialect {};
3000 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3001 let expected = vec![
3002 Token::make_keyword("SELECT"),
3003 Token::Whitespace(Whitespace::Space),
3004 Token::DollarQuotedString(DollarQuotedString {
3005 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3006 tag: None,
3007 }),
3008 ];
3009 compare(expected, tokens);
3010 }
3011
3012 #[test]
3013 fn tokenize_dollar_quoted_string_untagged_unterminated() {
3014 let sql = String::from(
3015 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3016 );
3017 let dialect = GenericDialect {};
3018 assert_eq!(
3019 Tokenizer::new(&dialect, &sql).tokenize(),
3020 Err(TokenizerError {
3021 message: "Unterminated dollar-quoted string".into(),
3022 location: Location {
3023 line: 1,
3024 column: 86
3025 }
3026 })
3027 );
3028 }
3029
3030 #[test]
3031 fn tokenize_right_arrow() {
3032 let sql = String::from("FUNCTION(key=>value)");
3033 let dialect = GenericDialect {};
3034 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3035 let expected = vec![
3036 Token::make_word("FUNCTION", None),
3037 Token::LParen,
3038 Token::make_word("key", None),
3039 Token::RArrow,
3040 Token::make_word("value", None),
3041 Token::RParen,
3042 ];
3043 compare(expected, tokens);
3044 }
3045
3046 #[test]
3047 fn tokenize_is_null() {
3048 let sql = String::from("a IS NULL");
3049 let dialect = GenericDialect {};
3050 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3051
3052 let expected = vec![
3053 Token::make_word("a", None),
3054 Token::Whitespace(Whitespace::Space),
3055 Token::make_keyword("IS"),
3056 Token::Whitespace(Whitespace::Space),
3057 Token::make_keyword("NULL"),
3058 ];
3059
3060 compare(expected, tokens);
3061 }
3062
3063 #[test]
3064 fn tokenize_comment() {
3065 let test_cases = vec![
3066 (
3067 String::from("0--this is a comment\n1"),
3068 vec![
3069 Token::Number("0".to_string(), false),
3070 Token::Whitespace(Whitespace::SingleLineComment {
3071 prefix: "--".to_string(),
3072 comment: "this is a comment\n".to_string(),
3073 }),
3074 Token::Number("1".to_string(), false),
3075 ],
3076 ),
3077 (
3078 String::from("0--this is a comment\r1"),
3079 vec![
3080 Token::Number("0".to_string(), false),
3081 Token::Whitespace(Whitespace::SingleLineComment {
3082 prefix: "--".to_string(),
3083 comment: "this is a comment\r1".to_string(),
3084 }),
3085 ],
3086 ),
3087 (
3088 String::from("0--this is a comment\r\n1"),
3089 vec![
3090 Token::Number("0".to_string(), false),
3091 Token::Whitespace(Whitespace::SingleLineComment {
3092 prefix: "--".to_string(),
3093 comment: "this is a comment\r\n".to_string(),
3094 }),
3095 Token::Number("1".to_string(), false),
3096 ],
3097 ),
3098 ];
3099
3100 let dialect = GenericDialect {};
3101
3102 for (sql, expected) in test_cases {
3103 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3104 compare(expected, tokens);
3105 }
3106 }
3107
3108 #[test]
3109 fn tokenize_comment_postgres() {
3110 let sql = String::from("1--\r0");
3111
3112 let dialect = PostgreSqlDialect {};
3113 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3114 let expected = vec![
3115 Token::Number("1".to_string(), false),
3116 Token::Whitespace(Whitespace::SingleLineComment {
3117 prefix: "--".to_string(),
3118 comment: "\r".to_string(),
3119 }),
3120 Token::Number("0".to_string(), false),
3121 ];
3122 compare(expected, tokens);
3123 }
3124
3125 #[test]
3126 fn tokenize_comment_at_eof() {
3127 let sql = String::from("--this is a comment");
3128
3129 let dialect = GenericDialect {};
3130 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3131 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3132 prefix: "--".to_string(),
3133 comment: "this is a comment".to_string(),
3134 })];
3135 compare(expected, tokens);
3136 }
3137
3138 #[test]
3139 fn tokenize_multiline_comment() {
3140 let sql = String::from("0/*multi-line\n* /comment*/1");
3141
3142 let dialect = GenericDialect {};
3143 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3144 let expected = vec![
3145 Token::Number("0".to_string(), false),
3146 Token::Whitespace(Whitespace::MultiLineComment(
3147 "multi-line\n* /comment".to_string(),
3148 )),
3149 Token::Number("1".to_string(), false),
3150 ];
3151 compare(expected, tokens);
3152 }
3153
3154 #[test]
3155 fn tokenize_nested_multiline_comment() {
3156 let dialect = GenericDialect {};
3157 let test_cases = vec![
3158 (
3159 "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3160 vec![
3161 Token::Number("0".to_string(), false),
3162 Token::Whitespace(Whitespace::MultiLineComment(
3163 "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3164 )),
3165 Token::Whitespace(Whitespace::Space),
3166 Token::Div,
3167 Token::Word(Word {
3168 value: "comment".to_string(),
3169 quote_style: None,
3170 keyword: Keyword::COMMENT,
3171 }),
3172 Token::Mul,
3173 Token::Div,
3174 Token::Number("1".to_string(), false),
3175 ],
3176 ),
3177 (
3178 "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3179 vec![
3180 Token::Number("0".to_string(), false),
3181 Token::Whitespace(Whitespace::MultiLineComment(
3182 "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3183 )),
3184 Token::Number("1".to_string(), false),
3185 ],
3186 ),
3187 (
3188 "SELECT 1/* a /* b */ c */0",
3189 vec![
3190 Token::make_keyword("SELECT"),
3191 Token::Whitespace(Whitespace::Space),
3192 Token::Number("1".to_string(), false),
3193 Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3194 Token::Number("0".to_string(), false),
3195 ],
3196 ),
3197 ];
3198
3199 for (sql, expected) in test_cases {
3200 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3201 compare(expected, tokens);
3202 }
3203 }
3204
3205 #[test]
3206 fn tokenize_nested_multiline_comment_empty() {
3207 let sql = "select 1/*/**/*/0";
3208
3209 let dialect = GenericDialect {};
3210 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3211 let expected = vec![
3212 Token::make_keyword("select"),
3213 Token::Whitespace(Whitespace::Space),
3214 Token::Number("1".to_string(), false),
3215 Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3216 Token::Number("0".to_string(), false),
3217 ];
3218
3219 compare(expected, tokens);
3220 }
3221
3222 #[test]
3223 fn tokenize_nested_comments_if_not_supported() {
3224 let dialect = SQLiteDialect {};
3225 let sql = "SELECT 1/*/* nested comment */*/0";
3226 let tokens = Tokenizer::new(&dialect, sql).tokenize();
3227 let expected = vec![
3228 Token::make_keyword("SELECT"),
3229 Token::Whitespace(Whitespace::Space),
3230 Token::Number("1".to_string(), false),
3231 Token::Whitespace(Whitespace::MultiLineComment(
3232 "/* nested comment ".to_string(),
3233 )),
3234 Token::Mul,
3235 Token::Div,
3236 Token::Number("0".to_string(), false),
3237 ];
3238
3239 compare(expected, tokens.unwrap());
3240 }
3241
3242 #[test]
3243 fn tokenize_multiline_comment_with_even_asterisks() {
3244 let sql = String::from("\n/** Comment **/\n");
3245
3246 let dialect = GenericDialect {};
3247 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3248 let expected = vec![
3249 Token::Whitespace(Whitespace::Newline),
3250 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3251 Token::Whitespace(Whitespace::Newline),
3252 ];
3253 compare(expected, tokens);
3254 }
3255
3256 #[test]
3257 fn tokenize_unicode_whitespace() {
3258 let sql = String::from(" \u{2003}\n");
3259
3260 let dialect = GenericDialect {};
3261 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3262 let expected = vec![
3263 Token::Whitespace(Whitespace::Space),
3264 Token::Whitespace(Whitespace::Space),
3265 Token::Whitespace(Whitespace::Newline),
3266 ];
3267 compare(expected, tokens);
3268 }
3269
3270 #[test]
3271 fn tokenize_mismatched_quotes() {
3272 let sql = String::from("\"foo");
3273
3274 let dialect = GenericDialect {};
3275 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3276 assert_eq!(
3277 tokenizer.tokenize(),
3278 Err(TokenizerError {
3279 message: "Expected close delimiter '\"' before EOF.".to_string(),
3280 location: Location { line: 1, column: 1 },
3281 })
3282 );
3283 }
3284
3285 #[test]
3286 fn tokenize_newlines() {
3287 let sql = String::from("line1\nline2\rline3\r\nline4\r");
3288
3289 let dialect = GenericDialect {};
3290 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3291 let expected = vec![
3292 Token::make_word("line1", None),
3293 Token::Whitespace(Whitespace::Newline),
3294 Token::make_word("line2", None),
3295 Token::Whitespace(Whitespace::Newline),
3296 Token::make_word("line3", None),
3297 Token::Whitespace(Whitespace::Newline),
3298 Token::make_word("line4", None),
3299 Token::Whitespace(Whitespace::Newline),
3300 ];
3301 compare(expected, tokens);
3302 }
3303
3304 #[test]
3305 fn tokenize_mssql_top() {
3306 let sql = "SELECT TOP 5 [bar] FROM foo";
3307 let dialect = MsSqlDialect {};
3308 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3309 let expected = vec![
3310 Token::make_keyword("SELECT"),
3311 Token::Whitespace(Whitespace::Space),
3312 Token::make_keyword("TOP"),
3313 Token::Whitespace(Whitespace::Space),
3314 Token::Number(String::from("5"), false),
3315 Token::Whitespace(Whitespace::Space),
3316 Token::make_word("bar", Some('[')),
3317 Token::Whitespace(Whitespace::Space),
3318 Token::make_keyword("FROM"),
3319 Token::Whitespace(Whitespace::Space),
3320 Token::make_word("foo", None),
3321 ];
3322 compare(expected, tokens);
3323 }
3324
3325 #[test]
3326 fn tokenize_pg_regex_match() {
3327 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3328 let dialect = GenericDialect {};
3329 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3330 let expected = vec![
3331 Token::make_keyword("SELECT"),
3332 Token::Whitespace(Whitespace::Space),
3333 Token::make_word("col", None),
3334 Token::Whitespace(Whitespace::Space),
3335 Token::Tilde,
3336 Token::Whitespace(Whitespace::Space),
3337 Token::SingleQuotedString("^a".into()),
3338 Token::Comma,
3339 Token::Whitespace(Whitespace::Space),
3340 Token::make_word("col", None),
3341 Token::Whitespace(Whitespace::Space),
3342 Token::TildeAsterisk,
3343 Token::Whitespace(Whitespace::Space),
3344 Token::SingleQuotedString("^a".into()),
3345 Token::Comma,
3346 Token::Whitespace(Whitespace::Space),
3347 Token::make_word("col", None),
3348 Token::Whitespace(Whitespace::Space),
3349 Token::ExclamationMarkTilde,
3350 Token::Whitespace(Whitespace::Space),
3351 Token::SingleQuotedString("^a".into()),
3352 Token::Comma,
3353 Token::Whitespace(Whitespace::Space),
3354 Token::make_word("col", None),
3355 Token::Whitespace(Whitespace::Space),
3356 Token::ExclamationMarkTildeAsterisk,
3357 Token::Whitespace(Whitespace::Space),
3358 Token::SingleQuotedString("^a".into()),
3359 ];
3360 compare(expected, tokens);
3361 }
3362
3363 #[test]
3364 fn tokenize_pg_like_match() {
3365 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3366 let dialect = GenericDialect {};
3367 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3368 let expected = vec![
3369 Token::make_keyword("SELECT"),
3370 Token::Whitespace(Whitespace::Space),
3371 Token::make_word("col", None),
3372 Token::Whitespace(Whitespace::Space),
3373 Token::DoubleTilde,
3374 Token::Whitespace(Whitespace::Space),
3375 Token::SingleQuotedString("_a%".into()),
3376 Token::Comma,
3377 Token::Whitespace(Whitespace::Space),
3378 Token::make_word("col", None),
3379 Token::Whitespace(Whitespace::Space),
3380 Token::DoubleTildeAsterisk,
3381 Token::Whitespace(Whitespace::Space),
3382 Token::SingleQuotedString("_a%".into()),
3383 Token::Comma,
3384 Token::Whitespace(Whitespace::Space),
3385 Token::make_word("col", None),
3386 Token::Whitespace(Whitespace::Space),
3387 Token::ExclamationMarkDoubleTilde,
3388 Token::Whitespace(Whitespace::Space),
3389 Token::SingleQuotedString("_a%".into()),
3390 Token::Comma,
3391 Token::Whitespace(Whitespace::Space),
3392 Token::make_word("col", None),
3393 Token::Whitespace(Whitespace::Space),
3394 Token::ExclamationMarkDoubleTildeAsterisk,
3395 Token::Whitespace(Whitespace::Space),
3396 Token::SingleQuotedString("_a%".into()),
3397 ];
3398 compare(expected, tokens);
3399 }
3400
3401 #[test]
3402 fn tokenize_quoted_identifier() {
3403 let sql = r#" "a "" b" "a """ "c """"" "#;
3404 let dialect = GenericDialect {};
3405 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3406 let expected = vec![
3407 Token::Whitespace(Whitespace::Space),
3408 Token::make_word(r#"a " b"#, Some('"')),
3409 Token::Whitespace(Whitespace::Space),
3410 Token::make_word(r#"a ""#, Some('"')),
3411 Token::Whitespace(Whitespace::Space),
3412 Token::make_word(r#"c """#, Some('"')),
3413 Token::Whitespace(Whitespace::Space),
3414 ];
3415 compare(expected, tokens);
3416 }
3417
3418 #[test]
3419 fn tokenize_snowflake_div() {
3420 let sql = r#"field/1000"#;
3421 let dialect = SnowflakeDialect {};
3422 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3423 let expected = vec![
3424 Token::make_word(r#"field"#, None),
3425 Token::Div,
3426 Token::Number("1000".to_string(), false),
3427 ];
3428 compare(expected, tokens);
3429 }
3430
3431 #[test]
3432 fn tokenize_quoted_identifier_with_no_escape() {
3433 let sql = r#" "a "" b" "a """ "c """"" "#;
3434 let dialect = GenericDialect {};
3435 let tokens = Tokenizer::new(&dialect, sql)
3436 .with_unescape(false)
3437 .tokenize()
3438 .unwrap();
3439 let expected = vec![
3440 Token::Whitespace(Whitespace::Space),
3441 Token::make_word(r#"a "" b"#, Some('"')),
3442 Token::Whitespace(Whitespace::Space),
3443 Token::make_word(r#"a """#, Some('"')),
3444 Token::Whitespace(Whitespace::Space),
3445 Token::make_word(r#"c """""#, Some('"')),
3446 Token::Whitespace(Whitespace::Space),
3447 ];
3448 compare(expected, tokens);
3449 }
3450
3451 #[test]
3452 fn tokenize_with_location() {
3453 let sql = "SELECT a,\n b";
3454 let dialect = GenericDialect {};
3455 let tokens = Tokenizer::new(&dialect, sql)
3456 .tokenize_with_location()
3457 .unwrap();
3458 let expected = vec![
3459 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3460 TokenWithSpan::at(
3461 Token::Whitespace(Whitespace::Space),
3462 (1, 7).into(),
3463 (1, 8).into(),
3464 ),
3465 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3466 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3467 TokenWithSpan::at(
3468 Token::Whitespace(Whitespace::Newline),
3469 (1, 10).into(),
3470 (2, 1).into(),
3471 ),
3472 TokenWithSpan::at(
3473 Token::Whitespace(Whitespace::Space),
3474 (2, 1).into(),
3475 (2, 2).into(),
3476 ),
3477 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3478 ];
3479 compare(expected, tokens);
3480 }
3481
3482 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3483 assert_eq!(expected, actual);
3488 }
3489
3490 fn check_unescape(s: &str, expected: Option<&str>) {
3491 let s = format!("'{}'", s);
3492 let mut state = State {
3493 peekable: s.chars().peekable(),
3494 line: 0,
3495 col: 0,
3496 };
3497
3498 assert_eq!(
3499 unescape_single_quoted_string(&mut state),
3500 expected.map(|s| s.to_string())
3501 );
3502 }
3503
3504 #[test]
3505 fn test_unescape() {
3506 check_unescape(r"\b", Some("\u{0008}"));
3507 check_unescape(r"\f", Some("\u{000C}"));
3508 check_unescape(r"\t", Some("\t"));
3509 check_unescape(r"\r\n", Some("\r\n"));
3510 check_unescape(r"\/", Some("/"));
3511 check_unescape(r"/", Some("/"));
3512 check_unescape(r"\\", Some("\\"));
3513
3514 check_unescape(r"\u0001", Some("\u{0001}"));
3516 check_unescape(r"\u4c91", Some("\u{4c91}"));
3517 check_unescape(r"\u4c916", Some("\u{4c91}6"));
3518 check_unescape(r"\u4c", None);
3519 check_unescape(r"\u0000", None);
3520 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3521 check_unescape(r"\U00110000", None);
3522 check_unescape(r"\U00000000", None);
3523 check_unescape(r"\u", None);
3524 check_unescape(r"\U", None);
3525 check_unescape(r"\U1010FFFF", None);
3526
3527 check_unescape(r"\x4B", Some("\u{004b}"));
3529 check_unescape(r"\x4", Some("\u{0004}"));
3530 check_unescape(r"\x4L", Some("\u{0004}L"));
3531 check_unescape(r"\x", Some("x"));
3532 check_unescape(r"\xP", Some("xP"));
3533 check_unescape(r"\x0", None);
3534 check_unescape(r"\xCAD", None);
3535 check_unescape(r"\xA9", None);
3536
3537 check_unescape(r"\1", Some("\u{0001}"));
3539 check_unescape(r"\12", Some("\u{000a}"));
3540 check_unescape(r"\123", Some("\u{0053}"));
3541 check_unescape(r"\1232", Some("\u{0053}2"));
3542 check_unescape(r"\4", Some("\u{0004}"));
3543 check_unescape(r"\45", Some("\u{0025}"));
3544 check_unescape(r"\450", Some("\u{0028}"));
3545 check_unescape(r"\603", None);
3546 check_unescape(r"\0", None);
3547 check_unescape(r"\080", None);
3548
3549 check_unescape(r"\9", Some("9"));
3551 check_unescape(r"''", Some("'"));
3552 check_unescape(
3553 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3554 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3555 );
3556 check_unescape(r"Hello\0", None);
3557 check_unescape(r"Hello\xCADRust", None);
3558 }
3559
3560 #[test]
3561 fn tokenize_numeric_prefix_trait() {
3562 #[derive(Debug)]
3563 struct NumericPrefixDialect;
3564
3565 impl Dialect for NumericPrefixDialect {
3566 fn is_identifier_start(&self, ch: char) -> bool {
3567 ch.is_ascii_lowercase()
3568 || ch.is_ascii_uppercase()
3569 || ch.is_ascii_digit()
3570 || ch == '$'
3571 }
3572
3573 fn is_identifier_part(&self, ch: char) -> bool {
3574 ch.is_ascii_lowercase()
3575 || ch.is_ascii_uppercase()
3576 || ch.is_ascii_digit()
3577 || ch == '_'
3578 || ch == '$'
3579 || ch == '{'
3580 || ch == '}'
3581 }
3582
3583 fn supports_numeric_prefix(&self) -> bool {
3584 true
3585 }
3586 }
3587
3588 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3589 tokenize_numeric_prefix_inner(&HiveDialect {});
3590 tokenize_numeric_prefix_inner(&MySqlDialect {});
3591 }
3592
3593 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3594 let sql = r#"SELECT * FROM 1"#;
3595 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3596 let expected = vec![
3597 Token::make_keyword("SELECT"),
3598 Token::Whitespace(Whitespace::Space),
3599 Token::Mul,
3600 Token::Whitespace(Whitespace::Space),
3601 Token::make_keyword("FROM"),
3602 Token::Whitespace(Whitespace::Space),
3603 Token::Number(String::from("1"), false),
3604 ];
3605 compare(expected, tokens);
3606 }
3607
3608 #[test]
3609 fn tokenize_quoted_string_escape() {
3610 let dialect = SnowflakeDialect {};
3611 for (sql, expected, expected_unescaped) in [
3612 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3613 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3614 (r#"'\\'"#, r#"\\"#, r#"\"#),
3615 (
3616 r#"'\0\a\b\f\n\r\t\Z'"#,
3617 r#"\0\a\b\f\n\r\t\Z"#,
3618 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3619 ),
3620 (r#"'\"'"#, r#"\""#, "\""),
3621 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3622 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3623 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3624 (r#"'\q'"#, r#"\q"#, r#"q"#),
3625 (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3626 (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3627 ] {
3628 let tokens = Tokenizer::new(&dialect, sql)
3629 .with_unescape(false)
3630 .tokenize()
3631 .unwrap();
3632 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3633 compare(expected, tokens);
3634
3635 let tokens = Tokenizer::new(&dialect, sql)
3636 .with_unescape(true)
3637 .tokenize()
3638 .unwrap();
3639 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3640 compare(expected, tokens);
3641 }
3642
3643 for sql in [r#"'\'"#, r#"'ab\'"#] {
3644 let mut tokenizer = Tokenizer::new(&dialect, sql);
3645 assert_eq!(
3646 "Unterminated string literal",
3647 tokenizer.tokenize().unwrap_err().message.as_str(),
3648 );
3649 }
3650
3651 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3653 let dialect = GenericDialect {};
3654 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3655
3656 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3657
3658 compare(expected, tokens);
3659 }
3660
3661 for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3663 let dialect = MySqlDialect {};
3664 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3665
3666 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3667
3668 compare(expected, tokens);
3669 }
3670 }
3671
3672 #[test]
3673 fn tokenize_triple_quoted_string() {
3674 fn check<F>(
3675 q: char, r: char, quote_token: F,
3678 ) where
3679 F: Fn(String) -> Token,
3680 {
3681 let dialect = BigQueryDialect {};
3682
3683 for (sql, expected, expected_unescaped) in [
3684 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3686 (
3688 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3689 format!(r#"ab{q}{q}\{q}{q}cd"#),
3690 format!(r#"ab{q}{q}{q}{q}cd"#),
3691 ),
3692 (
3694 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3695 "abc".into(),
3696 "abc".into(),
3697 ),
3698 (
3700 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3701 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3702 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3703 ),
3704 (
3706 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3707 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3708 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3709 ),
3710 (
3712 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3713 r#"a\'\'b\'c\'d"#.into(),
3714 r#"a''b'c'd"#.into(),
3715 ),
3716 (
3718 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3719 r#"abc\0\n\rdef"#.into(),
3720 "abc\0\n\rdef".into(),
3721 ),
3722 ] {
3723 let tokens = Tokenizer::new(&dialect, sql.as_str())
3724 .with_unescape(false)
3725 .tokenize()
3726 .unwrap();
3727 let expected = vec![quote_token(expected.to_string())];
3728 compare(expected, tokens);
3729
3730 let tokens = Tokenizer::new(&dialect, sql.as_str())
3731 .with_unescape(true)
3732 .tokenize()
3733 .unwrap();
3734 let expected = vec![quote_token(expected_unescaped.to_string())];
3735 compare(expected, tokens);
3736 }
3737
3738 for sql in [
3739 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3740 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3741 format!(r#"{q}{q}{q}{q}"#),
3742 format!(r#"{q}{q}{q}{r}{r}"#),
3743 format!(r#"{q}{q}{q}abc{q}"#),
3744 format!(r#"{q}{q}{q}abc{q}{q}"#),
3745 format!(r#"{q}{q}{q}abc"#),
3746 ] {
3747 let dialect = BigQueryDialect {};
3748 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3749 assert_eq!(
3750 "Unterminated string literal",
3751 tokenizer.tokenize().unwrap_err().message.as_str(),
3752 );
3753 }
3754 }
3755
3756 check('"', '\'', Token::TripleDoubleQuotedString);
3757
3758 check('\'', '"', Token::TripleSingleQuotedString);
3759
3760 let dialect = BigQueryDialect {};
3761
3762 let sql = r#"""''"#;
3763 let tokens = Tokenizer::new(&dialect, sql)
3764 .with_unescape(true)
3765 .tokenize()
3766 .unwrap();
3767 let expected = vec![
3768 Token::DoubleQuotedString("".to_string()),
3769 Token::SingleQuotedString("".to_string()),
3770 ];
3771 compare(expected, tokens);
3772
3773 let sql = r#"''"""#;
3774 let tokens = Tokenizer::new(&dialect, sql)
3775 .with_unescape(true)
3776 .tokenize()
3777 .unwrap();
3778 let expected = vec![
3779 Token::SingleQuotedString("".to_string()),
3780 Token::DoubleQuotedString("".to_string()),
3781 ];
3782 compare(expected, tokens);
3783
3784 let dialect = SnowflakeDialect {};
3786 let sql = r#"''''''"#;
3787 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3788 let expected = vec![Token::SingleQuotedString("''".to_string())];
3789 compare(expected, tokens);
3790 }
3791
3792 #[test]
3793 fn test_mysql_users_grantees() {
3794 let dialect = MySqlDialect {};
3795
3796 let sql = "CREATE USER `root`@`%`";
3797 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3798 let expected = vec![
3799 Token::make_keyword("CREATE"),
3800 Token::Whitespace(Whitespace::Space),
3801 Token::make_keyword("USER"),
3802 Token::Whitespace(Whitespace::Space),
3803 Token::make_word("root", Some('`')),
3804 Token::AtSign,
3805 Token::make_word("%", Some('`')),
3806 ];
3807 compare(expected, tokens);
3808 }
3809
3810 #[test]
3811 fn test_postgres_abs_without_space_and_string_literal() {
3812 let dialect = MySqlDialect {};
3813
3814 let sql = "SELECT @'1'";
3815 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3816 let expected = vec![
3817 Token::make_keyword("SELECT"),
3818 Token::Whitespace(Whitespace::Space),
3819 Token::AtSign,
3820 Token::SingleQuotedString("1".to_string()),
3821 ];
3822 compare(expected, tokens);
3823 }
3824
3825 #[test]
3826 fn test_postgres_abs_without_space_and_quoted_column() {
3827 let dialect = MySqlDialect {};
3828
3829 let sql = r#"SELECT @"bar" FROM foo"#;
3830 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3831 let expected = vec![
3832 Token::make_keyword("SELECT"),
3833 Token::Whitespace(Whitespace::Space),
3834 Token::AtSign,
3835 Token::DoubleQuotedString("bar".to_string()),
3836 Token::Whitespace(Whitespace::Space),
3837 Token::make_keyword("FROM"),
3838 Token::Whitespace(Whitespace::Space),
3839 Token::make_word("foo", None),
3840 ];
3841 compare(expected, tokens);
3842 }
3843
3844 #[test]
3845 fn test_national_strings_backslash_escape_not_supported() {
3846 all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3847 .tokenizes_to(
3848 "select n'''''\\'",
3849 vec![
3850 Token::make_keyword("select"),
3851 Token::Whitespace(Whitespace::Space),
3852 Token::NationalStringLiteral("''\\".to_string()),
3853 ],
3854 );
3855 }
3856
3857 #[test]
3858 fn test_national_strings_backslash_escape_supported() {
3859 all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3860 .tokenizes_to(
3861 "select n'''''\\''",
3862 vec![
3863 Token::make_keyword("select"),
3864 Token::Whitespace(Whitespace::Space),
3865 Token::NationalStringLiteral("'''".to_string()),
3866 ],
3867 );
3868 }
3869
3870 #[test]
3871 fn test_string_escape_constant_not_supported() {
3872 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3873 "select e'...'",
3874 vec![
3875 Token::make_keyword("select"),
3876 Token::Whitespace(Whitespace::Space),
3877 Token::make_word("e", None),
3878 Token::SingleQuotedString("...".to_string()),
3879 ],
3880 );
3881
3882 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3883 "select E'...'",
3884 vec![
3885 Token::make_keyword("select"),
3886 Token::Whitespace(Whitespace::Space),
3887 Token::make_word("E", None),
3888 Token::SingleQuotedString("...".to_string()),
3889 ],
3890 );
3891 }
3892
3893 #[test]
3894 fn test_string_escape_constant_supported() {
3895 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3896 "select e'\\''",
3897 vec![
3898 Token::make_keyword("select"),
3899 Token::Whitespace(Whitespace::Space),
3900 Token::EscapedStringLiteral("'".to_string()),
3901 ],
3902 );
3903
3904 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3905 "select E'\\''",
3906 vec![
3907 Token::make_keyword("select"),
3908 Token::Whitespace(Whitespace::Space),
3909 Token::EscapedStringLiteral("'".to_string()),
3910 ],
3911 );
3912 }
3913
3914 #[test]
3915 fn test_whitespace_required_after_single_line_comment() {
3916 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3917 .tokenizes_to(
3918 "SELECT --'abc'",
3919 vec![
3920 Token::make_keyword("SELECT"),
3921 Token::Whitespace(Whitespace::Space),
3922 Token::Minus,
3923 Token::Minus,
3924 Token::SingleQuotedString("abc".to_string()),
3925 ],
3926 );
3927
3928 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3929 .tokenizes_to(
3930 "SELECT -- 'abc'",
3931 vec![
3932 Token::make_keyword("SELECT"),
3933 Token::Whitespace(Whitespace::Space),
3934 Token::Whitespace(Whitespace::SingleLineComment {
3935 prefix: "--".to_string(),
3936 comment: " 'abc'".to_string(),
3937 }),
3938 ],
3939 );
3940
3941 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3942 .tokenizes_to(
3943 "SELECT --",
3944 vec![
3945 Token::make_keyword("SELECT"),
3946 Token::Whitespace(Whitespace::Space),
3947 Token::Minus,
3948 Token::Minus,
3949 ],
3950 );
3951 }
3952
3953 #[test]
3954 fn test_whitespace_not_required_after_single_line_comment() {
3955 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3956 .tokenizes_to(
3957 "SELECT --'abc'",
3958 vec![
3959 Token::make_keyword("SELECT"),
3960 Token::Whitespace(Whitespace::Space),
3961 Token::Whitespace(Whitespace::SingleLineComment {
3962 prefix: "--".to_string(),
3963 comment: "'abc'".to_string(),
3964 }),
3965 ],
3966 );
3967
3968 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3969 .tokenizes_to(
3970 "SELECT -- 'abc'",
3971 vec![
3972 Token::make_keyword("SELECT"),
3973 Token::Whitespace(Whitespace::Space),
3974 Token::Whitespace(Whitespace::SingleLineComment {
3975 prefix: "--".to_string(),
3976 comment: " 'abc'".to_string(),
3977 }),
3978 ],
3979 );
3980
3981 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3982 .tokenizes_to(
3983 "SELECT --",
3984 vec![
3985 Token::make_keyword("SELECT"),
3986 Token::Whitespace(Whitespace::Space),
3987 Token::Whitespace(Whitespace::SingleLineComment {
3988 prefix: "--".to_string(),
3989 comment: "".to_string(),
3990 }),
3991 ],
3992 );
3993 }
3994
3995 #[test]
3996 fn test_tokenize_identifiers_numeric_prefix() {
3997 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3998 .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
3999
4000 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4001 .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4002
4003 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4004 "t.12e34",
4005 vec![
4006 Token::make_word("t", None),
4007 Token::Period,
4008 Token::make_word("12e34", None),
4009 ],
4010 );
4011
4012 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4013 "t.1two3",
4014 vec![
4015 Token::make_word("t", None),
4016 Token::Period,
4017 Token::make_word("1two3", None),
4018 ],
4019 );
4020 }
4021}