1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46 SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56 EOF,
58 Word(Word),
60 Number(String, bool),
62 Char(char),
64 SingleQuotedString(String),
66 DoubleQuotedString(String),
68 TripleSingleQuotedString(String),
71 TripleDoubleQuotedString(String),
74 DollarQuotedString(DollarQuotedString),
76 SingleQuotedByteStringLiteral(String),
79 DoubleQuotedByteStringLiteral(String),
81 TripleSingleQuotedByteStringLiteral(String),
84 TripleDoubleQuotedByteStringLiteral(String),
87 SingleQuotedRawStringLiteral(String),
90 DoubleQuotedRawStringLiteral(String),
93 TripleSingleQuotedRawStringLiteral(String),
96 TripleDoubleQuotedRawStringLiteral(String),
99 NationalStringLiteral(String),
101 EscapedStringLiteral(String),
103 UnicodeStringLiteral(String),
105 HexStringLiteral(String),
107 Comma,
109 Whitespace(Whitespace),
111 DoubleEq,
113 Eq,
115 Neq,
117 Lt,
119 Gt,
121 LtEq,
123 GtEq,
125 Spaceship,
127 Plus,
129 Minus,
131 Mul,
133 Div,
135 DuckIntDiv,
137 Mod,
139 StringConcat,
141 LParen,
143 RParen,
145 Period,
147 Colon,
149 DoubleColon,
151 Assignment,
153 SemiColon,
155 Backslash,
157 LBracket,
159 RBracket,
161 Ampersand,
163 Pipe,
165 Caret,
167 LBrace,
169 RBrace,
171 RArrow,
173 Sharp,
175 DoubleSharp,
177 Tilde,
179 TildeAsterisk,
181 ExclamationMarkTilde,
183 ExclamationMarkTildeAsterisk,
185 DoubleTilde,
187 DoubleTildeAsterisk,
189 ExclamationMarkDoubleTilde,
191 ExclamationMarkDoubleTildeAsterisk,
193 ShiftLeft,
195 ShiftRight,
197 Overlap,
199 ExclamationMark,
201 DoubleExclamationMark,
203 AtSign,
205 CaretAt,
207 PGSquareRoot,
209 PGCubeRoot,
211 Placeholder(String),
213 Arrow,
215 LongArrow,
217 HashArrow,
219 AtDashAt,
221 QuestionMarkDash,
223 AmpersandLeftAngleBracket,
225 AmpersandRightAngleBracket,
227 AmpersandLeftAngleBracketVerticalBar,
229 VerticalBarAmpersandRightAngleBracket,
231 TwoWayArrow,
233 LeftAngleBracketCaret,
235 RightAngleBracketCaret,
237 QuestionMarkSharp,
239 QuestionMarkDashVerticalBar,
241 QuestionMarkDoubleVerticalBar,
243 TildeEqual,
245 ShiftLeftVerticalBar,
247 VerticalBarShiftRight,
249 VerticalBarRightAngleBracket,
251 HashLongArrow,
253 AtArrow,
255 ArrowAt,
257 HashMinus,
260 AtQuestion,
263 AtAt,
267 Question,
270 QuestionAnd,
273 QuestionPipe,
276 CustomBinaryOperator(String),
280}
281
282impl fmt::Display for Token {
283 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
284 match self {
285 Token::EOF => f.write_str("EOF"),
286 Token::Word(ref w) => write!(f, "{w}"),
287 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
288 Token::Char(ref c) => write!(f, "{c}"),
289 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
290 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
291 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
292 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
293 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
294 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
295 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
296 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
297 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
298 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
299 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
300 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
301 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
302 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
303 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
304 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
305 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
306 Token::Comma => f.write_str(","),
307 Token::Whitespace(ws) => write!(f, "{ws}"),
308 Token::DoubleEq => f.write_str("=="),
309 Token::Spaceship => f.write_str("<=>"),
310 Token::Eq => f.write_str("="),
311 Token::Neq => f.write_str("<>"),
312 Token::Lt => f.write_str("<"),
313 Token::Gt => f.write_str(">"),
314 Token::LtEq => f.write_str("<="),
315 Token::GtEq => f.write_str(">="),
316 Token::Plus => f.write_str("+"),
317 Token::Minus => f.write_str("-"),
318 Token::Mul => f.write_str("*"),
319 Token::Div => f.write_str("/"),
320 Token::DuckIntDiv => f.write_str("//"),
321 Token::StringConcat => f.write_str("||"),
322 Token::Mod => f.write_str("%"),
323 Token::LParen => f.write_str("("),
324 Token::RParen => f.write_str(")"),
325 Token::Period => f.write_str("."),
326 Token::Colon => f.write_str(":"),
327 Token::DoubleColon => f.write_str("::"),
328 Token::Assignment => f.write_str(":="),
329 Token::SemiColon => f.write_str(";"),
330 Token::Backslash => f.write_str("\\"),
331 Token::LBracket => f.write_str("["),
332 Token::RBracket => f.write_str("]"),
333 Token::Ampersand => f.write_str("&"),
334 Token::Caret => f.write_str("^"),
335 Token::Pipe => f.write_str("|"),
336 Token::LBrace => f.write_str("{"),
337 Token::RBrace => f.write_str("}"),
338 Token::RArrow => f.write_str("=>"),
339 Token::Sharp => f.write_str("#"),
340 Token::DoubleSharp => f.write_str("##"),
341 Token::ExclamationMark => f.write_str("!"),
342 Token::DoubleExclamationMark => f.write_str("!!"),
343 Token::Tilde => f.write_str("~"),
344 Token::TildeAsterisk => f.write_str("~*"),
345 Token::ExclamationMarkTilde => f.write_str("!~"),
346 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
347 Token::DoubleTilde => f.write_str("~~"),
348 Token::DoubleTildeAsterisk => f.write_str("~~*"),
349 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
350 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
351 Token::AtSign => f.write_str("@"),
352 Token::CaretAt => f.write_str("^@"),
353 Token::ShiftLeft => f.write_str("<<"),
354 Token::ShiftRight => f.write_str(">>"),
355 Token::Overlap => f.write_str("&&"),
356 Token::PGSquareRoot => f.write_str("|/"),
357 Token::PGCubeRoot => f.write_str("||/"),
358 Token::AtDashAt => f.write_str("@-@"),
359 Token::QuestionMarkDash => f.write_str("?-"),
360 Token::AmpersandLeftAngleBracket => f.write_str("&<"),
361 Token::AmpersandRightAngleBracket => f.write_str("&>"),
362 Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
363 Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
364 Token::VerticalBarRightAngleBracket => f.write_str("|>"),
365 Token::TwoWayArrow => f.write_str("<->"),
366 Token::LeftAngleBracketCaret => f.write_str("<^"),
367 Token::RightAngleBracketCaret => f.write_str(">^"),
368 Token::QuestionMarkSharp => f.write_str("?#"),
369 Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
370 Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
371 Token::TildeEqual => f.write_str("~="),
372 Token::ShiftLeftVerticalBar => f.write_str("<<|"),
373 Token::VerticalBarShiftRight => f.write_str("|>>"),
374 Token::Placeholder(ref s) => write!(f, "{s}"),
375 Token::Arrow => write!(f, "->"),
376 Token::LongArrow => write!(f, "->>"),
377 Token::HashArrow => write!(f, "#>"),
378 Token::HashLongArrow => write!(f, "#>>"),
379 Token::AtArrow => write!(f, "@>"),
380 Token::ArrowAt => write!(f, "<@"),
381 Token::HashMinus => write!(f, "#-"),
382 Token::AtQuestion => write!(f, "@?"),
383 Token::AtAt => write!(f, "@@"),
384 Token::Question => write!(f, "?"),
385 Token::QuestionAnd => write!(f, "?&"),
386 Token::QuestionPipe => write!(f, "?|"),
387 Token::CustomBinaryOperator(s) => f.write_str(s),
388 }
389 }
390}
391
392impl Token {
393 pub fn make_keyword(keyword: &str) -> Self {
394 Token::make_word(keyword, None)
395 }
396
397 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
398 let word_uppercase = word.to_uppercase();
399 Token::Word(Word {
400 value: word.to_string(),
401 quote_style,
402 keyword: if quote_style.is_none() {
403 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
404 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
405 } else {
406 Keyword::NoKeyword
407 },
408 })
409 }
410}
411
412#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
416pub struct Word {
417 pub value: String,
420 pub quote_style: Option<char>,
424 pub keyword: Keyword,
427}
428
429impl fmt::Display for Word {
430 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
431 match self.quote_style {
432 Some(s) if s == '"' || s == '[' || s == '`' => {
433 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
434 }
435 None => f.write_str(&self.value),
436 _ => panic!("Unexpected quote_style!"),
437 }
438 }
439}
440
441impl Word {
442 fn matching_end_quote(ch: char) -> char {
443 match ch {
444 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
448 }
449 }
450}
451
452#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
453#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
454#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
455pub enum Whitespace {
456 Space,
457 Newline,
458 Tab,
459 SingleLineComment { comment: String, prefix: String },
460 MultiLineComment(String),
461}
462
463impl fmt::Display for Whitespace {
464 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
465 match self {
466 Whitespace::Space => f.write_str(" "),
467 Whitespace::Newline => f.write_str("\n"),
468 Whitespace::Tab => f.write_str("\t"),
469 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
470 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
471 }
472 }
473}
474
475#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
495#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
496#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
497pub struct Location {
498 pub line: u64,
502 pub column: u64,
506}
507
508impl fmt::Display for Location {
509 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
510 if self.line == 0 {
511 return Ok(());
512 }
513 write!(f, " at Line: {}, Column: {}", self.line, self.column)
514 }
515}
516
517impl fmt::Debug for Location {
518 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
519 write!(f, "Location({},{})", self.line, self.column)
520 }
521}
522
523impl Location {
524 pub fn empty() -> Self {
526 Self { line: 0, column: 0 }
527 }
528
529 pub fn new(line: u64, column: u64) -> Self {
531 Self { line, column }
532 }
533
534 pub fn of(line: u64, column: u64) -> Self {
539 Self::new(line, column)
540 }
541
542 pub fn span_to(self, end: Self) -> Span {
544 Span { start: self, end }
545 }
546}
547
548impl From<(u64, u64)> for Location {
549 fn from((line, column): (u64, u64)) -> Self {
550 Self { line, column }
551 }
552}
553
554#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
558#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
559#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
560pub struct Span {
561 pub start: Location,
562 pub end: Location,
563}
564
565impl fmt::Debug for Span {
566 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
567 write!(f, "Span({:?}..{:?})", self.start, self.end)
568 }
569}
570
571impl Span {
572 const EMPTY: Span = Self::empty();
575
576 pub fn new(start: Location, end: Location) -> Span {
578 Span { start, end }
579 }
580
581 pub const fn empty() -> Span {
586 Span {
587 start: Location { line: 0, column: 0 },
588 end: Location { line: 0, column: 0 },
589 }
590 }
591
592 pub fn union(&self, other: &Span) -> Span {
608 match (self, other) {
611 (&Span::EMPTY, _) => *other,
612 (_, &Span::EMPTY) => *self,
613 _ => Span {
614 start: cmp::min(self.start, other.start),
615 end: cmp::max(self.end, other.end),
616 },
617 }
618 }
619
620 pub fn union_opt(&self, other: &Option<Span>) -> Span {
624 match other {
625 Some(other) => self.union(other),
626 None => *self,
627 }
628 }
629
630 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
648 iter.into_iter()
649 .reduce(|acc, item| acc.union(&item))
650 .unwrap_or(Span::empty())
651 }
652}
653
654#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656pub type TokenWithLocation = TokenWithSpan;
657
658#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
681#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
682#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
683pub struct TokenWithSpan {
684 pub token: Token,
685 pub span: Span,
686}
687
688impl TokenWithSpan {
689 pub fn new(token: Token, span: Span) -> Self {
691 Self { token, span }
692 }
693
694 pub fn wrap(token: Token) -> Self {
696 Self::new(token, Span::empty())
697 }
698
699 pub fn at(token: Token, start: Location, end: Location) -> Self {
701 Self::new(token, Span::new(start, end))
702 }
703
704 pub fn new_eof() -> Self {
706 Self::wrap(Token::EOF)
707 }
708}
709
710impl PartialEq<Token> for TokenWithSpan {
711 fn eq(&self, other: &Token) -> bool {
712 &self.token == other
713 }
714}
715
716impl PartialEq<TokenWithSpan> for Token {
717 fn eq(&self, other: &TokenWithSpan) -> bool {
718 self == &other.token
719 }
720}
721
722impl fmt::Display for TokenWithSpan {
723 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
724 self.token.fmt(f)
725 }
726}
727
728#[derive(Debug, PartialEq, Eq)]
730pub struct TokenizerError {
731 pub message: String,
732 pub location: Location,
733}
734
735impl fmt::Display for TokenizerError {
736 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737 write!(f, "{}{}", self.message, self.location,)
738 }
739}
740
741#[cfg(feature = "std")]
742impl std::error::Error for TokenizerError {}
743
744struct State<'a> {
745 peekable: Peekable<Chars<'a>>,
746 pub line: u64,
747 pub col: u64,
748}
749
750impl State<'_> {
751 pub fn next(&mut self) -> Option<char> {
753 match self.peekable.next() {
754 None => None,
755 Some(s) => {
756 if s == '\n' {
757 self.line += 1;
758 self.col = 1;
759 } else {
760 self.col += 1;
761 }
762 Some(s)
763 }
764 }
765 }
766
767 pub fn peek(&mut self) -> Option<&char> {
769 self.peekable.peek()
770 }
771
772 pub fn location(&self) -> Location {
773 Location {
774 line: self.line,
775 column: self.col,
776 }
777 }
778}
779
780#[derive(Copy, Clone)]
782enum NumStringQuoteChars {
783 One,
785 Many(NonZeroU8),
787}
788
789struct TokenizeQuotedStringSettings {
791 quote_style: char,
793 num_quote_chars: NumStringQuoteChars,
795 num_opening_quotes_to_consume: u8,
801 backslash_escape: bool,
804}
805
806pub struct Tokenizer<'a> {
808 dialect: &'a dyn Dialect,
809 query: &'a str,
810 unescape: bool,
813}
814
815impl<'a> Tokenizer<'a> {
816 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
833 Self {
834 dialect,
835 query,
836 unescape: true,
837 }
838 }
839
840 pub fn with_unescape(mut self, unescape: bool) -> Self {
871 self.unescape = unescape;
872 self
873 }
874
875 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877 let twl = self.tokenize_with_location()?;
878 Ok(twl.into_iter().map(|t| t.token).collect())
879 }
880
881 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
883 let mut tokens: Vec<TokenWithSpan> = vec![];
884 self.tokenize_with_location_into_buf(&mut tokens)
885 .map(|_| tokens)
886 }
887
888 pub fn tokenize_with_location_into_buf(
891 &mut self,
892 buf: &mut Vec<TokenWithSpan>,
893 ) -> Result<(), TokenizerError> {
894 let mut state = State {
895 peekable: self.query.chars().peekable(),
896 line: 1,
897 col: 1,
898 };
899
900 let mut location = state.location();
901 while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902 let span = location.span_to(state.location());
903
904 buf.push(TokenWithSpan { token, span });
905
906 location = state.location();
907 }
908 Ok(())
909 }
910
911 fn tokenize_identifier_or_keyword(
913 &self,
914 ch: impl IntoIterator<Item = char>,
915 chars: &mut State,
916 ) -> Result<Option<Token>, TokenizerError> {
917 chars.next(); let ch: String = ch.into_iter().collect();
919 let word = self.tokenize_word(ch, chars);
920
921 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923 let mut inner_state = State {
924 peekable: word.chars().peekable(),
925 line: 0,
926 col: 0,
927 };
928 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
930 s += s2.as_str();
931 return Ok(Some(Token::Number(s, false)));
932 }
933
934 Ok(Some(Token::make_word(&word, None)))
935 }
936
937 fn next_token(
939 &self,
940 chars: &mut State,
941 prev_token: Option<&Token>,
942 ) -> Result<Option<Token>, TokenizerError> {
943 match chars.peek() {
944 Some(&ch) => match ch {
945 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
946 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
947 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
948 '\r' => {
949 chars.next();
951 if let Some('\n') = chars.peek() {
952 chars.next();
953 }
954 Ok(Some(Token::Whitespace(Whitespace::Newline)))
955 }
956 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
958 {
959 chars.next(); match chars.peek() {
961 Some('\'') => {
962 if self.dialect.supports_triple_quoted_string() {
963 return self
964 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
965 chars,
966 '\'',
967 false,
968 Token::SingleQuotedByteStringLiteral,
969 Token::TripleSingleQuotedByteStringLiteral,
970 );
971 }
972 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
973 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
974 }
975 Some('\"') => {
976 if self.dialect.supports_triple_quoted_string() {
977 return self
978 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
979 chars,
980 '"',
981 false,
982 Token::DoubleQuotedByteStringLiteral,
983 Token::TripleDoubleQuotedByteStringLiteral,
984 );
985 }
986 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
987 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
988 }
989 _ => {
990 let s = self.tokenize_word(b, chars);
992 Ok(Some(Token::make_word(&s, None)))
993 }
994 }
995 }
996 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
998 chars.next(); match chars.peek() {
1000 Some('\'') => self
1001 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1002 chars,
1003 '\'',
1004 false,
1005 Token::SingleQuotedRawStringLiteral,
1006 Token::TripleSingleQuotedRawStringLiteral,
1007 ),
1008 Some('\"') => self
1009 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1010 chars,
1011 '"',
1012 false,
1013 Token::DoubleQuotedRawStringLiteral,
1014 Token::TripleDoubleQuotedRawStringLiteral,
1015 ),
1016 _ => {
1017 let s = self.tokenize_word(b, chars);
1019 Ok(Some(Token::make_word(&s, None)))
1020 }
1021 }
1022 }
1023 n @ 'N' | n @ 'n' => {
1025 chars.next(); match chars.peek() {
1027 Some('\'') => {
1028 let backslash_escape =
1030 self.dialect.supports_string_literal_backslash_escape();
1031 let s =
1032 self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1033 Ok(Some(Token::NationalStringLiteral(s)))
1034 }
1035 _ => {
1036 let s = self.tokenize_word(n, chars);
1038 Ok(Some(Token::make_word(&s, None)))
1039 }
1040 }
1041 }
1042 x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1044 let starting_loc = chars.location();
1045 chars.next(); match chars.peek() {
1047 Some('\'') => {
1048 let s =
1049 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1050 Ok(Some(Token::EscapedStringLiteral(s)))
1051 }
1052 _ => {
1053 let s = self.tokenize_word(x, chars);
1055 Ok(Some(Token::make_word(&s, None)))
1056 }
1057 }
1058 }
1059 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1061 chars.next(); if chars.peek() == Some(&'&') {
1063 let mut chars_clone = chars.peekable.clone();
1065 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
1067 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
1069 return Ok(Some(Token::UnicodeStringLiteral(s)));
1070 }
1071 }
1072 let s = self.tokenize_word(x, chars);
1074 Ok(Some(Token::make_word(&s, None)))
1075 }
1076 x @ 'x' | x @ 'X' => {
1079 chars.next(); match chars.peek() {
1081 Some('\'') => {
1082 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1084 Ok(Some(Token::HexStringLiteral(s)))
1085 }
1086 _ => {
1087 let s = self.tokenize_word(x, chars);
1089 Ok(Some(Token::make_word(&s, None)))
1090 }
1091 }
1092 }
1093 '\'' => {
1095 if self.dialect.supports_triple_quoted_string() {
1096 return self
1097 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1098 chars,
1099 '\'',
1100 self.dialect.supports_string_literal_backslash_escape(),
1101 Token::SingleQuotedString,
1102 Token::TripleSingleQuotedString,
1103 );
1104 }
1105 let s = self.tokenize_single_quoted_string(
1106 chars,
1107 '\'',
1108 self.dialect.supports_string_literal_backslash_escape(),
1109 )?;
1110
1111 Ok(Some(Token::SingleQuotedString(s)))
1112 }
1113 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1115 && !self.dialect.is_identifier_start(ch) =>
1116 {
1117 if self.dialect.supports_triple_quoted_string() {
1118 return self
1119 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120 chars,
1121 '"',
1122 self.dialect.supports_string_literal_backslash_escape(),
1123 Token::DoubleQuotedString,
1124 Token::TripleDoubleQuotedString,
1125 );
1126 }
1127 let s = self.tokenize_single_quoted_string(
1128 chars,
1129 '"',
1130 self.dialect.supports_string_literal_backslash_escape(),
1131 )?;
1132
1133 Ok(Some(Token::DoubleQuotedString(s)))
1134 }
1135 quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1137 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1138 Ok(Some(Token::make_word(&word, Some(quote_start))))
1139 }
1140 quote_start
1142 if self
1143 .dialect
1144 .is_nested_delimited_identifier_start(quote_start)
1145 && self
1146 .dialect
1147 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1148 .is_some() =>
1149 {
1150 let Some((quote_start, nested_quote_start)) = self
1151 .dialect
1152 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1153 else {
1154 return self.tokenizer_error(
1155 chars.location(),
1156 format!("Expected nested delimiter '{quote_start}' before EOF."),
1157 );
1158 };
1159
1160 let Some(nested_quote_start) = nested_quote_start else {
1161 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1162 return Ok(Some(Token::make_word(&word, Some(quote_start))));
1163 };
1164
1165 let mut word = vec![];
1166 let quote_end = Word::matching_end_quote(quote_start);
1167 let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1168 let error_loc = chars.location();
1169
1170 chars.next(); peeking_take_while(chars, |ch| ch.is_whitespace());
1172 if chars.peek() != Some(&nested_quote_start) {
1173 return self.tokenizer_error(
1174 error_loc,
1175 format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1176 );
1177 }
1178 word.push(nested_quote_start.into());
1179 word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1180 word.push(nested_quote_end.into());
1181 peeking_take_while(chars, |ch| ch.is_whitespace());
1182 if chars.peek() != Some("e_end) {
1183 return self.tokenizer_error(
1184 error_loc,
1185 format!("Expected close delimiter '{quote_end}' before EOF."),
1186 );
1187 }
1188 chars.next(); Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1191 }
1192 '0'..='9' | '.' => {
1194 if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1199 if let Some(Token::Word(_)) = prev_token {
1200 chars.next();
1201 return Ok(Some(Token::Period));
1202 }
1203
1204 return self.tokenizer_error(
1205 chars.location(),
1206 "Unexpected character '_'".to_string(),
1207 );
1208 }
1209
1210 let is_number_separator = |ch: char, next_char: Option<char>| {
1213 self.dialect.supports_numeric_literal_underscores()
1214 && ch == '_'
1215 && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1216 };
1217
1218 let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1219 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1220 });
1221
1222 if s == "0" && chars.peek() == Some(&'x') {
1224 chars.next();
1225 let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1226 ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1227 });
1228 return Ok(Some(Token::HexStringLiteral(s2)));
1229 }
1230
1231 if let Some('.') = chars.peek() {
1233 s.push('.');
1234 chars.next();
1235 }
1236
1237 if s == "." && self.dialect.supports_numeric_prefix() {
1243 if let Some(Token::Word(_)) = prev_token {
1244 return Ok(Some(Token::Period));
1245 }
1246 }
1247
1248 s += &peeking_next_take_while(chars, |ch, next_ch| {
1250 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1251 });
1252
1253 if s == "." {
1255 return Ok(Some(Token::Period));
1256 }
1257
1258 let mut exponent_part = String::new();
1260 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1261 let mut char_clone = chars.peekable.clone();
1262 exponent_part.push(char_clone.next().unwrap());
1263
1264 match char_clone.peek() {
1266 Some(&c) if matches!(c, '+' | '-') => {
1267 exponent_part.push(c);
1268 char_clone.next();
1269 }
1270 _ => (),
1271 }
1272
1273 match char_clone.peek() {
1274 Some(&c) if c.is_ascii_digit() => {
1276 for _ in 0..exponent_part.len() {
1277 chars.next();
1278 }
1279 exponent_part +=
1280 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1281 s += exponent_part.as_str();
1282 }
1283 _ => (),
1285 }
1286 }
1287
1288 if self.dialect.supports_numeric_prefix() {
1292 if exponent_part.is_empty() {
1293 let word =
1296 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1297
1298 if !word.is_empty() {
1299 s += word.as_str();
1300 return Ok(Some(Token::make_word(s.as_str(), None)));
1301 }
1302 } else if prev_token == Some(&Token::Period) {
1303 return Ok(Some(Token::make_word(s.as_str(), None)));
1306 }
1307 }
1308
1309 let long = if chars.peek() == Some(&'L') {
1310 chars.next();
1311 true
1312 } else {
1313 false
1314 };
1315 Ok(Some(Token::Number(s, long)))
1316 }
1317 '(' => self.consume_and_return(chars, Token::LParen),
1319 ')' => self.consume_and_return(chars, Token::RParen),
1320 ',' => self.consume_and_return(chars, Token::Comma),
1321 '-' => {
1323 chars.next(); match chars.peek() {
1326 Some('-') => {
1327 let mut is_comment = true;
1328 if self.dialect.requires_single_line_comment_whitespace() {
1329 is_comment = Some(' ') == chars.peekable.clone().nth(1);
1330 }
1331
1332 if is_comment {
1333 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1335 return Ok(Some(Token::Whitespace(
1336 Whitespace::SingleLineComment {
1337 prefix: "--".to_owned(),
1338 comment,
1339 },
1340 )));
1341 }
1342
1343 self.start_binop(chars, "-", Token::Minus)
1344 }
1345 Some('>') => {
1346 chars.next();
1347 match chars.peek() {
1348 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1349 _ => self.start_binop(chars, "->", Token::Arrow),
1350 }
1351 }
1352 _ => self.start_binop(chars, "-", Token::Minus),
1354 }
1355 }
1356 '/' => {
1357 chars.next(); match chars.peek() {
1359 Some('*') => {
1360 chars.next(); self.tokenize_multiline_comment(chars)
1362 }
1363 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1364 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1366 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1367 prefix: "//".to_owned(),
1368 comment,
1369 })))
1370 }
1371 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1372 self.consume_and_return(chars, Token::DuckIntDiv)
1373 }
1374 _ => Ok(Some(Token::Div)),
1376 }
1377 }
1378 '+' => self.consume_and_return(chars, Token::Plus),
1379 '*' => self.consume_and_return(chars, Token::Mul),
1380 '%' => {
1381 chars.next(); match chars.peek() {
1383 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1384 Some(sch) if self.dialect.is_identifier_start('%') => {
1385 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1386 }
1387 _ => self.start_binop(chars, "%", Token::Mod),
1388 }
1389 }
1390 '|' => {
1391 chars.next(); match chars.peek() {
1393 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1394 Some('|') => {
1395 chars.next(); match chars.peek() {
1397 Some('/') => {
1398 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1399 }
1400 _ => self.start_binop(chars, "||", Token::StringConcat),
1401 }
1402 }
1403 Some('&') if self.dialect.supports_geometric_types() => {
1404 chars.next(); match chars.peek() {
1406 Some('>') => self.consume_for_binop(
1407 chars,
1408 "|&>",
1409 Token::VerticalBarAmpersandRightAngleBracket,
1410 ),
1411 _ => self.start_binop_opt(chars, "|&", None),
1412 }
1413 }
1414 Some('>') if self.dialect.supports_geometric_types() => {
1415 chars.next(); match chars.peek() {
1417 Some('>') => self.consume_for_binop(
1418 chars,
1419 "|>>",
1420 Token::VerticalBarShiftRight,
1421 ),
1422 _ => self.start_binop_opt(chars, "|>", None),
1423 }
1424 }
1425 Some('>') if self.dialect.supports_pipe_operator() => {
1426 self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1427 }
1428 _ => self.start_binop(chars, "|", Token::Pipe),
1430 }
1431 }
1432 '=' => {
1433 chars.next(); match chars.peek() {
1435 Some('>') => self.consume_and_return(chars, Token::RArrow),
1436 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1437 _ => Ok(Some(Token::Eq)),
1438 }
1439 }
1440 '!' => {
1441 chars.next(); match chars.peek() {
1443 Some('=') => self.consume_and_return(chars, Token::Neq),
1444 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1445 Some('~') => {
1446 chars.next();
1447 match chars.peek() {
1448 Some('*') => self
1449 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1450 Some('~') => {
1451 chars.next();
1452 match chars.peek() {
1453 Some('*') => self.consume_and_return(
1454 chars,
1455 Token::ExclamationMarkDoubleTildeAsterisk,
1456 ),
1457 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1458 }
1459 }
1460 _ => Ok(Some(Token::ExclamationMarkTilde)),
1461 }
1462 }
1463 _ => Ok(Some(Token::ExclamationMark)),
1464 }
1465 }
1466 '<' => {
1467 chars.next(); match chars.peek() {
1469 Some('=') => {
1470 chars.next();
1471 match chars.peek() {
1472 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1473 _ => self.start_binop(chars, "<=", Token::LtEq),
1474 }
1475 }
1476 Some('|') if self.dialect.supports_geometric_types() => {
1477 self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1478 }
1479 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1480 Some('<') if self.dialect.supports_geometric_types() => {
1481 chars.next(); match chars.peek() {
1483 Some('|') => self.consume_for_binop(
1484 chars,
1485 "<<|",
1486 Token::ShiftLeftVerticalBar,
1487 ),
1488 _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1489 }
1490 }
1491 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1492 Some('-') if self.dialect.supports_geometric_types() => {
1493 chars.next(); match chars.peek() {
1495 Some('>') => {
1496 self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1497 }
1498 _ => self.start_binop_opt(chars, "<-", None),
1499 }
1500 }
1501 Some('^') if self.dialect.supports_geometric_types() => {
1502 self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1503 }
1504 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1505 _ => self.start_binop(chars, "<", Token::Lt),
1506 }
1507 }
1508 '>' => {
1509 chars.next(); match chars.peek() {
1511 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1512 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1513 Some('^') if self.dialect.supports_geometric_types() => {
1514 self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1515 }
1516 _ => self.start_binop(chars, ">", Token::Gt),
1517 }
1518 }
1519 ':' => {
1520 chars.next();
1521 match chars.peek() {
1522 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1523 Some('=') => self.consume_and_return(chars, Token::Assignment),
1524 _ => Ok(Some(Token::Colon)),
1525 }
1526 }
1527 ';' => self.consume_and_return(chars, Token::SemiColon),
1528 '\\' => self.consume_and_return(chars, Token::Backslash),
1529 '[' => self.consume_and_return(chars, Token::LBracket),
1530 ']' => self.consume_and_return(chars, Token::RBracket),
1531 '&' => {
1532 chars.next(); match chars.peek() {
1534 Some('>') if self.dialect.supports_geometric_types() => {
1535 chars.next();
1536 self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1537 }
1538 Some('<') if self.dialect.supports_geometric_types() => {
1539 chars.next(); match chars.peek() {
1541 Some('|') => self.consume_and_return(
1542 chars,
1543 Token::AmpersandLeftAngleBracketVerticalBar,
1544 ),
1545 _ => {
1546 self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1547 }
1548 }
1549 }
1550 Some('&') => {
1551 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1553 }
1554 _ => self.start_binop(chars, "&", Token::Ampersand),
1556 }
1557 }
1558 '^' => {
1559 chars.next(); match chars.peek() {
1561 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1562 _ => Ok(Some(Token::Caret)),
1563 }
1564 }
1565 '{' => self.consume_and_return(chars, Token::LBrace),
1566 '}' => self.consume_and_return(chars, Token::RBrace),
1567 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1568 {
1569 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1571 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1572 prefix: "#".to_owned(),
1573 comment,
1574 })))
1575 }
1576 '~' => {
1577 chars.next(); match chars.peek() {
1579 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1580 Some('=') if self.dialect.supports_geometric_types() => {
1581 self.consume_for_binop(chars, "~=", Token::TildeEqual)
1582 }
1583 Some('~') => {
1584 chars.next();
1585 match chars.peek() {
1586 Some('*') => {
1587 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1588 }
1589 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1590 }
1591 }
1592 _ => self.start_binop(chars, "~", Token::Tilde),
1593 }
1594 }
1595 '#' => {
1596 chars.next();
1597 match chars.peek() {
1598 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1599 Some('>') => {
1600 chars.next();
1601 match chars.peek() {
1602 Some('>') => {
1603 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1604 }
1605 _ => self.start_binop(chars, "#>", Token::HashArrow),
1606 }
1607 }
1608 Some(' ') => Ok(Some(Token::Sharp)),
1609 Some('#') if self.dialect.supports_geometric_types() => {
1610 self.consume_for_binop(chars, "##", Token::DoubleSharp)
1611 }
1612 Some(sch) if self.dialect.is_identifier_start('#') => {
1613 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1614 }
1615 _ => self.start_binop(chars, "#", Token::Sharp),
1616 }
1617 }
1618 '@' => {
1619 chars.next();
1620 match chars.peek() {
1621 Some('@') if self.dialect.supports_geometric_types() => {
1622 self.consume_and_return(chars, Token::AtAt)
1623 }
1624 Some('-') if self.dialect.supports_geometric_types() => {
1625 chars.next();
1626 match chars.peek() {
1627 Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1628 _ => self.start_binop_opt(chars, "@-", None),
1629 }
1630 }
1631 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1632 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1633 Some('@') => {
1634 chars.next();
1635 match chars.peek() {
1636 Some(' ') => Ok(Some(Token::AtAt)),
1637 Some(tch) if self.dialect.is_identifier_start('@') => {
1638 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1639 }
1640 _ => Ok(Some(Token::AtAt)),
1641 }
1642 }
1643 Some(' ') => Ok(Some(Token::AtSign)),
1644 Some('\'') => Ok(Some(Token::AtSign)),
1654 Some('\"') => Ok(Some(Token::AtSign)),
1655 Some('`') => Ok(Some(Token::AtSign)),
1656 Some(sch) if self.dialect.is_identifier_start('@') => {
1657 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1658 }
1659 _ => Ok(Some(Token::AtSign)),
1660 }
1661 }
1662 '?' if self.dialect.supports_geometric_types() => {
1664 chars.next(); match chars.peek() {
1666 Some('|') => {
1667 chars.next();
1668 match chars.peek() {
1669 Some('|') => self.consume_and_return(
1670 chars,
1671 Token::QuestionMarkDoubleVerticalBar,
1672 ),
1673 _ => Ok(Some(Token::QuestionPipe)),
1674 }
1675 }
1676
1677 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1678 Some('-') => {
1679 chars.next(); match chars.peek() {
1681 Some('|') => self
1682 .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1683 _ => Ok(Some(Token::QuestionMarkDash)),
1684 }
1685 }
1686 Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1687 _ => self.consume_and_return(chars, Token::Question),
1688 }
1689 }
1690 '?' => {
1691 chars.next();
1692 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1693 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1694 }
1695
1696 ch if self.dialect.is_identifier_start(ch) => {
1698 self.tokenize_identifier_or_keyword([ch], chars)
1699 }
1700 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1701
1702 ch if ch.is_whitespace() => {
1704 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1705 }
1706 other => self.consume_and_return(chars, Token::Char(other)),
1707 },
1708 None => Ok(None),
1709 }
1710 }
1711
1712 fn consume_for_binop(
1714 &self,
1715 chars: &mut State,
1716 prefix: &str,
1717 default: Token,
1718 ) -> Result<Option<Token>, TokenizerError> {
1719 chars.next(); self.start_binop_opt(chars, prefix, Some(default))
1721 }
1722
1723 fn start_binop(
1725 &self,
1726 chars: &mut State,
1727 prefix: &str,
1728 default: Token,
1729 ) -> Result<Option<Token>, TokenizerError> {
1730 self.start_binop_opt(chars, prefix, Some(default))
1731 }
1732
1733 fn start_binop_opt(
1735 &self,
1736 chars: &mut State,
1737 prefix: &str,
1738 default: Option<Token>,
1739 ) -> Result<Option<Token>, TokenizerError> {
1740 let mut custom = None;
1741 while let Some(&ch) = chars.peek() {
1742 if !self.dialect.is_custom_operator_part(ch) {
1743 break;
1744 }
1745
1746 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1747 chars.next();
1748 }
1749 match (custom, default) {
1750 (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1751 (None, Some(tok)) => Ok(Some(tok)),
1752 (None, None) => self.tokenizer_error(
1753 chars.location(),
1754 format!("Expected a valid binary operator after '{prefix}'"),
1755 ),
1756 }
1757 }
1758
1759 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1761 let mut s = String::new();
1762 let mut value = String::new();
1763
1764 chars.next();
1765
1766 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1768 chars.next();
1769
1770 let mut is_terminated = false;
1771 let mut prev: Option<char> = None;
1772
1773 while let Some(&ch) = chars.peek() {
1774 if prev == Some('$') {
1775 if ch == '$' {
1776 chars.next();
1777 is_terminated = true;
1778 break;
1779 } else {
1780 s.push('$');
1781 s.push(ch);
1782 }
1783 } else if ch != '$' {
1784 s.push(ch);
1785 }
1786
1787 prev = Some(ch);
1788 chars.next();
1789 }
1790
1791 return if chars.peek().is_none() && !is_terminated {
1792 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1793 } else {
1794 Ok(Token::DollarQuotedString(DollarQuotedString {
1795 value: s,
1796 tag: None,
1797 }))
1798 };
1799 } else {
1800 value.push_str(&peeking_take_while(chars, |ch| {
1801 ch.is_alphanumeric()
1802 || ch == '_'
1803 || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1805 }));
1806
1807 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1809 chars.next();
1810
1811 let mut temp = String::new();
1812 let end_delimiter = format!("${value}$");
1813
1814 loop {
1815 match chars.next() {
1816 Some(ch) => {
1817 temp.push(ch);
1818
1819 if temp.ends_with(&end_delimiter) {
1820 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1821 s.push_str(temp);
1822 }
1823 break;
1824 }
1825 }
1826 None => {
1827 if temp.ends_with(&end_delimiter) {
1828 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1829 s.push_str(temp);
1830 }
1831 break;
1832 }
1833
1834 return self.tokenizer_error(
1835 chars.location(),
1836 "Unterminated dollar-quoted, expected $",
1837 );
1838 }
1839 }
1840 }
1841 } else {
1842 return Ok(Token::Placeholder(String::from("$") + &value));
1843 }
1844 }
1845
1846 Ok(Token::DollarQuotedString(DollarQuotedString {
1847 value: s,
1848 tag: if value.is_empty() { None } else { Some(value) },
1849 }))
1850 }
1851
1852 fn tokenizer_error<R>(
1853 &self,
1854 loc: Location,
1855 message: impl Into<String>,
1856 ) -> Result<R, TokenizerError> {
1857 Err(TokenizerError {
1858 message: message.into(),
1859 location: loc,
1860 })
1861 }
1862
1863 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1865 let mut comment = peeking_take_while(chars, |ch| match ch {
1866 '\n' => false, '\r' if dialect_of!(self is PostgreSqlDialect) => false, _ => true, });
1870
1871 if let Some(ch) = chars.next() {
1872 assert!(ch == '\n' || ch == '\r');
1873 comment.push(ch);
1874 }
1875
1876 comment
1877 }
1878
1879 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881 let mut s = first_chars.into();
1882 s.push_str(&peeking_take_while(chars, |ch| {
1883 self.dialect.is_identifier_part(ch)
1884 }));
1885 s
1886 }
1887
1888 fn tokenize_quoted_identifier(
1890 &self,
1891 quote_start: char,
1892 chars: &mut State,
1893 ) -> Result<String, TokenizerError> {
1894 let error_loc = chars.location();
1895 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
1897 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1898
1899 if last_char == Some(quote_end) {
1900 Ok(s)
1901 } else {
1902 self.tokenizer_error(
1903 error_loc,
1904 format!("Expected close delimiter '{quote_end}' before EOF."),
1905 )
1906 }
1907 }
1908
1909 fn tokenize_escaped_single_quoted_string(
1911 &self,
1912 starting_loc: Location,
1913 chars: &mut State,
1914 ) -> Result<String, TokenizerError> {
1915 if let Some(s) = unescape_single_quoted_string(chars) {
1916 return Ok(s);
1917 }
1918
1919 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1920 }
1921
1922 fn tokenize_single_or_triple_quoted_string<F>(
1925 &self,
1926 chars: &mut State,
1927 quote_style: char,
1928 backslash_escape: bool,
1929 single_quote_token: F,
1930 triple_quote_token: F,
1931 ) -> Result<Option<Token>, TokenizerError>
1932 where
1933 F: Fn(String) -> Token,
1934 {
1935 let error_loc = chars.location();
1936
1937 let mut num_opening_quotes = 0u8;
1938 for _ in 0..3 {
1939 if Some("e_style) == chars.peek() {
1940 chars.next(); num_opening_quotes += 1;
1942 } else {
1943 break;
1944 }
1945 }
1946
1947 let (token_fn, num_quote_chars) = match num_opening_quotes {
1948 1 => (single_quote_token, NumStringQuoteChars::One),
1949 2 => {
1950 return Ok(Some(single_quote_token("".into())));
1952 }
1953 3 => {
1954 let Some(num_quote_chars) = NonZeroU8::new(3) else {
1955 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1956 };
1957 (
1958 triple_quote_token,
1959 NumStringQuoteChars::Many(num_quote_chars),
1960 )
1961 }
1962 _ => {
1963 return self.tokenizer_error(error_loc, "invalid string literal opening");
1964 }
1965 };
1966
1967 let settings = TokenizeQuotedStringSettings {
1968 quote_style,
1969 num_quote_chars,
1970 num_opening_quotes_to_consume: 0,
1971 backslash_escape,
1972 };
1973
1974 self.tokenize_quoted_string(chars, settings)
1975 .map(token_fn)
1976 .map(Some)
1977 }
1978
1979 fn tokenize_single_quoted_string(
1981 &self,
1982 chars: &mut State,
1983 quote_style: char,
1984 backslash_escape: bool,
1985 ) -> Result<String, TokenizerError> {
1986 self.tokenize_quoted_string(
1987 chars,
1988 TokenizeQuotedStringSettings {
1989 quote_style,
1990 num_quote_chars: NumStringQuoteChars::One,
1991 num_opening_quotes_to_consume: 1,
1992 backslash_escape,
1993 },
1994 )
1995 }
1996
1997 fn tokenize_quoted_string(
1999 &self,
2000 chars: &mut State,
2001 settings: TokenizeQuotedStringSettings,
2002 ) -> Result<String, TokenizerError> {
2003 let mut s = String::new();
2004 let error_loc = chars.location();
2005
2006 for _ in 0..settings.num_opening_quotes_to_consume {
2008 if Some(settings.quote_style) != chars.next() {
2009 return self.tokenizer_error(error_loc, "invalid string literal opening");
2010 }
2011 }
2012
2013 let mut num_consecutive_quotes = 0;
2014 while let Some(&ch) = chars.peek() {
2015 let pending_final_quote = match settings.num_quote_chars {
2016 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2017 n @ NumStringQuoteChars::Many(count)
2018 if num_consecutive_quotes + 1 == count.get() =>
2019 {
2020 Some(n)
2021 }
2022 NumStringQuoteChars::Many(_) => None,
2023 };
2024
2025 match ch {
2026 char if char == settings.quote_style && pending_final_quote.is_some() => {
2027 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2030 let mut buf = s.chars();
2035 for _ in 1..count.get() {
2036 buf.next_back();
2037 }
2038 return Ok(buf.as_str().to_string());
2039 } else if chars
2040 .peek()
2041 .map(|c| *c == settings.quote_style)
2042 .unwrap_or(false)
2043 {
2044 s.push(ch);
2045 if !self.unescape {
2046 s.push(ch);
2048 }
2049 chars.next();
2050 } else {
2051 return Ok(s);
2052 }
2053 }
2054 '\\' if settings.backslash_escape => {
2055 chars.next();
2057
2058 num_consecutive_quotes = 0;
2059
2060 if let Some(next) = chars.peek() {
2061 if !self.unescape
2062 || (self.dialect.ignores_wildcard_escapes()
2063 && (*next == '%' || *next == '_'))
2064 {
2065 s.push(ch);
2069 s.push(*next);
2070 chars.next(); } else {
2072 let n = match next {
2073 '0' => '\0',
2074 'a' => '\u{7}',
2075 'b' => '\u{8}',
2076 'f' => '\u{c}',
2077 'n' => '\n',
2078 'r' => '\r',
2079 't' => '\t',
2080 'Z' => '\u{1a}',
2081 _ => *next,
2082 };
2083 s.push(n);
2084 chars.next(); }
2086 }
2087 }
2088 ch => {
2089 chars.next(); if ch == settings.quote_style {
2092 num_consecutive_quotes += 1;
2093 } else {
2094 num_consecutive_quotes = 0;
2095 }
2096
2097 s.push(ch);
2098 }
2099 }
2100 }
2101 self.tokenizer_error(error_loc, "Unterminated string literal")
2102 }
2103
2104 fn tokenize_multiline_comment(
2105 &self,
2106 chars: &mut State,
2107 ) -> Result<Option<Token>, TokenizerError> {
2108 let mut s = String::new();
2109 let mut nested = 1;
2110 let supports_nested_comments = self.dialect.supports_nested_comments();
2111
2112 loop {
2113 match chars.next() {
2114 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2115 chars.next(); s.push('/');
2117 s.push('*');
2118 nested += 1;
2119 }
2120 Some('*') if matches!(chars.peek(), Some('/')) => {
2121 chars.next(); nested -= 1;
2123 if nested == 0 {
2124 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2125 }
2126 s.push('*');
2127 s.push('/');
2128 }
2129 Some(ch) => {
2130 s.push(ch);
2131 }
2132 None => {
2133 break self.tokenizer_error(
2134 chars.location(),
2135 "Unexpected EOF while in a multi-line comment",
2136 );
2137 }
2138 }
2139 }
2140 }
2141
2142 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2143 let mut last_char = None;
2144 let mut s = String::new();
2145 while let Some(ch) = chars.next() {
2146 if ch == quote_end {
2147 if chars.peek() == Some("e_end) {
2148 chars.next();
2149 s.push(ch);
2150 if !self.unescape {
2151 s.push(ch);
2153 }
2154 } else {
2155 last_char = Some(quote_end);
2156 break;
2157 }
2158 } else {
2159 s.push(ch);
2160 }
2161 }
2162 (s, last_char)
2163 }
2164
2165 #[allow(clippy::unnecessary_wraps)]
2166 fn consume_and_return(
2167 &self,
2168 chars: &mut State,
2169 t: Token,
2170 ) -> Result<Option<Token>, TokenizerError> {
2171 chars.next();
2172 Ok(Some(t))
2173 }
2174}
2175
2176fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2180 let mut s = String::new();
2181 while let Some(&ch) = chars.peek() {
2182 if predicate(ch) {
2183 chars.next(); s.push(ch);
2185 } else {
2186 break;
2187 }
2188 }
2189 s
2190}
2191
2192fn peeking_next_take_while(
2194 chars: &mut State,
2195 mut predicate: impl FnMut(char, Option<char>) -> bool,
2196) -> String {
2197 let mut s = String::new();
2198 while let Some(&ch) = chars.peek() {
2199 let next_char = chars.peekable.clone().nth(1);
2200 if predicate(ch, next_char) {
2201 chars.next(); s.push(ch);
2203 } else {
2204 break;
2205 }
2206 }
2207 s
2208}
2209
2210fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2211 Unescape::new(chars).unescape()
2212}
2213
2214struct Unescape<'a: 'b, 'b> {
2215 chars: &'b mut State<'a>,
2216}
2217
2218impl<'a: 'b, 'b> Unescape<'a, 'b> {
2219 fn new(chars: &'b mut State<'a>) -> Self {
2220 Self { chars }
2221 }
2222 fn unescape(mut self) -> Option<String> {
2223 let mut unescaped = String::new();
2224
2225 self.chars.next();
2226
2227 while let Some(c) = self.chars.next() {
2228 if c == '\'' {
2229 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2231 self.chars.next();
2232 unescaped.push('\'');
2233 continue;
2234 }
2235 return Some(unescaped);
2236 }
2237
2238 if c != '\\' {
2239 unescaped.push(c);
2240 continue;
2241 }
2242
2243 let c = match self.chars.next()? {
2244 'b' => '\u{0008}',
2245 'f' => '\u{000C}',
2246 'n' => '\n',
2247 'r' => '\r',
2248 't' => '\t',
2249 'u' => self.unescape_unicode_16()?,
2250 'U' => self.unescape_unicode_32()?,
2251 'x' => self.unescape_hex()?,
2252 c if c.is_digit(8) => self.unescape_octal(c)?,
2253 c => c,
2254 };
2255
2256 unescaped.push(Self::check_null(c)?);
2257 }
2258
2259 None
2260 }
2261
2262 #[inline]
2263 fn check_null(c: char) -> Option<char> {
2264 if c == '\0' {
2265 None
2266 } else {
2267 Some(c)
2268 }
2269 }
2270
2271 #[inline]
2272 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2273 match u32::from_str_radix(s, RADIX) {
2275 Err(_) => None,
2276 Ok(n) => {
2277 let n = n & 0xFF;
2278 if n <= 127 {
2279 char::from_u32(n)
2280 } else {
2281 None
2282 }
2283 }
2284 }
2285 }
2286
2287 fn unescape_hex(&mut self) -> Option<char> {
2289 let mut s = String::new();
2290
2291 for _ in 0..2 {
2292 match self.next_hex_digit() {
2293 Some(c) => s.push(c),
2294 None => break,
2295 }
2296 }
2297
2298 if s.is_empty() {
2299 return Some('x');
2300 }
2301
2302 Self::byte_to_char::<16>(&s)
2303 }
2304
2305 #[inline]
2306 fn next_hex_digit(&mut self) -> Option<char> {
2307 match self.chars.peek() {
2308 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2309 _ => None,
2310 }
2311 }
2312
2313 fn unescape_octal(&mut self, c: char) -> Option<char> {
2315 let mut s = String::new();
2316
2317 s.push(c);
2318 for _ in 0..2 {
2319 match self.next_octal_digest() {
2320 Some(c) => s.push(c),
2321 None => break,
2322 }
2323 }
2324
2325 Self::byte_to_char::<8>(&s)
2326 }
2327
2328 #[inline]
2329 fn next_octal_digest(&mut self) -> Option<char> {
2330 match self.chars.peek() {
2331 Some(c) if c.is_digit(8) => self.chars.next(),
2332 _ => None,
2333 }
2334 }
2335
2336 fn unescape_unicode_16(&mut self) -> Option<char> {
2338 self.unescape_unicode::<4>()
2339 }
2340
2341 fn unescape_unicode_32(&mut self) -> Option<char> {
2343 self.unescape_unicode::<8>()
2344 }
2345
2346 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2347 let mut s = String::new();
2348 for _ in 0..NUM {
2349 s.push(self.chars.next()?);
2350 }
2351 match u32::from_str_radix(&s, 16) {
2352 Err(_) => None,
2353 Ok(n) => char::from_u32(n),
2354 }
2355 }
2356}
2357
2358fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2359 let mut unescaped = String::new();
2360 chars.next(); while let Some(c) = chars.next() {
2362 match c {
2363 '\'' => {
2364 if chars.peek() == Some(&'\'') {
2365 chars.next();
2366 unescaped.push('\'');
2367 } else {
2368 return Ok(unescaped);
2369 }
2370 }
2371 '\\' => match chars.peek() {
2372 Some('\\') => {
2373 chars.next();
2374 unescaped.push('\\');
2375 }
2376 Some('+') => {
2377 chars.next();
2378 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2379 }
2380 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2381 },
2382 _ => {
2383 unescaped.push(c);
2384 }
2385 }
2386 }
2387 Err(TokenizerError {
2388 message: "Unterminated unicode encoded string literal".to_string(),
2389 location: chars.location(),
2390 })
2391}
2392
2393fn take_char_from_hex_digits(
2394 chars: &mut State<'_>,
2395 max_digits: usize,
2396) -> Result<char, TokenizerError> {
2397 let mut result = 0u32;
2398 for _ in 0..max_digits {
2399 let next_char = chars.next().ok_or_else(|| TokenizerError {
2400 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2401 .to_string(),
2402 location: chars.location(),
2403 })?;
2404 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2405 message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2406 location: chars.location(),
2407 })?;
2408 result = result * 16 + digit;
2409 }
2410 char::from_u32(result).ok_or_else(|| TokenizerError {
2411 message: format!("Invalid unicode character: {result:x}"),
2412 location: chars.location(),
2413 })
2414}
2415
2416#[cfg(test)]
2417mod tests {
2418 use super::*;
2419 use crate::dialect::{
2420 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2421 };
2422 use crate::test_utils::all_dialects_where;
2423 use core::fmt::Debug;
2424
2425 #[test]
2426 fn tokenizer_error_impl() {
2427 let err = TokenizerError {
2428 message: "test".into(),
2429 location: Location { line: 1, column: 1 },
2430 };
2431 #[cfg(feature = "std")]
2432 {
2433 use std::error::Error;
2434 assert!(err.source().is_none());
2435 }
2436 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2437 }
2438
2439 #[test]
2440 fn tokenize_select_1() {
2441 let sql = String::from("SELECT 1");
2442 let dialect = GenericDialect {};
2443 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2444
2445 let expected = vec![
2446 Token::make_keyword("SELECT"),
2447 Token::Whitespace(Whitespace::Space),
2448 Token::Number(String::from("1"), false),
2449 ];
2450
2451 compare(expected, tokens);
2452 }
2453
2454 #[test]
2455 fn tokenize_select_float() {
2456 let sql = String::from("SELECT .1");
2457 let dialect = GenericDialect {};
2458 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2459
2460 let expected = vec![
2461 Token::make_keyword("SELECT"),
2462 Token::Whitespace(Whitespace::Space),
2463 Token::Number(String::from(".1"), false),
2464 ];
2465
2466 compare(expected, tokens);
2467 }
2468
2469 #[test]
2470 fn tokenize_clickhouse_double_equal() {
2471 let sql = String::from("SELECT foo=='1'");
2472 let dialect = ClickHouseDialect {};
2473 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2474 let tokens = tokenizer.tokenize().unwrap();
2475
2476 let expected = vec![
2477 Token::make_keyword("SELECT"),
2478 Token::Whitespace(Whitespace::Space),
2479 Token::Word(Word {
2480 value: "foo".to_string(),
2481 quote_style: None,
2482 keyword: Keyword::NoKeyword,
2483 }),
2484 Token::DoubleEq,
2485 Token::SingleQuotedString("1".to_string()),
2486 ];
2487
2488 compare(expected, tokens);
2489 }
2490
2491 #[test]
2492 fn tokenize_numeric_literal_underscore() {
2493 let dialect = GenericDialect {};
2494 let sql = String::from("SELECT 10_000");
2495 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2496 let tokens = tokenizer.tokenize().unwrap();
2497 let expected = vec![
2498 Token::make_keyword("SELECT"),
2499 Token::Whitespace(Whitespace::Space),
2500 Token::Number("10".to_string(), false),
2501 Token::make_word("_000", None),
2502 ];
2503 compare(expected, tokens);
2504
2505 all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2506 "SELECT 10_000, _10_000, 10_00_, 10___0",
2507 vec![
2508 Token::make_keyword("SELECT"),
2509 Token::Whitespace(Whitespace::Space),
2510 Token::Number("10_000".to_string(), false),
2511 Token::Comma,
2512 Token::Whitespace(Whitespace::Space),
2513 Token::make_word("_10_000", None), Token::Comma,
2515 Token::Whitespace(Whitespace::Space),
2516 Token::Number("10_00".to_string(), false),
2517 Token::make_word("_", None), Token::Comma,
2519 Token::Whitespace(Whitespace::Space),
2520 Token::Number("10".to_string(), false),
2521 Token::make_word("___0", None), ],
2523 );
2524 }
2525
2526 #[test]
2527 fn tokenize_select_exponent() {
2528 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2529 let dialect = GenericDialect {};
2530 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2531
2532 let expected = vec![
2533 Token::make_keyword("SELECT"),
2534 Token::Whitespace(Whitespace::Space),
2535 Token::Number(String::from("1e10"), false),
2536 Token::Comma,
2537 Token::Whitespace(Whitespace::Space),
2538 Token::Number(String::from("1e-10"), false),
2539 Token::Comma,
2540 Token::Whitespace(Whitespace::Space),
2541 Token::Number(String::from("1e+10"), false),
2542 Token::Comma,
2543 Token::Whitespace(Whitespace::Space),
2544 Token::Number(String::from("1"), false),
2545 Token::make_word("ea", None),
2546 Token::Comma,
2547 Token::Whitespace(Whitespace::Space),
2548 Token::Number(String::from("1e-10"), false),
2549 Token::make_word("a", None),
2550 Token::Comma,
2551 Token::Whitespace(Whitespace::Space),
2552 Token::Number(String::from("1e-10"), false),
2553 Token::Minus,
2554 Token::Number(String::from("10"), false),
2555 ];
2556
2557 compare(expected, tokens);
2558 }
2559
2560 #[test]
2561 fn tokenize_scalar_function() {
2562 let sql = String::from("SELECT sqrt(1)");
2563 let dialect = GenericDialect {};
2564 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2565
2566 let expected = vec![
2567 Token::make_keyword("SELECT"),
2568 Token::Whitespace(Whitespace::Space),
2569 Token::make_word("sqrt", None),
2570 Token::LParen,
2571 Token::Number(String::from("1"), false),
2572 Token::RParen,
2573 ];
2574
2575 compare(expected, tokens);
2576 }
2577
2578 #[test]
2579 fn tokenize_string_string_concat() {
2580 let sql = String::from("SELECT 'a' || 'b'");
2581 let dialect = GenericDialect {};
2582 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2583
2584 let expected = vec![
2585 Token::make_keyword("SELECT"),
2586 Token::Whitespace(Whitespace::Space),
2587 Token::SingleQuotedString(String::from("a")),
2588 Token::Whitespace(Whitespace::Space),
2589 Token::StringConcat,
2590 Token::Whitespace(Whitespace::Space),
2591 Token::SingleQuotedString(String::from("b")),
2592 ];
2593
2594 compare(expected, tokens);
2595 }
2596 #[test]
2597 fn tokenize_bitwise_op() {
2598 let sql = String::from("SELECT one | two ^ three");
2599 let dialect = GenericDialect {};
2600 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2601
2602 let expected = vec![
2603 Token::make_keyword("SELECT"),
2604 Token::Whitespace(Whitespace::Space),
2605 Token::make_word("one", None),
2606 Token::Whitespace(Whitespace::Space),
2607 Token::Pipe,
2608 Token::Whitespace(Whitespace::Space),
2609 Token::make_word("two", None),
2610 Token::Whitespace(Whitespace::Space),
2611 Token::Caret,
2612 Token::Whitespace(Whitespace::Space),
2613 Token::make_word("three", None),
2614 ];
2615 compare(expected, tokens);
2616 }
2617
2618 #[test]
2619 fn tokenize_logical_xor() {
2620 let sql =
2621 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2622 let dialect = GenericDialect {};
2623 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2624
2625 let expected = vec![
2626 Token::make_keyword("SELECT"),
2627 Token::Whitespace(Whitespace::Space),
2628 Token::make_keyword("true"),
2629 Token::Whitespace(Whitespace::Space),
2630 Token::make_keyword("XOR"),
2631 Token::Whitespace(Whitespace::Space),
2632 Token::make_keyword("true"),
2633 Token::Comma,
2634 Token::Whitespace(Whitespace::Space),
2635 Token::make_keyword("false"),
2636 Token::Whitespace(Whitespace::Space),
2637 Token::make_keyword("XOR"),
2638 Token::Whitespace(Whitespace::Space),
2639 Token::make_keyword("false"),
2640 Token::Comma,
2641 Token::Whitespace(Whitespace::Space),
2642 Token::make_keyword("true"),
2643 Token::Whitespace(Whitespace::Space),
2644 Token::make_keyword("XOR"),
2645 Token::Whitespace(Whitespace::Space),
2646 Token::make_keyword("false"),
2647 Token::Comma,
2648 Token::Whitespace(Whitespace::Space),
2649 Token::make_keyword("false"),
2650 Token::Whitespace(Whitespace::Space),
2651 Token::make_keyword("XOR"),
2652 Token::Whitespace(Whitespace::Space),
2653 Token::make_keyword("true"),
2654 ];
2655 compare(expected, tokens);
2656 }
2657
2658 #[test]
2659 fn tokenize_simple_select() {
2660 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2661 let dialect = GenericDialect {};
2662 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2663
2664 let expected = vec![
2665 Token::make_keyword("SELECT"),
2666 Token::Whitespace(Whitespace::Space),
2667 Token::Mul,
2668 Token::Whitespace(Whitespace::Space),
2669 Token::make_keyword("FROM"),
2670 Token::Whitespace(Whitespace::Space),
2671 Token::make_word("customer", None),
2672 Token::Whitespace(Whitespace::Space),
2673 Token::make_keyword("WHERE"),
2674 Token::Whitespace(Whitespace::Space),
2675 Token::make_word("id", None),
2676 Token::Whitespace(Whitespace::Space),
2677 Token::Eq,
2678 Token::Whitespace(Whitespace::Space),
2679 Token::Number(String::from("1"), false),
2680 Token::Whitespace(Whitespace::Space),
2681 Token::make_keyword("LIMIT"),
2682 Token::Whitespace(Whitespace::Space),
2683 Token::Number(String::from("5"), false),
2684 ];
2685
2686 compare(expected, tokens);
2687 }
2688
2689 #[test]
2690 fn tokenize_explain_select() {
2691 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2692 let dialect = GenericDialect {};
2693 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2694
2695 let expected = vec![
2696 Token::make_keyword("EXPLAIN"),
2697 Token::Whitespace(Whitespace::Space),
2698 Token::make_keyword("SELECT"),
2699 Token::Whitespace(Whitespace::Space),
2700 Token::Mul,
2701 Token::Whitespace(Whitespace::Space),
2702 Token::make_keyword("FROM"),
2703 Token::Whitespace(Whitespace::Space),
2704 Token::make_word("customer", None),
2705 Token::Whitespace(Whitespace::Space),
2706 Token::make_keyword("WHERE"),
2707 Token::Whitespace(Whitespace::Space),
2708 Token::make_word("id", None),
2709 Token::Whitespace(Whitespace::Space),
2710 Token::Eq,
2711 Token::Whitespace(Whitespace::Space),
2712 Token::Number(String::from("1"), false),
2713 ];
2714
2715 compare(expected, tokens);
2716 }
2717
2718 #[test]
2719 fn tokenize_explain_analyze_select() {
2720 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2721 let dialect = GenericDialect {};
2722 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2723
2724 let expected = vec![
2725 Token::make_keyword("EXPLAIN"),
2726 Token::Whitespace(Whitespace::Space),
2727 Token::make_keyword("ANALYZE"),
2728 Token::Whitespace(Whitespace::Space),
2729 Token::make_keyword("SELECT"),
2730 Token::Whitespace(Whitespace::Space),
2731 Token::Mul,
2732 Token::Whitespace(Whitespace::Space),
2733 Token::make_keyword("FROM"),
2734 Token::Whitespace(Whitespace::Space),
2735 Token::make_word("customer", None),
2736 Token::Whitespace(Whitespace::Space),
2737 Token::make_keyword("WHERE"),
2738 Token::Whitespace(Whitespace::Space),
2739 Token::make_word("id", None),
2740 Token::Whitespace(Whitespace::Space),
2741 Token::Eq,
2742 Token::Whitespace(Whitespace::Space),
2743 Token::Number(String::from("1"), false),
2744 ];
2745
2746 compare(expected, tokens);
2747 }
2748
2749 #[test]
2750 fn tokenize_string_predicate() {
2751 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2752 let dialect = GenericDialect {};
2753 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2754
2755 let expected = vec![
2756 Token::make_keyword("SELECT"),
2757 Token::Whitespace(Whitespace::Space),
2758 Token::Mul,
2759 Token::Whitespace(Whitespace::Space),
2760 Token::make_keyword("FROM"),
2761 Token::Whitespace(Whitespace::Space),
2762 Token::make_word("customer", None),
2763 Token::Whitespace(Whitespace::Space),
2764 Token::make_keyword("WHERE"),
2765 Token::Whitespace(Whitespace::Space),
2766 Token::make_word("salary", None),
2767 Token::Whitespace(Whitespace::Space),
2768 Token::Neq,
2769 Token::Whitespace(Whitespace::Space),
2770 Token::SingleQuotedString(String::from("Not Provided")),
2771 ];
2772
2773 compare(expected, tokens);
2774 }
2775
2776 #[test]
2777 fn tokenize_invalid_string() {
2778 let sql = String::from("\n💝مصطفىh");
2779
2780 let dialect = GenericDialect {};
2781 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2782 let expected = vec![
2784 Token::Whitespace(Whitespace::Newline),
2785 Token::Char('💝'),
2786 Token::make_word("مصطفىh", None),
2787 ];
2788 compare(expected, tokens);
2789 }
2790
2791 #[test]
2792 fn tokenize_newline_in_string_literal() {
2793 let sql = String::from("'foo\r\nbar\nbaz'");
2794
2795 let dialect = GenericDialect {};
2796 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2797 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2798 compare(expected, tokens);
2799 }
2800
2801 #[test]
2802 fn tokenize_unterminated_string_literal() {
2803 let sql = String::from("select 'foo");
2804
2805 let dialect = GenericDialect {};
2806 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2807 assert_eq!(
2808 tokenizer.tokenize(),
2809 Err(TokenizerError {
2810 message: "Unterminated string literal".to_string(),
2811 location: Location { line: 1, column: 8 },
2812 })
2813 );
2814 }
2815
2816 #[test]
2817 fn tokenize_unterminated_string_literal_utf8() {
2818 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2819
2820 let dialect = GenericDialect {};
2821 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2822 assert_eq!(
2823 tokenizer.tokenize(),
2824 Err(TokenizerError {
2825 message: "Unterminated string literal".to_string(),
2826 location: Location {
2827 line: 1,
2828 column: 35
2829 }
2830 })
2831 );
2832 }
2833
2834 #[test]
2835 fn tokenize_invalid_string_cols() {
2836 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2837
2838 let dialect = GenericDialect {};
2839 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2840 let expected = vec![
2842 Token::Whitespace(Whitespace::Newline),
2843 Token::Whitespace(Whitespace::Newline),
2844 Token::make_keyword("SELECT"),
2845 Token::Whitespace(Whitespace::Space),
2846 Token::Mul,
2847 Token::Whitespace(Whitespace::Space),
2848 Token::make_keyword("FROM"),
2849 Token::Whitespace(Whitespace::Space),
2850 Token::make_keyword("table"),
2851 Token::Whitespace(Whitespace::Tab),
2852 Token::Char('💝'),
2853 Token::make_word("مصطفىh", None),
2854 ];
2855 compare(expected, tokens);
2856 }
2857
2858 #[test]
2859 fn tokenize_dollar_quoted_string_tagged() {
2860 let test_cases = vec![
2861 (
2862 String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2863 vec![
2864 Token::make_keyword("SELECT"),
2865 Token::Whitespace(Whitespace::Space),
2866 Token::DollarQuotedString(DollarQuotedString {
2867 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2868 tag: Some("tag".into()),
2869 })
2870 ]
2871 ),
2872 (
2873 String::from("SELECT $abc$x$ab$abc$"),
2874 vec![
2875 Token::make_keyword("SELECT"),
2876 Token::Whitespace(Whitespace::Space),
2877 Token::DollarQuotedString(DollarQuotedString {
2878 value: "x$ab".into(),
2879 tag: Some("abc".into()),
2880 })
2881 ]
2882 ),
2883 (
2884 String::from("SELECT $abc$$abc$"),
2885 vec![
2886 Token::make_keyword("SELECT"),
2887 Token::Whitespace(Whitespace::Space),
2888 Token::DollarQuotedString(DollarQuotedString {
2889 value: "".into(),
2890 tag: Some("abc".into()),
2891 })
2892 ]
2893 ),
2894 (
2895 String::from("0$abc$$abc$1"),
2896 vec![
2897 Token::Number("0".into(), false),
2898 Token::DollarQuotedString(DollarQuotedString {
2899 value: "".into(),
2900 tag: Some("abc".into()),
2901 }),
2902 Token::Number("1".into(), false),
2903 ]
2904 ),
2905 (
2906 String::from("$function$abc$q$data$q$$function$"),
2907 vec![
2908 Token::DollarQuotedString(DollarQuotedString {
2909 value: "abc$q$data$q$".into(),
2910 tag: Some("function".into()),
2911 }),
2912 ]
2913 ),
2914 ];
2915
2916 let dialect = GenericDialect {};
2917 for (sql, expected) in test_cases {
2918 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2919 compare(expected, tokens);
2920 }
2921 }
2922
2923 #[test]
2924 fn tokenize_dollar_quoted_string_tagged_unterminated() {
2925 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2926 let dialect = GenericDialect {};
2927 assert_eq!(
2928 Tokenizer::new(&dialect, &sql).tokenize(),
2929 Err(TokenizerError {
2930 message: "Unterminated dollar-quoted, expected $".into(),
2931 location: Location {
2932 line: 1,
2933 column: 91
2934 }
2935 })
2936 );
2937 }
2938
2939 #[test]
2940 fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2941 let sql = String::from("SELECT $abc$abc$");
2942 let dialect = GenericDialect {};
2943 assert_eq!(
2944 Tokenizer::new(&dialect, &sql).tokenize(),
2945 Err(TokenizerError {
2946 message: "Unterminated dollar-quoted, expected $".into(),
2947 location: Location {
2948 line: 1,
2949 column: 17
2950 }
2951 })
2952 );
2953 }
2954
2955 #[test]
2956 fn tokenize_dollar_placeholder() {
2957 let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
2958 let dialect = SQLiteDialect {};
2959 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2960 assert_eq!(
2961 tokens,
2962 vec![
2963 Token::make_keyword("SELECT"),
2964 Token::Whitespace(Whitespace::Space),
2965 Token::Placeholder("$$".into()),
2966 Token::Comma,
2967 Token::Whitespace(Whitespace::Space),
2968 Token::Placeholder("$$ABC$$".into()),
2969 Token::Comma,
2970 Token::Whitespace(Whitespace::Space),
2971 Token::Placeholder("$ABC$".into()),
2972 Token::Comma,
2973 Token::Whitespace(Whitespace::Space),
2974 Token::Placeholder("$ABC".into()),
2975 ]
2976 );
2977 }
2978
2979 #[test]
2980 fn tokenize_nested_dollar_quoted_strings() {
2981 let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
2982 let dialect = GenericDialect {};
2983 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2984 let expected = vec![
2985 Token::make_keyword("SELECT"),
2986 Token::Whitespace(Whitespace::Space),
2987 Token::DollarQuotedString(DollarQuotedString {
2988 value: "dollar $nested$ string".into(),
2989 tag: Some("tag".into()),
2990 }),
2991 ];
2992 compare(expected, tokens);
2993 }
2994
2995 #[test]
2996 fn tokenize_dollar_quoted_string_untagged_empty() {
2997 let sql = String::from("SELECT $$$$");
2998 let dialect = GenericDialect {};
2999 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3000 let expected = vec![
3001 Token::make_keyword("SELECT"),
3002 Token::Whitespace(Whitespace::Space),
3003 Token::DollarQuotedString(DollarQuotedString {
3004 value: "".into(),
3005 tag: None,
3006 }),
3007 ];
3008 compare(expected, tokens);
3009 }
3010
3011 #[test]
3012 fn tokenize_dollar_quoted_string_untagged() {
3013 let sql =
3014 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3015 let dialect = GenericDialect {};
3016 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3017 let expected = vec![
3018 Token::make_keyword("SELECT"),
3019 Token::Whitespace(Whitespace::Space),
3020 Token::DollarQuotedString(DollarQuotedString {
3021 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3022 tag: None,
3023 }),
3024 ];
3025 compare(expected, tokens);
3026 }
3027
3028 #[test]
3029 fn tokenize_dollar_quoted_string_untagged_unterminated() {
3030 let sql = String::from(
3031 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3032 );
3033 let dialect = GenericDialect {};
3034 assert_eq!(
3035 Tokenizer::new(&dialect, &sql).tokenize(),
3036 Err(TokenizerError {
3037 message: "Unterminated dollar-quoted string".into(),
3038 location: Location {
3039 line: 1,
3040 column: 86
3041 }
3042 })
3043 );
3044 }
3045
3046 #[test]
3047 fn tokenize_right_arrow() {
3048 let sql = String::from("FUNCTION(key=>value)");
3049 let dialect = GenericDialect {};
3050 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3051 let expected = vec![
3052 Token::make_word("FUNCTION", None),
3053 Token::LParen,
3054 Token::make_word("key", None),
3055 Token::RArrow,
3056 Token::make_word("value", None),
3057 Token::RParen,
3058 ];
3059 compare(expected, tokens);
3060 }
3061
3062 #[test]
3063 fn tokenize_is_null() {
3064 let sql = String::from("a IS NULL");
3065 let dialect = GenericDialect {};
3066 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3067
3068 let expected = vec![
3069 Token::make_word("a", None),
3070 Token::Whitespace(Whitespace::Space),
3071 Token::make_keyword("IS"),
3072 Token::Whitespace(Whitespace::Space),
3073 Token::make_keyword("NULL"),
3074 ];
3075
3076 compare(expected, tokens);
3077 }
3078
3079 #[test]
3080 fn tokenize_comment() {
3081 let test_cases = vec![
3082 (
3083 String::from("0--this is a comment\n1"),
3084 vec![
3085 Token::Number("0".to_string(), false),
3086 Token::Whitespace(Whitespace::SingleLineComment {
3087 prefix: "--".to_string(),
3088 comment: "this is a comment\n".to_string(),
3089 }),
3090 Token::Number("1".to_string(), false),
3091 ],
3092 ),
3093 (
3094 String::from("0--this is a comment\r1"),
3095 vec![
3096 Token::Number("0".to_string(), false),
3097 Token::Whitespace(Whitespace::SingleLineComment {
3098 prefix: "--".to_string(),
3099 comment: "this is a comment\r1".to_string(),
3100 }),
3101 ],
3102 ),
3103 (
3104 String::from("0--this is a comment\r\n1"),
3105 vec![
3106 Token::Number("0".to_string(), false),
3107 Token::Whitespace(Whitespace::SingleLineComment {
3108 prefix: "--".to_string(),
3109 comment: "this is a comment\r\n".to_string(),
3110 }),
3111 Token::Number("1".to_string(), false),
3112 ],
3113 ),
3114 ];
3115
3116 let dialect = GenericDialect {};
3117
3118 for (sql, expected) in test_cases {
3119 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3120 compare(expected, tokens);
3121 }
3122 }
3123
3124 #[test]
3125 fn tokenize_comment_postgres() {
3126 let sql = String::from("1--\r0");
3127
3128 let dialect = PostgreSqlDialect {};
3129 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3130 let expected = vec![
3131 Token::Number("1".to_string(), false),
3132 Token::Whitespace(Whitespace::SingleLineComment {
3133 prefix: "--".to_string(),
3134 comment: "\r".to_string(),
3135 }),
3136 Token::Number("0".to_string(), false),
3137 ];
3138 compare(expected, tokens);
3139 }
3140
3141 #[test]
3142 fn tokenize_comment_at_eof() {
3143 let sql = String::from("--this is a comment");
3144
3145 let dialect = GenericDialect {};
3146 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3147 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3148 prefix: "--".to_string(),
3149 comment: "this is a comment".to_string(),
3150 })];
3151 compare(expected, tokens);
3152 }
3153
3154 #[test]
3155 fn tokenize_multiline_comment() {
3156 let sql = String::from("0/*multi-line\n* /comment*/1");
3157
3158 let dialect = GenericDialect {};
3159 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3160 let expected = vec![
3161 Token::Number("0".to_string(), false),
3162 Token::Whitespace(Whitespace::MultiLineComment(
3163 "multi-line\n* /comment".to_string(),
3164 )),
3165 Token::Number("1".to_string(), false),
3166 ];
3167 compare(expected, tokens);
3168 }
3169
3170 #[test]
3171 fn tokenize_nested_multiline_comment() {
3172 let dialect = GenericDialect {};
3173 let test_cases = vec![
3174 (
3175 "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3176 vec![
3177 Token::Number("0".to_string(), false),
3178 Token::Whitespace(Whitespace::MultiLineComment(
3179 "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3180 )),
3181 Token::Whitespace(Whitespace::Space),
3182 Token::Div,
3183 Token::Word(Word {
3184 value: "comment".to_string(),
3185 quote_style: None,
3186 keyword: Keyword::COMMENT,
3187 }),
3188 Token::Mul,
3189 Token::Div,
3190 Token::Number("1".to_string(), false),
3191 ],
3192 ),
3193 (
3194 "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3195 vec![
3196 Token::Number("0".to_string(), false),
3197 Token::Whitespace(Whitespace::MultiLineComment(
3198 "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3199 )),
3200 Token::Number("1".to_string(), false),
3201 ],
3202 ),
3203 (
3204 "SELECT 1/* a /* b */ c */0",
3205 vec![
3206 Token::make_keyword("SELECT"),
3207 Token::Whitespace(Whitespace::Space),
3208 Token::Number("1".to_string(), false),
3209 Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3210 Token::Number("0".to_string(), false),
3211 ],
3212 ),
3213 ];
3214
3215 for (sql, expected) in test_cases {
3216 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3217 compare(expected, tokens);
3218 }
3219 }
3220
3221 #[test]
3222 fn tokenize_nested_multiline_comment_empty() {
3223 let sql = "select 1/*/**/*/0";
3224
3225 let dialect = GenericDialect {};
3226 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3227 let expected = vec![
3228 Token::make_keyword("select"),
3229 Token::Whitespace(Whitespace::Space),
3230 Token::Number("1".to_string(), false),
3231 Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3232 Token::Number("0".to_string(), false),
3233 ];
3234
3235 compare(expected, tokens);
3236 }
3237
3238 #[test]
3239 fn tokenize_nested_comments_if_not_supported() {
3240 let dialect = SQLiteDialect {};
3241 let sql = "SELECT 1/*/* nested comment */*/0";
3242 let tokens = Tokenizer::new(&dialect, sql).tokenize();
3243 let expected = vec![
3244 Token::make_keyword("SELECT"),
3245 Token::Whitespace(Whitespace::Space),
3246 Token::Number("1".to_string(), false),
3247 Token::Whitespace(Whitespace::MultiLineComment(
3248 "/* nested comment ".to_string(),
3249 )),
3250 Token::Mul,
3251 Token::Div,
3252 Token::Number("0".to_string(), false),
3253 ];
3254
3255 compare(expected, tokens.unwrap());
3256 }
3257
3258 #[test]
3259 fn tokenize_multiline_comment_with_even_asterisks() {
3260 let sql = String::from("\n/** Comment **/\n");
3261
3262 let dialect = GenericDialect {};
3263 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3264 let expected = vec![
3265 Token::Whitespace(Whitespace::Newline),
3266 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3267 Token::Whitespace(Whitespace::Newline),
3268 ];
3269 compare(expected, tokens);
3270 }
3271
3272 #[test]
3273 fn tokenize_unicode_whitespace() {
3274 let sql = String::from(" \u{2003}\n");
3275
3276 let dialect = GenericDialect {};
3277 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3278 let expected = vec![
3279 Token::Whitespace(Whitespace::Space),
3280 Token::Whitespace(Whitespace::Space),
3281 Token::Whitespace(Whitespace::Newline),
3282 ];
3283 compare(expected, tokens);
3284 }
3285
3286 #[test]
3287 fn tokenize_mismatched_quotes() {
3288 let sql = String::from("\"foo");
3289
3290 let dialect = GenericDialect {};
3291 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3292 assert_eq!(
3293 tokenizer.tokenize(),
3294 Err(TokenizerError {
3295 message: "Expected close delimiter '\"' before EOF.".to_string(),
3296 location: Location { line: 1, column: 1 },
3297 })
3298 );
3299 }
3300
3301 #[test]
3302 fn tokenize_newlines() {
3303 let sql = String::from("line1\nline2\rline3\r\nline4\r");
3304
3305 let dialect = GenericDialect {};
3306 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3307 let expected = vec![
3308 Token::make_word("line1", None),
3309 Token::Whitespace(Whitespace::Newline),
3310 Token::make_word("line2", None),
3311 Token::Whitespace(Whitespace::Newline),
3312 Token::make_word("line3", None),
3313 Token::Whitespace(Whitespace::Newline),
3314 Token::make_word("line4", None),
3315 Token::Whitespace(Whitespace::Newline),
3316 ];
3317 compare(expected, tokens);
3318 }
3319
3320 #[test]
3321 fn tokenize_mssql_top() {
3322 let sql = "SELECT TOP 5 [bar] FROM foo";
3323 let dialect = MsSqlDialect {};
3324 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3325 let expected = vec![
3326 Token::make_keyword("SELECT"),
3327 Token::Whitespace(Whitespace::Space),
3328 Token::make_keyword("TOP"),
3329 Token::Whitespace(Whitespace::Space),
3330 Token::Number(String::from("5"), false),
3331 Token::Whitespace(Whitespace::Space),
3332 Token::make_word("bar", Some('[')),
3333 Token::Whitespace(Whitespace::Space),
3334 Token::make_keyword("FROM"),
3335 Token::Whitespace(Whitespace::Space),
3336 Token::make_word("foo", None),
3337 ];
3338 compare(expected, tokens);
3339 }
3340
3341 #[test]
3342 fn tokenize_pg_regex_match() {
3343 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3344 let dialect = GenericDialect {};
3345 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3346 let expected = vec![
3347 Token::make_keyword("SELECT"),
3348 Token::Whitespace(Whitespace::Space),
3349 Token::make_word("col", None),
3350 Token::Whitespace(Whitespace::Space),
3351 Token::Tilde,
3352 Token::Whitespace(Whitespace::Space),
3353 Token::SingleQuotedString("^a".into()),
3354 Token::Comma,
3355 Token::Whitespace(Whitespace::Space),
3356 Token::make_word("col", None),
3357 Token::Whitespace(Whitespace::Space),
3358 Token::TildeAsterisk,
3359 Token::Whitespace(Whitespace::Space),
3360 Token::SingleQuotedString("^a".into()),
3361 Token::Comma,
3362 Token::Whitespace(Whitespace::Space),
3363 Token::make_word("col", None),
3364 Token::Whitespace(Whitespace::Space),
3365 Token::ExclamationMarkTilde,
3366 Token::Whitespace(Whitespace::Space),
3367 Token::SingleQuotedString("^a".into()),
3368 Token::Comma,
3369 Token::Whitespace(Whitespace::Space),
3370 Token::make_word("col", None),
3371 Token::Whitespace(Whitespace::Space),
3372 Token::ExclamationMarkTildeAsterisk,
3373 Token::Whitespace(Whitespace::Space),
3374 Token::SingleQuotedString("^a".into()),
3375 ];
3376 compare(expected, tokens);
3377 }
3378
3379 #[test]
3380 fn tokenize_pg_like_match() {
3381 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3382 let dialect = GenericDialect {};
3383 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3384 let expected = vec![
3385 Token::make_keyword("SELECT"),
3386 Token::Whitespace(Whitespace::Space),
3387 Token::make_word("col", None),
3388 Token::Whitespace(Whitespace::Space),
3389 Token::DoubleTilde,
3390 Token::Whitespace(Whitespace::Space),
3391 Token::SingleQuotedString("_a%".into()),
3392 Token::Comma,
3393 Token::Whitespace(Whitespace::Space),
3394 Token::make_word("col", None),
3395 Token::Whitespace(Whitespace::Space),
3396 Token::DoubleTildeAsterisk,
3397 Token::Whitespace(Whitespace::Space),
3398 Token::SingleQuotedString("_a%".into()),
3399 Token::Comma,
3400 Token::Whitespace(Whitespace::Space),
3401 Token::make_word("col", None),
3402 Token::Whitespace(Whitespace::Space),
3403 Token::ExclamationMarkDoubleTilde,
3404 Token::Whitespace(Whitespace::Space),
3405 Token::SingleQuotedString("_a%".into()),
3406 Token::Comma,
3407 Token::Whitespace(Whitespace::Space),
3408 Token::make_word("col", None),
3409 Token::Whitespace(Whitespace::Space),
3410 Token::ExclamationMarkDoubleTildeAsterisk,
3411 Token::Whitespace(Whitespace::Space),
3412 Token::SingleQuotedString("_a%".into()),
3413 ];
3414 compare(expected, tokens);
3415 }
3416
3417 #[test]
3418 fn tokenize_quoted_identifier() {
3419 let sql = r#" "a "" b" "a """ "c """"" "#;
3420 let dialect = GenericDialect {};
3421 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3422 let expected = vec![
3423 Token::Whitespace(Whitespace::Space),
3424 Token::make_word(r#"a " b"#, Some('"')),
3425 Token::Whitespace(Whitespace::Space),
3426 Token::make_word(r#"a ""#, Some('"')),
3427 Token::Whitespace(Whitespace::Space),
3428 Token::make_word(r#"c """#, Some('"')),
3429 Token::Whitespace(Whitespace::Space),
3430 ];
3431 compare(expected, tokens);
3432 }
3433
3434 #[test]
3435 fn tokenize_snowflake_div() {
3436 let sql = r#"field/1000"#;
3437 let dialect = SnowflakeDialect {};
3438 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3439 let expected = vec![
3440 Token::make_word(r#"field"#, None),
3441 Token::Div,
3442 Token::Number("1000".to_string(), false),
3443 ];
3444 compare(expected, tokens);
3445 }
3446
3447 #[test]
3448 fn tokenize_quoted_identifier_with_no_escape() {
3449 let sql = r#" "a "" b" "a """ "c """"" "#;
3450 let dialect = GenericDialect {};
3451 let tokens = Tokenizer::new(&dialect, sql)
3452 .with_unescape(false)
3453 .tokenize()
3454 .unwrap();
3455 let expected = vec![
3456 Token::Whitespace(Whitespace::Space),
3457 Token::make_word(r#"a "" b"#, Some('"')),
3458 Token::Whitespace(Whitespace::Space),
3459 Token::make_word(r#"a """#, Some('"')),
3460 Token::Whitespace(Whitespace::Space),
3461 Token::make_word(r#"c """""#, Some('"')),
3462 Token::Whitespace(Whitespace::Space),
3463 ];
3464 compare(expected, tokens);
3465 }
3466
3467 #[test]
3468 fn tokenize_with_location() {
3469 let sql = "SELECT a,\n b";
3470 let dialect = GenericDialect {};
3471 let tokens = Tokenizer::new(&dialect, sql)
3472 .tokenize_with_location()
3473 .unwrap();
3474 let expected = vec![
3475 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3476 TokenWithSpan::at(
3477 Token::Whitespace(Whitespace::Space),
3478 (1, 7).into(),
3479 (1, 8).into(),
3480 ),
3481 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3482 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3483 TokenWithSpan::at(
3484 Token::Whitespace(Whitespace::Newline),
3485 (1, 10).into(),
3486 (2, 1).into(),
3487 ),
3488 TokenWithSpan::at(
3489 Token::Whitespace(Whitespace::Space),
3490 (2, 1).into(),
3491 (2, 2).into(),
3492 ),
3493 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3494 ];
3495 compare(expected, tokens);
3496 }
3497
3498 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3499 assert_eq!(expected, actual);
3504 }
3505
3506 fn check_unescape(s: &str, expected: Option<&str>) {
3507 let s = format!("'{s}'");
3508 let mut state = State {
3509 peekable: s.chars().peekable(),
3510 line: 0,
3511 col: 0,
3512 };
3513
3514 assert_eq!(
3515 unescape_single_quoted_string(&mut state),
3516 expected.map(|s| s.to_string())
3517 );
3518 }
3519
3520 #[test]
3521 fn test_unescape() {
3522 check_unescape(r"\b", Some("\u{0008}"));
3523 check_unescape(r"\f", Some("\u{000C}"));
3524 check_unescape(r"\t", Some("\t"));
3525 check_unescape(r"\r\n", Some("\r\n"));
3526 check_unescape(r"\/", Some("/"));
3527 check_unescape(r"/", Some("/"));
3528 check_unescape(r"\\", Some("\\"));
3529
3530 check_unescape(r"\u0001", Some("\u{0001}"));
3532 check_unescape(r"\u4c91", Some("\u{4c91}"));
3533 check_unescape(r"\u4c916", Some("\u{4c91}6"));
3534 check_unescape(r"\u4c", None);
3535 check_unescape(r"\u0000", None);
3536 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3537 check_unescape(r"\U00110000", None);
3538 check_unescape(r"\U00000000", None);
3539 check_unescape(r"\u", None);
3540 check_unescape(r"\U", None);
3541 check_unescape(r"\U1010FFFF", None);
3542
3543 check_unescape(r"\x4B", Some("\u{004b}"));
3545 check_unescape(r"\x4", Some("\u{0004}"));
3546 check_unescape(r"\x4L", Some("\u{0004}L"));
3547 check_unescape(r"\x", Some("x"));
3548 check_unescape(r"\xP", Some("xP"));
3549 check_unescape(r"\x0", None);
3550 check_unescape(r"\xCAD", None);
3551 check_unescape(r"\xA9", None);
3552
3553 check_unescape(r"\1", Some("\u{0001}"));
3555 check_unescape(r"\12", Some("\u{000a}"));
3556 check_unescape(r"\123", Some("\u{0053}"));
3557 check_unescape(r"\1232", Some("\u{0053}2"));
3558 check_unescape(r"\4", Some("\u{0004}"));
3559 check_unescape(r"\45", Some("\u{0025}"));
3560 check_unescape(r"\450", Some("\u{0028}"));
3561 check_unescape(r"\603", None);
3562 check_unescape(r"\0", None);
3563 check_unescape(r"\080", None);
3564
3565 check_unescape(r"\9", Some("9"));
3567 check_unescape(r"''", Some("'"));
3568 check_unescape(
3569 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3570 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3571 );
3572 check_unescape(r"Hello\0", None);
3573 check_unescape(r"Hello\xCADRust", None);
3574 }
3575
3576 #[test]
3577 fn tokenize_numeric_prefix_trait() {
3578 #[derive(Debug)]
3579 struct NumericPrefixDialect;
3580
3581 impl Dialect for NumericPrefixDialect {
3582 fn is_identifier_start(&self, ch: char) -> bool {
3583 ch.is_ascii_lowercase()
3584 || ch.is_ascii_uppercase()
3585 || ch.is_ascii_digit()
3586 || ch == '$'
3587 }
3588
3589 fn is_identifier_part(&self, ch: char) -> bool {
3590 ch.is_ascii_lowercase()
3591 || ch.is_ascii_uppercase()
3592 || ch.is_ascii_digit()
3593 || ch == '_'
3594 || ch == '$'
3595 || ch == '{'
3596 || ch == '}'
3597 }
3598
3599 fn supports_numeric_prefix(&self) -> bool {
3600 true
3601 }
3602 }
3603
3604 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3605 tokenize_numeric_prefix_inner(&HiveDialect {});
3606 tokenize_numeric_prefix_inner(&MySqlDialect {});
3607 }
3608
3609 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3610 let sql = r#"SELECT * FROM 1"#;
3611 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3612 let expected = vec![
3613 Token::make_keyword("SELECT"),
3614 Token::Whitespace(Whitespace::Space),
3615 Token::Mul,
3616 Token::Whitespace(Whitespace::Space),
3617 Token::make_keyword("FROM"),
3618 Token::Whitespace(Whitespace::Space),
3619 Token::Number(String::from("1"), false),
3620 ];
3621 compare(expected, tokens);
3622 }
3623
3624 #[test]
3625 fn tokenize_quoted_string_escape() {
3626 let dialect = SnowflakeDialect {};
3627 for (sql, expected, expected_unescaped) in [
3628 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3629 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3630 (r#"'\\'"#, r#"\\"#, r#"\"#),
3631 (
3632 r#"'\0\a\b\f\n\r\t\Z'"#,
3633 r#"\0\a\b\f\n\r\t\Z"#,
3634 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3635 ),
3636 (r#"'\"'"#, r#"\""#, "\""),
3637 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3638 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3639 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3640 (r#"'\q'"#, r#"\q"#, r#"q"#),
3641 (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3642 (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3643 ] {
3644 let tokens = Tokenizer::new(&dialect, sql)
3645 .with_unescape(false)
3646 .tokenize()
3647 .unwrap();
3648 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3649 compare(expected, tokens);
3650
3651 let tokens = Tokenizer::new(&dialect, sql)
3652 .with_unescape(true)
3653 .tokenize()
3654 .unwrap();
3655 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3656 compare(expected, tokens);
3657 }
3658
3659 for sql in [r#"'\'"#, r#"'ab\'"#] {
3660 let mut tokenizer = Tokenizer::new(&dialect, sql);
3661 assert_eq!(
3662 "Unterminated string literal",
3663 tokenizer.tokenize().unwrap_err().message.as_str(),
3664 );
3665 }
3666
3667 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3669 let dialect = GenericDialect {};
3670 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3671
3672 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3673
3674 compare(expected, tokens);
3675 }
3676
3677 for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3679 let dialect = MySqlDialect {};
3680 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3681
3682 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3683
3684 compare(expected, tokens);
3685 }
3686 }
3687
3688 #[test]
3689 fn tokenize_triple_quoted_string() {
3690 fn check<F>(
3691 q: char, r: char, quote_token: F,
3694 ) where
3695 F: Fn(String) -> Token,
3696 {
3697 let dialect = BigQueryDialect {};
3698
3699 for (sql, expected, expected_unescaped) in [
3700 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3702 (
3704 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3705 format!(r#"ab{q}{q}\{q}{q}cd"#),
3706 format!(r#"ab{q}{q}{q}{q}cd"#),
3707 ),
3708 (
3710 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3711 "abc".into(),
3712 "abc".into(),
3713 ),
3714 (
3716 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3717 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3718 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3719 ),
3720 (
3722 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3723 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3724 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3725 ),
3726 (
3728 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3729 r#"a\'\'b\'c\'d"#.into(),
3730 r#"a''b'c'd"#.into(),
3731 ),
3732 (
3734 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3735 r#"abc\0\n\rdef"#.into(),
3736 "abc\0\n\rdef".into(),
3737 ),
3738 ] {
3739 let tokens = Tokenizer::new(&dialect, sql.as_str())
3740 .with_unescape(false)
3741 .tokenize()
3742 .unwrap();
3743 let expected = vec![quote_token(expected.to_string())];
3744 compare(expected, tokens);
3745
3746 let tokens = Tokenizer::new(&dialect, sql.as_str())
3747 .with_unescape(true)
3748 .tokenize()
3749 .unwrap();
3750 let expected = vec![quote_token(expected_unescaped.to_string())];
3751 compare(expected, tokens);
3752 }
3753
3754 for sql in [
3755 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3756 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3757 format!(r#"{q}{q}{q}{q}"#),
3758 format!(r#"{q}{q}{q}{r}{r}"#),
3759 format!(r#"{q}{q}{q}abc{q}"#),
3760 format!(r#"{q}{q}{q}abc{q}{q}"#),
3761 format!(r#"{q}{q}{q}abc"#),
3762 ] {
3763 let dialect = BigQueryDialect {};
3764 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3765 assert_eq!(
3766 "Unterminated string literal",
3767 tokenizer.tokenize().unwrap_err().message.as_str(),
3768 );
3769 }
3770 }
3771
3772 check('"', '\'', Token::TripleDoubleQuotedString);
3773
3774 check('\'', '"', Token::TripleSingleQuotedString);
3775
3776 let dialect = BigQueryDialect {};
3777
3778 let sql = r#"""''"#;
3779 let tokens = Tokenizer::new(&dialect, sql)
3780 .with_unescape(true)
3781 .tokenize()
3782 .unwrap();
3783 let expected = vec![
3784 Token::DoubleQuotedString("".to_string()),
3785 Token::SingleQuotedString("".to_string()),
3786 ];
3787 compare(expected, tokens);
3788
3789 let sql = r#"''"""#;
3790 let tokens = Tokenizer::new(&dialect, sql)
3791 .with_unescape(true)
3792 .tokenize()
3793 .unwrap();
3794 let expected = vec![
3795 Token::SingleQuotedString("".to_string()),
3796 Token::DoubleQuotedString("".to_string()),
3797 ];
3798 compare(expected, tokens);
3799
3800 let dialect = SnowflakeDialect {};
3802 let sql = r#"''''''"#;
3803 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3804 let expected = vec![Token::SingleQuotedString("''".to_string())];
3805 compare(expected, tokens);
3806 }
3807
3808 #[test]
3809 fn test_mysql_users_grantees() {
3810 let dialect = MySqlDialect {};
3811
3812 let sql = "CREATE USER `root`@`%`";
3813 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3814 let expected = vec![
3815 Token::make_keyword("CREATE"),
3816 Token::Whitespace(Whitespace::Space),
3817 Token::make_keyword("USER"),
3818 Token::Whitespace(Whitespace::Space),
3819 Token::make_word("root", Some('`')),
3820 Token::AtSign,
3821 Token::make_word("%", Some('`')),
3822 ];
3823 compare(expected, tokens);
3824 }
3825
3826 #[test]
3827 fn test_postgres_abs_without_space_and_string_literal() {
3828 let dialect = MySqlDialect {};
3829
3830 let sql = "SELECT @'1'";
3831 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3832 let expected = vec![
3833 Token::make_keyword("SELECT"),
3834 Token::Whitespace(Whitespace::Space),
3835 Token::AtSign,
3836 Token::SingleQuotedString("1".to_string()),
3837 ];
3838 compare(expected, tokens);
3839 }
3840
3841 #[test]
3842 fn test_postgres_abs_without_space_and_quoted_column() {
3843 let dialect = MySqlDialect {};
3844
3845 let sql = r#"SELECT @"bar" FROM foo"#;
3846 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3847 let expected = vec![
3848 Token::make_keyword("SELECT"),
3849 Token::Whitespace(Whitespace::Space),
3850 Token::AtSign,
3851 Token::DoubleQuotedString("bar".to_string()),
3852 Token::Whitespace(Whitespace::Space),
3853 Token::make_keyword("FROM"),
3854 Token::Whitespace(Whitespace::Space),
3855 Token::make_word("foo", None),
3856 ];
3857 compare(expected, tokens);
3858 }
3859
3860 #[test]
3861 fn test_national_strings_backslash_escape_not_supported() {
3862 all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3863 .tokenizes_to(
3864 "select n'''''\\'",
3865 vec![
3866 Token::make_keyword("select"),
3867 Token::Whitespace(Whitespace::Space),
3868 Token::NationalStringLiteral("''\\".to_string()),
3869 ],
3870 );
3871 }
3872
3873 #[test]
3874 fn test_national_strings_backslash_escape_supported() {
3875 all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3876 .tokenizes_to(
3877 "select n'''''\\''",
3878 vec![
3879 Token::make_keyword("select"),
3880 Token::Whitespace(Whitespace::Space),
3881 Token::NationalStringLiteral("'''".to_string()),
3882 ],
3883 );
3884 }
3885
3886 #[test]
3887 fn test_string_escape_constant_not_supported() {
3888 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3889 "select e'...'",
3890 vec![
3891 Token::make_keyword("select"),
3892 Token::Whitespace(Whitespace::Space),
3893 Token::make_word("e", None),
3894 Token::SingleQuotedString("...".to_string()),
3895 ],
3896 );
3897
3898 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3899 "select E'...'",
3900 vec![
3901 Token::make_keyword("select"),
3902 Token::Whitespace(Whitespace::Space),
3903 Token::make_word("E", None),
3904 Token::SingleQuotedString("...".to_string()),
3905 ],
3906 );
3907 }
3908
3909 #[test]
3910 fn test_string_escape_constant_supported() {
3911 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3912 "select e'\\''",
3913 vec![
3914 Token::make_keyword("select"),
3915 Token::Whitespace(Whitespace::Space),
3916 Token::EscapedStringLiteral("'".to_string()),
3917 ],
3918 );
3919
3920 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3921 "select E'\\''",
3922 vec![
3923 Token::make_keyword("select"),
3924 Token::Whitespace(Whitespace::Space),
3925 Token::EscapedStringLiteral("'".to_string()),
3926 ],
3927 );
3928 }
3929
3930 #[test]
3931 fn test_whitespace_required_after_single_line_comment() {
3932 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3933 .tokenizes_to(
3934 "SELECT --'abc'",
3935 vec![
3936 Token::make_keyword("SELECT"),
3937 Token::Whitespace(Whitespace::Space),
3938 Token::Minus,
3939 Token::Minus,
3940 Token::SingleQuotedString("abc".to_string()),
3941 ],
3942 );
3943
3944 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3945 .tokenizes_to(
3946 "SELECT -- 'abc'",
3947 vec![
3948 Token::make_keyword("SELECT"),
3949 Token::Whitespace(Whitespace::Space),
3950 Token::Whitespace(Whitespace::SingleLineComment {
3951 prefix: "--".to_string(),
3952 comment: " 'abc'".to_string(),
3953 }),
3954 ],
3955 );
3956
3957 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3958 .tokenizes_to(
3959 "SELECT --",
3960 vec![
3961 Token::make_keyword("SELECT"),
3962 Token::Whitespace(Whitespace::Space),
3963 Token::Minus,
3964 Token::Minus,
3965 ],
3966 );
3967 }
3968
3969 #[test]
3970 fn test_whitespace_not_required_after_single_line_comment() {
3971 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3972 .tokenizes_to(
3973 "SELECT --'abc'",
3974 vec![
3975 Token::make_keyword("SELECT"),
3976 Token::Whitespace(Whitespace::Space),
3977 Token::Whitespace(Whitespace::SingleLineComment {
3978 prefix: "--".to_string(),
3979 comment: "'abc'".to_string(),
3980 }),
3981 ],
3982 );
3983
3984 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3985 .tokenizes_to(
3986 "SELECT -- 'abc'",
3987 vec![
3988 Token::make_keyword("SELECT"),
3989 Token::Whitespace(Whitespace::Space),
3990 Token::Whitespace(Whitespace::SingleLineComment {
3991 prefix: "--".to_string(),
3992 comment: " 'abc'".to_string(),
3993 }),
3994 ],
3995 );
3996
3997 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3998 .tokenizes_to(
3999 "SELECT --",
4000 vec![
4001 Token::make_keyword("SELECT"),
4002 Token::Whitespace(Whitespace::Space),
4003 Token::Whitespace(Whitespace::SingleLineComment {
4004 prefix: "--".to_string(),
4005 comment: "".to_string(),
4006 }),
4007 ],
4008 );
4009 }
4010
4011 #[test]
4012 fn test_tokenize_identifiers_numeric_prefix() {
4013 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4014 .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4015
4016 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4017 .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4018
4019 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4020 "t.12e34",
4021 vec![
4022 Token::make_word("t", None),
4023 Token::Period,
4024 Token::make_word("12e34", None),
4025 ],
4026 );
4027
4028 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4029 "t.1two3",
4030 vec![
4031 Token::make_word("t", None),
4032 Token::Period,
4033 Token::make_word("1two3", None),
4034 ],
4035 );
4036 }
4037
4038 #[test]
4039 fn tokenize_period_underscore() {
4040 let sql = String::from("SELECT table._col");
4041 let dialect = PostgreSqlDialect {};
4043 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4044
4045 let expected = vec![
4046 Token::make_keyword("SELECT"),
4047 Token::Whitespace(Whitespace::Space),
4048 Token::Word(Word {
4049 value: "table".to_string(),
4050 quote_style: None,
4051 keyword: Keyword::TABLE,
4052 }),
4053 Token::Period,
4054 Token::Word(Word {
4055 value: "_col".to_string(),
4056 quote_style: None,
4057 keyword: Keyword::NoKeyword,
4058 }),
4059 ];
4060
4061 compare(expected, tokens);
4062
4063 let sql = String::from("SELECT ._123");
4064 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4065 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4066 }
4067
4068 let sql = String::from("SELECT ._abc");
4069 if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4070 panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4071 }
4072 }
4073}