1#[cfg(not(feature = "std"))]
20use alloc::{
21 borrow::ToOwned,
22 format,
23 string::{String, ToString},
24 vec,
25 vec::Vec,
26};
27use core::fmt;
28use core::iter::Peekable;
29use core::str::Chars;
30
31#[cfg(feature = "serde")]
32use serde::{Deserialize, Serialize};
33
34#[cfg(feature = "visitor")]
35use sqlparser_derive::{Visit, VisitMut};
36
37use crate::ast::DollarQuotedString;
38use crate::dialect::{BigQueryDialect, GenericDialect, SnowflakeDialect};
39use crate::dialect::{Dialect, MySqlDialect};
40use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
41
42#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
44#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
45#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
46pub enum Token {
47 EOF,
49 Word(Word),
51 Number(String, bool),
53 Char(char),
55 SingleQuotedString(String),
57 DoubleQuotedString(String),
59 DollarQuotedString(DollarQuotedString),
61 SingleQuotedByteStringLiteral(String),
63 DoubleQuotedByteStringLiteral(String),
65 RawStringLiteral(String),
67 NationalStringLiteral(String),
69 EscapedStringLiteral(String),
71 HexStringLiteral(String),
73 Comma,
75 Whitespace(Whitespace),
77 DoubleEq,
79 Eq,
81 Neq,
83 Lt,
85 Gt,
87 LtEq,
89 GtEq,
91 Spaceship,
93 Plus,
95 Minus,
97 Mul,
99 Div,
101 Mod,
103 StringConcat,
105 LParen,
107 RParen,
109 Period,
111 Colon,
113 DoubleColon,
115 SemiColon,
117 Backslash,
119 LBracket,
121 RBracket,
123 Ampersand,
125 Pipe,
127 Caret,
129 LBrace,
131 RBrace,
133 RArrow,
135 Sharp,
137 Tilde,
139 TildeAsterisk,
141 ExclamationMarkTilde,
143 ExclamationMarkTildeAsterisk,
145 ShiftLeft,
147 ShiftRight,
149 ExclamationMark,
151 DoubleExclamationMark,
153 AtSign,
155 PGSquareRoot,
157 PGCubeRoot,
159 Placeholder(String),
161 Arrow,
163 LongArrow,
165 HashArrow,
167 HashLongArrow,
169 AtArrow,
171 ArrowAt,
173 HashMinus,
176 AtQuestion,
179 AtAt,
183}
184
185impl fmt::Display for Token {
186 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
187 match self {
188 Token::EOF => f.write_str("EOF"),
189 Token::Word(ref w) => write!(f, "{w}"),
190 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
191 Token::Char(ref c) => write!(f, "{c}"),
192 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
193 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
194 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
195 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
196 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
197 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
198 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
199 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
200 Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
201 Token::Comma => f.write_str(","),
202 Token::Whitespace(ws) => write!(f, "{ws}"),
203 Token::DoubleEq => f.write_str("=="),
204 Token::Spaceship => f.write_str("<=>"),
205 Token::Eq => f.write_str("="),
206 Token::Neq => f.write_str("<>"),
207 Token::Lt => f.write_str("<"),
208 Token::Gt => f.write_str(">"),
209 Token::LtEq => f.write_str("<="),
210 Token::GtEq => f.write_str(">="),
211 Token::Plus => f.write_str("+"),
212 Token::Minus => f.write_str("-"),
213 Token::Mul => f.write_str("*"),
214 Token::Div => f.write_str("/"),
215 Token::StringConcat => f.write_str("||"),
216 Token::Mod => f.write_str("%"),
217 Token::LParen => f.write_str("("),
218 Token::RParen => f.write_str(")"),
219 Token::Period => f.write_str("."),
220 Token::Colon => f.write_str(":"),
221 Token::DoubleColon => f.write_str("::"),
222 Token::SemiColon => f.write_str(";"),
223 Token::Backslash => f.write_str("\\"),
224 Token::LBracket => f.write_str("["),
225 Token::RBracket => f.write_str("]"),
226 Token::Ampersand => f.write_str("&"),
227 Token::Caret => f.write_str("^"),
228 Token::Pipe => f.write_str("|"),
229 Token::LBrace => f.write_str("{"),
230 Token::RBrace => f.write_str("}"),
231 Token::RArrow => f.write_str("=>"),
232 Token::Sharp => f.write_str("#"),
233 Token::ExclamationMark => f.write_str("!"),
234 Token::DoubleExclamationMark => f.write_str("!!"),
235 Token::Tilde => f.write_str("~"),
236 Token::TildeAsterisk => f.write_str("~*"),
237 Token::ExclamationMarkTilde => f.write_str("!~"),
238 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
239 Token::AtSign => f.write_str("@"),
240 Token::ShiftLeft => f.write_str("<<"),
241 Token::ShiftRight => f.write_str(">>"),
242 Token::PGSquareRoot => f.write_str("|/"),
243 Token::PGCubeRoot => f.write_str("||/"),
244 Token::Placeholder(ref s) => write!(f, "{s}"),
245 Token::Arrow => write!(f, "->"),
246 Token::LongArrow => write!(f, "->>"),
247 Token::HashArrow => write!(f, "#>"),
248 Token::HashLongArrow => write!(f, "#>>"),
249 Token::AtArrow => write!(f, "@>"),
250 Token::ArrowAt => write!(f, "<@"),
251 Token::HashMinus => write!(f, "#-"),
252 Token::AtQuestion => write!(f, "@?"),
253 Token::AtAt => write!(f, "@@"),
254 }
255 }
256}
257
258impl Token {
259 pub fn make_keyword(keyword: &str) -> Self {
260 Token::make_word(keyword, None)
261 }
262
263 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
264 let word_uppercase = word.to_uppercase();
265 Token::Word(Word {
266 value: word.to_string(),
267 quote_style,
268 keyword: if quote_style.is_none() {
269 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
270 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
271 } else {
272 Keyword::NoKeyword
273 },
274 })
275 }
276}
277
278#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
280#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
281#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
282pub struct Word {
283 pub value: String,
286 pub quote_style: Option<char>,
290 pub keyword: Keyword,
293}
294
295impl fmt::Display for Word {
296 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
297 match self.quote_style {
298 Some(s) if s == '"' || s == '[' || s == '`' => {
299 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
300 }
301 None => f.write_str(&self.value),
302 _ => panic!("Unexpected quote_style!"),
303 }
304 }
305}
306
307impl Word {
308 fn matching_end_quote(ch: char) -> char {
309 match ch {
310 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
314 }
315 }
316}
317
318#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
319#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
320#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
321pub enum Whitespace {
322 Space,
323 Newline,
324 Tab,
325 SingleLineComment { comment: String, prefix: String },
326 MultiLineComment(String),
327}
328
329impl fmt::Display for Whitespace {
330 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
331 match self {
332 Whitespace::Space => f.write_str(" "),
333 Whitespace::Newline => f.write_str("\n"),
334 Whitespace::Tab => f.write_str("\t"),
335 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
336 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
337 }
338 }
339}
340
341#[derive(Debug, Eq, PartialEq, Clone)]
343pub struct Location {
344 pub line: u64,
346 pub column: u64,
348}
349
350#[derive(Debug, Eq, PartialEq, Clone)]
352pub struct TokenWithLocation {
353 pub token: Token,
354 pub location: Location,
355}
356
357impl TokenWithLocation {
358 pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
359 TokenWithLocation {
360 token,
361 location: Location { line, column },
362 }
363 }
364
365 pub fn wrap(token: Token) -> TokenWithLocation {
366 TokenWithLocation::new(token, 0, 0)
367 }
368}
369
370impl PartialEq<Token> for TokenWithLocation {
371 fn eq(&self, other: &Token) -> bool {
372 &self.token == other
373 }
374}
375
376impl PartialEq<TokenWithLocation> for Token {
377 fn eq(&self, other: &TokenWithLocation) -> bool {
378 self == &other.token
379 }
380}
381
382impl fmt::Display for TokenWithLocation {
383 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
384 self.token.fmt(f)
385 }
386}
387
388#[derive(Debug, PartialEq, Eq)]
390pub struct TokenizerError {
391 pub message: String,
392 pub line: u64,
393 pub col: u64,
394}
395
396impl fmt::Display for TokenizerError {
397 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
398 write!(
399 f,
400 "{} at Line: {}, Column {}",
401 self.message, self.line, self.col
402 )
403 }
404}
405
406#[cfg(feature = "std")]
407impl std::error::Error for TokenizerError {}
408
409struct State<'a> {
410 peekable: Peekable<Chars<'a>>,
411 pub line: u64,
412 pub col: u64,
413}
414
415impl<'a> State<'a> {
416 pub fn next(&mut self) -> Option<char> {
417 match self.peekable.next() {
418 None => None,
419 Some(s) => {
420 if s == '\n' {
421 self.line += 1;
422 self.col = 1;
423 } else {
424 self.col += 1;
425 }
426 Some(s)
427 }
428 }
429 }
430
431 pub fn peek(&mut self) -> Option<&char> {
432 self.peekable.peek()
433 }
434
435 pub fn location(&self) -> Location {
436 Location {
437 line: self.line,
438 column: self.col,
439 }
440 }
441}
442
443pub struct Tokenizer<'a> {
445 dialect: &'a dyn Dialect,
446 query: &'a str,
447}
448
449impl<'a> Tokenizer<'a> {
450 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
452 Self { dialect, query }
453 }
454
455 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
457 let twl = self.tokenize_with_location()?;
458
459 let mut tokens: Vec<Token> = vec![];
460 tokens.reserve(twl.len());
461 for token_with_location in twl {
462 tokens.push(token_with_location.token);
463 }
464 Ok(tokens)
465 }
466
467 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
469 let mut state = State {
470 peekable: self.query.chars().peekable(),
471 line: 1,
472 col: 1,
473 };
474
475 let mut tokens: Vec<TokenWithLocation> = vec![];
476
477 let mut location = state.location();
478 while let Some(token) = self.next_token(&mut state)? {
479 tokens.push(TokenWithLocation {
480 token,
481 location: location.clone(),
482 });
483
484 location = state.location();
485 }
486 Ok(tokens)
487 }
488
489 fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
491 match chars.peek() {
493 Some(&ch) => match ch {
494 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
495 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
496 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
497 '\r' => {
498 chars.next();
500 if let Some('\n') = chars.peek() {
501 chars.next();
502 }
503 Ok(Some(Token::Whitespace(Whitespace::Newline)))
504 }
505 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
507 chars.next(); match chars.peek() {
509 Some('\'') => {
510 let s = self.tokenize_quoted_string(chars, '\'')?;
511 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
512 }
513 Some('\"') => {
514 let s = self.tokenize_quoted_string(chars, '\"')?;
515 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
516 }
517 _ => {
518 let s = self.tokenize_word(b, chars);
520 Ok(Some(Token::make_word(&s, None)))
521 }
522 }
523 }
524 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
526 chars.next(); match chars.peek() {
528 Some('\'') => {
529 let s = self.tokenize_quoted_string(chars, '\'')?;
530 Ok(Some(Token::RawStringLiteral(s)))
531 }
532 Some('\"') => {
533 let s = self.tokenize_quoted_string(chars, '\"')?;
534 Ok(Some(Token::RawStringLiteral(s)))
535 }
536 _ => {
537 let s = self.tokenize_word(b, chars);
539 Ok(Some(Token::make_word(&s, None)))
540 }
541 }
542 }
543 n @ 'N' | n @ 'n' => {
545 chars.next(); match chars.peek() {
547 Some('\'') => {
548 let s = self.tokenize_quoted_string(chars, '\'')?;
550 Ok(Some(Token::NationalStringLiteral(s)))
551 }
552 _ => {
553 let s = self.tokenize_word(n, chars);
555 Ok(Some(Token::make_word(&s, None)))
556 }
557 }
558 }
559 x @ 'e' | x @ 'E' => {
561 let starting_loc = chars.location();
562 chars.next(); match chars.peek() {
564 Some('\'') => {
565 let s =
566 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
567 Ok(Some(Token::EscapedStringLiteral(s)))
568 }
569 _ => {
570 let s = self.tokenize_word(x, chars);
572 Ok(Some(Token::make_word(&s, None)))
573 }
574 }
575 }
576 x @ 'x' | x @ 'X' => {
579 chars.next(); match chars.peek() {
581 Some('\'') => {
582 let s = self.tokenize_quoted_string(chars, '\'')?;
584 Ok(Some(Token::HexStringLiteral(s)))
585 }
586 _ => {
587 let s = self.tokenize_word(x, chars);
589 Ok(Some(Token::make_word(&s, None)))
590 }
591 }
592 }
593 ch if self.dialect.is_identifier_start(ch) => {
595 chars.next(); let word = self.tokenize_word(ch, chars);
597
598 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
600 let mut inner_state = State {
601 peekable: word.chars().peekable(),
602 line: 0,
603 col: 0,
604 };
605 let mut s = peeking_take_while(&mut inner_state, |ch| {
606 matches!(ch, '0'..='9' | '.')
607 });
608 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
609 s += s2.as_str();
610 return Ok(Some(Token::Number(s, false)));
611 }
612
613 Ok(Some(Token::make_word(&word, None)))
614 }
615 '\'' => {
617 let s = self.tokenize_quoted_string(chars, '\'')?;
618
619 Ok(Some(Token::SingleQuotedString(s)))
620 }
621 '\"' if !self.dialect.is_delimited_identifier_start(ch)
623 && !self.dialect.is_identifier_start(ch) =>
624 {
625 let s = self.tokenize_quoted_string(chars, '"')?;
626
627 Ok(Some(Token::DoubleQuotedString(s)))
628 }
629 quote_start
631 if self.dialect.is_delimited_identifier_start(ch)
632 && self
633 .dialect
634 .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
635 {
636 let error_loc = chars.location();
637 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
639 let (s, last_char) = parse_quoted_ident(chars, quote_end);
640
641 if last_char == Some(quote_end) {
642 Ok(Some(Token::make_word(&s, Some(quote_start))))
643 } else {
644 self.tokenizer_error(
645 error_loc,
646 format!("Expected close delimiter '{quote_end}' before EOF."),
647 )
648 }
649 }
650 '0'..='9' | '.' => {
652 let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
653
654 if s == "0" && chars.peek() == Some(&'x') {
656 chars.next();
657 let s2 = peeking_take_while(
658 chars,
659 |ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'),
660 );
661 return Ok(Some(Token::HexStringLiteral(s2)));
662 }
663
664 if let Some('.') = chars.peek() {
666 s.push('.');
667 chars.next();
668 }
669 s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
670
671 if s == "." {
673 return Ok(Some(Token::Period));
674 }
675
676 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
678 let mut char_clone = chars.peekable.clone();
679 let mut exponent_part = String::new();
680 exponent_part.push(char_clone.next().unwrap());
681
682 match char_clone.peek() {
684 Some(&c) if matches!(c, '+' | '-') => {
685 exponent_part.push(c);
686 char_clone.next();
687 }
688 _ => (),
689 }
690
691 match char_clone.peek() {
692 Some(&c) if c.is_ascii_digit() => {
694 for _ in 0..exponent_part.len() {
695 chars.next();
696 }
697 exponent_part +=
698 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
699 s += exponent_part.as_str();
700 }
701 _ => (),
703 }
704 }
705
706 let long = if chars.peek() == Some(&'L') {
707 chars.next();
708 true
709 } else {
710 false
711 };
712 Ok(Some(Token::Number(s, long)))
713 }
714 '(' => self.consume_and_return(chars, Token::LParen),
716 ')' => self.consume_and_return(chars, Token::RParen),
717 ',' => self.consume_and_return(chars, Token::Comma),
718 '-' => {
720 chars.next(); match chars.peek() {
722 Some('-') => {
723 chars.next(); let comment = self.tokenize_single_line_comment(chars);
725 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
726 prefix: "--".to_owned(),
727 comment,
728 })))
729 }
730 Some('>') => {
731 chars.next();
732 match chars.peek() {
733 Some('>') => {
734 chars.next();
735 Ok(Some(Token::LongArrow))
736 }
737 _ => Ok(Some(Token::Arrow)),
738 }
739 }
740 _ => Ok(Some(Token::Minus)),
742 }
743 }
744 '/' => {
745 chars.next(); match chars.peek() {
747 Some('*') => {
748 chars.next(); self.tokenize_multiline_comment(chars)
750 }
751 Some('/') if dialect_of!(self is SnowflakeDialect) => {
752 chars.next(); let comment = self.tokenize_single_line_comment(chars);
754 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
755 prefix: "//".to_owned(),
756 comment,
757 })))
758 }
759 _ => Ok(Some(Token::Div)),
761 }
762 }
763 '+' => self.consume_and_return(chars, Token::Plus),
764 '*' => self.consume_and_return(chars, Token::Mul),
765 '%' => self.consume_and_return(chars, Token::Mod),
766 '|' => {
767 chars.next(); match chars.peek() {
769 Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
770 Some('|') => {
771 chars.next(); match chars.peek() {
773 Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
774 _ => Ok(Some(Token::StringConcat)),
775 }
776 }
777 _ => Ok(Some(Token::Pipe)),
779 }
780 }
781 '=' => {
782 chars.next(); match chars.peek() {
784 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
785 Some('>') => self.consume_and_return(chars, Token::RArrow),
786 _ => Ok(Some(Token::Eq)),
787 }
788 }
789 '!' => {
790 chars.next(); match chars.peek() {
792 Some('=') => self.consume_and_return(chars, Token::Neq),
793 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
794 Some('~') => {
795 chars.next();
796 match chars.peek() {
797 Some('*') => self
798 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
799 _ => Ok(Some(Token::ExclamationMarkTilde)),
800 }
801 }
802 _ => Ok(Some(Token::ExclamationMark)),
803 }
804 }
805 '<' => {
806 chars.next(); match chars.peek() {
808 Some('=') => {
809 chars.next();
810 match chars.peek() {
811 Some('>') => self.consume_and_return(chars, Token::Spaceship),
812 _ => Ok(Some(Token::LtEq)),
813 }
814 }
815 Some('>') => self.consume_and_return(chars, Token::Neq),
816 Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
817 Some('@') => self.consume_and_return(chars, Token::ArrowAt),
818 _ => Ok(Some(Token::Lt)),
819 }
820 }
821 '>' => {
822 chars.next(); match chars.peek() {
824 Some('=') => self.consume_and_return(chars, Token::GtEq),
825 Some('>') => self.consume_and_return(chars, Token::ShiftRight),
826 _ => Ok(Some(Token::Gt)),
827 }
828 }
829 ':' => {
830 chars.next();
831 match chars.peek() {
832 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
833 _ => Ok(Some(Token::Colon)),
834 }
835 }
836 ';' => self.consume_and_return(chars, Token::SemiColon),
837 '\\' => self.consume_and_return(chars, Token::Backslash),
838 '[' => self.consume_and_return(chars, Token::LBracket),
839 ']' => self.consume_and_return(chars, Token::RBracket),
840 '&' => self.consume_and_return(chars, Token::Ampersand),
841 '^' => self.consume_and_return(chars, Token::Caret),
842 '{' => self.consume_and_return(chars, Token::LBrace),
843 '}' => self.consume_and_return(chars, Token::RBrace),
844 '#' if dialect_of!(self is SnowflakeDialect) => {
845 chars.next(); let comment = self.tokenize_single_line_comment(chars);
847 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
848 prefix: "#".to_owned(),
849 comment,
850 })))
851 }
852 '~' => {
853 chars.next(); match chars.peek() {
855 Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
856 _ => Ok(Some(Token::Tilde)),
857 }
858 }
859 '#' => {
860 chars.next();
861 match chars.peek() {
862 Some('-') => self.consume_and_return(chars, Token::HashMinus),
863 Some('>') => {
864 chars.next();
865 match chars.peek() {
866 Some('>') => {
867 chars.next();
868 Ok(Some(Token::HashLongArrow))
869 }
870 _ => Ok(Some(Token::HashArrow)),
871 }
872 }
873 _ => Ok(Some(Token::Sharp)),
874 }
875 }
876 '@' => {
877 chars.next();
878 match chars.peek() {
879 Some('>') => self.consume_and_return(chars, Token::AtArrow),
880 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
881 Some('@') => self.consume_and_return(chars, Token::AtAt),
882 _ => Ok(Some(Token::AtSign)),
883 }
884 }
885 '?' => {
886 chars.next();
887 let s = peeking_take_while(chars, |ch| ch.is_numeric());
888 Ok(Some(Token::Placeholder(String::from("?") + &s)))
889 }
890 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
891
892 ch if ch.is_whitespace() => {
894 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
895 }
896 other => self.consume_and_return(chars, Token::Char(other)),
897 },
898 None => Ok(None),
899 }
900 }
901
902 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
904 let mut s = String::new();
905 let mut value = String::new();
906
907 chars.next();
908
909 if let Some('$') = chars.peek() {
910 chars.next();
911
912 let mut is_terminated = false;
913 let mut prev: Option<char> = None;
914
915 while let Some(&ch) = chars.peek() {
916 if prev == Some('$') {
917 if ch == '$' {
918 chars.next();
919 is_terminated = true;
920 break;
921 } else {
922 s.push('$');
923 s.push(ch);
924 }
925 } else if ch != '$' {
926 s.push(ch);
927 }
928
929 prev = Some(ch);
930 chars.next();
931 }
932
933 return if chars.peek().is_none() && !is_terminated {
934 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
935 } else {
936 Ok(Token::DollarQuotedString(DollarQuotedString {
937 value: s,
938 tag: None,
939 }))
940 };
941 } else {
942 value.push_str(&peeking_take_while(chars, |ch| {
943 ch.is_alphanumeric() || ch == '_'
944 }));
945
946 if let Some('$') = chars.peek() {
947 chars.next();
948 s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
949
950 match chars.peek() {
951 Some('$') => {
952 chars.next();
953 for (_, c) in value.chars().enumerate() {
954 let next_char = chars.next();
955 if Some(c) != next_char {
956 return self.tokenizer_error(
957 chars.location(),
958 format!(
959 "Unterminated dollar-quoted string at or near \"{value}\""
960 ),
961 );
962 }
963 }
964
965 if let Some('$') = chars.peek() {
966 chars.next();
967 } else {
968 return self.tokenizer_error(
969 chars.location(),
970 "Unterminated dollar-quoted string, expected $",
971 );
972 }
973 }
974 _ => {
975 return self.tokenizer_error(
976 chars.location(),
977 "Unterminated dollar-quoted, expected $",
978 );
979 }
980 }
981 } else {
982 return Ok(Token::Placeholder(String::from("$") + &value));
983 }
984 }
985
986 Ok(Token::DollarQuotedString(DollarQuotedString {
987 value: s,
988 tag: if value.is_empty() { None } else { Some(value) },
989 }))
990 }
991
992 fn tokenizer_error<R>(
993 &self,
994 loc: Location,
995 message: impl Into<String>,
996 ) -> Result<R, TokenizerError> {
997 Err(TokenizerError {
998 message: message.into(),
999 col: loc.column,
1000 line: loc.line,
1001 })
1002 }
1003
1004 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1006 let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1007 if let Some(ch) = chars.next() {
1008 assert_eq!(ch, '\n');
1009 comment.push(ch);
1010 }
1011 comment
1012 }
1013
1014 fn tokenize_word(&self, first_char: char, chars: &mut State) -> String {
1016 let mut s = first_char.to_string();
1017 s.push_str(&peeking_take_while(chars, |ch| {
1018 self.dialect.is_identifier_part(ch)
1019 }));
1020 s
1021 }
1022
1023 fn tokenize_escaped_single_quoted_string(
1025 &self,
1026 starting_loc: Location,
1027 chars: &mut State,
1028 ) -> Result<String, TokenizerError> {
1029 let mut s = String::new();
1030
1031 chars.next(); let mut is_escaped = false;
1037 while let Some(&ch) = chars.peek() {
1038 macro_rules! escape_control_character {
1039 ($ESCAPED:expr) => {{
1040 if is_escaped {
1041 s.push($ESCAPED);
1042 is_escaped = false;
1043 } else {
1044 s.push(ch);
1045 }
1046
1047 chars.next();
1048 }};
1049 }
1050
1051 match ch {
1052 '\'' => {
1053 chars.next(); if is_escaped {
1055 s.push(ch);
1056 is_escaped = false;
1057 } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1058 s.push(ch);
1059 chars.next();
1060 } else {
1061 return Ok(s);
1062 }
1063 }
1064 '\\' => {
1065 if is_escaped {
1066 s.push('\\');
1067 is_escaped = false;
1068 } else {
1069 is_escaped = true;
1070 }
1071
1072 chars.next();
1073 }
1074 'r' => escape_control_character!('\r'),
1075 'n' => escape_control_character!('\n'),
1076 't' => escape_control_character!('\t'),
1077 _ => {
1078 is_escaped = false;
1079 chars.next(); s.push(ch);
1081 }
1082 }
1083 }
1084 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1085 }
1086
1087 fn tokenize_quoted_string(
1089 &self,
1090 chars: &mut State,
1091 quote_style: char,
1092 ) -> Result<String, TokenizerError> {
1093 let mut s = String::new();
1094 let error_loc = chars.location();
1095
1096 chars.next(); let mut is_escaped = false;
1100 while let Some(&ch) = chars.peek() {
1101 match ch {
1102 char if char == quote_style => {
1103 chars.next(); if is_escaped {
1105 s.push(ch);
1106 is_escaped = false;
1107 } else if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
1108 s.push(ch);
1109 chars.next();
1110 } else {
1111 return Ok(s);
1112 }
1113 }
1114 '\\' => {
1115 if dialect_of!(self is MySqlDialect) {
1116 is_escaped = !is_escaped;
1117 } else {
1118 s.push(ch);
1119 }
1120 chars.next();
1121 }
1122 _ => {
1123 chars.next(); s.push(ch);
1125 }
1126 }
1127 }
1128 self.tokenizer_error(error_loc, "Unterminated string literal")
1129 }
1130
1131 fn tokenize_multiline_comment(
1132 &self,
1133 chars: &mut State,
1134 ) -> Result<Option<Token>, TokenizerError> {
1135 let mut s = String::new();
1136 let mut nested = 1;
1137 let mut last_ch = ' ';
1138
1139 loop {
1140 match chars.next() {
1141 Some(ch) => {
1142 if last_ch == '/' && ch == '*' {
1143 nested += 1;
1144 } else if last_ch == '*' && ch == '/' {
1145 nested -= 1;
1146 if nested == 0 {
1147 s.pop();
1148 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1149 }
1150 }
1151 s.push(ch);
1152 last_ch = ch;
1153 }
1154 None => {
1155 break self.tokenizer_error(
1156 chars.location(),
1157 "Unexpected EOF while in a multi-line comment",
1158 )
1159 }
1160 }
1161 }
1162 }
1163
1164 #[allow(clippy::unnecessary_wraps)]
1165 fn consume_and_return(
1166 &self,
1167 chars: &mut State,
1168 t: Token,
1169 ) -> Result<Option<Token>, TokenizerError> {
1170 chars.next();
1171 Ok(Some(t))
1172 }
1173}
1174
1175fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1179 let mut s = String::new();
1180 while let Some(&ch) = chars.peek() {
1181 if predicate(ch) {
1182 chars.next(); s.push(ch);
1184 } else {
1185 break;
1186 }
1187 }
1188 s
1189}
1190
1191fn parse_quoted_ident(chars: &mut State, quote_end: char) -> (String, Option<char>) {
1192 let mut last_char = None;
1193 let mut s = String::new();
1194 while let Some(ch) = chars.next() {
1195 if ch == quote_end {
1196 if chars.peek() == Some("e_end) {
1197 chars.next();
1198 s.push(ch);
1199 } else {
1200 last_char = Some(quote_end);
1201 break;
1202 }
1203 } else {
1204 s.push(ch);
1205 }
1206 }
1207 (s, last_char)
1208}
1209
1210#[cfg(test)]
1211mod tests {
1212 use super::*;
1213 use crate::dialect::{GenericDialect, MsSqlDialect};
1214
1215 #[test]
1216 fn tokenizer_error_impl() {
1217 let err = TokenizerError {
1218 message: "test".into(),
1219 line: 1,
1220 col: 1,
1221 };
1222 #[cfg(feature = "std")]
1223 {
1224 use std::error::Error;
1225 assert!(err.source().is_none());
1226 }
1227 assert_eq!(err.to_string(), "test at Line: 1, Column 1");
1228 }
1229
1230 #[test]
1231 fn tokenize_select_1() {
1232 let sql = String::from("SELECT 1");
1233 let dialect = GenericDialect {};
1234 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1235 let tokens = tokenizer.tokenize().unwrap();
1236
1237 let expected = vec![
1238 Token::make_keyword("SELECT"),
1239 Token::Whitespace(Whitespace::Space),
1240 Token::Number(String::from("1"), false),
1241 ];
1242
1243 compare(expected, tokens);
1244 }
1245
1246 #[test]
1247 fn tokenize_select_float() {
1248 let sql = String::from("SELECT .1");
1249 let dialect = GenericDialect {};
1250 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1251 let tokens = tokenizer.tokenize().unwrap();
1252
1253 let expected = vec![
1254 Token::make_keyword("SELECT"),
1255 Token::Whitespace(Whitespace::Space),
1256 Token::Number(String::from(".1"), false),
1257 ];
1258
1259 compare(expected, tokens);
1260 }
1261
1262 #[test]
1263 fn tokenize_select_exponent() {
1264 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
1265 let dialect = GenericDialect {};
1266 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1267 let tokens = tokenizer.tokenize().unwrap();
1268
1269 let expected = vec![
1270 Token::make_keyword("SELECT"),
1271 Token::Whitespace(Whitespace::Space),
1272 Token::Number(String::from("1e10"), false),
1273 Token::Comma,
1274 Token::Whitespace(Whitespace::Space),
1275 Token::Number(String::from("1e-10"), false),
1276 Token::Comma,
1277 Token::Whitespace(Whitespace::Space),
1278 Token::Number(String::from("1e+10"), false),
1279 Token::Comma,
1280 Token::Whitespace(Whitespace::Space),
1281 Token::Number(String::from("1"), false),
1282 Token::make_word("ea", None),
1283 Token::Comma,
1284 Token::Whitespace(Whitespace::Space),
1285 Token::Number(String::from("1e-10"), false),
1286 Token::make_word("a", None),
1287 Token::Comma,
1288 Token::Whitespace(Whitespace::Space),
1289 Token::Number(String::from("1e-10"), false),
1290 Token::Minus,
1291 Token::Number(String::from("10"), false),
1292 ];
1293
1294 compare(expected, tokens);
1295 }
1296
1297 #[test]
1298 fn tokenize_scalar_function() {
1299 let sql = String::from("SELECT sqrt(1)");
1300 let dialect = GenericDialect {};
1301 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1302 let tokens = tokenizer.tokenize().unwrap();
1303
1304 let expected = vec![
1305 Token::make_keyword("SELECT"),
1306 Token::Whitespace(Whitespace::Space),
1307 Token::make_word("sqrt", None),
1308 Token::LParen,
1309 Token::Number(String::from("1"), false),
1310 Token::RParen,
1311 ];
1312
1313 compare(expected, tokens);
1314 }
1315
1316 #[test]
1317 fn tokenize_string_string_concat() {
1318 let sql = String::from("SELECT 'a' || 'b'");
1319 let dialect = GenericDialect {};
1320 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1321 let tokens = tokenizer.tokenize().unwrap();
1322
1323 let expected = vec![
1324 Token::make_keyword("SELECT"),
1325 Token::Whitespace(Whitespace::Space),
1326 Token::SingleQuotedString(String::from("a")),
1327 Token::Whitespace(Whitespace::Space),
1328 Token::StringConcat,
1329 Token::Whitespace(Whitespace::Space),
1330 Token::SingleQuotedString(String::from("b")),
1331 ];
1332
1333 compare(expected, tokens);
1334 }
1335 #[test]
1336 fn tokenize_bitwise_op() {
1337 let sql = String::from("SELECT one | two ^ three");
1338 let dialect = GenericDialect {};
1339 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1340 let tokens = tokenizer.tokenize().unwrap();
1341
1342 let expected = vec![
1343 Token::make_keyword("SELECT"),
1344 Token::Whitespace(Whitespace::Space),
1345 Token::make_word("one", None),
1346 Token::Whitespace(Whitespace::Space),
1347 Token::Pipe,
1348 Token::Whitespace(Whitespace::Space),
1349 Token::make_word("two", None),
1350 Token::Whitespace(Whitespace::Space),
1351 Token::Caret,
1352 Token::Whitespace(Whitespace::Space),
1353 Token::make_word("three", None),
1354 ];
1355 compare(expected, tokens);
1356 }
1357
1358 #[test]
1359 fn tokenize_logical_xor() {
1360 let sql =
1361 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1362 let dialect = GenericDialect {};
1363 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1364 let tokens = tokenizer.tokenize().unwrap();
1365
1366 let expected = vec![
1367 Token::make_keyword("SELECT"),
1368 Token::Whitespace(Whitespace::Space),
1369 Token::make_keyword("true"),
1370 Token::Whitespace(Whitespace::Space),
1371 Token::make_keyword("XOR"),
1372 Token::Whitespace(Whitespace::Space),
1373 Token::make_keyword("true"),
1374 Token::Comma,
1375 Token::Whitespace(Whitespace::Space),
1376 Token::make_keyword("false"),
1377 Token::Whitespace(Whitespace::Space),
1378 Token::make_keyword("XOR"),
1379 Token::Whitespace(Whitespace::Space),
1380 Token::make_keyword("false"),
1381 Token::Comma,
1382 Token::Whitespace(Whitespace::Space),
1383 Token::make_keyword("true"),
1384 Token::Whitespace(Whitespace::Space),
1385 Token::make_keyword("XOR"),
1386 Token::Whitespace(Whitespace::Space),
1387 Token::make_keyword("false"),
1388 Token::Comma,
1389 Token::Whitespace(Whitespace::Space),
1390 Token::make_keyword("false"),
1391 Token::Whitespace(Whitespace::Space),
1392 Token::make_keyword("XOR"),
1393 Token::Whitespace(Whitespace::Space),
1394 Token::make_keyword("true"),
1395 ];
1396 compare(expected, tokens);
1397 }
1398
1399 #[test]
1400 fn tokenize_simple_select() {
1401 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1402 let dialect = GenericDialect {};
1403 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1404 let tokens = tokenizer.tokenize().unwrap();
1405
1406 let expected = vec![
1407 Token::make_keyword("SELECT"),
1408 Token::Whitespace(Whitespace::Space),
1409 Token::Mul,
1410 Token::Whitespace(Whitespace::Space),
1411 Token::make_keyword("FROM"),
1412 Token::Whitespace(Whitespace::Space),
1413 Token::make_word("customer", None),
1414 Token::Whitespace(Whitespace::Space),
1415 Token::make_keyword("WHERE"),
1416 Token::Whitespace(Whitespace::Space),
1417 Token::make_word("id", None),
1418 Token::Whitespace(Whitespace::Space),
1419 Token::Eq,
1420 Token::Whitespace(Whitespace::Space),
1421 Token::Number(String::from("1"), false),
1422 Token::Whitespace(Whitespace::Space),
1423 Token::make_keyword("LIMIT"),
1424 Token::Whitespace(Whitespace::Space),
1425 Token::Number(String::from("5"), false),
1426 ];
1427
1428 compare(expected, tokens);
1429 }
1430
1431 #[test]
1432 fn tokenize_explain_select() {
1433 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1434 let dialect = GenericDialect {};
1435 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1436 let tokens = tokenizer.tokenize().unwrap();
1437
1438 let expected = vec![
1439 Token::make_keyword("EXPLAIN"),
1440 Token::Whitespace(Whitespace::Space),
1441 Token::make_keyword("SELECT"),
1442 Token::Whitespace(Whitespace::Space),
1443 Token::Mul,
1444 Token::Whitespace(Whitespace::Space),
1445 Token::make_keyword("FROM"),
1446 Token::Whitespace(Whitespace::Space),
1447 Token::make_word("customer", None),
1448 Token::Whitespace(Whitespace::Space),
1449 Token::make_keyword("WHERE"),
1450 Token::Whitespace(Whitespace::Space),
1451 Token::make_word("id", None),
1452 Token::Whitespace(Whitespace::Space),
1453 Token::Eq,
1454 Token::Whitespace(Whitespace::Space),
1455 Token::Number(String::from("1"), false),
1456 ];
1457
1458 compare(expected, tokens);
1459 }
1460
1461 #[test]
1462 fn tokenize_explain_analyze_select() {
1463 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1464 let dialect = GenericDialect {};
1465 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1466 let tokens = tokenizer.tokenize().unwrap();
1467
1468 let expected = vec![
1469 Token::make_keyword("EXPLAIN"),
1470 Token::Whitespace(Whitespace::Space),
1471 Token::make_keyword("ANALYZE"),
1472 Token::Whitespace(Whitespace::Space),
1473 Token::make_keyword("SELECT"),
1474 Token::Whitespace(Whitespace::Space),
1475 Token::Mul,
1476 Token::Whitespace(Whitespace::Space),
1477 Token::make_keyword("FROM"),
1478 Token::Whitespace(Whitespace::Space),
1479 Token::make_word("customer", None),
1480 Token::Whitespace(Whitespace::Space),
1481 Token::make_keyword("WHERE"),
1482 Token::Whitespace(Whitespace::Space),
1483 Token::make_word("id", None),
1484 Token::Whitespace(Whitespace::Space),
1485 Token::Eq,
1486 Token::Whitespace(Whitespace::Space),
1487 Token::Number(String::from("1"), false),
1488 ];
1489
1490 compare(expected, tokens);
1491 }
1492
1493 #[test]
1494 fn tokenize_string_predicate() {
1495 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1496 let dialect = GenericDialect {};
1497 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1498 let tokens = tokenizer.tokenize().unwrap();
1499
1500 let expected = vec![
1501 Token::make_keyword("SELECT"),
1502 Token::Whitespace(Whitespace::Space),
1503 Token::Mul,
1504 Token::Whitespace(Whitespace::Space),
1505 Token::make_keyword("FROM"),
1506 Token::Whitespace(Whitespace::Space),
1507 Token::make_word("customer", None),
1508 Token::Whitespace(Whitespace::Space),
1509 Token::make_keyword("WHERE"),
1510 Token::Whitespace(Whitespace::Space),
1511 Token::make_word("salary", None),
1512 Token::Whitespace(Whitespace::Space),
1513 Token::Neq,
1514 Token::Whitespace(Whitespace::Space),
1515 Token::SingleQuotedString(String::from("Not Provided")),
1516 ];
1517
1518 compare(expected, tokens);
1519 }
1520
1521 #[test]
1522 fn tokenize_invalid_string() {
1523 let sql = String::from("\n💝مصطفىh");
1524
1525 let dialect = GenericDialect {};
1526 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1527 let tokens = tokenizer.tokenize().unwrap();
1528 let expected = vec![
1530 Token::Whitespace(Whitespace::Newline),
1531 Token::Char('💝'),
1532 Token::make_word("مصطفىh", None),
1533 ];
1534 compare(expected, tokens);
1535 }
1536
1537 #[test]
1538 fn tokenize_newline_in_string_literal() {
1539 let sql = String::from("'foo\r\nbar\nbaz'");
1540
1541 let dialect = GenericDialect {};
1542 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1543 let tokens = tokenizer.tokenize().unwrap();
1544 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
1545 compare(expected, tokens);
1546 }
1547
1548 #[test]
1549 fn tokenize_unterminated_string_literal() {
1550 let sql = String::from("select 'foo");
1551
1552 let dialect = GenericDialect {};
1553 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1554 assert_eq!(
1555 tokenizer.tokenize(),
1556 Err(TokenizerError {
1557 message: "Unterminated string literal".to_string(),
1558 line: 1,
1559 col: 8
1560 })
1561 );
1562 }
1563
1564 #[test]
1565 fn tokenize_unterminated_string_literal_utf8() {
1566 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
1567
1568 let dialect = GenericDialect {};
1569 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1570 assert_eq!(
1571 tokenizer.tokenize(),
1572 Err(TokenizerError {
1573 message: "Unterminated string literal".to_string(),
1574 line: 1,
1575 col: 35
1576 })
1577 );
1578 }
1579
1580 #[test]
1581 fn tokenize_invalid_string_cols() {
1582 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
1583
1584 let dialect = GenericDialect {};
1585 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1586 let tokens = tokenizer.tokenize().unwrap();
1587 let expected = vec![
1589 Token::Whitespace(Whitespace::Newline),
1590 Token::Whitespace(Whitespace::Newline),
1591 Token::make_keyword("SELECT"),
1592 Token::Whitespace(Whitespace::Space),
1593 Token::Mul,
1594 Token::Whitespace(Whitespace::Space),
1595 Token::make_keyword("FROM"),
1596 Token::Whitespace(Whitespace::Space),
1597 Token::make_keyword("table"),
1598 Token::Whitespace(Whitespace::Tab),
1599 Token::Char('💝'),
1600 Token::make_word("مصطفىh", None),
1601 ];
1602 compare(expected, tokens);
1603 }
1604
1605 #[test]
1606 fn tokenize_right_arrow() {
1607 let sql = String::from("FUNCTION(key=>value)");
1608 let dialect = GenericDialect {};
1609 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1610 let tokens = tokenizer.tokenize().unwrap();
1611 let expected = vec![
1612 Token::make_word("FUNCTION", None),
1613 Token::LParen,
1614 Token::make_word("key", None),
1615 Token::RArrow,
1616 Token::make_word("value", None),
1617 Token::RParen,
1618 ];
1619 compare(expected, tokens);
1620 }
1621
1622 #[test]
1623 fn tokenize_is_null() {
1624 let sql = String::from("a IS NULL");
1625 let dialect = GenericDialect {};
1626 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1627 let tokens = tokenizer.tokenize().unwrap();
1628
1629 let expected = vec![
1630 Token::make_word("a", None),
1631 Token::Whitespace(Whitespace::Space),
1632 Token::make_keyword("IS"),
1633 Token::Whitespace(Whitespace::Space),
1634 Token::make_keyword("NULL"),
1635 ];
1636
1637 compare(expected, tokens);
1638 }
1639
1640 #[test]
1641 fn tokenize_double_eq() {
1642 let sql = String::from("a == 123");
1643 let dialect = GenericDialect {};
1644 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1645 let tokens = tokenizer.tokenize().unwrap();
1646
1647 let expected = vec![
1648 Token::make_word("a", None),
1649 Token::Whitespace(Whitespace::Space),
1650 Token::DoubleEq,
1651 Token::Whitespace(Whitespace::Space),
1652 Token::Number(String::from("123"), false),
1653 ];
1654
1655 compare(expected, tokens);
1656 }
1657
1658 #[test]
1659 fn tokenize_comment() {
1660 let sql = String::from("0--this is a comment\n1");
1661
1662 let dialect = GenericDialect {};
1663 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1664 let tokens = tokenizer.tokenize().unwrap();
1665 let expected = vec![
1666 Token::Number("0".to_string(), false),
1667 Token::Whitespace(Whitespace::SingleLineComment {
1668 prefix: "--".to_string(),
1669 comment: "this is a comment\n".to_string(),
1670 }),
1671 Token::Number("1".to_string(), false),
1672 ];
1673 compare(expected, tokens);
1674 }
1675
1676 #[test]
1677 fn tokenize_comment_at_eof() {
1678 let sql = String::from("--this is a comment");
1679
1680 let dialect = GenericDialect {};
1681 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1682 let tokens = tokenizer.tokenize().unwrap();
1683 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1684 prefix: "--".to_string(),
1685 comment: "this is a comment".to_string(),
1686 })];
1687 compare(expected, tokens);
1688 }
1689
1690 #[test]
1691 fn tokenize_multiline_comment() {
1692 let sql = String::from("0/*multi-line\n* /comment*/1");
1693
1694 let dialect = GenericDialect {};
1695 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1696 let tokens = tokenizer.tokenize().unwrap();
1697 let expected = vec![
1698 Token::Number("0".to_string(), false),
1699 Token::Whitespace(Whitespace::MultiLineComment(
1700 "multi-line\n* /comment".to_string(),
1701 )),
1702 Token::Number("1".to_string(), false),
1703 ];
1704 compare(expected, tokens);
1705 }
1706
1707 #[test]
1708 fn tokenize_nested_multiline_comment() {
1709 let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1710
1711 let dialect = GenericDialect {};
1712 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1713 let tokens = tokenizer.tokenize().unwrap();
1714 let expected = vec![
1715 Token::Number("0".to_string(), false),
1716 Token::Whitespace(Whitespace::MultiLineComment(
1717 "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
1718 )),
1719 Token::Number("1".to_string(), false),
1720 ];
1721 compare(expected, tokens);
1722 }
1723
1724 #[test]
1725 fn tokenize_multiline_comment_with_even_asterisks() {
1726 let sql = String::from("\n/** Comment **/\n");
1727
1728 let dialect = GenericDialect {};
1729 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1730 let tokens = tokenizer.tokenize().unwrap();
1731 let expected = vec![
1732 Token::Whitespace(Whitespace::Newline),
1733 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
1734 Token::Whitespace(Whitespace::Newline),
1735 ];
1736 compare(expected, tokens);
1737 }
1738
1739 #[test]
1740 fn tokenize_unicode_whitespace() {
1741 let sql = String::from(" \u{2003}\n");
1742
1743 let dialect = GenericDialect {};
1744 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1745 let tokens = tokenizer.tokenize().unwrap();
1746 let expected = vec![
1747 Token::Whitespace(Whitespace::Space),
1748 Token::Whitespace(Whitespace::Space),
1749 Token::Whitespace(Whitespace::Newline),
1750 ];
1751 compare(expected, tokens);
1752 }
1753
1754 #[test]
1755 fn tokenize_mismatched_quotes() {
1756 let sql = String::from("\"foo");
1757
1758 let dialect = GenericDialect {};
1759 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1760 assert_eq!(
1761 tokenizer.tokenize(),
1762 Err(TokenizerError {
1763 message: "Expected close delimiter '\"' before EOF.".to_string(),
1764 line: 1,
1765 col: 1
1766 })
1767 );
1768 }
1769
1770 #[test]
1771 fn tokenize_newlines() {
1772 let sql = String::from("line1\nline2\rline3\r\nline4\r");
1773
1774 let dialect = GenericDialect {};
1775 let mut tokenizer = Tokenizer::new(&dialect, &sql);
1776 let tokens = tokenizer.tokenize().unwrap();
1777 let expected = vec![
1778 Token::make_word("line1", None),
1779 Token::Whitespace(Whitespace::Newline),
1780 Token::make_word("line2", None),
1781 Token::Whitespace(Whitespace::Newline),
1782 Token::make_word("line3", None),
1783 Token::Whitespace(Whitespace::Newline),
1784 Token::make_word("line4", None),
1785 Token::Whitespace(Whitespace::Newline),
1786 ];
1787 compare(expected, tokens);
1788 }
1789
1790 #[test]
1791 fn tokenize_mssql_top() {
1792 let sql = "SELECT TOP 5 [bar] FROM foo";
1793 let dialect = MsSqlDialect {};
1794 let mut tokenizer = Tokenizer::new(&dialect, sql);
1795 let tokens = tokenizer.tokenize().unwrap();
1796 let expected = vec![
1797 Token::make_keyword("SELECT"),
1798 Token::Whitespace(Whitespace::Space),
1799 Token::make_keyword("TOP"),
1800 Token::Whitespace(Whitespace::Space),
1801 Token::Number(String::from("5"), false),
1802 Token::Whitespace(Whitespace::Space),
1803 Token::make_word("bar", Some('[')),
1804 Token::Whitespace(Whitespace::Space),
1805 Token::make_keyword("FROM"),
1806 Token::Whitespace(Whitespace::Space),
1807 Token::make_word("foo", None),
1808 ];
1809 compare(expected, tokens);
1810 }
1811
1812 #[test]
1813 fn tokenize_pg_regex_match() {
1814 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1815 let dialect = GenericDialect {};
1816 let mut tokenizer = Tokenizer::new(&dialect, sql);
1817 let tokens = tokenizer.tokenize().unwrap();
1818 let expected = vec![
1819 Token::make_keyword("SELECT"),
1820 Token::Whitespace(Whitespace::Space),
1821 Token::make_word("col", None),
1822 Token::Whitespace(Whitespace::Space),
1823 Token::Tilde,
1824 Token::Whitespace(Whitespace::Space),
1825 Token::SingleQuotedString("^a".into()),
1826 Token::Comma,
1827 Token::Whitespace(Whitespace::Space),
1828 Token::make_word("col", None),
1829 Token::Whitespace(Whitespace::Space),
1830 Token::TildeAsterisk,
1831 Token::Whitespace(Whitespace::Space),
1832 Token::SingleQuotedString("^a".into()),
1833 Token::Comma,
1834 Token::Whitespace(Whitespace::Space),
1835 Token::make_word("col", None),
1836 Token::Whitespace(Whitespace::Space),
1837 Token::ExclamationMarkTilde,
1838 Token::Whitespace(Whitespace::Space),
1839 Token::SingleQuotedString("^a".into()),
1840 Token::Comma,
1841 Token::Whitespace(Whitespace::Space),
1842 Token::make_word("col", None),
1843 Token::Whitespace(Whitespace::Space),
1844 Token::ExclamationMarkTildeAsterisk,
1845 Token::Whitespace(Whitespace::Space),
1846 Token::SingleQuotedString("^a".into()),
1847 ];
1848 compare(expected, tokens);
1849 }
1850
1851 #[test]
1852 fn tokenize_quoted_identifier() {
1853 let sql = r#" "a "" b" "a """ "c """"" "#;
1854 let dialect = GenericDialect {};
1855 let mut tokenizer = Tokenizer::new(&dialect, sql);
1856 let tokens = tokenizer.tokenize().unwrap();
1857 let expected = vec![
1858 Token::Whitespace(Whitespace::Space),
1859 Token::make_word(r#"a " b"#, Some('"')),
1860 Token::Whitespace(Whitespace::Space),
1861 Token::make_word(r#"a ""#, Some('"')),
1862 Token::Whitespace(Whitespace::Space),
1863 Token::make_word(r#"c """#, Some('"')),
1864 Token::Whitespace(Whitespace::Space),
1865 ];
1866 compare(expected, tokens);
1867 }
1868
1869 #[test]
1870 fn tokenize_with_location() {
1871 let sql = "SELECT a,\n b";
1872 let dialect = GenericDialect {};
1873 let mut tokenizer = Tokenizer::new(&dialect, sql);
1874 let tokens = tokenizer.tokenize_with_location().unwrap();
1875 let expected = vec![
1876 TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
1877 TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
1878 TokenWithLocation::new(Token::make_word("a", None), 1, 8),
1879 TokenWithLocation::new(Token::Comma, 1, 9),
1880 TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
1881 TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
1882 TokenWithLocation::new(Token::make_word("b", None), 2, 2),
1883 ];
1884 compare(expected, tokens);
1885 }
1886
1887 fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
1888 assert_eq!(expected, actual);
1893 }
1894}