sqlparser/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5// http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19#[cfg(not(feature = "std"))]
20use alloc::{
21    borrow::ToOwned,
22    format,
23    string::{String, ToString},
24    vec,
25    vec::Vec,
26};
27use core::fmt;
28use core::iter::Peekable;
29use core::str::Chars;
30
31#[cfg(feature = "serde")]
32use serde::{Deserialize, Serialize};
33
34#[cfg(feature = "visitor")]
35use sqlparser_derive::{Visit, VisitMut};
36
37use crate::ast::DollarQuotedString;
38use crate::dialect::{BigQueryDialect, GenericDialect, SnowflakeDialect};
39use crate::dialect::{Dialect, MySqlDialect};
40use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
41
42/// SQL Token enumeration
43#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
44#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
45#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
46pub enum Token {
47    /// An end-of-file marker, not a real token
48    EOF,
49    /// A keyword (like SELECT) or an optionally quoted SQL identifier
50    Word(Word),
51    /// An unsigned numeric literal
52    Number(String, bool),
53    /// A character that could not be tokenized
54    Char(char),
55    /// Single quoted string: i.e: 'string'
56    SingleQuotedString(String),
57    /// Double quoted string: i.e: "string"
58    DoubleQuotedString(String),
59    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
60    DollarQuotedString(DollarQuotedString),
61    /// Byte string literal: i.e: b'string' or B'string'
62    SingleQuotedByteStringLiteral(String),
63    /// Byte string literal: i.e: b"string" or B"string"
64    DoubleQuotedByteStringLiteral(String),
65    /// Raw string literal: i.e: r'string' or R'string' or r"string" or R"string"
66    RawStringLiteral(String),
67    /// "National" string literal: i.e: N'string'
68    NationalStringLiteral(String),
69    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
70    EscapedStringLiteral(String),
71    /// Hexadecimal string literal: i.e.: X'deadbeef'
72    HexStringLiteral(String),
73    /// Comma
74    Comma,
75    /// Whitespace (space, tab, etc)
76    Whitespace(Whitespace),
77    /// Double equals sign `==`
78    DoubleEq,
79    /// Equality operator `=`
80    Eq,
81    /// Not Equals operator `<>` (or `!=` in some dialects)
82    Neq,
83    /// Less Than operator `<`
84    Lt,
85    /// Greater Than operator `>`
86    Gt,
87    /// Less Than Or Equals operator `<=`
88    LtEq,
89    /// Greater Than Or Equals operator `>=`
90    GtEq,
91    /// Spaceship operator <=>
92    Spaceship,
93    /// Plus operator `+`
94    Plus,
95    /// Minus operator `-`
96    Minus,
97    /// Multiplication operator `*`
98    Mul,
99    /// Division operator `/`
100    Div,
101    /// Modulo Operator `%`
102    Mod,
103    /// String concatenation `||`
104    StringConcat,
105    /// Left parenthesis `(`
106    LParen,
107    /// Right parenthesis `)`
108    RParen,
109    /// Period (used for compound identifiers or projections into nested types)
110    Period,
111    /// Colon `:`
112    Colon,
113    /// DoubleColon `::` (used for casting in postgresql)
114    DoubleColon,
115    /// SemiColon `;` used as separator for COPY and payload
116    SemiColon,
117    /// Backslash `\` used in terminating the COPY payload with `\.`
118    Backslash,
119    /// Left bracket `[`
120    LBracket,
121    /// Right bracket `]`
122    RBracket,
123    /// Ampersand `&`
124    Ampersand,
125    /// Pipe `|`
126    Pipe,
127    /// Caret `^`
128    Caret,
129    /// Left brace `{`
130    LBrace,
131    /// Right brace `}`
132    RBrace,
133    /// Right Arrow `=>`
134    RArrow,
135    /// Sharp `#` used for PostgreSQL Bitwise XOR operator
136    Sharp,
137    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
138    Tilde,
139    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
140    TildeAsterisk,
141    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
142    ExclamationMarkTilde,
143    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
144    ExclamationMarkTildeAsterisk,
145    /// `<<`, a bitwise shift left operator in PostgreSQL
146    ShiftLeft,
147    /// `>>`, a bitwise shift right operator in PostgreSQL
148    ShiftRight,
149    /// Exclamation Mark `!` used for PostgreSQL factorial operator
150    ExclamationMark,
151    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
152    DoubleExclamationMark,
153    /// AtSign `@` used for PostgreSQL abs operator
154    AtSign,
155    /// `|/`, a square root math operator in PostgreSQL
156    PGSquareRoot,
157    /// `||/` , a cube root math operator in PostgreSQL
158    PGCubeRoot,
159    /// `?` or `$` , a prepared statement arg placeholder
160    Placeholder(String),
161    /// ->, used as a operator to extract json field in PostgreSQL
162    Arrow,
163    /// ->>, used as a operator to extract json field as text in PostgreSQL
164    LongArrow,
165    /// #> Extracts JSON sub-object at the specified path
166    HashArrow,
167    /// #>> Extracts JSON sub-object at the specified path as text
168    HashLongArrow,
169    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
170    AtArrow,
171    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
172    ArrowAt,
173    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
174    /// path, where path elements can be either field keys or array indexes.
175    HashMinus,
176    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
177    /// JSON value?
178    AtQuestion,
179    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
180    /// for the specified JSON value. Only the first item of the result is taken into
181    /// account. If the result is not Boolean, then NULL is returned.
182    AtAt,
183}
184
185impl fmt::Display for Token {
186    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
187        match self {
188            Token::EOF => f.write_str("EOF"),
189            Token::Word(ref w) => write!(f, "{w}"),
190            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
191            Token::Char(ref c) => write!(f, "{c}"),
192            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
193            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
194            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
195            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
196            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
197            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
198            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
199            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
200            Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
201            Token::Comma => f.write_str(","),
202            Token::Whitespace(ws) => write!(f, "{ws}"),
203            Token::DoubleEq => f.write_str("=="),
204            Token::Spaceship => f.write_str("<=>"),
205            Token::Eq => f.write_str("="),
206            Token::Neq => f.write_str("<>"),
207            Token::Lt => f.write_str("<"),
208            Token::Gt => f.write_str(">"),
209            Token::LtEq => f.write_str("<="),
210            Token::GtEq => f.write_str(">="),
211            Token::Plus => f.write_str("+"),
212            Token::Minus => f.write_str("-"),
213            Token::Mul => f.write_str("*"),
214            Token::Div => f.write_str("/"),
215            Token::StringConcat => f.write_str("||"),
216            Token::Mod => f.write_str("%"),
217            Token::LParen => f.write_str("("),
218            Token::RParen => f.write_str(")"),
219            Token::Period => f.write_str("."),
220            Token::Colon => f.write_str(":"),
221            Token::DoubleColon => f.write_str("::"),
222            Token::SemiColon => f.write_str(";"),
223            Token::Backslash => f.write_str("\\"),
224            Token::LBracket => f.write_str("["),
225            Token::RBracket => f.write_str("]"),
226            Token::Ampersand => f.write_str("&"),
227            Token::Caret => f.write_str("^"),
228            Token::Pipe => f.write_str("|"),
229            Token::LBrace => f.write_str("{"),
230            Token::RBrace => f.write_str("}"),
231            Token::RArrow => f.write_str("=>"),
232            Token::Sharp => f.write_str("#"),
233            Token::ExclamationMark => f.write_str("!"),
234            Token::DoubleExclamationMark => f.write_str("!!"),
235            Token::Tilde => f.write_str("~"),
236            Token::TildeAsterisk => f.write_str("~*"),
237            Token::ExclamationMarkTilde => f.write_str("!~"),
238            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
239            Token::AtSign => f.write_str("@"),
240            Token::ShiftLeft => f.write_str("<<"),
241            Token::ShiftRight => f.write_str(">>"),
242            Token::PGSquareRoot => f.write_str("|/"),
243            Token::PGCubeRoot => f.write_str("||/"),
244            Token::Placeholder(ref s) => write!(f, "{s}"),
245            Token::Arrow => write!(f, "->"),
246            Token::LongArrow => write!(f, "->>"),
247            Token::HashArrow => write!(f, "#>"),
248            Token::HashLongArrow => write!(f, "#>>"),
249            Token::AtArrow => write!(f, "@>"),
250            Token::ArrowAt => write!(f, "<@"),
251            Token::HashMinus => write!(f, "#-"),
252            Token::AtQuestion => write!(f, "@?"),
253            Token::AtAt => write!(f, "@@"),
254        }
255    }
256}
257
258impl Token {
259    pub fn make_keyword(keyword: &str) -> Self {
260        Token::make_word(keyword, None)
261    }
262
263    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
264        let word_uppercase = word.to_uppercase();
265        Token::Word(Word {
266            value: word.to_string(),
267            quote_style,
268            keyword: if quote_style.is_none() {
269                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
270                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
271            } else {
272                Keyword::NoKeyword
273            },
274        })
275    }
276}
277
278/// A keyword (like SELECT) or an optionally quoted SQL identifier
279#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
280#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
281#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
282pub struct Word {
283    /// The value of the token, without the enclosing quotes, and with the
284    /// escape sequences (if any) processed (TODO: escapes are not handled)
285    pub value: String,
286    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
287    /// The standard and most implementations allow using double quotes for this,
288    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
289    pub quote_style: Option<char>,
290    /// If the word was not quoted and it matched one of the known keywords,
291    /// this will have one of the values from dialect::keywords, otherwise empty
292    pub keyword: Keyword,
293}
294
295impl fmt::Display for Word {
296    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
297        match self.quote_style {
298            Some(s) if s == '"' || s == '[' || s == '`' => {
299                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
300            }
301            None => f.write_str(&self.value),
302            _ => panic!("Unexpected quote_style!"),
303        }
304    }
305}
306
307impl Word {
308    fn matching_end_quote(ch: char) -> char {
309        match ch {
310            '"' => '"', // ANSI and most dialects
311            '[' => ']', // MS SQL
312            '`' => '`', // MySQL
313            _ => panic!("unexpected quoting style!"),
314        }
315    }
316}
317
318#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
319#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
320#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
321pub enum Whitespace {
322    Space,
323    Newline,
324    Tab,
325    SingleLineComment { comment: String, prefix: String },
326    MultiLineComment(String),
327}
328
329impl fmt::Display for Whitespace {
330    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
331        match self {
332            Whitespace::Space => f.write_str(" "),
333            Whitespace::Newline => f.write_str("\n"),
334            Whitespace::Tab => f.write_str("\t"),
335            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
336            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
337        }
338    }
339}
340
341/// Location in input string
342#[derive(Debug, Eq, PartialEq, Clone)]
343pub struct Location {
344    /// Line number, starting from 1
345    pub line: u64,
346    /// Line column, starting from 1
347    pub column: u64,
348}
349
350/// A [Token] with [Location] attached to it
351#[derive(Debug, Eq, PartialEq, Clone)]
352pub struct TokenWithLocation {
353    pub token: Token,
354    pub location: Location,
355}
356
357impl TokenWithLocation {
358    pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
359        TokenWithLocation {
360            token,
361            location: Location { line, column },
362        }
363    }
364
365    pub fn wrap(token: Token) -> TokenWithLocation {
366        TokenWithLocation::new(token, 0, 0)
367    }
368}
369
370impl PartialEq<Token> for TokenWithLocation {
371    fn eq(&self, other: &Token) -> bool {
372        &self.token == other
373    }
374}
375
376impl PartialEq<TokenWithLocation> for Token {
377    fn eq(&self, other: &TokenWithLocation) -> bool {
378        self == &other.token
379    }
380}
381
382impl fmt::Display for TokenWithLocation {
383    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
384        self.token.fmt(f)
385    }
386}
387
388/// Tokenizer error
389#[derive(Debug, PartialEq, Eq)]
390pub struct TokenizerError {
391    pub message: String,
392    pub line: u64,
393    pub col: u64,
394}
395
396impl fmt::Display for TokenizerError {
397    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
398        write!(
399            f,
400            "{} at Line: {}, Column {}",
401            self.message, self.line, self.col
402        )
403    }
404}
405
406#[cfg(feature = "std")]
407impl std::error::Error for TokenizerError {}
408
409struct State<'a> {
410    peekable: Peekable<Chars<'a>>,
411    pub line: u64,
412    pub col: u64,
413}
414
415impl<'a> State<'a> {
416    pub fn next(&mut self) -> Option<char> {
417        match self.peekable.next() {
418            None => None,
419            Some(s) => {
420                if s == '\n' {
421                    self.line += 1;
422                    self.col = 1;
423                } else {
424                    self.col += 1;
425                }
426                Some(s)
427            }
428        }
429    }
430
431    pub fn peek(&mut self) -> Option<&char> {
432        self.peekable.peek()
433    }
434
435    pub fn location(&self) -> Location {
436        Location {
437            line: self.line,
438            column: self.col,
439        }
440    }
441}
442
443/// SQL Tokenizer
444pub struct Tokenizer<'a> {
445    dialect: &'a dyn Dialect,
446    query: &'a str,
447}
448
449impl<'a> Tokenizer<'a> {
450    /// Create a new SQL tokenizer for the specified SQL statement
451    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
452        Self { dialect, query }
453    }
454
455    /// Tokenize the statement and produce a vector of tokens
456    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
457        let twl = self.tokenize_with_location()?;
458
459        let mut tokens: Vec<Token> = vec![];
460        tokens.reserve(twl.len());
461        for token_with_location in twl {
462            tokens.push(token_with_location.token);
463        }
464        Ok(tokens)
465    }
466
467    /// Tokenize the statement and produce a vector of tokens with location information
468    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
469        let mut state = State {
470            peekable: self.query.chars().peekable(),
471            line: 1,
472            col: 1,
473        };
474
475        let mut tokens: Vec<TokenWithLocation> = vec![];
476
477        let mut location = state.location();
478        while let Some(token) = self.next_token(&mut state)? {
479            tokens.push(TokenWithLocation {
480                token,
481                location: location.clone(),
482            });
483
484            location = state.location();
485        }
486        Ok(tokens)
487    }
488
489    /// Get the next token or return None
490    fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
491        //println!("next_token: {:?}", chars.peek());
492        match chars.peek() {
493            Some(&ch) => match ch {
494                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
495                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
496                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
497                '\r' => {
498                    // Emit a single Whitespace::Newline token for \r and \r\n
499                    chars.next();
500                    if let Some('\n') = chars.peek() {
501                        chars.next();
502                    }
503                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
504                }
505                // BigQuery uses b or B for byte string literal
506                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
507                    chars.next(); // consume
508                    match chars.peek() {
509                        Some('\'') => {
510                            let s = self.tokenize_quoted_string(chars, '\'')?;
511                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
512                        }
513                        Some('\"') => {
514                            let s = self.tokenize_quoted_string(chars, '\"')?;
515                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
516                        }
517                        _ => {
518                            // regular identifier starting with an "b" or "B"
519                            let s = self.tokenize_word(b, chars);
520                            Ok(Some(Token::make_word(&s, None)))
521                        }
522                    }
523                }
524                // BigQuery uses r or R for raw string literal
525                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
526                    chars.next(); // consume
527                    match chars.peek() {
528                        Some('\'') => {
529                            let s = self.tokenize_quoted_string(chars, '\'')?;
530                            Ok(Some(Token::RawStringLiteral(s)))
531                        }
532                        Some('\"') => {
533                            let s = self.tokenize_quoted_string(chars, '\"')?;
534                            Ok(Some(Token::RawStringLiteral(s)))
535                        }
536                        _ => {
537                            // regular identifier starting with an "r" or "R"
538                            let s = self.tokenize_word(b, chars);
539                            Ok(Some(Token::make_word(&s, None)))
540                        }
541                    }
542                }
543                // Redshift uses lower case n for national string literal
544                n @ 'N' | n @ 'n' => {
545                    chars.next(); // consume, to check the next char
546                    match chars.peek() {
547                        Some('\'') => {
548                            // N'...' - a <national character string literal>
549                            let s = self.tokenize_quoted_string(chars, '\'')?;
550                            Ok(Some(Token::NationalStringLiteral(s)))
551                        }
552                        _ => {
553                            // regular identifier starting with an "N"
554                            let s = self.tokenize_word(n, chars);
555                            Ok(Some(Token::make_word(&s, None)))
556                        }
557                    }
558                }
559                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
560                x @ 'e' | x @ 'E' => {
561                    let starting_loc = chars.location();
562                    chars.next(); // consume, to check the next char
563                    match chars.peek() {
564                        Some('\'') => {
565                            let s =
566                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
567                            Ok(Some(Token::EscapedStringLiteral(s)))
568                        }
569                        _ => {
570                            // regular identifier starting with an "E" or "e"
571                            let s = self.tokenize_word(x, chars);
572                            Ok(Some(Token::make_word(&s, None)))
573                        }
574                    }
575                }
576                // The spec only allows an uppercase 'X' to introduce a hex
577                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
578                x @ 'x' | x @ 'X' => {
579                    chars.next(); // consume, to check the next char
580                    match chars.peek() {
581                        Some('\'') => {
582                            // X'...' - a <binary string literal>
583                            let s = self.tokenize_quoted_string(chars, '\'')?;
584                            Ok(Some(Token::HexStringLiteral(s)))
585                        }
586                        _ => {
587                            // regular identifier starting with an "X"
588                            let s = self.tokenize_word(x, chars);
589                            Ok(Some(Token::make_word(&s, None)))
590                        }
591                    }
592                }
593                // identifier or keyword
594                ch if self.dialect.is_identifier_start(ch) => {
595                    chars.next(); // consume the first char
596                    let word = self.tokenize_word(ch, chars);
597
598                    // TODO: implement parsing of exponent here
599                    if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
600                        let mut inner_state = State {
601                            peekable: word.chars().peekable(),
602                            line: 0,
603                            col: 0,
604                        };
605                        let mut s = peeking_take_while(&mut inner_state, |ch| {
606                            matches!(ch, '0'..='9' | '.')
607                        });
608                        let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
609                        s += s2.as_str();
610                        return Ok(Some(Token::Number(s, false)));
611                    }
612
613                    Ok(Some(Token::make_word(&word, None)))
614                }
615                // single quoted string
616                '\'' => {
617                    let s = self.tokenize_quoted_string(chars, '\'')?;
618
619                    Ok(Some(Token::SingleQuotedString(s)))
620                }
621                // double quoted string
622                '\"' if !self.dialect.is_delimited_identifier_start(ch)
623                    && !self.dialect.is_identifier_start(ch) =>
624                {
625                    let s = self.tokenize_quoted_string(chars, '"')?;
626
627                    Ok(Some(Token::DoubleQuotedString(s)))
628                }
629                // delimited (quoted) identifier
630                quote_start
631                    if self.dialect.is_delimited_identifier_start(ch)
632                        && self
633                            .dialect
634                            .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
635                {
636                    let error_loc = chars.location();
637                    chars.next(); // consume the opening quote
638                    let quote_end = Word::matching_end_quote(quote_start);
639                    let (s, last_char) = parse_quoted_ident(chars, quote_end);
640
641                    if last_char == Some(quote_end) {
642                        Ok(Some(Token::make_word(&s, Some(quote_start))))
643                    } else {
644                        self.tokenizer_error(
645                            error_loc,
646                            format!("Expected close delimiter '{quote_end}' before EOF."),
647                        )
648                    }
649                }
650                // numbers and period
651                '0'..='9' | '.' => {
652                    let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
653
654                    // match binary literal that starts with 0x
655                    if s == "0" && chars.peek() == Some(&'x') {
656                        chars.next();
657                        let s2 = peeking_take_while(
658                            chars,
659                            |ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'),
660                        );
661                        return Ok(Some(Token::HexStringLiteral(s2)));
662                    }
663
664                    // match one period
665                    if let Some('.') = chars.peek() {
666                        s.push('.');
667                        chars.next();
668                    }
669                    s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
670
671                    // No number -> Token::Period
672                    if s == "." {
673                        return Ok(Some(Token::Period));
674                    }
675
676                    // Parse exponent as number
677                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
678                        let mut char_clone = chars.peekable.clone();
679                        let mut exponent_part = String::new();
680                        exponent_part.push(char_clone.next().unwrap());
681
682                        // Optional sign
683                        match char_clone.peek() {
684                            Some(&c) if matches!(c, '+' | '-') => {
685                                exponent_part.push(c);
686                                char_clone.next();
687                            }
688                            _ => (),
689                        }
690
691                        match char_clone.peek() {
692                            // Definitely an exponent, get original iterator up to speed and use it
693                            Some(&c) if c.is_ascii_digit() => {
694                                for _ in 0..exponent_part.len() {
695                                    chars.next();
696                                }
697                                exponent_part +=
698                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
699                                s += exponent_part.as_str();
700                            }
701                            // Not an exponent, discard the work done
702                            _ => (),
703                        }
704                    }
705
706                    let long = if chars.peek() == Some(&'L') {
707                        chars.next();
708                        true
709                    } else {
710                        false
711                    };
712                    Ok(Some(Token::Number(s, long)))
713                }
714                // punctuation
715                '(' => self.consume_and_return(chars, Token::LParen),
716                ')' => self.consume_and_return(chars, Token::RParen),
717                ',' => self.consume_and_return(chars, Token::Comma),
718                // operators
719                '-' => {
720                    chars.next(); // consume the '-'
721                    match chars.peek() {
722                        Some('-') => {
723                            chars.next(); // consume the second '-', starting a single-line comment
724                            let comment = self.tokenize_single_line_comment(chars);
725                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
726                                prefix: "--".to_owned(),
727                                comment,
728                            })))
729                        }
730                        Some('>') => {
731                            chars.next();
732                            match chars.peek() {
733                                Some('>') => {
734                                    chars.next();
735                                    Ok(Some(Token::LongArrow))
736                                }
737                                _ => Ok(Some(Token::Arrow)),
738                            }
739                        }
740                        // a regular '-' operator
741                        _ => Ok(Some(Token::Minus)),
742                    }
743                }
744                '/' => {
745                    chars.next(); // consume the '/'
746                    match chars.peek() {
747                        Some('*') => {
748                            chars.next(); // consume the '*', starting a multi-line comment
749                            self.tokenize_multiline_comment(chars)
750                        }
751                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
752                            chars.next(); // consume the second '/', starting a snowflake single-line comment
753                            let comment = self.tokenize_single_line_comment(chars);
754                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
755                                prefix: "//".to_owned(),
756                                comment,
757                            })))
758                        }
759                        // a regular '/' operator
760                        _ => Ok(Some(Token::Div)),
761                    }
762                }
763                '+' => self.consume_and_return(chars, Token::Plus),
764                '*' => self.consume_and_return(chars, Token::Mul),
765                '%' => self.consume_and_return(chars, Token::Mod),
766                '|' => {
767                    chars.next(); // consume the '|'
768                    match chars.peek() {
769                        Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
770                        Some('|') => {
771                            chars.next(); // consume the second '|'
772                            match chars.peek() {
773                                Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
774                                _ => Ok(Some(Token::StringConcat)),
775                            }
776                        }
777                        // Bitshift '|' operator
778                        _ => Ok(Some(Token::Pipe)),
779                    }
780                }
781                '=' => {
782                    chars.next(); // consume
783                    match chars.peek() {
784                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
785                        Some('>') => self.consume_and_return(chars, Token::RArrow),
786                        _ => Ok(Some(Token::Eq)),
787                    }
788                }
789                '!' => {
790                    chars.next(); // consume
791                    match chars.peek() {
792                        Some('=') => self.consume_and_return(chars, Token::Neq),
793                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
794                        Some('~') => {
795                            chars.next();
796                            match chars.peek() {
797                                Some('*') => self
798                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
799                                _ => Ok(Some(Token::ExclamationMarkTilde)),
800                            }
801                        }
802                        _ => Ok(Some(Token::ExclamationMark)),
803                    }
804                }
805                '<' => {
806                    chars.next(); // consume
807                    match chars.peek() {
808                        Some('=') => {
809                            chars.next();
810                            match chars.peek() {
811                                Some('>') => self.consume_and_return(chars, Token::Spaceship),
812                                _ => Ok(Some(Token::LtEq)),
813                            }
814                        }
815                        Some('>') => self.consume_and_return(chars, Token::Neq),
816                        Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
817                        Some('@') => self.consume_and_return(chars, Token::ArrowAt),
818                        _ => Ok(Some(Token::Lt)),
819                    }
820                }
821                '>' => {
822                    chars.next(); // consume
823                    match chars.peek() {
824                        Some('=') => self.consume_and_return(chars, Token::GtEq),
825                        Some('>') => self.consume_and_return(chars, Token::ShiftRight),
826                        _ => Ok(Some(Token::Gt)),
827                    }
828                }
829                ':' => {
830                    chars.next();
831                    match chars.peek() {
832                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
833                        _ => Ok(Some(Token::Colon)),
834                    }
835                }
836                ';' => self.consume_and_return(chars, Token::SemiColon),
837                '\\' => self.consume_and_return(chars, Token::Backslash),
838                '[' => self.consume_and_return(chars, Token::LBracket),
839                ']' => self.consume_and_return(chars, Token::RBracket),
840                '&' => self.consume_and_return(chars, Token::Ampersand),
841                '^' => self.consume_and_return(chars, Token::Caret),
842                '{' => self.consume_and_return(chars, Token::LBrace),
843                '}' => self.consume_and_return(chars, Token::RBrace),
844                '#' if dialect_of!(self is SnowflakeDialect) => {
845                    chars.next(); // consume the '#', starting a snowflake single-line comment
846                    let comment = self.tokenize_single_line_comment(chars);
847                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
848                        prefix: "#".to_owned(),
849                        comment,
850                    })))
851                }
852                '~' => {
853                    chars.next(); // consume
854                    match chars.peek() {
855                        Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
856                        _ => Ok(Some(Token::Tilde)),
857                    }
858                }
859                '#' => {
860                    chars.next();
861                    match chars.peek() {
862                        Some('-') => self.consume_and_return(chars, Token::HashMinus),
863                        Some('>') => {
864                            chars.next();
865                            match chars.peek() {
866                                Some('>') => {
867                                    chars.next();
868                                    Ok(Some(Token::HashLongArrow))
869                                }
870                                _ => Ok(Some(Token::HashArrow)),
871                            }
872                        }
873                        _ => Ok(Some(Token::Sharp)),
874                    }
875                }
876                '@' => {
877                    chars.next();
878                    match chars.peek() {
879                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
880                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
881                        Some('@') => self.consume_and_return(chars, Token::AtAt),
882                        _ => Ok(Some(Token::AtSign)),
883                    }
884                }
885                '?' => {
886                    chars.next();
887                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
888                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
889                }
890                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
891
892                //whitespace check (including unicode chars) should be last as it covers some of the chars above
893                ch if ch.is_whitespace() => {
894                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
895                }
896                other => self.consume_and_return(chars, Token::Char(other)),
897            },
898            None => Ok(None),
899        }
900    }
901
902    /// Tokenize dollar preceded value (i.e: a string/placeholder)
903    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
904        let mut s = String::new();
905        let mut value = String::new();
906
907        chars.next();
908
909        if let Some('$') = chars.peek() {
910            chars.next();
911
912            let mut is_terminated = false;
913            let mut prev: Option<char> = None;
914
915            while let Some(&ch) = chars.peek() {
916                if prev == Some('$') {
917                    if ch == '$' {
918                        chars.next();
919                        is_terminated = true;
920                        break;
921                    } else {
922                        s.push('$');
923                        s.push(ch);
924                    }
925                } else if ch != '$' {
926                    s.push(ch);
927                }
928
929                prev = Some(ch);
930                chars.next();
931            }
932
933            return if chars.peek().is_none() && !is_terminated {
934                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
935            } else {
936                Ok(Token::DollarQuotedString(DollarQuotedString {
937                    value: s,
938                    tag: None,
939                }))
940            };
941        } else {
942            value.push_str(&peeking_take_while(chars, |ch| {
943                ch.is_alphanumeric() || ch == '_'
944            }));
945
946            if let Some('$') = chars.peek() {
947                chars.next();
948                s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
949
950                match chars.peek() {
951                    Some('$') => {
952                        chars.next();
953                        for (_, c) in value.chars().enumerate() {
954                            let next_char = chars.next();
955                            if Some(c) != next_char {
956                                return self.tokenizer_error(
957                                    chars.location(),
958                                    format!(
959                                        "Unterminated dollar-quoted string at or near \"{value}\""
960                                    ),
961                                );
962                            }
963                        }
964
965                        if let Some('$') = chars.peek() {
966                            chars.next();
967                        } else {
968                            return self.tokenizer_error(
969                                chars.location(),
970                                "Unterminated dollar-quoted string, expected $",
971                            );
972                        }
973                    }
974                    _ => {
975                        return self.tokenizer_error(
976                            chars.location(),
977                            "Unterminated dollar-quoted, expected $",
978                        );
979                    }
980                }
981            } else {
982                return Ok(Token::Placeholder(String::from("$") + &value));
983            }
984        }
985
986        Ok(Token::DollarQuotedString(DollarQuotedString {
987            value: s,
988            tag: if value.is_empty() { None } else { Some(value) },
989        }))
990    }
991
992    fn tokenizer_error<R>(
993        &self,
994        loc: Location,
995        message: impl Into<String>,
996    ) -> Result<R, TokenizerError> {
997        Err(TokenizerError {
998            message: message.into(),
999            col: loc.column,
1000            line: loc.line,
1001        })
1002    }
1003
1004    // Consume characters until newline
1005    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1006        let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1007        if let Some(ch) = chars.next() {
1008            assert_eq!(ch, '\n');
1009            comment.push(ch);
1010        }
1011        comment
1012    }
1013
1014    /// Tokenize an identifier or keyword, after the first char is already consumed.
1015    fn tokenize_word(&self, first_char: char, chars: &mut State) -> String {
1016        let mut s = first_char.to_string();
1017        s.push_str(&peeking_take_while(chars, |ch| {
1018            self.dialect.is_identifier_part(ch)
1019        }));
1020        s
1021    }
1022
1023    /// Read a single quoted string, starting with the opening quote.
1024    fn tokenize_escaped_single_quoted_string(
1025        &self,
1026        starting_loc: Location,
1027        chars: &mut State,
1028    ) -> Result<String, TokenizerError> {
1029        let mut s = String::new();
1030
1031        // This case is a bit tricky
1032
1033        chars.next(); // consume the opening quote
1034
1035        // slash escaping
1036        let mut is_escaped = false;
1037        while let Some(&ch) = chars.peek() {
1038            macro_rules! escape_control_character {
1039                ($ESCAPED:expr) => {{
1040                    if is_escaped {
1041                        s.push($ESCAPED);
1042                        is_escaped = false;
1043                    } else {
1044                        s.push(ch);
1045                    }
1046
1047                    chars.next();
1048                }};
1049            }
1050
1051            match ch {
1052                '\'' => {
1053                    chars.next(); // consume
1054                    if is_escaped {
1055                        s.push(ch);
1056                        is_escaped = false;
1057                    } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1058                        s.push(ch);
1059                        chars.next();
1060                    } else {
1061                        return Ok(s);
1062                    }
1063                }
1064                '\\' => {
1065                    if is_escaped {
1066                        s.push('\\');
1067                        is_escaped = false;
1068                    } else {
1069                        is_escaped = true;
1070                    }
1071
1072                    chars.next();
1073                }
1074                'r' => escape_control_character!('\r'),
1075                'n' => escape_control_character!('\n'),
1076                't' => escape_control_character!('\t'),
1077                _ => {
1078                    is_escaped = false;
1079                    chars.next(); // consume
1080                    s.push(ch);
1081                }
1082            }
1083        }
1084        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1085    }
1086
1087    /// Read a single quoted string, starting with the opening quote.
1088    fn tokenize_quoted_string(
1089        &self,
1090        chars: &mut State,
1091        quote_style: char,
1092    ) -> Result<String, TokenizerError> {
1093        let mut s = String::new();
1094        let error_loc = chars.location();
1095
1096        chars.next(); // consume the opening quote
1097
1098        // slash escaping is specific to MySQL dialect
1099        let mut is_escaped = false;
1100        while let Some(&ch) = chars.peek() {
1101            match ch {
1102                char if char == quote_style => {
1103                    chars.next(); // consume
1104                    if is_escaped {
1105                        s.push(ch);
1106                        is_escaped = false;
1107                    } else if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
1108                        s.push(ch);
1109                        chars.next();
1110                    } else {
1111                        return Ok(s);
1112                    }
1113                }
1114                '\\' => {
1115                    if dialect_of!(self is MySqlDialect) {
1116                        is_escaped = !is_escaped;
1117                    } else {
1118                        s.push(ch);
1119                    }
1120                    chars.next();
1121                }
1122                _ => {
1123                    chars.next(); // consume
1124                    s.push(ch);
1125                }
1126            }
1127        }
1128        self.tokenizer_error(error_loc, "Unterminated string literal")
1129    }
1130
1131    fn tokenize_multiline_comment(
1132        &self,
1133        chars: &mut State,
1134    ) -> Result<Option<Token>, TokenizerError> {
1135        let mut s = String::new();
1136        let mut nested = 1;
1137        let mut last_ch = ' ';
1138
1139        loop {
1140            match chars.next() {
1141                Some(ch) => {
1142                    if last_ch == '/' && ch == '*' {
1143                        nested += 1;
1144                    } else if last_ch == '*' && ch == '/' {
1145                        nested -= 1;
1146                        if nested == 0 {
1147                            s.pop();
1148                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1149                        }
1150                    }
1151                    s.push(ch);
1152                    last_ch = ch;
1153                }
1154                None => {
1155                    break self.tokenizer_error(
1156                        chars.location(),
1157                        "Unexpected EOF while in a multi-line comment",
1158                    )
1159                }
1160            }
1161        }
1162    }
1163
1164    #[allow(clippy::unnecessary_wraps)]
1165    fn consume_and_return(
1166        &self,
1167        chars: &mut State,
1168        t: Token,
1169    ) -> Result<Option<Token>, TokenizerError> {
1170        chars.next();
1171        Ok(Some(t))
1172    }
1173}
1174
1175/// Read from `chars` until `predicate` returns `false` or EOF is hit.
1176/// Return the characters read as String, and keep the first non-matching
1177/// char available as `chars.next()`.
1178fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1179    let mut s = String::new();
1180    while let Some(&ch) = chars.peek() {
1181        if predicate(ch) {
1182            chars.next(); // consume
1183            s.push(ch);
1184        } else {
1185            break;
1186        }
1187    }
1188    s
1189}
1190
1191fn parse_quoted_ident(chars: &mut State, quote_end: char) -> (String, Option<char>) {
1192    let mut last_char = None;
1193    let mut s = String::new();
1194    while let Some(ch) = chars.next() {
1195        if ch == quote_end {
1196            if chars.peek() == Some(&quote_end) {
1197                chars.next();
1198                s.push(ch);
1199            } else {
1200                last_char = Some(quote_end);
1201                break;
1202            }
1203        } else {
1204            s.push(ch);
1205        }
1206    }
1207    (s, last_char)
1208}
1209
1210#[cfg(test)]
1211mod tests {
1212    use super::*;
1213    use crate::dialect::{GenericDialect, MsSqlDialect};
1214
1215    #[test]
1216    fn tokenizer_error_impl() {
1217        let err = TokenizerError {
1218            message: "test".into(),
1219            line: 1,
1220            col: 1,
1221        };
1222        #[cfg(feature = "std")]
1223        {
1224            use std::error::Error;
1225            assert!(err.source().is_none());
1226        }
1227        assert_eq!(err.to_string(), "test at Line: 1, Column 1");
1228    }
1229
1230    #[test]
1231    fn tokenize_select_1() {
1232        let sql = String::from("SELECT 1");
1233        let dialect = GenericDialect {};
1234        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1235        let tokens = tokenizer.tokenize().unwrap();
1236
1237        let expected = vec![
1238            Token::make_keyword("SELECT"),
1239            Token::Whitespace(Whitespace::Space),
1240            Token::Number(String::from("1"), false),
1241        ];
1242
1243        compare(expected, tokens);
1244    }
1245
1246    #[test]
1247    fn tokenize_select_float() {
1248        let sql = String::from("SELECT .1");
1249        let dialect = GenericDialect {};
1250        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1251        let tokens = tokenizer.tokenize().unwrap();
1252
1253        let expected = vec![
1254            Token::make_keyword("SELECT"),
1255            Token::Whitespace(Whitespace::Space),
1256            Token::Number(String::from(".1"), false),
1257        ];
1258
1259        compare(expected, tokens);
1260    }
1261
1262    #[test]
1263    fn tokenize_select_exponent() {
1264        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
1265        let dialect = GenericDialect {};
1266        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1267        let tokens = tokenizer.tokenize().unwrap();
1268
1269        let expected = vec![
1270            Token::make_keyword("SELECT"),
1271            Token::Whitespace(Whitespace::Space),
1272            Token::Number(String::from("1e10"), false),
1273            Token::Comma,
1274            Token::Whitespace(Whitespace::Space),
1275            Token::Number(String::from("1e-10"), false),
1276            Token::Comma,
1277            Token::Whitespace(Whitespace::Space),
1278            Token::Number(String::from("1e+10"), false),
1279            Token::Comma,
1280            Token::Whitespace(Whitespace::Space),
1281            Token::Number(String::from("1"), false),
1282            Token::make_word("ea", None),
1283            Token::Comma,
1284            Token::Whitespace(Whitespace::Space),
1285            Token::Number(String::from("1e-10"), false),
1286            Token::make_word("a", None),
1287            Token::Comma,
1288            Token::Whitespace(Whitespace::Space),
1289            Token::Number(String::from("1e-10"), false),
1290            Token::Minus,
1291            Token::Number(String::from("10"), false),
1292        ];
1293
1294        compare(expected, tokens);
1295    }
1296
1297    #[test]
1298    fn tokenize_scalar_function() {
1299        let sql = String::from("SELECT sqrt(1)");
1300        let dialect = GenericDialect {};
1301        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1302        let tokens = tokenizer.tokenize().unwrap();
1303
1304        let expected = vec![
1305            Token::make_keyword("SELECT"),
1306            Token::Whitespace(Whitespace::Space),
1307            Token::make_word("sqrt", None),
1308            Token::LParen,
1309            Token::Number(String::from("1"), false),
1310            Token::RParen,
1311        ];
1312
1313        compare(expected, tokens);
1314    }
1315
1316    #[test]
1317    fn tokenize_string_string_concat() {
1318        let sql = String::from("SELECT 'a' || 'b'");
1319        let dialect = GenericDialect {};
1320        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1321        let tokens = tokenizer.tokenize().unwrap();
1322
1323        let expected = vec![
1324            Token::make_keyword("SELECT"),
1325            Token::Whitespace(Whitespace::Space),
1326            Token::SingleQuotedString(String::from("a")),
1327            Token::Whitespace(Whitespace::Space),
1328            Token::StringConcat,
1329            Token::Whitespace(Whitespace::Space),
1330            Token::SingleQuotedString(String::from("b")),
1331        ];
1332
1333        compare(expected, tokens);
1334    }
1335    #[test]
1336    fn tokenize_bitwise_op() {
1337        let sql = String::from("SELECT one | two ^ three");
1338        let dialect = GenericDialect {};
1339        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1340        let tokens = tokenizer.tokenize().unwrap();
1341
1342        let expected = vec![
1343            Token::make_keyword("SELECT"),
1344            Token::Whitespace(Whitespace::Space),
1345            Token::make_word("one", None),
1346            Token::Whitespace(Whitespace::Space),
1347            Token::Pipe,
1348            Token::Whitespace(Whitespace::Space),
1349            Token::make_word("two", None),
1350            Token::Whitespace(Whitespace::Space),
1351            Token::Caret,
1352            Token::Whitespace(Whitespace::Space),
1353            Token::make_word("three", None),
1354        ];
1355        compare(expected, tokens);
1356    }
1357
1358    #[test]
1359    fn tokenize_logical_xor() {
1360        let sql =
1361            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1362        let dialect = GenericDialect {};
1363        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1364        let tokens = tokenizer.tokenize().unwrap();
1365
1366        let expected = vec![
1367            Token::make_keyword("SELECT"),
1368            Token::Whitespace(Whitespace::Space),
1369            Token::make_keyword("true"),
1370            Token::Whitespace(Whitespace::Space),
1371            Token::make_keyword("XOR"),
1372            Token::Whitespace(Whitespace::Space),
1373            Token::make_keyword("true"),
1374            Token::Comma,
1375            Token::Whitespace(Whitespace::Space),
1376            Token::make_keyword("false"),
1377            Token::Whitespace(Whitespace::Space),
1378            Token::make_keyword("XOR"),
1379            Token::Whitespace(Whitespace::Space),
1380            Token::make_keyword("false"),
1381            Token::Comma,
1382            Token::Whitespace(Whitespace::Space),
1383            Token::make_keyword("true"),
1384            Token::Whitespace(Whitespace::Space),
1385            Token::make_keyword("XOR"),
1386            Token::Whitespace(Whitespace::Space),
1387            Token::make_keyword("false"),
1388            Token::Comma,
1389            Token::Whitespace(Whitespace::Space),
1390            Token::make_keyword("false"),
1391            Token::Whitespace(Whitespace::Space),
1392            Token::make_keyword("XOR"),
1393            Token::Whitespace(Whitespace::Space),
1394            Token::make_keyword("true"),
1395        ];
1396        compare(expected, tokens);
1397    }
1398
1399    #[test]
1400    fn tokenize_simple_select() {
1401        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1402        let dialect = GenericDialect {};
1403        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1404        let tokens = tokenizer.tokenize().unwrap();
1405
1406        let expected = vec![
1407            Token::make_keyword("SELECT"),
1408            Token::Whitespace(Whitespace::Space),
1409            Token::Mul,
1410            Token::Whitespace(Whitespace::Space),
1411            Token::make_keyword("FROM"),
1412            Token::Whitespace(Whitespace::Space),
1413            Token::make_word("customer", None),
1414            Token::Whitespace(Whitespace::Space),
1415            Token::make_keyword("WHERE"),
1416            Token::Whitespace(Whitespace::Space),
1417            Token::make_word("id", None),
1418            Token::Whitespace(Whitespace::Space),
1419            Token::Eq,
1420            Token::Whitespace(Whitespace::Space),
1421            Token::Number(String::from("1"), false),
1422            Token::Whitespace(Whitespace::Space),
1423            Token::make_keyword("LIMIT"),
1424            Token::Whitespace(Whitespace::Space),
1425            Token::Number(String::from("5"), false),
1426        ];
1427
1428        compare(expected, tokens);
1429    }
1430
1431    #[test]
1432    fn tokenize_explain_select() {
1433        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1434        let dialect = GenericDialect {};
1435        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1436        let tokens = tokenizer.tokenize().unwrap();
1437
1438        let expected = vec![
1439            Token::make_keyword("EXPLAIN"),
1440            Token::Whitespace(Whitespace::Space),
1441            Token::make_keyword("SELECT"),
1442            Token::Whitespace(Whitespace::Space),
1443            Token::Mul,
1444            Token::Whitespace(Whitespace::Space),
1445            Token::make_keyword("FROM"),
1446            Token::Whitespace(Whitespace::Space),
1447            Token::make_word("customer", None),
1448            Token::Whitespace(Whitespace::Space),
1449            Token::make_keyword("WHERE"),
1450            Token::Whitespace(Whitespace::Space),
1451            Token::make_word("id", None),
1452            Token::Whitespace(Whitespace::Space),
1453            Token::Eq,
1454            Token::Whitespace(Whitespace::Space),
1455            Token::Number(String::from("1"), false),
1456        ];
1457
1458        compare(expected, tokens);
1459    }
1460
1461    #[test]
1462    fn tokenize_explain_analyze_select() {
1463        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1464        let dialect = GenericDialect {};
1465        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1466        let tokens = tokenizer.tokenize().unwrap();
1467
1468        let expected = vec![
1469            Token::make_keyword("EXPLAIN"),
1470            Token::Whitespace(Whitespace::Space),
1471            Token::make_keyword("ANALYZE"),
1472            Token::Whitespace(Whitespace::Space),
1473            Token::make_keyword("SELECT"),
1474            Token::Whitespace(Whitespace::Space),
1475            Token::Mul,
1476            Token::Whitespace(Whitespace::Space),
1477            Token::make_keyword("FROM"),
1478            Token::Whitespace(Whitespace::Space),
1479            Token::make_word("customer", None),
1480            Token::Whitespace(Whitespace::Space),
1481            Token::make_keyword("WHERE"),
1482            Token::Whitespace(Whitespace::Space),
1483            Token::make_word("id", None),
1484            Token::Whitespace(Whitespace::Space),
1485            Token::Eq,
1486            Token::Whitespace(Whitespace::Space),
1487            Token::Number(String::from("1"), false),
1488        ];
1489
1490        compare(expected, tokens);
1491    }
1492
1493    #[test]
1494    fn tokenize_string_predicate() {
1495        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1496        let dialect = GenericDialect {};
1497        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1498        let tokens = tokenizer.tokenize().unwrap();
1499
1500        let expected = vec![
1501            Token::make_keyword("SELECT"),
1502            Token::Whitespace(Whitespace::Space),
1503            Token::Mul,
1504            Token::Whitespace(Whitespace::Space),
1505            Token::make_keyword("FROM"),
1506            Token::Whitespace(Whitespace::Space),
1507            Token::make_word("customer", None),
1508            Token::Whitespace(Whitespace::Space),
1509            Token::make_keyword("WHERE"),
1510            Token::Whitespace(Whitespace::Space),
1511            Token::make_word("salary", None),
1512            Token::Whitespace(Whitespace::Space),
1513            Token::Neq,
1514            Token::Whitespace(Whitespace::Space),
1515            Token::SingleQuotedString(String::from("Not Provided")),
1516        ];
1517
1518        compare(expected, tokens);
1519    }
1520
1521    #[test]
1522    fn tokenize_invalid_string() {
1523        let sql = String::from("\n💝مصطفىh");
1524
1525        let dialect = GenericDialect {};
1526        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1527        let tokens = tokenizer.tokenize().unwrap();
1528        // println!("tokens: {:#?}", tokens);
1529        let expected = vec![
1530            Token::Whitespace(Whitespace::Newline),
1531            Token::Char('💝'),
1532            Token::make_word("مصطفىh", None),
1533        ];
1534        compare(expected, tokens);
1535    }
1536
1537    #[test]
1538    fn tokenize_newline_in_string_literal() {
1539        let sql = String::from("'foo\r\nbar\nbaz'");
1540
1541        let dialect = GenericDialect {};
1542        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1543        let tokens = tokenizer.tokenize().unwrap();
1544        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
1545        compare(expected, tokens);
1546    }
1547
1548    #[test]
1549    fn tokenize_unterminated_string_literal() {
1550        let sql = String::from("select 'foo");
1551
1552        let dialect = GenericDialect {};
1553        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1554        assert_eq!(
1555            tokenizer.tokenize(),
1556            Err(TokenizerError {
1557                message: "Unterminated string literal".to_string(),
1558                line: 1,
1559                col: 8
1560            })
1561        );
1562    }
1563
1564    #[test]
1565    fn tokenize_unterminated_string_literal_utf8() {
1566        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
1567
1568        let dialect = GenericDialect {};
1569        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1570        assert_eq!(
1571            tokenizer.tokenize(),
1572            Err(TokenizerError {
1573                message: "Unterminated string literal".to_string(),
1574                line: 1,
1575                col: 35
1576            })
1577        );
1578    }
1579
1580    #[test]
1581    fn tokenize_invalid_string_cols() {
1582        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
1583
1584        let dialect = GenericDialect {};
1585        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1586        let tokens = tokenizer.tokenize().unwrap();
1587        // println!("tokens: {:#?}", tokens);
1588        let expected = vec![
1589            Token::Whitespace(Whitespace::Newline),
1590            Token::Whitespace(Whitespace::Newline),
1591            Token::make_keyword("SELECT"),
1592            Token::Whitespace(Whitespace::Space),
1593            Token::Mul,
1594            Token::Whitespace(Whitespace::Space),
1595            Token::make_keyword("FROM"),
1596            Token::Whitespace(Whitespace::Space),
1597            Token::make_keyword("table"),
1598            Token::Whitespace(Whitespace::Tab),
1599            Token::Char('💝'),
1600            Token::make_word("مصطفىh", None),
1601        ];
1602        compare(expected, tokens);
1603    }
1604
1605    #[test]
1606    fn tokenize_right_arrow() {
1607        let sql = String::from("FUNCTION(key=>value)");
1608        let dialect = GenericDialect {};
1609        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1610        let tokens = tokenizer.tokenize().unwrap();
1611        let expected = vec![
1612            Token::make_word("FUNCTION", None),
1613            Token::LParen,
1614            Token::make_word("key", None),
1615            Token::RArrow,
1616            Token::make_word("value", None),
1617            Token::RParen,
1618        ];
1619        compare(expected, tokens);
1620    }
1621
1622    #[test]
1623    fn tokenize_is_null() {
1624        let sql = String::from("a IS NULL");
1625        let dialect = GenericDialect {};
1626        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1627        let tokens = tokenizer.tokenize().unwrap();
1628
1629        let expected = vec![
1630            Token::make_word("a", None),
1631            Token::Whitespace(Whitespace::Space),
1632            Token::make_keyword("IS"),
1633            Token::Whitespace(Whitespace::Space),
1634            Token::make_keyword("NULL"),
1635        ];
1636
1637        compare(expected, tokens);
1638    }
1639
1640    #[test]
1641    fn tokenize_double_eq() {
1642        let sql = String::from("a == 123");
1643        let dialect = GenericDialect {};
1644        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1645        let tokens = tokenizer.tokenize().unwrap();
1646
1647        let expected = vec![
1648            Token::make_word("a", None),
1649            Token::Whitespace(Whitespace::Space),
1650            Token::DoubleEq,
1651            Token::Whitespace(Whitespace::Space),
1652            Token::Number(String::from("123"), false),
1653        ];
1654
1655        compare(expected, tokens);
1656    }
1657
1658    #[test]
1659    fn tokenize_comment() {
1660        let sql = String::from("0--this is a comment\n1");
1661
1662        let dialect = GenericDialect {};
1663        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1664        let tokens = tokenizer.tokenize().unwrap();
1665        let expected = vec![
1666            Token::Number("0".to_string(), false),
1667            Token::Whitespace(Whitespace::SingleLineComment {
1668                prefix: "--".to_string(),
1669                comment: "this is a comment\n".to_string(),
1670            }),
1671            Token::Number("1".to_string(), false),
1672        ];
1673        compare(expected, tokens);
1674    }
1675
1676    #[test]
1677    fn tokenize_comment_at_eof() {
1678        let sql = String::from("--this is a comment");
1679
1680        let dialect = GenericDialect {};
1681        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1682        let tokens = tokenizer.tokenize().unwrap();
1683        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1684            prefix: "--".to_string(),
1685            comment: "this is a comment".to_string(),
1686        })];
1687        compare(expected, tokens);
1688    }
1689
1690    #[test]
1691    fn tokenize_multiline_comment() {
1692        let sql = String::from("0/*multi-line\n* /comment*/1");
1693
1694        let dialect = GenericDialect {};
1695        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1696        let tokens = tokenizer.tokenize().unwrap();
1697        let expected = vec![
1698            Token::Number("0".to_string(), false),
1699            Token::Whitespace(Whitespace::MultiLineComment(
1700                "multi-line\n* /comment".to_string(),
1701            )),
1702            Token::Number("1".to_string(), false),
1703        ];
1704        compare(expected, tokens);
1705    }
1706
1707    #[test]
1708    fn tokenize_nested_multiline_comment() {
1709        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1710
1711        let dialect = GenericDialect {};
1712        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1713        let tokens = tokenizer.tokenize().unwrap();
1714        let expected = vec![
1715            Token::Number("0".to_string(), false),
1716            Token::Whitespace(Whitespace::MultiLineComment(
1717                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
1718            )),
1719            Token::Number("1".to_string(), false),
1720        ];
1721        compare(expected, tokens);
1722    }
1723
1724    #[test]
1725    fn tokenize_multiline_comment_with_even_asterisks() {
1726        let sql = String::from("\n/** Comment **/\n");
1727
1728        let dialect = GenericDialect {};
1729        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1730        let tokens = tokenizer.tokenize().unwrap();
1731        let expected = vec![
1732            Token::Whitespace(Whitespace::Newline),
1733            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
1734            Token::Whitespace(Whitespace::Newline),
1735        ];
1736        compare(expected, tokens);
1737    }
1738
1739    #[test]
1740    fn tokenize_unicode_whitespace() {
1741        let sql = String::from(" \u{2003}\n");
1742
1743        let dialect = GenericDialect {};
1744        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1745        let tokens = tokenizer.tokenize().unwrap();
1746        let expected = vec![
1747            Token::Whitespace(Whitespace::Space),
1748            Token::Whitespace(Whitespace::Space),
1749            Token::Whitespace(Whitespace::Newline),
1750        ];
1751        compare(expected, tokens);
1752    }
1753
1754    #[test]
1755    fn tokenize_mismatched_quotes() {
1756        let sql = String::from("\"foo");
1757
1758        let dialect = GenericDialect {};
1759        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1760        assert_eq!(
1761            tokenizer.tokenize(),
1762            Err(TokenizerError {
1763                message: "Expected close delimiter '\"' before EOF.".to_string(),
1764                line: 1,
1765                col: 1
1766            })
1767        );
1768    }
1769
1770    #[test]
1771    fn tokenize_newlines() {
1772        let sql = String::from("line1\nline2\rline3\r\nline4\r");
1773
1774        let dialect = GenericDialect {};
1775        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1776        let tokens = tokenizer.tokenize().unwrap();
1777        let expected = vec![
1778            Token::make_word("line1", None),
1779            Token::Whitespace(Whitespace::Newline),
1780            Token::make_word("line2", None),
1781            Token::Whitespace(Whitespace::Newline),
1782            Token::make_word("line3", None),
1783            Token::Whitespace(Whitespace::Newline),
1784            Token::make_word("line4", None),
1785            Token::Whitespace(Whitespace::Newline),
1786        ];
1787        compare(expected, tokens);
1788    }
1789
1790    #[test]
1791    fn tokenize_mssql_top() {
1792        let sql = "SELECT TOP 5 [bar] FROM foo";
1793        let dialect = MsSqlDialect {};
1794        let mut tokenizer = Tokenizer::new(&dialect, sql);
1795        let tokens = tokenizer.tokenize().unwrap();
1796        let expected = vec![
1797            Token::make_keyword("SELECT"),
1798            Token::Whitespace(Whitespace::Space),
1799            Token::make_keyword("TOP"),
1800            Token::Whitespace(Whitespace::Space),
1801            Token::Number(String::from("5"), false),
1802            Token::Whitespace(Whitespace::Space),
1803            Token::make_word("bar", Some('[')),
1804            Token::Whitespace(Whitespace::Space),
1805            Token::make_keyword("FROM"),
1806            Token::Whitespace(Whitespace::Space),
1807            Token::make_word("foo", None),
1808        ];
1809        compare(expected, tokens);
1810    }
1811
1812    #[test]
1813    fn tokenize_pg_regex_match() {
1814        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1815        let dialect = GenericDialect {};
1816        let mut tokenizer = Tokenizer::new(&dialect, sql);
1817        let tokens = tokenizer.tokenize().unwrap();
1818        let expected = vec![
1819            Token::make_keyword("SELECT"),
1820            Token::Whitespace(Whitespace::Space),
1821            Token::make_word("col", None),
1822            Token::Whitespace(Whitespace::Space),
1823            Token::Tilde,
1824            Token::Whitespace(Whitespace::Space),
1825            Token::SingleQuotedString("^a".into()),
1826            Token::Comma,
1827            Token::Whitespace(Whitespace::Space),
1828            Token::make_word("col", None),
1829            Token::Whitespace(Whitespace::Space),
1830            Token::TildeAsterisk,
1831            Token::Whitespace(Whitespace::Space),
1832            Token::SingleQuotedString("^a".into()),
1833            Token::Comma,
1834            Token::Whitespace(Whitespace::Space),
1835            Token::make_word("col", None),
1836            Token::Whitespace(Whitespace::Space),
1837            Token::ExclamationMarkTilde,
1838            Token::Whitespace(Whitespace::Space),
1839            Token::SingleQuotedString("^a".into()),
1840            Token::Comma,
1841            Token::Whitespace(Whitespace::Space),
1842            Token::make_word("col", None),
1843            Token::Whitespace(Whitespace::Space),
1844            Token::ExclamationMarkTildeAsterisk,
1845            Token::Whitespace(Whitespace::Space),
1846            Token::SingleQuotedString("^a".into()),
1847        ];
1848        compare(expected, tokens);
1849    }
1850
1851    #[test]
1852    fn tokenize_quoted_identifier() {
1853        let sql = r#" "a "" b" "a """ "c """"" "#;
1854        let dialect = GenericDialect {};
1855        let mut tokenizer = Tokenizer::new(&dialect, sql);
1856        let tokens = tokenizer.tokenize().unwrap();
1857        let expected = vec![
1858            Token::Whitespace(Whitespace::Space),
1859            Token::make_word(r#"a " b"#, Some('"')),
1860            Token::Whitespace(Whitespace::Space),
1861            Token::make_word(r#"a ""#, Some('"')),
1862            Token::Whitespace(Whitespace::Space),
1863            Token::make_word(r#"c """#, Some('"')),
1864            Token::Whitespace(Whitespace::Space),
1865        ];
1866        compare(expected, tokens);
1867    }
1868
1869    #[test]
1870    fn tokenize_with_location() {
1871        let sql = "SELECT a,\n b";
1872        let dialect = GenericDialect {};
1873        let mut tokenizer = Tokenizer::new(&dialect, sql);
1874        let tokens = tokenizer.tokenize_with_location().unwrap();
1875        let expected = vec![
1876            TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
1877            TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
1878            TokenWithLocation::new(Token::make_word("a", None), 1, 8),
1879            TokenWithLocation::new(Token::Comma, 1, 9),
1880            TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
1881            TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
1882            TokenWithLocation::new(Token::make_word("b", None), 2, 2),
1883        ];
1884        compare(expected, tokens);
1885    }
1886
1887    fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
1888        //println!("------------------------------");
1889        //println!("tokens   = {:?}", actual);
1890        //println!("expected = {:?}", expected);
1891        //println!("------------------------------");
1892        assert_eq!(expected, actual);
1893    }
1894}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs