sqlparser/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5// http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19#[cfg(not(feature = "std"))]
20use alloc::{
21    borrow::ToOwned,
22    format,
23    string::{String, ToString},
24    vec,
25    vec::Vec,
26};
27use core::fmt;
28use core::iter::Peekable;
29use core::str::Chars;
30
31#[cfg(feature = "serde")]
32use serde::{Deserialize, Serialize};
33
34#[cfg(feature = "visitor")]
35use sqlparser_derive::{Visit, VisitMut};
36
37use crate::ast::DollarQuotedString;
38use crate::dialect::{
39    BigQueryDialect, DuckDbDialect, GenericDialect, HiveDialect, SnowflakeDialect,
40};
41use crate::dialect::{Dialect, MySqlDialect};
42use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
43
44/// SQL Token enumeration
45#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
46#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
47#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
48pub enum Token {
49    /// An end-of-file marker, not a real token
50    EOF,
51    /// A keyword (like SELECT) or an optionally quoted SQL identifier
52    Word(Word),
53    /// An unsigned numeric literal
54    Number(String, bool),
55    /// A character that could not be tokenized
56    Char(char),
57    /// Single quoted string: i.e: 'string'
58    SingleQuotedString(String),
59    /// Double quoted string: i.e: "string"
60    DoubleQuotedString(String),
61    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
62    DollarQuotedString(DollarQuotedString),
63    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
64    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
65    SingleQuotedByteStringLiteral(String),
66    /// Byte string literal: i.e: b"string" or B"string"
67    DoubleQuotedByteStringLiteral(String),
68    /// Raw string literal: i.e: r'string' or R'string' or r"string" or R"string"
69    RawStringLiteral(String),
70    /// "National" string literal: i.e: N'string'
71    NationalStringLiteral(String),
72    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
73    EscapedStringLiteral(String),
74    /// Hexadecimal string literal: i.e.: X'deadbeef'
75    HexStringLiteral(String),
76    /// Comma
77    Comma,
78    /// Whitespace (space, tab, etc)
79    Whitespace(Whitespace),
80    /// Double equals sign `==`
81    DoubleEq,
82    /// Equality operator `=`
83    Eq,
84    /// Not Equals operator `<>` (or `!=` in some dialects)
85    Neq,
86    /// Less Than operator `<`
87    Lt,
88    /// Greater Than operator `>`
89    Gt,
90    /// Less Than Or Equals operator `<=`
91    LtEq,
92    /// Greater Than Or Equals operator `>=`
93    GtEq,
94    /// Spaceship operator <=>
95    Spaceship,
96    /// Plus operator `+`
97    Plus,
98    /// Minus operator `-`
99    Minus,
100    /// Multiplication operator `*`
101    Mul,
102    /// Division operator `/`
103    Div,
104    /// Integer division operator `//` in DuckDB
105    DuckIntDiv,
106    /// Modulo Operator `%`
107    Mod,
108    /// String concatenation `||`
109    StringConcat,
110    /// Left parenthesis `(`
111    LParen,
112    /// Right parenthesis `)`
113    RParen,
114    /// Period (used for compound identifiers or projections into nested types)
115    Period,
116    /// Colon `:`
117    Colon,
118    /// DoubleColon `::` (used for casting in PostgreSQL)
119    DoubleColon,
120    /// Assignment `:=` (used for keyword argument in DuckDB macros)
121    DuckAssignment,
122    /// SemiColon `;` used as separator for COPY and payload
123    SemiColon,
124    /// Backslash `\` used in terminating the COPY payload with `\.`
125    Backslash,
126    /// Left bracket `[`
127    LBracket,
128    /// Right bracket `]`
129    RBracket,
130    /// Ampersand `&`
131    Ampersand,
132    /// Pipe `|`
133    Pipe,
134    /// Caret `^`
135    Caret,
136    /// Left brace `{`
137    LBrace,
138    /// Right brace `}`
139    RBrace,
140    /// Right Arrow `=>`
141    RArrow,
142    /// Sharp `#` used for PostgreSQL Bitwise XOR operator
143    Sharp,
144    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
145    Tilde,
146    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
147    TildeAsterisk,
148    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
149    ExclamationMarkTilde,
150    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
151    ExclamationMarkTildeAsterisk,
152    /// `~~`, a case sensitive match pattern operator in PostgreSQL
153    DoubleTilde,
154    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
155    DoubleTildeAsterisk,
156    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
157    ExclamationMarkDoubleTilde,
158    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
159    ExclamationMarkDoubleTildeAsterisk,
160    /// `<<`, a bitwise shift left operator in PostgreSQL
161    ShiftLeft,
162    /// `>>`, a bitwise shift right operator in PostgreSQL
163    ShiftRight,
164    /// `&&`, an overlap operator in PostgreSQL
165    Overlap,
166    /// Exclamation Mark `!` used for PostgreSQL factorial operator
167    ExclamationMark,
168    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
169    DoubleExclamationMark,
170    /// AtSign `@` used for PostgreSQL abs operator
171    AtSign,
172    /// `^@`, a "starts with" string operator in PostgreSQL
173    CaretAt,
174    /// `|/`, a square root math operator in PostgreSQL
175    PGSquareRoot,
176    /// `||/`, a cube root math operator in PostgreSQL
177    PGCubeRoot,
178    /// `?` or `$` , a prepared statement arg placeholder
179    Placeholder(String),
180    /// `->`, used as a operator to extract json field in PostgreSQL
181    Arrow,
182    /// `->>`, used as a operator to extract json field as text in PostgreSQL
183    LongArrow,
184    /// `#>`, extracts JSON sub-object at the specified path
185    HashArrow,
186    /// `#>>`, extracts JSON sub-object at the specified path as text
187    HashLongArrow,
188    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
189    AtArrow,
190    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
191    ArrowAt,
192    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
193    /// path, where path elements can be either field keys or array indexes.
194    HashMinus,
195    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
196    /// JSON value?
197    AtQuestion,
198    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
199    /// for the specified JSON value. Only the first item of the result is taken into
200    /// account. If the result is not Boolean, then NULL is returned.
201    AtAt,
202}
203
204impl fmt::Display for Token {
205    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
206        match self {
207            Token::EOF => f.write_str("EOF"),
208            Token::Word(ref w) => write!(f, "{w}"),
209            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
210            Token::Char(ref c) => write!(f, "{c}"),
211            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
212            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
213            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
214            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
215            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
216            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
217            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
218            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
219            Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
220            Token::Comma => f.write_str(","),
221            Token::Whitespace(ws) => write!(f, "{ws}"),
222            Token::DoubleEq => f.write_str("=="),
223            Token::Spaceship => f.write_str("<=>"),
224            Token::Eq => f.write_str("="),
225            Token::Neq => f.write_str("<>"),
226            Token::Lt => f.write_str("<"),
227            Token::Gt => f.write_str(">"),
228            Token::LtEq => f.write_str("<="),
229            Token::GtEq => f.write_str(">="),
230            Token::Plus => f.write_str("+"),
231            Token::Minus => f.write_str("-"),
232            Token::Mul => f.write_str("*"),
233            Token::Div => f.write_str("/"),
234            Token::DuckIntDiv => f.write_str("//"),
235            Token::StringConcat => f.write_str("||"),
236            Token::Mod => f.write_str("%"),
237            Token::LParen => f.write_str("("),
238            Token::RParen => f.write_str(")"),
239            Token::Period => f.write_str("."),
240            Token::Colon => f.write_str(":"),
241            Token::DoubleColon => f.write_str("::"),
242            Token::DuckAssignment => f.write_str(":="),
243            Token::SemiColon => f.write_str(";"),
244            Token::Backslash => f.write_str("\\"),
245            Token::LBracket => f.write_str("["),
246            Token::RBracket => f.write_str("]"),
247            Token::Ampersand => f.write_str("&"),
248            Token::Caret => f.write_str("^"),
249            Token::Pipe => f.write_str("|"),
250            Token::LBrace => f.write_str("{"),
251            Token::RBrace => f.write_str("}"),
252            Token::RArrow => f.write_str("=>"),
253            Token::Sharp => f.write_str("#"),
254            Token::ExclamationMark => f.write_str("!"),
255            Token::DoubleExclamationMark => f.write_str("!!"),
256            Token::Tilde => f.write_str("~"),
257            Token::TildeAsterisk => f.write_str("~*"),
258            Token::ExclamationMarkTilde => f.write_str("!~"),
259            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
260            Token::DoubleTilde => f.write_str("~~"),
261            Token::DoubleTildeAsterisk => f.write_str("~~*"),
262            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
263            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
264            Token::AtSign => f.write_str("@"),
265            Token::CaretAt => f.write_str("^@"),
266            Token::ShiftLeft => f.write_str("<<"),
267            Token::ShiftRight => f.write_str(">>"),
268            Token::Overlap => f.write_str("&&"),
269            Token::PGSquareRoot => f.write_str("|/"),
270            Token::PGCubeRoot => f.write_str("||/"),
271            Token::Placeholder(ref s) => write!(f, "{s}"),
272            Token::Arrow => write!(f, "->"),
273            Token::LongArrow => write!(f, "->>"),
274            Token::HashArrow => write!(f, "#>"),
275            Token::HashLongArrow => write!(f, "#>>"),
276            Token::AtArrow => write!(f, "@>"),
277            Token::ArrowAt => write!(f, "<@"),
278            Token::HashMinus => write!(f, "#-"),
279            Token::AtQuestion => write!(f, "@?"),
280            Token::AtAt => write!(f, "@@"),
281        }
282    }
283}
284
285impl Token {
286    pub fn make_keyword(keyword: &str) -> Self {
287        Token::make_word(keyword, None)
288    }
289
290    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
291        let word_uppercase = word.to_uppercase();
292        Token::Word(Word {
293            value: word.to_string(),
294            quote_style,
295            keyword: if quote_style.is_none() {
296                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
297                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
298            } else {
299                Keyword::NoKeyword
300            },
301        })
302    }
303}
304
305/// A keyword (like SELECT) or an optionally quoted SQL identifier
306#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
307#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
308#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
309pub struct Word {
310    /// The value of the token, without the enclosing quotes, and with the
311    /// escape sequences (if any) processed (TODO: escapes are not handled)
312    pub value: String,
313    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
314    /// The standard and most implementations allow using double quotes for this,
315    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
316    pub quote_style: Option<char>,
317    /// If the word was not quoted and it matched one of the known keywords,
318    /// this will have one of the values from dialect::keywords, otherwise empty
319    pub keyword: Keyword,
320}
321
322impl fmt::Display for Word {
323    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
324        match self.quote_style {
325            Some(s) if s == '"' || s == '[' || s == '`' => {
326                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
327            }
328            None => f.write_str(&self.value),
329            _ => panic!("Unexpected quote_style!"),
330        }
331    }
332}
333
334impl Word {
335    fn matching_end_quote(ch: char) -> char {
336        match ch {
337            '"' => '"', // ANSI and most dialects
338            '[' => ']', // MS SQL
339            '`' => '`', // MySQL
340            _ => panic!("unexpected quoting style!"),
341        }
342    }
343}
344
345#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
346#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
347#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
348pub enum Whitespace {
349    Space,
350    Newline,
351    Tab,
352    SingleLineComment { comment: String, prefix: String },
353    MultiLineComment(String),
354}
355
356impl fmt::Display for Whitespace {
357    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
358        match self {
359            Whitespace::Space => f.write_str(" "),
360            Whitespace::Newline => f.write_str("\n"),
361            Whitespace::Tab => f.write_str("\t"),
362            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
363            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
364        }
365    }
366}
367
368/// Location in input string
369#[derive(Debug, Eq, PartialEq, Clone, Copy)]
370pub struct Location {
371    /// Line number, starting from 1
372    pub line: u64,
373    /// Line column, starting from 1
374    pub column: u64,
375}
376
377impl fmt::Display for Location {
378    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
379        if self.line == 0 {
380            return Ok(());
381        }
382        write!(
383            f,
384            // TODO: use standard compiler location syntax (<path>:<line>:<col>)
385            " at Line: {}, Column {}",
386            self.line, self.column,
387        )
388    }
389}
390
391/// A [Token] with [Location] attached to it
392#[derive(Debug, Eq, PartialEq, Clone)]
393pub struct TokenWithLocation {
394    pub token: Token,
395    pub location: Location,
396}
397
398impl TokenWithLocation {
399    pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
400        TokenWithLocation {
401            token,
402            location: Location { line, column },
403        }
404    }
405
406    pub fn wrap(token: Token) -> TokenWithLocation {
407        TokenWithLocation::new(token, 0, 0)
408    }
409}
410
411impl PartialEq<Token> for TokenWithLocation {
412    fn eq(&self, other: &Token) -> bool {
413        &self.token == other
414    }
415}
416
417impl PartialEq<TokenWithLocation> for Token {
418    fn eq(&self, other: &TokenWithLocation) -> bool {
419        self == &other.token
420    }
421}
422
423impl fmt::Display for TokenWithLocation {
424    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
425        self.token.fmt(f)
426    }
427}
428
429/// Tokenizer error
430#[derive(Debug, PartialEq, Eq)]
431pub struct TokenizerError {
432    pub message: String,
433    pub location: Location,
434}
435
436impl fmt::Display for TokenizerError {
437    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
438        write!(f, "{}{}", self.message, self.location,)
439    }
440}
441
442#[cfg(feature = "std")]
443impl std::error::Error for TokenizerError {}
444
445struct State<'a> {
446    peekable: Peekable<Chars<'a>>,
447    pub line: u64,
448    pub col: u64,
449}
450
451impl<'a> State<'a> {
452    /// return the next character and advance the stream
453    pub fn next(&mut self) -> Option<char> {
454        match self.peekable.next() {
455            None => None,
456            Some(s) => {
457                if s == '\n' {
458                    self.line += 1;
459                    self.col = 1;
460                } else {
461                    self.col += 1;
462                }
463                Some(s)
464            }
465        }
466    }
467
468    /// return the next character but do not advance the stream
469    pub fn peek(&mut self) -> Option<&char> {
470        self.peekable.peek()
471    }
472
473    pub fn location(&self) -> Location {
474        Location {
475            line: self.line,
476            column: self.col,
477        }
478    }
479}
480
481/// SQL Tokenizer
482pub struct Tokenizer<'a> {
483    dialect: &'a dyn Dialect,
484    query: &'a str,
485    /// If true (the default), the tokenizer will un-escape literal
486    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
487    unescape: bool,
488}
489
490impl<'a> Tokenizer<'a> {
491    /// Create a new SQL tokenizer for the specified SQL statement
492    ///
493    /// ```
494    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
495    /// # use sqlparser::dialect::GenericDialect;
496    /// # let dialect = GenericDialect{};
497    /// let query = r#"SELECT 'foo'"#;
498    ///
499    /// // Parsing the query
500    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
501    ///
502    /// assert_eq!(tokens, vec![
503    ///   Token::make_word("SELECT", None),
504    ///   Token::Whitespace(Whitespace::Space),
505    ///   Token::SingleQuotedString("foo".to_string()),
506    /// ]);
507    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
508        Self {
509            dialect,
510            query,
511            unescape: true,
512        }
513    }
514
515    /// Set unescape mode
516    ///
517    /// When true (default) the tokenizer unescapes literal values
518    /// (for example, `""` in SQL is unescaped to the literal `"`).
519    ///
520    /// When false, the tokenizer provides the raw strings as provided
521    /// in the query.  This can be helpful for programs that wish to
522    /// recover the *exact* original query text without normalizing
523    /// the escaping
524    ///
525    /// # Example
526    ///
527    /// ```
528    /// # use sqlparser::tokenizer::{Token, Tokenizer};
529    /// # use sqlparser::dialect::GenericDialect;
530    /// # let dialect = GenericDialect{};
531    /// let query = r#""Foo "" Bar""#;
532    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
533    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
534    ///
535    /// // Parsing with unescaping (default)
536    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
537    /// assert_eq!(tokens, vec![unescaped]);
538    ///
539    /// // Parsing with unescape = false
540    /// let tokens = Tokenizer::new(&dialect, &query)
541    ///    .with_unescape(false)
542    ///    .tokenize().unwrap();
543    /// assert_eq!(tokens, vec![original]);
544    /// ```
545    pub fn with_unescape(mut self, unescape: bool) -> Self {
546        self.unescape = unescape;
547        self
548    }
549
550    /// Tokenize the statement and produce a vector of tokens
551    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
552        let twl = self.tokenize_with_location()?;
553        Ok(twl.into_iter().map(|t| t.token).collect())
554    }
555
556    /// Tokenize the statement and produce a vector of tokens with location information
557    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
558        let mut tokens: Vec<TokenWithLocation> = vec![];
559        self.tokenize_with_location_into_buf(&mut tokens)
560            .map(|_| tokens)
561    }
562
563    /// Tokenize the statement and append tokens with location information into the provided buffer.
564    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
565    pub fn tokenize_with_location_into_buf(
566        &mut self,
567        buf: &mut Vec<TokenWithLocation>,
568    ) -> Result<(), TokenizerError> {
569        let mut state = State {
570            peekable: self.query.chars().peekable(),
571            line: 1,
572            col: 1,
573        };
574
575        let mut location = state.location();
576        while let Some(token) = self.next_token(&mut state)? {
577            buf.push(TokenWithLocation { token, location });
578
579            location = state.location();
580        }
581        Ok(())
582    }
583
584    // Tokenize the identifer or keywords in `ch`
585    fn tokenize_identifier_or_keyword(
586        &self,
587        ch: impl IntoIterator<Item = char>,
588        chars: &mut State,
589    ) -> Result<Option<Token>, TokenizerError> {
590        chars.next(); // consume the first char
591        let ch: String = ch.into_iter().collect();
592        let word = self.tokenize_word(ch, chars);
593
594        // TODO: implement parsing of exponent here
595        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
596            let mut inner_state = State {
597                peekable: word.chars().peekable(),
598                line: 0,
599                col: 0,
600            };
601            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
602            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
603            s += s2.as_str();
604            return Ok(Some(Token::Number(s, false)));
605        }
606
607        Ok(Some(Token::make_word(&word, None)))
608    }
609
610    /// Get the next token or return None
611    fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
612        match chars.peek() {
613            Some(&ch) => match ch {
614                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
615                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
616                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
617                '\r' => {
618                    // Emit a single Whitespace::Newline token for \r and \r\n
619                    chars.next();
620                    if let Some('\n') = chars.peek() {
621                        chars.next();
622                    }
623                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
624                }
625                // BigQuery uses b or B for byte string literal
626                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
627                    chars.next(); // consume
628                    match chars.peek() {
629                        Some('\'') => {
630                            let s = self.tokenize_quoted_string(chars, '\'')?;
631                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
632                        }
633                        Some('\"') => {
634                            let s = self.tokenize_quoted_string(chars, '\"')?;
635                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
636                        }
637                        _ => {
638                            // regular identifier starting with an "b" or "B"
639                            let s = self.tokenize_word(b, chars);
640                            Ok(Some(Token::make_word(&s, None)))
641                        }
642                    }
643                }
644                // BigQuery uses r or R for raw string literal
645                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
646                    chars.next(); // consume
647                    match chars.peek() {
648                        Some('\'') => {
649                            let s = self.tokenize_quoted_string(chars, '\'')?;
650                            Ok(Some(Token::RawStringLiteral(s)))
651                        }
652                        Some('\"') => {
653                            let s = self.tokenize_quoted_string(chars, '\"')?;
654                            Ok(Some(Token::RawStringLiteral(s)))
655                        }
656                        _ => {
657                            // regular identifier starting with an "r" or "R"
658                            let s = self.tokenize_word(b, chars);
659                            Ok(Some(Token::make_word(&s, None)))
660                        }
661                    }
662                }
663                // Redshift uses lower case n for national string literal
664                n @ 'N' | n @ 'n' => {
665                    chars.next(); // consume, to check the next char
666                    match chars.peek() {
667                        Some('\'') => {
668                            // N'...' - a <national character string literal>
669                            let s = self.tokenize_quoted_string(chars, '\'')?;
670                            Ok(Some(Token::NationalStringLiteral(s)))
671                        }
672                        _ => {
673                            // regular identifier starting with an "N"
674                            let s = self.tokenize_word(n, chars);
675                            Ok(Some(Token::make_word(&s, None)))
676                        }
677                    }
678                }
679                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
680                x @ 'e' | x @ 'E' => {
681                    let starting_loc = chars.location();
682                    chars.next(); // consume, to check the next char
683                    match chars.peek() {
684                        Some('\'') => {
685                            let s =
686                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
687                            Ok(Some(Token::EscapedStringLiteral(s)))
688                        }
689                        _ => {
690                            // regular identifier starting with an "E" or "e"
691                            let s = self.tokenize_word(x, chars);
692                            Ok(Some(Token::make_word(&s, None)))
693                        }
694                    }
695                }
696                // The spec only allows an uppercase 'X' to introduce a hex
697                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
698                x @ 'x' | x @ 'X' => {
699                    chars.next(); // consume, to check the next char
700                    match chars.peek() {
701                        Some('\'') => {
702                            // X'...' - a <binary string literal>
703                            let s = self.tokenize_quoted_string(chars, '\'')?;
704                            Ok(Some(Token::HexStringLiteral(s)))
705                        }
706                        _ => {
707                            // regular identifier starting with an "X"
708                            let s = self.tokenize_word(x, chars);
709                            Ok(Some(Token::make_word(&s, None)))
710                        }
711                    }
712                }
713                // single quoted string
714                '\'' => {
715                    let s = self.tokenize_quoted_string(chars, '\'')?;
716
717                    Ok(Some(Token::SingleQuotedString(s)))
718                }
719                // double quoted string
720                '\"' if !self.dialect.is_delimited_identifier_start(ch)
721                    && !self.dialect.is_identifier_start(ch) =>
722                {
723                    let s = self.tokenize_quoted_string(chars, '"')?;
724
725                    Ok(Some(Token::DoubleQuotedString(s)))
726                }
727                // delimited (quoted) identifier
728                quote_start
729                    if self.dialect.is_delimited_identifier_start(ch)
730                        && self
731                            .dialect
732                            .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
733                {
734                    let error_loc = chars.location();
735                    chars.next(); // consume the opening quote
736                    let quote_end = Word::matching_end_quote(quote_start);
737                    let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
738
739                    if last_char == Some(quote_end) {
740                        Ok(Some(Token::make_word(&s, Some(quote_start))))
741                    } else {
742                        self.tokenizer_error(
743                            error_loc,
744                            format!("Expected close delimiter '{quote_end}' before EOF."),
745                        )
746                    }
747                }
748                // numbers and period
749                '0'..='9' | '.' => {
750                    let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
751
752                    // match binary literal that starts with 0x
753                    if s == "0" && chars.peek() == Some(&'x') {
754                        chars.next();
755                        let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
756                        return Ok(Some(Token::HexStringLiteral(s2)));
757                    }
758
759                    // match one period
760                    if let Some('.') = chars.peek() {
761                        s.push('.');
762                        chars.next();
763                    }
764                    s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
765
766                    // No number -> Token::Period
767                    if s == "." {
768                        return Ok(Some(Token::Period));
769                    }
770
771                    let mut exponent_part = String::new();
772                    // Parse exponent as number
773                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
774                        let mut char_clone = chars.peekable.clone();
775                        exponent_part.push(char_clone.next().unwrap());
776
777                        // Optional sign
778                        match char_clone.peek() {
779                            Some(&c) if matches!(c, '+' | '-') => {
780                                exponent_part.push(c);
781                                char_clone.next();
782                            }
783                            _ => (),
784                        }
785
786                        match char_clone.peek() {
787                            // Definitely an exponent, get original iterator up to speed and use it
788                            Some(&c) if c.is_ascii_digit() => {
789                                for _ in 0..exponent_part.len() {
790                                    chars.next();
791                                }
792                                exponent_part +=
793                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
794                                s += exponent_part.as_str();
795                            }
796                            // Not an exponent, discard the work done
797                            _ => (),
798                        }
799                    }
800
801                    // mysql dialect supports identifiers that start with a numeric prefix,
802                    // as long as they aren't an exponent number.
803                    if dialect_of!(self is MySqlDialect | HiveDialect) && exponent_part.is_empty() {
804                        let word =
805                            peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
806
807                        if !word.is_empty() {
808                            s += word.as_str();
809                            return Ok(Some(Token::make_word(s.as_str(), None)));
810                        }
811                    }
812
813                    let long = if chars.peek() == Some(&'L') {
814                        chars.next();
815                        true
816                    } else {
817                        false
818                    };
819                    Ok(Some(Token::Number(s, long)))
820                }
821                // punctuation
822                '(' => self.consume_and_return(chars, Token::LParen),
823                ')' => self.consume_and_return(chars, Token::RParen),
824                ',' => self.consume_and_return(chars, Token::Comma),
825                // operators
826                '-' => {
827                    chars.next(); // consume the '-'
828                    match chars.peek() {
829                        Some('-') => {
830                            chars.next(); // consume the second '-', starting a single-line comment
831                            let comment = self.tokenize_single_line_comment(chars);
832                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
833                                prefix: "--".to_owned(),
834                                comment,
835                            })))
836                        }
837                        Some('>') => {
838                            chars.next();
839                            match chars.peek() {
840                                Some('>') => {
841                                    chars.next();
842                                    Ok(Some(Token::LongArrow))
843                                }
844                                _ => Ok(Some(Token::Arrow)),
845                            }
846                        }
847                        // a regular '-' operator
848                        _ => Ok(Some(Token::Minus)),
849                    }
850                }
851                '/' => {
852                    chars.next(); // consume the '/'
853                    match chars.peek() {
854                        Some('*') => {
855                            chars.next(); // consume the '*', starting a multi-line comment
856                            self.tokenize_multiline_comment(chars)
857                        }
858                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
859                            chars.next(); // consume the second '/', starting a snowflake single-line comment
860                            let comment = self.tokenize_single_line_comment(chars);
861                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
862                                prefix: "//".to_owned(),
863                                comment,
864                            })))
865                        }
866                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
867                            self.consume_and_return(chars, Token::DuckIntDiv)
868                        }
869                        // a regular '/' operator
870                        _ => Ok(Some(Token::Div)),
871                    }
872                }
873                '+' => self.consume_and_return(chars, Token::Plus),
874                '*' => self.consume_and_return(chars, Token::Mul),
875                '%' => {
876                    chars.next(); // advance past '%'
877                    match chars.peek() {
878                        Some(' ') => Ok(Some(Token::Mod)),
879                        Some(sch) if self.dialect.is_identifier_start('%') => {
880                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
881                        }
882                        _ => Ok(Some(Token::Mod)),
883                    }
884                }
885                '|' => {
886                    chars.next(); // consume the '|'
887                    match chars.peek() {
888                        Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
889                        Some('|') => {
890                            chars.next(); // consume the second '|'
891                            match chars.peek() {
892                                Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
893                                _ => Ok(Some(Token::StringConcat)),
894                            }
895                        }
896                        // Bitshift '|' operator
897                        _ => Ok(Some(Token::Pipe)),
898                    }
899                }
900                '=' => {
901                    chars.next(); // consume
902                    match chars.peek() {
903                        Some('>') => self.consume_and_return(chars, Token::RArrow),
904                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
905                        _ => Ok(Some(Token::Eq)),
906                    }
907                }
908                '!' => {
909                    chars.next(); // consume
910                    match chars.peek() {
911                        Some('=') => self.consume_and_return(chars, Token::Neq),
912                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
913                        Some('~') => {
914                            chars.next();
915                            match chars.peek() {
916                                Some('*') => self
917                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
918                                Some('~') => {
919                                    chars.next();
920                                    match chars.peek() {
921                                        Some('*') => self.consume_and_return(
922                                            chars,
923                                            Token::ExclamationMarkDoubleTildeAsterisk,
924                                        ),
925                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
926                                    }
927                                }
928                                _ => Ok(Some(Token::ExclamationMarkTilde)),
929                            }
930                        }
931                        _ => Ok(Some(Token::ExclamationMark)),
932                    }
933                }
934                '<' => {
935                    chars.next(); // consume
936                    match chars.peek() {
937                        Some('=') => {
938                            chars.next();
939                            match chars.peek() {
940                                Some('>') => self.consume_and_return(chars, Token::Spaceship),
941                                _ => Ok(Some(Token::LtEq)),
942                            }
943                        }
944                        Some('>') => self.consume_and_return(chars, Token::Neq),
945                        Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
946                        Some('@') => self.consume_and_return(chars, Token::ArrowAt),
947                        _ => Ok(Some(Token::Lt)),
948                    }
949                }
950                '>' => {
951                    chars.next(); // consume
952                    match chars.peek() {
953                        Some('=') => self.consume_and_return(chars, Token::GtEq),
954                        Some('>') => self.consume_and_return(chars, Token::ShiftRight),
955                        _ => Ok(Some(Token::Gt)),
956                    }
957                }
958                ':' => {
959                    chars.next();
960                    match chars.peek() {
961                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
962                        Some('=') => self.consume_and_return(chars, Token::DuckAssignment),
963                        _ => Ok(Some(Token::Colon)),
964                    }
965                }
966                ';' => self.consume_and_return(chars, Token::SemiColon),
967                '\\' => self.consume_and_return(chars, Token::Backslash),
968                '[' => self.consume_and_return(chars, Token::LBracket),
969                ']' => self.consume_and_return(chars, Token::RBracket),
970                '&' => {
971                    chars.next(); // consume the '&'
972                    match chars.peek() {
973                        Some('&') => self.consume_and_return(chars, Token::Overlap),
974                        // Bitshift '&' operator
975                        _ => Ok(Some(Token::Ampersand)),
976                    }
977                }
978                '^' => {
979                    chars.next(); // consume the '^'
980                    match chars.peek() {
981                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
982                        _ => Ok(Some(Token::Caret)),
983                    }
984                }
985                '{' => self.consume_and_return(chars, Token::LBrace),
986                '}' => self.consume_and_return(chars, Token::RBrace),
987                '#' if dialect_of!(self is SnowflakeDialect) => {
988                    chars.next(); // consume the '#', starting a snowflake single-line comment
989                    let comment = self.tokenize_single_line_comment(chars);
990                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
991                        prefix: "#".to_owned(),
992                        comment,
993                    })))
994                }
995                '~' => {
996                    chars.next(); // consume
997                    match chars.peek() {
998                        Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
999                        Some('~') => {
1000                            chars.next();
1001                            match chars.peek() {
1002                                Some('*') => {
1003                                    self.consume_and_return(chars, Token::DoubleTildeAsterisk)
1004                                }
1005                                _ => Ok(Some(Token::DoubleTilde)),
1006                            }
1007                        }
1008                        _ => Ok(Some(Token::Tilde)),
1009                    }
1010                }
1011                '#' => {
1012                    chars.next();
1013                    match chars.peek() {
1014                        Some('-') => self.consume_and_return(chars, Token::HashMinus),
1015                        Some('>') => {
1016                            chars.next();
1017                            match chars.peek() {
1018                                Some('>') => {
1019                                    chars.next();
1020                                    Ok(Some(Token::HashLongArrow))
1021                                }
1022                                _ => Ok(Some(Token::HashArrow)),
1023                            }
1024                        }
1025                        Some(' ') => Ok(Some(Token::Sharp)),
1026                        Some(sch) if self.dialect.is_identifier_start('#') => {
1027                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1028                        }
1029                        _ => Ok(Some(Token::Sharp)),
1030                    }
1031                }
1032                '@' => {
1033                    chars.next();
1034                    match chars.peek() {
1035                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1036                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1037                        Some('@') => {
1038                            chars.next();
1039                            match chars.peek() {
1040                                Some(' ') => Ok(Some(Token::AtAt)),
1041                                Some(tch) if self.dialect.is_identifier_start('@') => {
1042                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1043                                }
1044                                _ => Ok(Some(Token::AtAt)),
1045                            }
1046                        }
1047                        Some(' ') => Ok(Some(Token::AtSign)),
1048                        Some(sch) if self.dialect.is_identifier_start('@') => {
1049                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1050                        }
1051                        _ => Ok(Some(Token::AtSign)),
1052                    }
1053                }
1054                '?' => {
1055                    chars.next();
1056                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1057                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1058                }
1059
1060                // identifier or keyword
1061                ch if self.dialect.is_identifier_start(ch) => {
1062                    self.tokenize_identifier_or_keyword([ch], chars)
1063                }
1064                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1065
1066                //whitespace check (including unicode chars) should be last as it covers some of the chars above
1067                ch if ch.is_whitespace() => {
1068                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1069                }
1070                other => self.consume_and_return(chars, Token::Char(other)),
1071            },
1072            None => Ok(None),
1073        }
1074    }
1075
1076    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1077    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1078        let mut s = String::new();
1079        let mut value = String::new();
1080
1081        chars.next();
1082
1083        if let Some('$') = chars.peek() {
1084            chars.next();
1085
1086            let mut is_terminated = false;
1087            let mut prev: Option<char> = None;
1088
1089            while let Some(&ch) = chars.peek() {
1090                if prev == Some('$') {
1091                    if ch == '$' {
1092                        chars.next();
1093                        is_terminated = true;
1094                        break;
1095                    } else {
1096                        s.push('$');
1097                        s.push(ch);
1098                    }
1099                } else if ch != '$' {
1100                    s.push(ch);
1101                }
1102
1103                prev = Some(ch);
1104                chars.next();
1105            }
1106
1107            return if chars.peek().is_none() && !is_terminated {
1108                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1109            } else {
1110                Ok(Token::DollarQuotedString(DollarQuotedString {
1111                    value: s,
1112                    tag: None,
1113                }))
1114            };
1115        } else {
1116            value.push_str(&peeking_take_while(chars, |ch| {
1117                ch.is_alphanumeric() || ch == '_'
1118            }));
1119
1120            if let Some('$') = chars.peek() {
1121                chars.next();
1122                s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
1123
1124                match chars.peek() {
1125                    Some('$') => {
1126                        chars.next();
1127                        for c in value.chars() {
1128                            let next_char = chars.next();
1129                            if Some(c) != next_char {
1130                                return self.tokenizer_error(
1131                                    chars.location(),
1132                                    format!(
1133                                        "Unterminated dollar-quoted string at or near \"{value}\""
1134                                    ),
1135                                );
1136                            }
1137                        }
1138
1139                        if let Some('$') = chars.peek() {
1140                            chars.next();
1141                        } else {
1142                            return self.tokenizer_error(
1143                                chars.location(),
1144                                "Unterminated dollar-quoted string, expected $",
1145                            );
1146                        }
1147                    }
1148                    _ => {
1149                        return self.tokenizer_error(
1150                            chars.location(),
1151                            "Unterminated dollar-quoted, expected $",
1152                        );
1153                    }
1154                }
1155            } else {
1156                return Ok(Token::Placeholder(String::from("$") + &value));
1157            }
1158        }
1159
1160        Ok(Token::DollarQuotedString(DollarQuotedString {
1161            value: s,
1162            tag: if value.is_empty() { None } else { Some(value) },
1163        }))
1164    }
1165
1166    fn tokenizer_error<R>(
1167        &self,
1168        loc: Location,
1169        message: impl Into<String>,
1170    ) -> Result<R, TokenizerError> {
1171        Err(TokenizerError {
1172            message: message.into(),
1173            location: loc,
1174        })
1175    }
1176
1177    // Consume characters until newline
1178    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1179        let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1180        if let Some(ch) = chars.next() {
1181            assert_eq!(ch, '\n');
1182            comment.push(ch);
1183        }
1184        comment
1185    }
1186
1187    /// Tokenize an identifier or keyword, after the first char is already consumed.
1188    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1189        let mut s = first_chars.into();
1190        s.push_str(&peeking_take_while(chars, |ch| {
1191            self.dialect.is_identifier_part(ch)
1192        }));
1193        s
1194    }
1195
1196    /// Read a single quoted string, starting with the opening quote.
1197    fn tokenize_escaped_single_quoted_string(
1198        &self,
1199        starting_loc: Location,
1200        chars: &mut State,
1201    ) -> Result<String, TokenizerError> {
1202        let mut s = String::new();
1203
1204        // This case is a bit tricky
1205
1206        chars.next(); // consume the opening quote
1207
1208        // slash escaping
1209        let mut is_escaped = false;
1210        while let Some(&ch) = chars.peek() {
1211            macro_rules! escape_control_character {
1212                ($ESCAPED:expr) => {{
1213                    if is_escaped {
1214                        s.push($ESCAPED);
1215                        is_escaped = false;
1216                    } else {
1217                        s.push(ch);
1218                    }
1219
1220                    chars.next();
1221                }};
1222            }
1223
1224            match ch {
1225                '\'' => {
1226                    chars.next(); // consume
1227                    if is_escaped {
1228                        s.push(ch);
1229                        is_escaped = false;
1230                    } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1231                        s.push(ch);
1232                        chars.next();
1233                    } else {
1234                        return Ok(s);
1235                    }
1236                }
1237                '\\' => {
1238                    if is_escaped {
1239                        s.push('\\');
1240                        is_escaped = false;
1241                    } else {
1242                        is_escaped = true;
1243                    }
1244
1245                    chars.next();
1246                }
1247                'r' => escape_control_character!('\r'),
1248                'n' => escape_control_character!('\n'),
1249                't' => escape_control_character!('\t'),
1250                _ => {
1251                    is_escaped = false;
1252                    chars.next(); // consume
1253                    s.push(ch);
1254                }
1255            }
1256        }
1257        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1258    }
1259
1260    /// Read a single quoted string, starting with the opening quote.
1261    fn tokenize_quoted_string(
1262        &self,
1263        chars: &mut State,
1264        quote_style: char,
1265    ) -> Result<String, TokenizerError> {
1266        let mut s = String::new();
1267        let error_loc = chars.location();
1268
1269        chars.next(); // consume the opening quote
1270
1271        while let Some(&ch) = chars.peek() {
1272            match ch {
1273                char if char == quote_style => {
1274                    chars.next(); // consume
1275                    if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
1276                        s.push(ch);
1277                        if !self.unescape {
1278                            // In no-escape mode, the given query has to be saved completely
1279                            s.push(ch);
1280                        }
1281                        chars.next();
1282                    } else {
1283                        return Ok(s);
1284                    }
1285                }
1286                '\\' => {
1287                    // consume
1288                    chars.next();
1289                    // slash escaping is specific to MySQL dialect.
1290                    if dialect_of!(self is MySqlDialect) {
1291                        if let Some(next) = chars.peek() {
1292                            if !self.unescape {
1293                                // In no-escape mode, the given query has to be saved completely including backslashes.
1294                                s.push(ch);
1295                                s.push(*next);
1296                                chars.next(); // consume next
1297                            } else {
1298                                // See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
1299                                let n = match next {
1300                                    '\'' | '\"' | '\\' | '%' | '_' => *next,
1301                                    '0' => '\0',
1302                                    'b' => '\u{8}',
1303                                    'n' => '\n',
1304                                    'r' => '\r',
1305                                    't' => '\t',
1306                                    'Z' => '\u{1a}',
1307                                    _ => *next,
1308                                };
1309                                s.push(n);
1310                                chars.next(); // consume next
1311                            }
1312                        }
1313                    } else {
1314                        s.push(ch);
1315                    }
1316                }
1317                _ => {
1318                    chars.next(); // consume
1319                    s.push(ch);
1320                }
1321            }
1322        }
1323        self.tokenizer_error(error_loc, "Unterminated string literal")
1324    }
1325
1326    fn tokenize_multiline_comment(
1327        &self,
1328        chars: &mut State,
1329    ) -> Result<Option<Token>, TokenizerError> {
1330        let mut s = String::new();
1331        let mut nested = 1;
1332        let mut last_ch = ' ';
1333
1334        loop {
1335            match chars.next() {
1336                Some(ch) => {
1337                    if last_ch == '/' && ch == '*' {
1338                        nested += 1;
1339                    } else if last_ch == '*' && ch == '/' {
1340                        nested -= 1;
1341                        if nested == 0 {
1342                            s.pop();
1343                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1344                        }
1345                    }
1346                    s.push(ch);
1347                    last_ch = ch;
1348                }
1349                None => {
1350                    break self.tokenizer_error(
1351                        chars.location(),
1352                        "Unexpected EOF while in a multi-line comment",
1353                    )
1354                }
1355            }
1356        }
1357    }
1358
1359    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
1360        let mut last_char = None;
1361        let mut s = String::new();
1362        while let Some(ch) = chars.next() {
1363            if ch == quote_end {
1364                if chars.peek() == Some(&quote_end) {
1365                    chars.next();
1366                    s.push(ch);
1367                    if !self.unescape {
1368                        // In no-escape mode, the given query has to be saved completely
1369                        s.push(ch);
1370                    }
1371                } else {
1372                    last_char = Some(quote_end);
1373                    break;
1374                }
1375            } else {
1376                s.push(ch);
1377            }
1378        }
1379        (s, last_char)
1380    }
1381
1382    #[allow(clippy::unnecessary_wraps)]
1383    fn consume_and_return(
1384        &self,
1385        chars: &mut State,
1386        t: Token,
1387    ) -> Result<Option<Token>, TokenizerError> {
1388        chars.next();
1389        Ok(Some(t))
1390    }
1391}
1392
1393/// Read from `chars` until `predicate` returns `false` or EOF is hit.
1394/// Return the characters read as String, and keep the first non-matching
1395/// char available as `chars.next()`.
1396fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1397    let mut s = String::new();
1398    while let Some(&ch) = chars.peek() {
1399        if predicate(ch) {
1400            chars.next(); // consume
1401            s.push(ch);
1402        } else {
1403            break;
1404        }
1405    }
1406    s
1407}
1408
1409#[cfg(test)]
1410mod tests {
1411    use super::*;
1412    use crate::dialect::{ClickHouseDialect, GenericDialect, MsSqlDialect};
1413
1414    #[test]
1415    fn tokenizer_error_impl() {
1416        let err = TokenizerError {
1417            message: "test".into(),
1418            location: Location { line: 1, column: 1 },
1419        };
1420        #[cfg(feature = "std")]
1421        {
1422            use std::error::Error;
1423            assert!(err.source().is_none());
1424        }
1425        assert_eq!(err.to_string(), "test at Line: 1, Column 1");
1426    }
1427
1428    #[test]
1429    fn tokenize_select_1() {
1430        let sql = String::from("SELECT 1");
1431        let dialect = GenericDialect {};
1432        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1433
1434        let expected = vec![
1435            Token::make_keyword("SELECT"),
1436            Token::Whitespace(Whitespace::Space),
1437            Token::Number(String::from("1"), false),
1438        ];
1439
1440        compare(expected, tokens);
1441    }
1442
1443    #[test]
1444    fn tokenize_select_float() {
1445        let sql = String::from("SELECT .1");
1446        let dialect = GenericDialect {};
1447        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1448
1449        let expected = vec![
1450            Token::make_keyword("SELECT"),
1451            Token::Whitespace(Whitespace::Space),
1452            Token::Number(String::from(".1"), false),
1453        ];
1454
1455        compare(expected, tokens);
1456    }
1457
1458    #[test]
1459    fn tokenize_clickhouse_double_equal() {
1460        let sql = String::from("SELECT foo=='1'");
1461        let dialect = ClickHouseDialect {};
1462        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1463        let tokens = tokenizer.tokenize().unwrap();
1464
1465        let expected = vec![
1466            Token::make_keyword("SELECT"),
1467            Token::Whitespace(Whitespace::Space),
1468            Token::Word(Word {
1469                value: "foo".to_string(),
1470                quote_style: None,
1471                keyword: Keyword::NoKeyword,
1472            }),
1473            Token::DoubleEq,
1474            Token::SingleQuotedString("1".to_string()),
1475        ];
1476
1477        compare(expected, tokens);
1478    }
1479
1480    #[test]
1481    fn tokenize_select_exponent() {
1482        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
1483        let dialect = GenericDialect {};
1484        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1485
1486        let expected = vec![
1487            Token::make_keyword("SELECT"),
1488            Token::Whitespace(Whitespace::Space),
1489            Token::Number(String::from("1e10"), false),
1490            Token::Comma,
1491            Token::Whitespace(Whitespace::Space),
1492            Token::Number(String::from("1e-10"), false),
1493            Token::Comma,
1494            Token::Whitespace(Whitespace::Space),
1495            Token::Number(String::from("1e+10"), false),
1496            Token::Comma,
1497            Token::Whitespace(Whitespace::Space),
1498            Token::Number(String::from("1"), false),
1499            Token::make_word("ea", None),
1500            Token::Comma,
1501            Token::Whitespace(Whitespace::Space),
1502            Token::Number(String::from("1e-10"), false),
1503            Token::make_word("a", None),
1504            Token::Comma,
1505            Token::Whitespace(Whitespace::Space),
1506            Token::Number(String::from("1e-10"), false),
1507            Token::Minus,
1508            Token::Number(String::from("10"), false),
1509        ];
1510
1511        compare(expected, tokens);
1512    }
1513
1514    #[test]
1515    fn tokenize_scalar_function() {
1516        let sql = String::from("SELECT sqrt(1)");
1517        let dialect = GenericDialect {};
1518        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1519
1520        let expected = vec![
1521            Token::make_keyword("SELECT"),
1522            Token::Whitespace(Whitespace::Space),
1523            Token::make_word("sqrt", None),
1524            Token::LParen,
1525            Token::Number(String::from("1"), false),
1526            Token::RParen,
1527        ];
1528
1529        compare(expected, tokens);
1530    }
1531
1532    #[test]
1533    fn tokenize_string_string_concat() {
1534        let sql = String::from("SELECT 'a' || 'b'");
1535        let dialect = GenericDialect {};
1536        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1537
1538        let expected = vec![
1539            Token::make_keyword("SELECT"),
1540            Token::Whitespace(Whitespace::Space),
1541            Token::SingleQuotedString(String::from("a")),
1542            Token::Whitespace(Whitespace::Space),
1543            Token::StringConcat,
1544            Token::Whitespace(Whitespace::Space),
1545            Token::SingleQuotedString(String::from("b")),
1546        ];
1547
1548        compare(expected, tokens);
1549    }
1550    #[test]
1551    fn tokenize_bitwise_op() {
1552        let sql = String::from("SELECT one | two ^ three");
1553        let dialect = GenericDialect {};
1554        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1555
1556        let expected = vec![
1557            Token::make_keyword("SELECT"),
1558            Token::Whitespace(Whitespace::Space),
1559            Token::make_word("one", None),
1560            Token::Whitespace(Whitespace::Space),
1561            Token::Pipe,
1562            Token::Whitespace(Whitespace::Space),
1563            Token::make_word("two", None),
1564            Token::Whitespace(Whitespace::Space),
1565            Token::Caret,
1566            Token::Whitespace(Whitespace::Space),
1567            Token::make_word("three", None),
1568        ];
1569        compare(expected, tokens);
1570    }
1571
1572    #[test]
1573    fn tokenize_logical_xor() {
1574        let sql =
1575            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1576        let dialect = GenericDialect {};
1577        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1578
1579        let expected = vec![
1580            Token::make_keyword("SELECT"),
1581            Token::Whitespace(Whitespace::Space),
1582            Token::make_keyword("true"),
1583            Token::Whitespace(Whitespace::Space),
1584            Token::make_keyword("XOR"),
1585            Token::Whitespace(Whitespace::Space),
1586            Token::make_keyword("true"),
1587            Token::Comma,
1588            Token::Whitespace(Whitespace::Space),
1589            Token::make_keyword("false"),
1590            Token::Whitespace(Whitespace::Space),
1591            Token::make_keyword("XOR"),
1592            Token::Whitespace(Whitespace::Space),
1593            Token::make_keyword("false"),
1594            Token::Comma,
1595            Token::Whitespace(Whitespace::Space),
1596            Token::make_keyword("true"),
1597            Token::Whitespace(Whitespace::Space),
1598            Token::make_keyword("XOR"),
1599            Token::Whitespace(Whitespace::Space),
1600            Token::make_keyword("false"),
1601            Token::Comma,
1602            Token::Whitespace(Whitespace::Space),
1603            Token::make_keyword("false"),
1604            Token::Whitespace(Whitespace::Space),
1605            Token::make_keyword("XOR"),
1606            Token::Whitespace(Whitespace::Space),
1607            Token::make_keyword("true"),
1608        ];
1609        compare(expected, tokens);
1610    }
1611
1612    #[test]
1613    fn tokenize_simple_select() {
1614        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1615        let dialect = GenericDialect {};
1616        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1617
1618        let expected = vec![
1619            Token::make_keyword("SELECT"),
1620            Token::Whitespace(Whitespace::Space),
1621            Token::Mul,
1622            Token::Whitespace(Whitespace::Space),
1623            Token::make_keyword("FROM"),
1624            Token::Whitespace(Whitespace::Space),
1625            Token::make_word("customer", None),
1626            Token::Whitespace(Whitespace::Space),
1627            Token::make_keyword("WHERE"),
1628            Token::Whitespace(Whitespace::Space),
1629            Token::make_word("id", None),
1630            Token::Whitespace(Whitespace::Space),
1631            Token::Eq,
1632            Token::Whitespace(Whitespace::Space),
1633            Token::Number(String::from("1"), false),
1634            Token::Whitespace(Whitespace::Space),
1635            Token::make_keyword("LIMIT"),
1636            Token::Whitespace(Whitespace::Space),
1637            Token::Number(String::from("5"), false),
1638        ];
1639
1640        compare(expected, tokens);
1641    }
1642
1643    #[test]
1644    fn tokenize_explain_select() {
1645        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1646        let dialect = GenericDialect {};
1647        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1648
1649        let expected = vec![
1650            Token::make_keyword("EXPLAIN"),
1651            Token::Whitespace(Whitespace::Space),
1652            Token::make_keyword("SELECT"),
1653            Token::Whitespace(Whitespace::Space),
1654            Token::Mul,
1655            Token::Whitespace(Whitespace::Space),
1656            Token::make_keyword("FROM"),
1657            Token::Whitespace(Whitespace::Space),
1658            Token::make_word("customer", None),
1659            Token::Whitespace(Whitespace::Space),
1660            Token::make_keyword("WHERE"),
1661            Token::Whitespace(Whitespace::Space),
1662            Token::make_word("id", None),
1663            Token::Whitespace(Whitespace::Space),
1664            Token::Eq,
1665            Token::Whitespace(Whitespace::Space),
1666            Token::Number(String::from("1"), false),
1667        ];
1668
1669        compare(expected, tokens);
1670    }
1671
1672    #[test]
1673    fn tokenize_explain_analyze_select() {
1674        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1675        let dialect = GenericDialect {};
1676        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1677
1678        let expected = vec![
1679            Token::make_keyword("EXPLAIN"),
1680            Token::Whitespace(Whitespace::Space),
1681            Token::make_keyword("ANALYZE"),
1682            Token::Whitespace(Whitespace::Space),
1683            Token::make_keyword("SELECT"),
1684            Token::Whitespace(Whitespace::Space),
1685            Token::Mul,
1686            Token::Whitespace(Whitespace::Space),
1687            Token::make_keyword("FROM"),
1688            Token::Whitespace(Whitespace::Space),
1689            Token::make_word("customer", None),
1690            Token::Whitespace(Whitespace::Space),
1691            Token::make_keyword("WHERE"),
1692            Token::Whitespace(Whitespace::Space),
1693            Token::make_word("id", None),
1694            Token::Whitespace(Whitespace::Space),
1695            Token::Eq,
1696            Token::Whitespace(Whitespace::Space),
1697            Token::Number(String::from("1"), false),
1698        ];
1699
1700        compare(expected, tokens);
1701    }
1702
1703    #[test]
1704    fn tokenize_string_predicate() {
1705        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1706        let dialect = GenericDialect {};
1707        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1708
1709        let expected = vec![
1710            Token::make_keyword("SELECT"),
1711            Token::Whitespace(Whitespace::Space),
1712            Token::Mul,
1713            Token::Whitespace(Whitespace::Space),
1714            Token::make_keyword("FROM"),
1715            Token::Whitespace(Whitespace::Space),
1716            Token::make_word("customer", None),
1717            Token::Whitespace(Whitespace::Space),
1718            Token::make_keyword("WHERE"),
1719            Token::Whitespace(Whitespace::Space),
1720            Token::make_word("salary", None),
1721            Token::Whitespace(Whitespace::Space),
1722            Token::Neq,
1723            Token::Whitespace(Whitespace::Space),
1724            Token::SingleQuotedString(String::from("Not Provided")),
1725        ];
1726
1727        compare(expected, tokens);
1728    }
1729
1730    #[test]
1731    fn tokenize_invalid_string() {
1732        let sql = String::from("\n💝مصطفىh");
1733
1734        let dialect = GenericDialect {};
1735        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1736        // println!("tokens: {:#?}", tokens);
1737        let expected = vec![
1738            Token::Whitespace(Whitespace::Newline),
1739            Token::Char('💝'),
1740            Token::make_word("مصطفىh", None),
1741        ];
1742        compare(expected, tokens);
1743    }
1744
1745    #[test]
1746    fn tokenize_newline_in_string_literal() {
1747        let sql = String::from("'foo\r\nbar\nbaz'");
1748
1749        let dialect = GenericDialect {};
1750        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1751        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
1752        compare(expected, tokens);
1753    }
1754
1755    #[test]
1756    fn tokenize_unterminated_string_literal() {
1757        let sql = String::from("select 'foo");
1758
1759        let dialect = GenericDialect {};
1760        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1761        assert_eq!(
1762            tokenizer.tokenize(),
1763            Err(TokenizerError {
1764                message: "Unterminated string literal".to_string(),
1765                location: Location { line: 1, column: 8 },
1766            })
1767        );
1768    }
1769
1770    #[test]
1771    fn tokenize_unterminated_string_literal_utf8() {
1772        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
1773
1774        let dialect = GenericDialect {};
1775        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1776        assert_eq!(
1777            tokenizer.tokenize(),
1778            Err(TokenizerError {
1779                message: "Unterminated string literal".to_string(),
1780                location: Location {
1781                    line: 1,
1782                    column: 35
1783                }
1784            })
1785        );
1786    }
1787
1788    #[test]
1789    fn tokenize_invalid_string_cols() {
1790        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
1791
1792        let dialect = GenericDialect {};
1793        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1794        // println!("tokens: {:#?}", tokens);
1795        let expected = vec![
1796            Token::Whitespace(Whitespace::Newline),
1797            Token::Whitespace(Whitespace::Newline),
1798            Token::make_keyword("SELECT"),
1799            Token::Whitespace(Whitespace::Space),
1800            Token::Mul,
1801            Token::Whitespace(Whitespace::Space),
1802            Token::make_keyword("FROM"),
1803            Token::Whitespace(Whitespace::Space),
1804            Token::make_keyword("table"),
1805            Token::Whitespace(Whitespace::Tab),
1806            Token::Char('💝'),
1807            Token::make_word("مصطفىh", None),
1808        ];
1809        compare(expected, tokens);
1810    }
1811
1812    #[test]
1813    fn tokenize_right_arrow() {
1814        let sql = String::from("FUNCTION(key=>value)");
1815        let dialect = GenericDialect {};
1816        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1817        let expected = vec![
1818            Token::make_word("FUNCTION", None),
1819            Token::LParen,
1820            Token::make_word("key", None),
1821            Token::RArrow,
1822            Token::make_word("value", None),
1823            Token::RParen,
1824        ];
1825        compare(expected, tokens);
1826    }
1827
1828    #[test]
1829    fn tokenize_is_null() {
1830        let sql = String::from("a IS NULL");
1831        let dialect = GenericDialect {};
1832        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1833
1834        let expected = vec![
1835            Token::make_word("a", None),
1836            Token::Whitespace(Whitespace::Space),
1837            Token::make_keyword("IS"),
1838            Token::Whitespace(Whitespace::Space),
1839            Token::make_keyword("NULL"),
1840        ];
1841
1842        compare(expected, tokens);
1843    }
1844
1845    #[test]
1846    fn tokenize_comment() {
1847        let sql = String::from("0--this is a comment\n1");
1848
1849        let dialect = GenericDialect {};
1850        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1851        let expected = vec![
1852            Token::Number("0".to_string(), false),
1853            Token::Whitespace(Whitespace::SingleLineComment {
1854                prefix: "--".to_string(),
1855                comment: "this is a comment\n".to_string(),
1856            }),
1857            Token::Number("1".to_string(), false),
1858        ];
1859        compare(expected, tokens);
1860    }
1861
1862    #[test]
1863    fn tokenize_comment_at_eof() {
1864        let sql = String::from("--this is a comment");
1865
1866        let dialect = GenericDialect {};
1867        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1868        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1869            prefix: "--".to_string(),
1870            comment: "this is a comment".to_string(),
1871        })];
1872        compare(expected, tokens);
1873    }
1874
1875    #[test]
1876    fn tokenize_multiline_comment() {
1877        let sql = String::from("0/*multi-line\n* /comment*/1");
1878
1879        let dialect = GenericDialect {};
1880        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1881        let expected = vec![
1882            Token::Number("0".to_string(), false),
1883            Token::Whitespace(Whitespace::MultiLineComment(
1884                "multi-line\n* /comment".to_string(),
1885            )),
1886            Token::Number("1".to_string(), false),
1887        ];
1888        compare(expected, tokens);
1889    }
1890
1891    #[test]
1892    fn tokenize_nested_multiline_comment() {
1893        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1894
1895        let dialect = GenericDialect {};
1896        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1897        let expected = vec![
1898            Token::Number("0".to_string(), false),
1899            Token::Whitespace(Whitespace::MultiLineComment(
1900                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
1901            )),
1902            Token::Number("1".to_string(), false),
1903        ];
1904        compare(expected, tokens);
1905    }
1906
1907    #[test]
1908    fn tokenize_multiline_comment_with_even_asterisks() {
1909        let sql = String::from("\n/** Comment **/\n");
1910
1911        let dialect = GenericDialect {};
1912        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1913        let expected = vec![
1914            Token::Whitespace(Whitespace::Newline),
1915            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
1916            Token::Whitespace(Whitespace::Newline),
1917        ];
1918        compare(expected, tokens);
1919    }
1920
1921    #[test]
1922    fn tokenize_unicode_whitespace() {
1923        let sql = String::from(" \u{2003}\n");
1924
1925        let dialect = GenericDialect {};
1926        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1927        let expected = vec![
1928            Token::Whitespace(Whitespace::Space),
1929            Token::Whitespace(Whitespace::Space),
1930            Token::Whitespace(Whitespace::Newline),
1931        ];
1932        compare(expected, tokens);
1933    }
1934
1935    #[test]
1936    fn tokenize_mismatched_quotes() {
1937        let sql = String::from("\"foo");
1938
1939        let dialect = GenericDialect {};
1940        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1941        assert_eq!(
1942            tokenizer.tokenize(),
1943            Err(TokenizerError {
1944                message: "Expected close delimiter '\"' before EOF.".to_string(),
1945                location: Location { line: 1, column: 1 },
1946            })
1947        );
1948    }
1949
1950    #[test]
1951    fn tokenize_newlines() {
1952        let sql = String::from("line1\nline2\rline3\r\nline4\r");
1953
1954        let dialect = GenericDialect {};
1955        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1956        let expected = vec![
1957            Token::make_word("line1", None),
1958            Token::Whitespace(Whitespace::Newline),
1959            Token::make_word("line2", None),
1960            Token::Whitespace(Whitespace::Newline),
1961            Token::make_word("line3", None),
1962            Token::Whitespace(Whitespace::Newline),
1963            Token::make_word("line4", None),
1964            Token::Whitespace(Whitespace::Newline),
1965        ];
1966        compare(expected, tokens);
1967    }
1968
1969    #[test]
1970    fn tokenize_mssql_top() {
1971        let sql = "SELECT TOP 5 [bar] FROM foo";
1972        let dialect = MsSqlDialect {};
1973        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1974        let expected = vec![
1975            Token::make_keyword("SELECT"),
1976            Token::Whitespace(Whitespace::Space),
1977            Token::make_keyword("TOP"),
1978            Token::Whitespace(Whitespace::Space),
1979            Token::Number(String::from("5"), false),
1980            Token::Whitespace(Whitespace::Space),
1981            Token::make_word("bar", Some('[')),
1982            Token::Whitespace(Whitespace::Space),
1983            Token::make_keyword("FROM"),
1984            Token::Whitespace(Whitespace::Space),
1985            Token::make_word("foo", None),
1986        ];
1987        compare(expected, tokens);
1988    }
1989
1990    #[test]
1991    fn tokenize_pg_regex_match() {
1992        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1993        let dialect = GenericDialect {};
1994        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1995        let expected = vec![
1996            Token::make_keyword("SELECT"),
1997            Token::Whitespace(Whitespace::Space),
1998            Token::make_word("col", None),
1999            Token::Whitespace(Whitespace::Space),
2000            Token::Tilde,
2001            Token::Whitespace(Whitespace::Space),
2002            Token::SingleQuotedString("^a".into()),
2003            Token::Comma,
2004            Token::Whitespace(Whitespace::Space),
2005            Token::make_word("col", None),
2006            Token::Whitespace(Whitespace::Space),
2007            Token::TildeAsterisk,
2008            Token::Whitespace(Whitespace::Space),
2009            Token::SingleQuotedString("^a".into()),
2010            Token::Comma,
2011            Token::Whitespace(Whitespace::Space),
2012            Token::make_word("col", None),
2013            Token::Whitespace(Whitespace::Space),
2014            Token::ExclamationMarkTilde,
2015            Token::Whitespace(Whitespace::Space),
2016            Token::SingleQuotedString("^a".into()),
2017            Token::Comma,
2018            Token::Whitespace(Whitespace::Space),
2019            Token::make_word("col", None),
2020            Token::Whitespace(Whitespace::Space),
2021            Token::ExclamationMarkTildeAsterisk,
2022            Token::Whitespace(Whitespace::Space),
2023            Token::SingleQuotedString("^a".into()),
2024        ];
2025        compare(expected, tokens);
2026    }
2027
2028    #[test]
2029    fn tokenize_pg_like_match() {
2030        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
2031        let dialect = GenericDialect {};
2032        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2033        let expected = vec![
2034            Token::make_keyword("SELECT"),
2035            Token::Whitespace(Whitespace::Space),
2036            Token::make_word("col", None),
2037            Token::Whitespace(Whitespace::Space),
2038            Token::DoubleTilde,
2039            Token::Whitespace(Whitespace::Space),
2040            Token::SingleQuotedString("_a%".into()),
2041            Token::Comma,
2042            Token::Whitespace(Whitespace::Space),
2043            Token::make_word("col", None),
2044            Token::Whitespace(Whitespace::Space),
2045            Token::DoubleTildeAsterisk,
2046            Token::Whitespace(Whitespace::Space),
2047            Token::SingleQuotedString("_a%".into()),
2048            Token::Comma,
2049            Token::Whitespace(Whitespace::Space),
2050            Token::make_word("col", None),
2051            Token::Whitespace(Whitespace::Space),
2052            Token::ExclamationMarkDoubleTilde,
2053            Token::Whitespace(Whitespace::Space),
2054            Token::SingleQuotedString("_a%".into()),
2055            Token::Comma,
2056            Token::Whitespace(Whitespace::Space),
2057            Token::make_word("col", None),
2058            Token::Whitespace(Whitespace::Space),
2059            Token::ExclamationMarkDoubleTildeAsterisk,
2060            Token::Whitespace(Whitespace::Space),
2061            Token::SingleQuotedString("_a%".into()),
2062        ];
2063        compare(expected, tokens);
2064    }
2065
2066    #[test]
2067    fn tokenize_quoted_identifier() {
2068        let sql = r#" "a "" b" "a """ "c """"" "#;
2069        let dialect = GenericDialect {};
2070        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2071        let expected = vec![
2072            Token::Whitespace(Whitespace::Space),
2073            Token::make_word(r#"a " b"#, Some('"')),
2074            Token::Whitespace(Whitespace::Space),
2075            Token::make_word(r#"a ""#, Some('"')),
2076            Token::Whitespace(Whitespace::Space),
2077            Token::make_word(r#"c """#, Some('"')),
2078            Token::Whitespace(Whitespace::Space),
2079        ];
2080        compare(expected, tokens);
2081    }
2082
2083    #[test]
2084    fn tokenize_snowflake_div() {
2085        let sql = r#"field/1000"#;
2086        let dialect = SnowflakeDialect {};
2087        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2088        let expected = vec![
2089            Token::make_word(r#"field"#, None),
2090            Token::Div,
2091            Token::Number("1000".to_string(), false),
2092        ];
2093        compare(expected, tokens);
2094    }
2095
2096    #[test]
2097    fn tokenize_quoted_identifier_with_no_escape() {
2098        let sql = r#" "a "" b" "a """ "c """"" "#;
2099        let dialect = GenericDialect {};
2100        let tokens = Tokenizer::new(&dialect, sql)
2101            .with_unescape(false)
2102            .tokenize()
2103            .unwrap();
2104        let expected = vec![
2105            Token::Whitespace(Whitespace::Space),
2106            Token::make_word(r#"a "" b"#, Some('"')),
2107            Token::Whitespace(Whitespace::Space),
2108            Token::make_word(r#"a """#, Some('"')),
2109            Token::Whitespace(Whitespace::Space),
2110            Token::make_word(r#"c """""#, Some('"')),
2111            Token::Whitespace(Whitespace::Space),
2112        ];
2113        compare(expected, tokens);
2114    }
2115
2116    #[test]
2117    fn tokenize_with_location() {
2118        let sql = "SELECT a,\n b";
2119        let dialect = GenericDialect {};
2120        let tokens = Tokenizer::new(&dialect, sql)
2121            .tokenize_with_location()
2122            .unwrap();
2123        let expected = vec![
2124            TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
2125            TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
2126            TokenWithLocation::new(Token::make_word("a", None), 1, 8),
2127            TokenWithLocation::new(Token::Comma, 1, 9),
2128            TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
2129            TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
2130            TokenWithLocation::new(Token::make_word("b", None), 2, 2),
2131        ];
2132        compare(expected, tokens);
2133    }
2134
2135    fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
2136        //println!("------------------------------");
2137        //println!("tokens   = {:?}", actual);
2138        //println!("expected = {:?}", expected);
2139        //println!("------------------------------");
2140        assert_eq!(expected, actual);
2141    }
2142}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs