sqlparser/
tokenizer.rs

1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4//
5// http://www.apache.org/licenses/LICENSE-2.0
6//
7// Unless required by applicable law or agreed to in writing, software
8// distributed under the License is distributed on an "AS IS" BASIS,
9// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10// See the License for the specific language governing permissions and
11// limitations under the License.
12
13//! SQL Tokenizer
14//!
15//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
16//!
17//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
18
19#[cfg(not(feature = "std"))]
20use alloc::{
21    borrow::ToOwned,
22    format,
23    string::{String, ToString},
24    vec,
25    vec::Vec,
26};
27use core::fmt;
28use core::iter::Peekable;
29use core::str::Chars;
30
31#[cfg(feature = "serde")]
32use serde::{Deserialize, Serialize};
33
34#[cfg(feature = "visitor")]
35use sqlparser_derive::{Visit, VisitMut};
36
37use crate::ast::DollarQuotedString;
38use crate::dialect::{
39    BigQueryDialect, DuckDbDialect, GenericDialect, HiveDialect, SnowflakeDialect,
40};
41use crate::dialect::{Dialect, MySqlDialect};
42use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
43
44/// SQL Token enumeration
45#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
46#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
47#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
48pub enum Token {
49    /// An end-of-file marker, not a real token
50    EOF,
51    /// A keyword (like SELECT) or an optionally quoted SQL identifier
52    Word(Word),
53    /// An unsigned numeric literal
54    Number(String, bool),
55    /// A character that could not be tokenized
56    Char(char),
57    /// Single quoted string: i.e: 'string'
58    SingleQuotedString(String),
59    /// Double quoted string: i.e: "string"
60    DoubleQuotedString(String),
61    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
62    DollarQuotedString(DollarQuotedString),
63    /// Byte string literal: i.e: b'string' or B'string'
64    SingleQuotedByteStringLiteral(String),
65    /// Byte string literal: i.e: b"string" or B"string"
66    DoubleQuotedByteStringLiteral(String),
67    /// Raw string literal: i.e: r'string' or R'string' or r"string" or R"string"
68    RawStringLiteral(String),
69    /// "National" string literal: i.e: N'string'
70    NationalStringLiteral(String),
71    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
72    EscapedStringLiteral(String),
73    /// Hexadecimal string literal: i.e.: X'deadbeef'
74    HexStringLiteral(String),
75    /// Comma
76    Comma,
77    /// Whitespace (space, tab, etc)
78    Whitespace(Whitespace),
79    /// Double equals sign `==`
80    DoubleEq,
81    /// Equality operator `=`
82    Eq,
83    /// Not Equals operator `<>` (or `!=` in some dialects)
84    Neq,
85    /// Less Than operator `<`
86    Lt,
87    /// Greater Than operator `>`
88    Gt,
89    /// Less Than Or Equals operator `<=`
90    LtEq,
91    /// Greater Than Or Equals operator `>=`
92    GtEq,
93    /// Spaceship operator <=>
94    Spaceship,
95    /// Plus operator `+`
96    Plus,
97    /// Minus operator `-`
98    Minus,
99    /// Multiplication operator `*`
100    Mul,
101    /// Division operator `/`
102    Div,
103    /// Integer division operator `//` in DuckDB
104    DuckIntDiv,
105    /// Modulo Operator `%`
106    Mod,
107    /// String concatenation `||`
108    StringConcat,
109    /// Left parenthesis `(`
110    LParen,
111    /// Right parenthesis `)`
112    RParen,
113    /// Period (used for compound identifiers or projections into nested types)
114    Period,
115    /// Colon `:`
116    Colon,
117    /// DoubleColon `::` (used for casting in postgresql)
118    DoubleColon,
119    /// Assignment `:=` (used for keyword argument in DuckDB macros)
120    DuckAssignment,
121    /// SemiColon `;` used as separator for COPY and payload
122    SemiColon,
123    /// Backslash `\` used in terminating the COPY payload with `\.`
124    Backslash,
125    /// Left bracket `[`
126    LBracket,
127    /// Right bracket `]`
128    RBracket,
129    /// Ampersand `&`
130    Ampersand,
131    /// Pipe `|`
132    Pipe,
133    /// Caret `^`
134    Caret,
135    /// Left brace `{`
136    LBrace,
137    /// Right brace `}`
138    RBrace,
139    /// Right Arrow `=>`
140    RArrow,
141    /// Sharp `#` used for PostgreSQL Bitwise XOR operator
142    Sharp,
143    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
144    Tilde,
145    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
146    TildeAsterisk,
147    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
148    ExclamationMarkTilde,
149    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
150    ExclamationMarkTildeAsterisk,
151    /// `<<`, a bitwise shift left operator in PostgreSQL
152    ShiftLeft,
153    /// `>>`, a bitwise shift right operator in PostgreSQL
154    ShiftRight,
155    /// '&&', an overlap operator in PostgreSQL
156    Overlap,
157    /// Exclamation Mark `!` used for PostgreSQL factorial operator
158    ExclamationMark,
159    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
160    DoubleExclamationMark,
161    /// AtSign `@` used for PostgreSQL abs operator
162    AtSign,
163    /// `|/`, a square root math operator in PostgreSQL
164    PGSquareRoot,
165    /// `||/`, a cube root math operator in PostgreSQL
166    PGCubeRoot,
167    /// `?` or `$` , a prepared statement arg placeholder
168    Placeholder(String),
169    /// ->, used as a operator to extract json field in PostgreSQL
170    Arrow,
171    /// ->>, used as a operator to extract json field as text in PostgreSQL
172    LongArrow,
173    /// #> Extracts JSON sub-object at the specified path
174    HashArrow,
175    /// #>> Extracts JSON sub-object at the specified path as text
176    HashLongArrow,
177    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
178    AtArrow,
179    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
180    ArrowAt,
181    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
182    /// path, where path elements can be either field keys or array indexes.
183    HashMinus,
184    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
185    /// JSON value?
186    AtQuestion,
187    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
188    /// for the specified JSON value. Only the first item of the result is taken into
189    /// account. If the result is not Boolean, then NULL is returned.
190    AtAt,
191}
192
193impl fmt::Display for Token {
194    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
195        match self {
196            Token::EOF => f.write_str("EOF"),
197            Token::Word(ref w) => write!(f, "{w}"),
198            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
199            Token::Char(ref c) => write!(f, "{c}"),
200            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
201            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
202            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
203            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
204            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
205            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
206            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
207            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
208            Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
209            Token::Comma => f.write_str(","),
210            Token::Whitespace(ws) => write!(f, "{ws}"),
211            Token::DoubleEq => f.write_str("=="),
212            Token::Spaceship => f.write_str("<=>"),
213            Token::Eq => f.write_str("="),
214            Token::Neq => f.write_str("<>"),
215            Token::Lt => f.write_str("<"),
216            Token::Gt => f.write_str(">"),
217            Token::LtEq => f.write_str("<="),
218            Token::GtEq => f.write_str(">="),
219            Token::Plus => f.write_str("+"),
220            Token::Minus => f.write_str("-"),
221            Token::Mul => f.write_str("*"),
222            Token::Div => f.write_str("/"),
223            Token::DuckIntDiv => f.write_str("//"),
224            Token::StringConcat => f.write_str("||"),
225            Token::Mod => f.write_str("%"),
226            Token::LParen => f.write_str("("),
227            Token::RParen => f.write_str(")"),
228            Token::Period => f.write_str("."),
229            Token::Colon => f.write_str(":"),
230            Token::DoubleColon => f.write_str("::"),
231            Token::DuckAssignment => f.write_str(":="),
232            Token::SemiColon => f.write_str(";"),
233            Token::Backslash => f.write_str("\\"),
234            Token::LBracket => f.write_str("["),
235            Token::RBracket => f.write_str("]"),
236            Token::Ampersand => f.write_str("&"),
237            Token::Caret => f.write_str("^"),
238            Token::Pipe => f.write_str("|"),
239            Token::LBrace => f.write_str("{"),
240            Token::RBrace => f.write_str("}"),
241            Token::RArrow => f.write_str("=>"),
242            Token::Sharp => f.write_str("#"),
243            Token::ExclamationMark => f.write_str("!"),
244            Token::DoubleExclamationMark => f.write_str("!!"),
245            Token::Tilde => f.write_str("~"),
246            Token::TildeAsterisk => f.write_str("~*"),
247            Token::ExclamationMarkTilde => f.write_str("!~"),
248            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
249            Token::AtSign => f.write_str("@"),
250            Token::ShiftLeft => f.write_str("<<"),
251            Token::ShiftRight => f.write_str(">>"),
252            Token::Overlap => f.write_str("&&"),
253            Token::PGSquareRoot => f.write_str("|/"),
254            Token::PGCubeRoot => f.write_str("||/"),
255            Token::Placeholder(ref s) => write!(f, "{s}"),
256            Token::Arrow => write!(f, "->"),
257            Token::LongArrow => write!(f, "->>"),
258            Token::HashArrow => write!(f, "#>"),
259            Token::HashLongArrow => write!(f, "#>>"),
260            Token::AtArrow => write!(f, "@>"),
261            Token::ArrowAt => write!(f, "<@"),
262            Token::HashMinus => write!(f, "#-"),
263            Token::AtQuestion => write!(f, "@?"),
264            Token::AtAt => write!(f, "@@"),
265        }
266    }
267}
268
269impl Token {
270    pub fn make_keyword(keyword: &str) -> Self {
271        Token::make_word(keyword, None)
272    }
273
274    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
275        let word_uppercase = word.to_uppercase();
276        Token::Word(Word {
277            value: word.to_string(),
278            quote_style,
279            keyword: if quote_style.is_none() {
280                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
281                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
282            } else {
283                Keyword::NoKeyword
284            },
285        })
286    }
287}
288
289/// A keyword (like SELECT) or an optionally quoted SQL identifier
290#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
291#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
292#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
293pub struct Word {
294    /// The value of the token, without the enclosing quotes, and with the
295    /// escape sequences (if any) processed (TODO: escapes are not handled)
296    pub value: String,
297    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
298    /// The standard and most implementations allow using double quotes for this,
299    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
300    pub quote_style: Option<char>,
301    /// If the word was not quoted and it matched one of the known keywords,
302    /// this will have one of the values from dialect::keywords, otherwise empty
303    pub keyword: Keyword,
304}
305
306impl fmt::Display for Word {
307    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
308        match self.quote_style {
309            Some(s) if s == '"' || s == '[' || s == '`' => {
310                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
311            }
312            None => f.write_str(&self.value),
313            _ => panic!("Unexpected quote_style!"),
314        }
315    }
316}
317
318impl Word {
319    fn matching_end_quote(ch: char) -> char {
320        match ch {
321            '"' => '"', // ANSI and most dialects
322            '[' => ']', // MS SQL
323            '`' => '`', // MySQL
324            _ => panic!("unexpected quoting style!"),
325        }
326    }
327}
328
329#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
330#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
331#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
332pub enum Whitespace {
333    Space,
334    Newline,
335    Tab,
336    SingleLineComment { comment: String, prefix: String },
337    MultiLineComment(String),
338}
339
340impl fmt::Display for Whitespace {
341    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
342        match self {
343            Whitespace::Space => f.write_str(" "),
344            Whitespace::Newline => f.write_str("\n"),
345            Whitespace::Tab => f.write_str("\t"),
346            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
347            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
348        }
349    }
350}
351
352/// Location in input string
353#[derive(Debug, Eq, PartialEq, Clone, Copy)]
354pub struct Location {
355    /// Line number, starting from 1
356    pub line: u64,
357    /// Line column, starting from 1
358    pub column: u64,
359}
360
361impl fmt::Display for Location {
362    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
363        if self.line == 0 {
364            return Ok(());
365        }
366        write!(
367            f,
368            // TODO: use standard compiler location syntax (<path>:<line>:<col>)
369            " at Line: {}, Column {}",
370            self.line, self.column,
371        )
372    }
373}
374
375/// A [Token] with [Location] attached to it
376#[derive(Debug, Eq, PartialEq, Clone)]
377pub struct TokenWithLocation {
378    pub token: Token,
379    pub location: Location,
380}
381
382impl TokenWithLocation {
383    pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
384        TokenWithLocation {
385            token,
386            location: Location { line, column },
387        }
388    }
389
390    pub fn wrap(token: Token) -> TokenWithLocation {
391        TokenWithLocation::new(token, 0, 0)
392    }
393}
394
395impl PartialEq<Token> for TokenWithLocation {
396    fn eq(&self, other: &Token) -> bool {
397        &self.token == other
398    }
399}
400
401impl PartialEq<TokenWithLocation> for Token {
402    fn eq(&self, other: &TokenWithLocation) -> bool {
403        self == &other.token
404    }
405}
406
407impl fmt::Display for TokenWithLocation {
408    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
409        self.token.fmt(f)
410    }
411}
412
413/// Tokenizer error
414#[derive(Debug, PartialEq, Eq)]
415pub struct TokenizerError {
416    pub message: String,
417    pub location: Location,
418}
419
420impl fmt::Display for TokenizerError {
421    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
422        write!(f, "{}{}", self.message, self.location,)
423    }
424}
425
426#[cfg(feature = "std")]
427impl std::error::Error for TokenizerError {}
428
429struct State<'a> {
430    peekable: Peekable<Chars<'a>>,
431    pub line: u64,
432    pub col: u64,
433}
434
435impl<'a> State<'a> {
436    /// return the next character and advance the stream
437    pub fn next(&mut self) -> Option<char> {
438        match self.peekable.next() {
439            None => None,
440            Some(s) => {
441                if s == '\n' {
442                    self.line += 1;
443                    self.col = 1;
444                } else {
445                    self.col += 1;
446                }
447                Some(s)
448            }
449        }
450    }
451
452    /// return the next character but do not advance the stream
453    pub fn peek(&mut self) -> Option<&char> {
454        self.peekable.peek()
455    }
456
457    pub fn location(&self) -> Location {
458        Location {
459            line: self.line,
460            column: self.col,
461        }
462    }
463}
464
465/// SQL Tokenizer
466pub struct Tokenizer<'a> {
467    dialect: &'a dyn Dialect,
468    query: &'a str,
469    /// If true (the default), the tokenizer will un-escape literal
470    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
471    unescape: bool,
472}
473
474impl<'a> Tokenizer<'a> {
475    /// Create a new SQL tokenizer for the specified SQL statement
476    ///
477    /// ```
478    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
479    /// # use sqlparser::dialect::GenericDialect;
480    /// # let dialect = GenericDialect{};
481    /// let query = r#"SELECT 'foo'"#;
482    ///
483    /// // Parsing the query
484    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
485    ///
486    /// assert_eq!(tokens, vec![
487    ///   Token::make_word("SELECT", None),
488    ///   Token::Whitespace(Whitespace::Space),
489    ///   Token::SingleQuotedString("foo".to_string()),
490    /// ]);
491    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
492        Self {
493            dialect,
494            query,
495            unescape: true,
496        }
497    }
498
499    /// Set unescape mode
500    ///
501    /// When true (default) the tokenizer unescapes literal values
502    /// (for example, `""` in SQL is unescaped to the literal `"`).
503    ///
504    /// When false, the tokenizer provides the raw strings as provided
505    /// in the query.  This can be helpful for programs that wish to
506    /// recover the *exact* original query text without normalizing
507    /// the escaping
508    ///
509    /// # Example
510    ///
511    /// ```
512    /// # use sqlparser::tokenizer::{Token, Tokenizer};
513    /// # use sqlparser::dialect::GenericDialect;
514    /// # let dialect = GenericDialect{};
515    /// let query = r#""Foo "" Bar""#;
516    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
517    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
518    ///
519    /// // Parsing with unescaping (default)
520    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
521    /// assert_eq!(tokens, vec![unescaped]);
522    ///
523    /// // Parsing with unescape = false
524    /// let tokens = Tokenizer::new(&dialect, &query)
525    ///    .with_unescape(false)
526    ///    .tokenize().unwrap();
527    /// assert_eq!(tokens, vec![original]);
528    /// ```
529    pub fn with_unescape(mut self, unescape: bool) -> Self {
530        self.unescape = unescape;
531        self
532    }
533
534    /// Tokenize the statement and produce a vector of tokens
535    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
536        let twl = self.tokenize_with_location()?;
537
538        let mut tokens: Vec<Token> = vec![];
539        tokens.reserve(twl.len());
540        for token_with_location in twl {
541            tokens.push(token_with_location.token);
542        }
543        Ok(tokens)
544    }
545
546    /// Tokenize the statement and produce a vector of tokens with location information
547    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
548        let mut state = State {
549            peekable: self.query.chars().peekable(),
550            line: 1,
551            col: 1,
552        };
553
554        let mut tokens: Vec<TokenWithLocation> = vec![];
555
556        let mut location = state.location();
557        while let Some(token) = self.next_token(&mut state)? {
558            tokens.push(TokenWithLocation { token, location });
559
560            location = state.location();
561        }
562        Ok(tokens)
563    }
564
565    // Tokenize the identifer or keywords in `ch`
566    fn tokenize_identifier_or_keyword(
567        &self,
568        ch: impl IntoIterator<Item = char>,
569        chars: &mut State,
570    ) -> Result<Option<Token>, TokenizerError> {
571        chars.next(); // consume the first char
572        let ch: String = ch.into_iter().collect();
573        let word = self.tokenize_word(ch, chars);
574
575        // TODO: implement parsing of exponent here
576        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
577            let mut inner_state = State {
578                peekable: word.chars().peekable(),
579                line: 0,
580                col: 0,
581            };
582            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
583            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
584            s += s2.as_str();
585            return Ok(Some(Token::Number(s, false)));
586        }
587
588        Ok(Some(Token::make_word(&word, None)))
589    }
590
591    /// Get the next token or return None
592    fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
593        match chars.peek() {
594            Some(&ch) => match ch {
595                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
596                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
597                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
598                '\r' => {
599                    // Emit a single Whitespace::Newline token for \r and \r\n
600                    chars.next();
601                    if let Some('\n') = chars.peek() {
602                        chars.next();
603                    }
604                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
605                }
606                // BigQuery uses b or B for byte string literal
607                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
608                    chars.next(); // consume
609                    match chars.peek() {
610                        Some('\'') => {
611                            let s = self.tokenize_quoted_string(chars, '\'')?;
612                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
613                        }
614                        Some('\"') => {
615                            let s = self.tokenize_quoted_string(chars, '\"')?;
616                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
617                        }
618                        _ => {
619                            // regular identifier starting with an "b" or "B"
620                            let s = self.tokenize_word(b, chars);
621                            Ok(Some(Token::make_word(&s, None)))
622                        }
623                    }
624                }
625                // BigQuery uses r or R for raw string literal
626                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
627                    chars.next(); // consume
628                    match chars.peek() {
629                        Some('\'') => {
630                            let s = self.tokenize_quoted_string(chars, '\'')?;
631                            Ok(Some(Token::RawStringLiteral(s)))
632                        }
633                        Some('\"') => {
634                            let s = self.tokenize_quoted_string(chars, '\"')?;
635                            Ok(Some(Token::RawStringLiteral(s)))
636                        }
637                        _ => {
638                            // regular identifier starting with an "r" or "R"
639                            let s = self.tokenize_word(b, chars);
640                            Ok(Some(Token::make_word(&s, None)))
641                        }
642                    }
643                }
644                // Redshift uses lower case n for national string literal
645                n @ 'N' | n @ 'n' => {
646                    chars.next(); // consume, to check the next char
647                    match chars.peek() {
648                        Some('\'') => {
649                            // N'...' - a <national character string literal>
650                            let s = self.tokenize_quoted_string(chars, '\'')?;
651                            Ok(Some(Token::NationalStringLiteral(s)))
652                        }
653                        _ => {
654                            // regular identifier starting with an "N"
655                            let s = self.tokenize_word(n, chars);
656                            Ok(Some(Token::make_word(&s, None)))
657                        }
658                    }
659                }
660                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
661                x @ 'e' | x @ 'E' => {
662                    let starting_loc = chars.location();
663                    chars.next(); // consume, to check the next char
664                    match chars.peek() {
665                        Some('\'') => {
666                            let s =
667                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
668                            Ok(Some(Token::EscapedStringLiteral(s)))
669                        }
670                        _ => {
671                            // regular identifier starting with an "E" or "e"
672                            let s = self.tokenize_word(x, chars);
673                            Ok(Some(Token::make_word(&s, None)))
674                        }
675                    }
676                }
677                // The spec only allows an uppercase 'X' to introduce a hex
678                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
679                x @ 'x' | x @ 'X' => {
680                    chars.next(); // consume, to check the next char
681                    match chars.peek() {
682                        Some('\'') => {
683                            // X'...' - a <binary string literal>
684                            let s = self.tokenize_quoted_string(chars, '\'')?;
685                            Ok(Some(Token::HexStringLiteral(s)))
686                        }
687                        _ => {
688                            // regular identifier starting with an "X"
689                            let s = self.tokenize_word(x, chars);
690                            Ok(Some(Token::make_word(&s, None)))
691                        }
692                    }
693                }
694                // single quoted string
695                '\'' => {
696                    let s = self.tokenize_quoted_string(chars, '\'')?;
697
698                    Ok(Some(Token::SingleQuotedString(s)))
699                }
700                // double quoted string
701                '\"' if !self.dialect.is_delimited_identifier_start(ch)
702                    && !self.dialect.is_identifier_start(ch) =>
703                {
704                    let s = self.tokenize_quoted_string(chars, '"')?;
705
706                    Ok(Some(Token::DoubleQuotedString(s)))
707                }
708                // delimited (quoted) identifier
709                quote_start
710                    if self.dialect.is_delimited_identifier_start(ch)
711                        && self
712                            .dialect
713                            .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
714                {
715                    let error_loc = chars.location();
716                    chars.next(); // consume the opening quote
717                    let quote_end = Word::matching_end_quote(quote_start);
718                    let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
719
720                    if last_char == Some(quote_end) {
721                        Ok(Some(Token::make_word(&s, Some(quote_start))))
722                    } else {
723                        self.tokenizer_error(
724                            error_loc,
725                            format!("Expected close delimiter '{quote_end}' before EOF."),
726                        )
727                    }
728                }
729                // numbers and period
730                '0'..='9' | '.' => {
731                    let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
732
733                    // match binary literal that starts with 0x
734                    if s == "0" && chars.peek() == Some(&'x') {
735                        chars.next();
736                        let s2 = peeking_take_while(
737                            chars,
738                            |ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'),
739                        );
740                        return Ok(Some(Token::HexStringLiteral(s2)));
741                    }
742
743                    // match one period
744                    if let Some('.') = chars.peek() {
745                        s.push('.');
746                        chars.next();
747                    }
748                    s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
749
750                    // No number -> Token::Period
751                    if s == "." {
752                        return Ok(Some(Token::Period));
753                    }
754
755                    let mut exponent_part = String::new();
756                    // Parse exponent as number
757                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
758                        let mut char_clone = chars.peekable.clone();
759                        exponent_part.push(char_clone.next().unwrap());
760
761                        // Optional sign
762                        match char_clone.peek() {
763                            Some(&c) if matches!(c, '+' | '-') => {
764                                exponent_part.push(c);
765                                char_clone.next();
766                            }
767                            _ => (),
768                        }
769
770                        match char_clone.peek() {
771                            // Definitely an exponent, get original iterator up to speed and use it
772                            Some(&c) if c.is_ascii_digit() => {
773                                for _ in 0..exponent_part.len() {
774                                    chars.next();
775                                }
776                                exponent_part +=
777                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
778                                s += exponent_part.as_str();
779                            }
780                            // Not an exponent, discard the work done
781                            _ => (),
782                        }
783                    }
784
785                    // mysql dialect supports identifiers that start with a numeric prefix,
786                    // as long as they aren't an exponent number.
787                    if dialect_of!(self is MySqlDialect | HiveDialect) && exponent_part.is_empty() {
788                        let word =
789                            peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
790
791                        if !word.is_empty() {
792                            s += word.as_str();
793                            return Ok(Some(Token::make_word(s.as_str(), None)));
794                        }
795                    }
796
797                    let long = if chars.peek() == Some(&'L') {
798                        chars.next();
799                        true
800                    } else {
801                        false
802                    };
803                    Ok(Some(Token::Number(s, long)))
804                }
805                // punctuation
806                '(' => self.consume_and_return(chars, Token::LParen),
807                ')' => self.consume_and_return(chars, Token::RParen),
808                ',' => self.consume_and_return(chars, Token::Comma),
809                // operators
810                '-' => {
811                    chars.next(); // consume the '-'
812                    match chars.peek() {
813                        Some('-') => {
814                            chars.next(); // consume the second '-', starting a single-line comment
815                            let comment = self.tokenize_single_line_comment(chars);
816                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
817                                prefix: "--".to_owned(),
818                                comment,
819                            })))
820                        }
821                        Some('>') => {
822                            chars.next();
823                            match chars.peek() {
824                                Some('>') => {
825                                    chars.next();
826                                    Ok(Some(Token::LongArrow))
827                                }
828                                _ => Ok(Some(Token::Arrow)),
829                            }
830                        }
831                        // a regular '-' operator
832                        _ => Ok(Some(Token::Minus)),
833                    }
834                }
835                '/' => {
836                    chars.next(); // consume the '/'
837                    match chars.peek() {
838                        Some('*') => {
839                            chars.next(); // consume the '*', starting a multi-line comment
840                            self.tokenize_multiline_comment(chars)
841                        }
842                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
843                            chars.next(); // consume the second '/', starting a snowflake single-line comment
844                            let comment = self.tokenize_single_line_comment(chars);
845                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
846                                prefix: "//".to_owned(),
847                                comment,
848                            })))
849                        }
850                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
851                            self.consume_and_return(chars, Token::DuckIntDiv)
852                        }
853                        // a regular '/' operator
854                        _ => Ok(Some(Token::Div)),
855                    }
856                }
857                '+' => self.consume_and_return(chars, Token::Plus),
858                '*' => self.consume_and_return(chars, Token::Mul),
859                '%' => {
860                    chars.next(); // advance past '%'
861                    match chars.peek() {
862                        Some(' ') => Ok(Some(Token::Mod)),
863                        Some(sch) if self.dialect.is_identifier_start('%') => {
864                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
865                        }
866                        _ => Ok(Some(Token::Mod)),
867                    }
868                }
869                '|' => {
870                    chars.next(); // consume the '|'
871                    match chars.peek() {
872                        Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
873                        Some('|') => {
874                            chars.next(); // consume the second '|'
875                            match chars.peek() {
876                                Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
877                                _ => Ok(Some(Token::StringConcat)),
878                            }
879                        }
880                        // Bitshift '|' operator
881                        _ => Ok(Some(Token::Pipe)),
882                    }
883                }
884                '=' => {
885                    chars.next(); // consume
886                    match chars.peek() {
887                        Some('>') => self.consume_and_return(chars, Token::RArrow),
888                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
889                        _ => Ok(Some(Token::Eq)),
890                    }
891                }
892                '!' => {
893                    chars.next(); // consume
894                    match chars.peek() {
895                        Some('=') => self.consume_and_return(chars, Token::Neq),
896                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
897                        Some('~') => {
898                            chars.next();
899                            match chars.peek() {
900                                Some('*') => self
901                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
902                                _ => Ok(Some(Token::ExclamationMarkTilde)),
903                            }
904                        }
905                        _ => Ok(Some(Token::ExclamationMark)),
906                    }
907                }
908                '<' => {
909                    chars.next(); // consume
910                    match chars.peek() {
911                        Some('=') => {
912                            chars.next();
913                            match chars.peek() {
914                                Some('>') => self.consume_and_return(chars, Token::Spaceship),
915                                _ => Ok(Some(Token::LtEq)),
916                            }
917                        }
918                        Some('>') => self.consume_and_return(chars, Token::Neq),
919                        Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
920                        Some('@') => self.consume_and_return(chars, Token::ArrowAt),
921                        _ => Ok(Some(Token::Lt)),
922                    }
923                }
924                '>' => {
925                    chars.next(); // consume
926                    match chars.peek() {
927                        Some('=') => self.consume_and_return(chars, Token::GtEq),
928                        Some('>') => self.consume_and_return(chars, Token::ShiftRight),
929                        _ => Ok(Some(Token::Gt)),
930                    }
931                }
932                ':' => {
933                    chars.next();
934                    match chars.peek() {
935                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
936                        Some('=') => self.consume_and_return(chars, Token::DuckAssignment),
937                        _ => Ok(Some(Token::Colon)),
938                    }
939                }
940                ';' => self.consume_and_return(chars, Token::SemiColon),
941                '\\' => self.consume_and_return(chars, Token::Backslash),
942                '[' => self.consume_and_return(chars, Token::LBracket),
943                ']' => self.consume_and_return(chars, Token::RBracket),
944                '&' => {
945                    chars.next(); // consume the '&'
946                    match chars.peek() {
947                        Some('&') => self.consume_and_return(chars, Token::Overlap),
948                        // Bitshift '&' operator
949                        _ => Ok(Some(Token::Ampersand)),
950                    }
951                }
952                '^' => self.consume_and_return(chars, Token::Caret),
953                '{' => self.consume_and_return(chars, Token::LBrace),
954                '}' => self.consume_and_return(chars, Token::RBrace),
955                '#' if dialect_of!(self is SnowflakeDialect) => {
956                    chars.next(); // consume the '#', starting a snowflake single-line comment
957                    let comment = self.tokenize_single_line_comment(chars);
958                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
959                        prefix: "#".to_owned(),
960                        comment,
961                    })))
962                }
963                '~' => {
964                    chars.next(); // consume
965                    match chars.peek() {
966                        Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
967                        _ => Ok(Some(Token::Tilde)),
968                    }
969                }
970                '#' => {
971                    chars.next();
972                    match chars.peek() {
973                        Some('-') => self.consume_and_return(chars, Token::HashMinus),
974                        Some('>') => {
975                            chars.next();
976                            match chars.peek() {
977                                Some('>') => {
978                                    chars.next();
979                                    Ok(Some(Token::HashLongArrow))
980                                }
981                                _ => Ok(Some(Token::HashArrow)),
982                            }
983                        }
984                        Some(' ') => Ok(Some(Token::Sharp)),
985                        Some(sch) if self.dialect.is_identifier_start('#') => {
986                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
987                        }
988                        _ => Ok(Some(Token::Sharp)),
989                    }
990                }
991                '@' => {
992                    chars.next();
993                    match chars.peek() {
994                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
995                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
996                        Some('@') => {
997                            chars.next();
998                            match chars.peek() {
999                                Some(' ') => Ok(Some(Token::AtAt)),
1000                                Some(tch) if self.dialect.is_identifier_start('@') => {
1001                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1002                                }
1003                                _ => Ok(Some(Token::AtAt)),
1004                            }
1005                        }
1006                        Some(' ') => Ok(Some(Token::AtSign)),
1007                        Some(sch) if self.dialect.is_identifier_start('@') => {
1008                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1009                        }
1010                        _ => Ok(Some(Token::AtSign)),
1011                    }
1012                }
1013                '?' => {
1014                    chars.next();
1015                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1016                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1017                }
1018
1019                // identifier or keyword
1020                ch if self.dialect.is_identifier_start(ch) => {
1021                    self.tokenize_identifier_or_keyword([ch], chars)
1022                }
1023                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1024
1025                //whitespace check (including unicode chars) should be last as it covers some of the chars above
1026                ch if ch.is_whitespace() => {
1027                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1028                }
1029                other => self.consume_and_return(chars, Token::Char(other)),
1030            },
1031            None => Ok(None),
1032        }
1033    }
1034
1035    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1036    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1037        let mut s = String::new();
1038        let mut value = String::new();
1039
1040        chars.next();
1041
1042        if let Some('$') = chars.peek() {
1043            chars.next();
1044
1045            let mut is_terminated = false;
1046            let mut prev: Option<char> = None;
1047
1048            while let Some(&ch) = chars.peek() {
1049                if prev == Some('$') {
1050                    if ch == '$' {
1051                        chars.next();
1052                        is_terminated = true;
1053                        break;
1054                    } else {
1055                        s.push('$');
1056                        s.push(ch);
1057                    }
1058                } else if ch != '$' {
1059                    s.push(ch);
1060                }
1061
1062                prev = Some(ch);
1063                chars.next();
1064            }
1065
1066            return if chars.peek().is_none() && !is_terminated {
1067                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1068            } else {
1069                Ok(Token::DollarQuotedString(DollarQuotedString {
1070                    value: s,
1071                    tag: None,
1072                }))
1073            };
1074        } else {
1075            value.push_str(&peeking_take_while(chars, |ch| {
1076                ch.is_alphanumeric() || ch == '_'
1077            }));
1078
1079            if let Some('$') = chars.peek() {
1080                chars.next();
1081                s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
1082
1083                match chars.peek() {
1084                    Some('$') => {
1085                        chars.next();
1086                        for (_, c) in value.chars().enumerate() {
1087                            let next_char = chars.next();
1088                            if Some(c) != next_char {
1089                                return self.tokenizer_error(
1090                                    chars.location(),
1091                                    format!(
1092                                        "Unterminated dollar-quoted string at or near \"{value}\""
1093                                    ),
1094                                );
1095                            }
1096                        }
1097
1098                        if let Some('$') = chars.peek() {
1099                            chars.next();
1100                        } else {
1101                            return self.tokenizer_error(
1102                                chars.location(),
1103                                "Unterminated dollar-quoted string, expected $",
1104                            );
1105                        }
1106                    }
1107                    _ => {
1108                        return self.tokenizer_error(
1109                            chars.location(),
1110                            "Unterminated dollar-quoted, expected $",
1111                        );
1112                    }
1113                }
1114            } else {
1115                return Ok(Token::Placeholder(String::from("$") + &value));
1116            }
1117        }
1118
1119        Ok(Token::DollarQuotedString(DollarQuotedString {
1120            value: s,
1121            tag: if value.is_empty() { None } else { Some(value) },
1122        }))
1123    }
1124
1125    fn tokenizer_error<R>(
1126        &self,
1127        loc: Location,
1128        message: impl Into<String>,
1129    ) -> Result<R, TokenizerError> {
1130        Err(TokenizerError {
1131            message: message.into(),
1132            location: loc,
1133        })
1134    }
1135
1136    // Consume characters until newline
1137    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1138        let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1139        if let Some(ch) = chars.next() {
1140            assert_eq!(ch, '\n');
1141            comment.push(ch);
1142        }
1143        comment
1144    }
1145
1146    /// Tokenize an identifier or keyword, after the first char is already consumed.
1147    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1148        let mut s = first_chars.into();
1149        s.push_str(&peeking_take_while(chars, |ch| {
1150            self.dialect.is_identifier_part(ch)
1151        }));
1152        s
1153    }
1154
1155    /// Read a single quoted string, starting with the opening quote.
1156    fn tokenize_escaped_single_quoted_string(
1157        &self,
1158        starting_loc: Location,
1159        chars: &mut State,
1160    ) -> Result<String, TokenizerError> {
1161        let mut s = String::new();
1162
1163        // This case is a bit tricky
1164
1165        chars.next(); // consume the opening quote
1166
1167        // slash escaping
1168        let mut is_escaped = false;
1169        while let Some(&ch) = chars.peek() {
1170            macro_rules! escape_control_character {
1171                ($ESCAPED:expr) => {{
1172                    if is_escaped {
1173                        s.push($ESCAPED);
1174                        is_escaped = false;
1175                    } else {
1176                        s.push(ch);
1177                    }
1178
1179                    chars.next();
1180                }};
1181            }
1182
1183            match ch {
1184                '\'' => {
1185                    chars.next(); // consume
1186                    if is_escaped {
1187                        s.push(ch);
1188                        is_escaped = false;
1189                    } else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1190                        s.push(ch);
1191                        chars.next();
1192                    } else {
1193                        return Ok(s);
1194                    }
1195                }
1196                '\\' => {
1197                    if is_escaped {
1198                        s.push('\\');
1199                        is_escaped = false;
1200                    } else {
1201                        is_escaped = true;
1202                    }
1203
1204                    chars.next();
1205                }
1206                'r' => escape_control_character!('\r'),
1207                'n' => escape_control_character!('\n'),
1208                't' => escape_control_character!('\t'),
1209                _ => {
1210                    is_escaped = false;
1211                    chars.next(); // consume
1212                    s.push(ch);
1213                }
1214            }
1215        }
1216        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1217    }
1218
1219    /// Read a single quoted string, starting with the opening quote.
1220    fn tokenize_quoted_string(
1221        &self,
1222        chars: &mut State,
1223        quote_style: char,
1224    ) -> Result<String, TokenizerError> {
1225        let mut s = String::new();
1226        let error_loc = chars.location();
1227
1228        chars.next(); // consume the opening quote
1229
1230        while let Some(&ch) = chars.peek() {
1231            match ch {
1232                char if char == quote_style => {
1233                    chars.next(); // consume
1234                    if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
1235                        s.push(ch);
1236                        if !self.unescape {
1237                            // In no-escape mode, the given query has to be saved completely
1238                            s.push(ch);
1239                        }
1240                        chars.next();
1241                    } else {
1242                        return Ok(s);
1243                    }
1244                }
1245                '\\' => {
1246                    // consume
1247                    chars.next();
1248                    // slash escaping is specific to MySQL dialect.
1249                    if dialect_of!(self is MySqlDialect) {
1250                        if let Some(next) = chars.peek() {
1251                            if !self.unescape {
1252                                // In no-escape mode, the given query has to be saved completely including backslashes.
1253                                s.push(ch);
1254                                s.push(*next);
1255                                chars.next(); // consume next
1256                            } else {
1257                                // See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
1258                                let n = match next {
1259                                    '\'' | '\"' | '\\' | '%' | '_' => *next,
1260                                    '0' => '\0',
1261                                    'b' => '\u{8}',
1262                                    'n' => '\n',
1263                                    'r' => '\r',
1264                                    't' => '\t',
1265                                    'Z' => '\u{1a}',
1266                                    _ => *next,
1267                                };
1268                                s.push(n);
1269                                chars.next(); // consume next
1270                            }
1271                        }
1272                    } else {
1273                        s.push(ch);
1274                    }
1275                }
1276                _ => {
1277                    chars.next(); // consume
1278                    s.push(ch);
1279                }
1280            }
1281        }
1282        self.tokenizer_error(error_loc, "Unterminated string literal")
1283    }
1284
1285    fn tokenize_multiline_comment(
1286        &self,
1287        chars: &mut State,
1288    ) -> Result<Option<Token>, TokenizerError> {
1289        let mut s = String::new();
1290        let mut nested = 1;
1291        let mut last_ch = ' ';
1292
1293        loop {
1294            match chars.next() {
1295                Some(ch) => {
1296                    if last_ch == '/' && ch == '*' {
1297                        nested += 1;
1298                    } else if last_ch == '*' && ch == '/' {
1299                        nested -= 1;
1300                        if nested == 0 {
1301                            s.pop();
1302                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1303                        }
1304                    }
1305                    s.push(ch);
1306                    last_ch = ch;
1307                }
1308                None => {
1309                    break self.tokenizer_error(
1310                        chars.location(),
1311                        "Unexpected EOF while in a multi-line comment",
1312                    )
1313                }
1314            }
1315        }
1316    }
1317
1318    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
1319        let mut last_char = None;
1320        let mut s = String::new();
1321        while let Some(ch) = chars.next() {
1322            if ch == quote_end {
1323                if chars.peek() == Some(&quote_end) {
1324                    chars.next();
1325                    s.push(ch);
1326                    if !self.unescape {
1327                        // In no-escape mode, the given query has to be saved completely
1328                        s.push(ch);
1329                    }
1330                } else {
1331                    last_char = Some(quote_end);
1332                    break;
1333                }
1334            } else {
1335                s.push(ch);
1336            }
1337        }
1338        (s, last_char)
1339    }
1340
1341    #[allow(clippy::unnecessary_wraps)]
1342    fn consume_and_return(
1343        &self,
1344        chars: &mut State,
1345        t: Token,
1346    ) -> Result<Option<Token>, TokenizerError> {
1347        chars.next();
1348        Ok(Some(t))
1349    }
1350}
1351
1352/// Read from `chars` until `predicate` returns `false` or EOF is hit.
1353/// Return the characters read as String, and keep the first non-matching
1354/// char available as `chars.next()`.
1355fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1356    let mut s = String::new();
1357    while let Some(&ch) = chars.peek() {
1358        if predicate(ch) {
1359            chars.next(); // consume
1360            s.push(ch);
1361        } else {
1362            break;
1363        }
1364    }
1365    s
1366}
1367
1368#[cfg(test)]
1369mod tests {
1370    use super::*;
1371    use crate::dialect::{ClickHouseDialect, GenericDialect, MsSqlDialect};
1372
1373    #[test]
1374    fn tokenizer_error_impl() {
1375        let err = TokenizerError {
1376            message: "test".into(),
1377            location: Location { line: 1, column: 1 },
1378        };
1379        #[cfg(feature = "std")]
1380        {
1381            use std::error::Error;
1382            assert!(err.source().is_none());
1383        }
1384        assert_eq!(err.to_string(), "test at Line: 1, Column 1");
1385    }
1386
1387    #[test]
1388    fn tokenize_select_1() {
1389        let sql = String::from("SELECT 1");
1390        let dialect = GenericDialect {};
1391        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1392
1393        let expected = vec![
1394            Token::make_keyword("SELECT"),
1395            Token::Whitespace(Whitespace::Space),
1396            Token::Number(String::from("1"), false),
1397        ];
1398
1399        compare(expected, tokens);
1400    }
1401
1402    #[test]
1403    fn tokenize_select_float() {
1404        let sql = String::from("SELECT .1");
1405        let dialect = GenericDialect {};
1406        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1407
1408        let expected = vec![
1409            Token::make_keyword("SELECT"),
1410            Token::Whitespace(Whitespace::Space),
1411            Token::Number(String::from(".1"), false),
1412        ];
1413
1414        compare(expected, tokens);
1415    }
1416
1417    #[test]
1418    fn tokenize_clickhouse_double_equal() {
1419        let sql = String::from("SELECT foo=='1'");
1420        let dialect = ClickHouseDialect {};
1421        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1422        let tokens = tokenizer.tokenize().unwrap();
1423
1424        let expected = vec![
1425            Token::make_keyword("SELECT"),
1426            Token::Whitespace(Whitespace::Space),
1427            Token::Word(Word {
1428                value: "foo".to_string(),
1429                quote_style: None,
1430                keyword: Keyword::NoKeyword,
1431            }),
1432            Token::DoubleEq,
1433            Token::SingleQuotedString("1".to_string()),
1434        ];
1435
1436        compare(expected, tokens);
1437    }
1438
1439    #[test]
1440    fn tokenize_select_exponent() {
1441        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
1442        let dialect = GenericDialect {};
1443        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1444
1445        let expected = vec![
1446            Token::make_keyword("SELECT"),
1447            Token::Whitespace(Whitespace::Space),
1448            Token::Number(String::from("1e10"), false),
1449            Token::Comma,
1450            Token::Whitespace(Whitespace::Space),
1451            Token::Number(String::from("1e-10"), false),
1452            Token::Comma,
1453            Token::Whitespace(Whitespace::Space),
1454            Token::Number(String::from("1e+10"), false),
1455            Token::Comma,
1456            Token::Whitespace(Whitespace::Space),
1457            Token::Number(String::from("1"), false),
1458            Token::make_word("ea", None),
1459            Token::Comma,
1460            Token::Whitespace(Whitespace::Space),
1461            Token::Number(String::from("1e-10"), false),
1462            Token::make_word("a", None),
1463            Token::Comma,
1464            Token::Whitespace(Whitespace::Space),
1465            Token::Number(String::from("1e-10"), false),
1466            Token::Minus,
1467            Token::Number(String::from("10"), false),
1468        ];
1469
1470        compare(expected, tokens);
1471    }
1472
1473    #[test]
1474    fn tokenize_scalar_function() {
1475        let sql = String::from("SELECT sqrt(1)");
1476        let dialect = GenericDialect {};
1477        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1478
1479        let expected = vec![
1480            Token::make_keyword("SELECT"),
1481            Token::Whitespace(Whitespace::Space),
1482            Token::make_word("sqrt", None),
1483            Token::LParen,
1484            Token::Number(String::from("1"), false),
1485            Token::RParen,
1486        ];
1487
1488        compare(expected, tokens);
1489    }
1490
1491    #[test]
1492    fn tokenize_string_string_concat() {
1493        let sql = String::from("SELECT 'a' || 'b'");
1494        let dialect = GenericDialect {};
1495        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1496
1497        let expected = vec![
1498            Token::make_keyword("SELECT"),
1499            Token::Whitespace(Whitespace::Space),
1500            Token::SingleQuotedString(String::from("a")),
1501            Token::Whitespace(Whitespace::Space),
1502            Token::StringConcat,
1503            Token::Whitespace(Whitespace::Space),
1504            Token::SingleQuotedString(String::from("b")),
1505        ];
1506
1507        compare(expected, tokens);
1508    }
1509    #[test]
1510    fn tokenize_bitwise_op() {
1511        let sql = String::from("SELECT one | two ^ three");
1512        let dialect = GenericDialect {};
1513        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1514
1515        let expected = vec![
1516            Token::make_keyword("SELECT"),
1517            Token::Whitespace(Whitespace::Space),
1518            Token::make_word("one", None),
1519            Token::Whitespace(Whitespace::Space),
1520            Token::Pipe,
1521            Token::Whitespace(Whitespace::Space),
1522            Token::make_word("two", None),
1523            Token::Whitespace(Whitespace::Space),
1524            Token::Caret,
1525            Token::Whitespace(Whitespace::Space),
1526            Token::make_word("three", None),
1527        ];
1528        compare(expected, tokens);
1529    }
1530
1531    #[test]
1532    fn tokenize_logical_xor() {
1533        let sql =
1534            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
1535        let dialect = GenericDialect {};
1536        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1537
1538        let expected = vec![
1539            Token::make_keyword("SELECT"),
1540            Token::Whitespace(Whitespace::Space),
1541            Token::make_keyword("true"),
1542            Token::Whitespace(Whitespace::Space),
1543            Token::make_keyword("XOR"),
1544            Token::Whitespace(Whitespace::Space),
1545            Token::make_keyword("true"),
1546            Token::Comma,
1547            Token::Whitespace(Whitespace::Space),
1548            Token::make_keyword("false"),
1549            Token::Whitespace(Whitespace::Space),
1550            Token::make_keyword("XOR"),
1551            Token::Whitespace(Whitespace::Space),
1552            Token::make_keyword("false"),
1553            Token::Comma,
1554            Token::Whitespace(Whitespace::Space),
1555            Token::make_keyword("true"),
1556            Token::Whitespace(Whitespace::Space),
1557            Token::make_keyword("XOR"),
1558            Token::Whitespace(Whitespace::Space),
1559            Token::make_keyword("false"),
1560            Token::Comma,
1561            Token::Whitespace(Whitespace::Space),
1562            Token::make_keyword("false"),
1563            Token::Whitespace(Whitespace::Space),
1564            Token::make_keyword("XOR"),
1565            Token::Whitespace(Whitespace::Space),
1566            Token::make_keyword("true"),
1567        ];
1568        compare(expected, tokens);
1569    }
1570
1571    #[test]
1572    fn tokenize_simple_select() {
1573        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
1574        let dialect = GenericDialect {};
1575        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1576
1577        let expected = vec![
1578            Token::make_keyword("SELECT"),
1579            Token::Whitespace(Whitespace::Space),
1580            Token::Mul,
1581            Token::Whitespace(Whitespace::Space),
1582            Token::make_keyword("FROM"),
1583            Token::Whitespace(Whitespace::Space),
1584            Token::make_word("customer", None),
1585            Token::Whitespace(Whitespace::Space),
1586            Token::make_keyword("WHERE"),
1587            Token::Whitespace(Whitespace::Space),
1588            Token::make_word("id", None),
1589            Token::Whitespace(Whitespace::Space),
1590            Token::Eq,
1591            Token::Whitespace(Whitespace::Space),
1592            Token::Number(String::from("1"), false),
1593            Token::Whitespace(Whitespace::Space),
1594            Token::make_keyword("LIMIT"),
1595            Token::Whitespace(Whitespace::Space),
1596            Token::Number(String::from("5"), false),
1597        ];
1598
1599        compare(expected, tokens);
1600    }
1601
1602    #[test]
1603    fn tokenize_explain_select() {
1604        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
1605        let dialect = GenericDialect {};
1606        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1607
1608        let expected = vec![
1609            Token::make_keyword("EXPLAIN"),
1610            Token::Whitespace(Whitespace::Space),
1611            Token::make_keyword("SELECT"),
1612            Token::Whitespace(Whitespace::Space),
1613            Token::Mul,
1614            Token::Whitespace(Whitespace::Space),
1615            Token::make_keyword("FROM"),
1616            Token::Whitespace(Whitespace::Space),
1617            Token::make_word("customer", None),
1618            Token::Whitespace(Whitespace::Space),
1619            Token::make_keyword("WHERE"),
1620            Token::Whitespace(Whitespace::Space),
1621            Token::make_word("id", None),
1622            Token::Whitespace(Whitespace::Space),
1623            Token::Eq,
1624            Token::Whitespace(Whitespace::Space),
1625            Token::Number(String::from("1"), false),
1626        ];
1627
1628        compare(expected, tokens);
1629    }
1630
1631    #[test]
1632    fn tokenize_explain_analyze_select() {
1633        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
1634        let dialect = GenericDialect {};
1635        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1636
1637        let expected = vec![
1638            Token::make_keyword("EXPLAIN"),
1639            Token::Whitespace(Whitespace::Space),
1640            Token::make_keyword("ANALYZE"),
1641            Token::Whitespace(Whitespace::Space),
1642            Token::make_keyword("SELECT"),
1643            Token::Whitespace(Whitespace::Space),
1644            Token::Mul,
1645            Token::Whitespace(Whitespace::Space),
1646            Token::make_keyword("FROM"),
1647            Token::Whitespace(Whitespace::Space),
1648            Token::make_word("customer", None),
1649            Token::Whitespace(Whitespace::Space),
1650            Token::make_keyword("WHERE"),
1651            Token::Whitespace(Whitespace::Space),
1652            Token::make_word("id", None),
1653            Token::Whitespace(Whitespace::Space),
1654            Token::Eq,
1655            Token::Whitespace(Whitespace::Space),
1656            Token::Number(String::from("1"), false),
1657        ];
1658
1659        compare(expected, tokens);
1660    }
1661
1662    #[test]
1663    fn tokenize_string_predicate() {
1664        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
1665        let dialect = GenericDialect {};
1666        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1667
1668        let expected = vec![
1669            Token::make_keyword("SELECT"),
1670            Token::Whitespace(Whitespace::Space),
1671            Token::Mul,
1672            Token::Whitespace(Whitespace::Space),
1673            Token::make_keyword("FROM"),
1674            Token::Whitespace(Whitespace::Space),
1675            Token::make_word("customer", None),
1676            Token::Whitespace(Whitespace::Space),
1677            Token::make_keyword("WHERE"),
1678            Token::Whitespace(Whitespace::Space),
1679            Token::make_word("salary", None),
1680            Token::Whitespace(Whitespace::Space),
1681            Token::Neq,
1682            Token::Whitespace(Whitespace::Space),
1683            Token::SingleQuotedString(String::from("Not Provided")),
1684        ];
1685
1686        compare(expected, tokens);
1687    }
1688
1689    #[test]
1690    fn tokenize_invalid_string() {
1691        let sql = String::from("\n💝مصطفىh");
1692
1693        let dialect = GenericDialect {};
1694        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1695        // println!("tokens: {:#?}", tokens);
1696        let expected = vec![
1697            Token::Whitespace(Whitespace::Newline),
1698            Token::Char('💝'),
1699            Token::make_word("مصطفىh", None),
1700        ];
1701        compare(expected, tokens);
1702    }
1703
1704    #[test]
1705    fn tokenize_newline_in_string_literal() {
1706        let sql = String::from("'foo\r\nbar\nbaz'");
1707
1708        let dialect = GenericDialect {};
1709        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1710        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
1711        compare(expected, tokens);
1712    }
1713
1714    #[test]
1715    fn tokenize_unterminated_string_literal() {
1716        let sql = String::from("select 'foo");
1717
1718        let dialect = GenericDialect {};
1719        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1720        assert_eq!(
1721            tokenizer.tokenize(),
1722            Err(TokenizerError {
1723                message: "Unterminated string literal".to_string(),
1724                location: Location { line: 1, column: 8 },
1725            })
1726        );
1727    }
1728
1729    #[test]
1730    fn tokenize_unterminated_string_literal_utf8() {
1731        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
1732
1733        let dialect = GenericDialect {};
1734        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1735        assert_eq!(
1736            tokenizer.tokenize(),
1737            Err(TokenizerError {
1738                message: "Unterminated string literal".to_string(),
1739                location: Location {
1740                    line: 1,
1741                    column: 35
1742                }
1743            })
1744        );
1745    }
1746
1747    #[test]
1748    fn tokenize_invalid_string_cols() {
1749        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
1750
1751        let dialect = GenericDialect {};
1752        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1753        // println!("tokens: {:#?}", tokens);
1754        let expected = vec![
1755            Token::Whitespace(Whitespace::Newline),
1756            Token::Whitespace(Whitespace::Newline),
1757            Token::make_keyword("SELECT"),
1758            Token::Whitespace(Whitespace::Space),
1759            Token::Mul,
1760            Token::Whitespace(Whitespace::Space),
1761            Token::make_keyword("FROM"),
1762            Token::Whitespace(Whitespace::Space),
1763            Token::make_keyword("table"),
1764            Token::Whitespace(Whitespace::Tab),
1765            Token::Char('💝'),
1766            Token::make_word("مصطفىh", None),
1767        ];
1768        compare(expected, tokens);
1769    }
1770
1771    #[test]
1772    fn tokenize_right_arrow() {
1773        let sql = String::from("FUNCTION(key=>value)");
1774        let dialect = GenericDialect {};
1775        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1776        let expected = vec![
1777            Token::make_word("FUNCTION", None),
1778            Token::LParen,
1779            Token::make_word("key", None),
1780            Token::RArrow,
1781            Token::make_word("value", None),
1782            Token::RParen,
1783        ];
1784        compare(expected, tokens);
1785    }
1786
1787    #[test]
1788    fn tokenize_is_null() {
1789        let sql = String::from("a IS NULL");
1790        let dialect = GenericDialect {};
1791        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1792
1793        let expected = vec![
1794            Token::make_word("a", None),
1795            Token::Whitespace(Whitespace::Space),
1796            Token::make_keyword("IS"),
1797            Token::Whitespace(Whitespace::Space),
1798            Token::make_keyword("NULL"),
1799        ];
1800
1801        compare(expected, tokens);
1802    }
1803
1804    #[test]
1805    fn tokenize_comment() {
1806        let sql = String::from("0--this is a comment\n1");
1807
1808        let dialect = GenericDialect {};
1809        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1810        let expected = vec![
1811            Token::Number("0".to_string(), false),
1812            Token::Whitespace(Whitespace::SingleLineComment {
1813                prefix: "--".to_string(),
1814                comment: "this is a comment\n".to_string(),
1815            }),
1816            Token::Number("1".to_string(), false),
1817        ];
1818        compare(expected, tokens);
1819    }
1820
1821    #[test]
1822    fn tokenize_comment_at_eof() {
1823        let sql = String::from("--this is a comment");
1824
1825        let dialect = GenericDialect {};
1826        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1827        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
1828            prefix: "--".to_string(),
1829            comment: "this is a comment".to_string(),
1830        })];
1831        compare(expected, tokens);
1832    }
1833
1834    #[test]
1835    fn tokenize_multiline_comment() {
1836        let sql = String::from("0/*multi-line\n* /comment*/1");
1837
1838        let dialect = GenericDialect {};
1839        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1840        let expected = vec![
1841            Token::Number("0".to_string(), false),
1842            Token::Whitespace(Whitespace::MultiLineComment(
1843                "multi-line\n* /comment".to_string(),
1844            )),
1845            Token::Number("1".to_string(), false),
1846        ];
1847        compare(expected, tokens);
1848    }
1849
1850    #[test]
1851    fn tokenize_nested_multiline_comment() {
1852        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
1853
1854        let dialect = GenericDialect {};
1855        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1856        let expected = vec![
1857            Token::Number("0".to_string(), false),
1858            Token::Whitespace(Whitespace::MultiLineComment(
1859                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
1860            )),
1861            Token::Number("1".to_string(), false),
1862        ];
1863        compare(expected, tokens);
1864    }
1865
1866    #[test]
1867    fn tokenize_multiline_comment_with_even_asterisks() {
1868        let sql = String::from("\n/** Comment **/\n");
1869
1870        let dialect = GenericDialect {};
1871        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1872        let expected = vec![
1873            Token::Whitespace(Whitespace::Newline),
1874            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
1875            Token::Whitespace(Whitespace::Newline),
1876        ];
1877        compare(expected, tokens);
1878    }
1879
1880    #[test]
1881    fn tokenize_unicode_whitespace() {
1882        let sql = String::from(" \u{2003}\n");
1883
1884        let dialect = GenericDialect {};
1885        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1886        let expected = vec![
1887            Token::Whitespace(Whitespace::Space),
1888            Token::Whitespace(Whitespace::Space),
1889            Token::Whitespace(Whitespace::Newline),
1890        ];
1891        compare(expected, tokens);
1892    }
1893
1894    #[test]
1895    fn tokenize_mismatched_quotes() {
1896        let sql = String::from("\"foo");
1897
1898        let dialect = GenericDialect {};
1899        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1900        assert_eq!(
1901            tokenizer.tokenize(),
1902            Err(TokenizerError {
1903                message: "Expected close delimiter '\"' before EOF.".to_string(),
1904                location: Location { line: 1, column: 1 },
1905            })
1906        );
1907    }
1908
1909    #[test]
1910    fn tokenize_newlines() {
1911        let sql = String::from("line1\nline2\rline3\r\nline4\r");
1912
1913        let dialect = GenericDialect {};
1914        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1915        let expected = vec![
1916            Token::make_word("line1", None),
1917            Token::Whitespace(Whitespace::Newline),
1918            Token::make_word("line2", None),
1919            Token::Whitespace(Whitespace::Newline),
1920            Token::make_word("line3", None),
1921            Token::Whitespace(Whitespace::Newline),
1922            Token::make_word("line4", None),
1923            Token::Whitespace(Whitespace::Newline),
1924        ];
1925        compare(expected, tokens);
1926    }
1927
1928    #[test]
1929    fn tokenize_mssql_top() {
1930        let sql = "SELECT TOP 5 [bar] FROM foo";
1931        let dialect = MsSqlDialect {};
1932        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1933        let expected = vec![
1934            Token::make_keyword("SELECT"),
1935            Token::Whitespace(Whitespace::Space),
1936            Token::make_keyword("TOP"),
1937            Token::Whitespace(Whitespace::Space),
1938            Token::Number(String::from("5"), false),
1939            Token::Whitespace(Whitespace::Space),
1940            Token::make_word("bar", Some('[')),
1941            Token::Whitespace(Whitespace::Space),
1942            Token::make_keyword("FROM"),
1943            Token::Whitespace(Whitespace::Space),
1944            Token::make_word("foo", None),
1945        ];
1946        compare(expected, tokens);
1947    }
1948
1949    #[test]
1950    fn tokenize_pg_regex_match() {
1951        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
1952        let dialect = GenericDialect {};
1953        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1954        let expected = vec![
1955            Token::make_keyword("SELECT"),
1956            Token::Whitespace(Whitespace::Space),
1957            Token::make_word("col", None),
1958            Token::Whitespace(Whitespace::Space),
1959            Token::Tilde,
1960            Token::Whitespace(Whitespace::Space),
1961            Token::SingleQuotedString("^a".into()),
1962            Token::Comma,
1963            Token::Whitespace(Whitespace::Space),
1964            Token::make_word("col", None),
1965            Token::Whitespace(Whitespace::Space),
1966            Token::TildeAsterisk,
1967            Token::Whitespace(Whitespace::Space),
1968            Token::SingleQuotedString("^a".into()),
1969            Token::Comma,
1970            Token::Whitespace(Whitespace::Space),
1971            Token::make_word("col", None),
1972            Token::Whitespace(Whitespace::Space),
1973            Token::ExclamationMarkTilde,
1974            Token::Whitespace(Whitespace::Space),
1975            Token::SingleQuotedString("^a".into()),
1976            Token::Comma,
1977            Token::Whitespace(Whitespace::Space),
1978            Token::make_word("col", None),
1979            Token::Whitespace(Whitespace::Space),
1980            Token::ExclamationMarkTildeAsterisk,
1981            Token::Whitespace(Whitespace::Space),
1982            Token::SingleQuotedString("^a".into()),
1983        ];
1984        compare(expected, tokens);
1985    }
1986
1987    #[test]
1988    fn tokenize_quoted_identifier() {
1989        let sql = r#" "a "" b" "a """ "c """"" "#;
1990        let dialect = GenericDialect {};
1991        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
1992        let expected = vec![
1993            Token::Whitespace(Whitespace::Space),
1994            Token::make_word(r#"a " b"#, Some('"')),
1995            Token::Whitespace(Whitespace::Space),
1996            Token::make_word(r#"a ""#, Some('"')),
1997            Token::Whitespace(Whitespace::Space),
1998            Token::make_word(r#"c """#, Some('"')),
1999            Token::Whitespace(Whitespace::Space),
2000        ];
2001        compare(expected, tokens);
2002    }
2003
2004    #[test]
2005    fn tokenize_snowflake_div() {
2006        let sql = r#"field/1000"#;
2007        let dialect = SnowflakeDialect {};
2008        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2009        let expected = vec![
2010            Token::make_word(r#"field"#, None),
2011            Token::Div,
2012            Token::Number("1000".to_string(), false),
2013        ];
2014        compare(expected, tokens);
2015    }
2016
2017    #[test]
2018    fn tokenize_quoted_identifier_with_no_escape() {
2019        let sql = r#" "a "" b" "a """ "c """"" "#;
2020        let dialect = GenericDialect {};
2021        let tokens = Tokenizer::new(&dialect, sql)
2022            .with_unescape(false)
2023            .tokenize()
2024            .unwrap();
2025        let expected = vec![
2026            Token::Whitespace(Whitespace::Space),
2027            Token::make_word(r#"a "" b"#, Some('"')),
2028            Token::Whitespace(Whitespace::Space),
2029            Token::make_word(r#"a """#, Some('"')),
2030            Token::Whitespace(Whitespace::Space),
2031            Token::make_word(r#"c """""#, Some('"')),
2032            Token::Whitespace(Whitespace::Space),
2033        ];
2034        compare(expected, tokens);
2035    }
2036
2037    #[test]
2038    fn tokenize_with_location() {
2039        let sql = "SELECT a,\n b";
2040        let dialect = GenericDialect {};
2041        let tokens = Tokenizer::new(&dialect, sql)
2042            .tokenize_with_location()
2043            .unwrap();
2044        let expected = vec![
2045            TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
2046            TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
2047            TokenWithLocation::new(Token::make_word("a", None), 1, 8),
2048            TokenWithLocation::new(Token::Comma, 1, 9),
2049            TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
2050            TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
2051            TokenWithLocation::new(Token::make_word("b", None), 2, 2),
2052        ];
2053        compare(expected, tokens);
2054    }
2055
2056    fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
2057        //println!("------------------------------");
2058        //println!("tokens   = {:?}", actual);
2059        //println!("expected = {:?}", expected);
2060        //println!("------------------------------");
2061        assert_eq!(expected, actual);
2062    }
2063}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs