sqltk_parser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::fmt;
33use core::iter::Peekable;
34use core::num::NonZeroU8;
35use core::str::Chars;
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqltk_parser_derive::{Visit, VisitMut};
42
43use crate::ast::DollarQuotedString;
44use crate::dialect::Dialect;
45use crate::dialect::{
46    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
47    SnowflakeDialect,
48};
49use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
50
51/// SQL Token enumeration
52#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56    /// An end-of-file marker, not a real token
57    EOF,
58    /// A keyword (like SELECT) or an optionally quoted SQL identifier
59    Word(Word),
60    /// An unsigned numeric literal
61    Number(String, bool),
62    /// A character that could not be tokenized
63    Char(char),
64    /// Single quoted string: i.e: 'string'
65    SingleQuotedString(String),
66    /// Double quoted string: i.e: "string"
67    DoubleQuotedString(String),
68    /// Triple single quoted strings: Example '''abc'''
69    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
70    TripleSingleQuotedString(String),
71    /// Triple double quoted strings: Example """abc"""
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleDoubleQuotedString(String),
74    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
75    DollarQuotedString(DollarQuotedString),
76    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
77    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
78    SingleQuotedByteStringLiteral(String),
79    /// Byte string literal: i.e: b"string" or B"string"
80    DoubleQuotedByteStringLiteral(String),
81    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
82    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
83    TripleSingleQuotedByteStringLiteral(String),
84    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleDoubleQuotedByteStringLiteral(String),
87    /// Single quoted literal with raw string prefix. Example `R'abc'`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    SingleQuotedRawStringLiteral(String),
90    /// Double quoted literal with raw string prefix. Example `R"abc"`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    DoubleQuotedRawStringLiteral(String),
93    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    TripleSingleQuotedRawStringLiteral(String),
96    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleDoubleQuotedRawStringLiteral(String),
99    /// "National" string literal: i.e: N'string'
100    NationalStringLiteral(String),
101    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
102    EscapedStringLiteral(String),
103    /// Unicode string literal: i.e: U&'first \000A second'
104    UnicodeStringLiteral(String),
105    /// Hexadecimal string literal: i.e.: X'deadbeef'
106    HexStringLiteral(String),
107    /// Comma
108    Comma,
109    /// Whitespace (space, tab, etc)
110    Whitespace(Whitespace),
111    /// Double equals sign `==`
112    DoubleEq,
113    /// Equality operator `=`
114    Eq,
115    /// Not Equals operator `<>` (or `!=` in some dialects)
116    Neq,
117    /// Less Than operator `<`
118    Lt,
119    /// Greater Than operator `>`
120    Gt,
121    /// Less Than Or Equals operator `<=`
122    LtEq,
123    /// Greater Than Or Equals operator `>=`
124    GtEq,
125    /// Spaceship operator <=>
126    Spaceship,
127    /// Plus operator `+`
128    Plus,
129    /// Minus operator `-`
130    Minus,
131    /// Multiplication operator `*`
132    Mul,
133    /// Division operator `/`
134    Div,
135    /// Integer division operator `//` in DuckDB
136    DuckIntDiv,
137    /// Modulo Operator `%`
138    Mod,
139    /// String concatenation `||`
140    StringConcat,
141    /// Left parenthesis `(`
142    LParen,
143    /// Right parenthesis `)`
144    RParen,
145    /// Period (used for compound identifiers or projections into nested types)
146    Period,
147    /// Colon `:`
148    Colon,
149    /// DoubleColon `::` (used for casting in PostgreSQL)
150    DoubleColon,
151    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
152    Assignment,
153    /// SemiColon `;` used as separator for COPY and payload
154    SemiColon,
155    /// Backslash `\` used in terminating the COPY payload with `\.`
156    Backslash,
157    /// Left bracket `[`
158    LBracket,
159    /// Right bracket `]`
160    RBracket,
161    /// Ampersand `&`
162    Ampersand,
163    /// Pipe `|`
164    Pipe,
165    /// Caret `^`
166    Caret,
167    /// Left brace `{`
168    LBrace,
169    /// Right brace `}`
170    RBrace,
171    /// Right Arrow `=>`
172    RArrow,
173    /// Sharp `#` used for PostgreSQL Bitwise XOR operator
174    Sharp,
175    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
176    Tilde,
177    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
178    TildeAsterisk,
179    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
180    ExclamationMarkTilde,
181    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
182    ExclamationMarkTildeAsterisk,
183    /// `~~`, a case sensitive match pattern operator in PostgreSQL
184    DoubleTilde,
185    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
186    DoubleTildeAsterisk,
187    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
188    ExclamationMarkDoubleTilde,
189    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
190    ExclamationMarkDoubleTildeAsterisk,
191    /// `<<`, a bitwise shift left operator in PostgreSQL
192    ShiftLeft,
193    /// `>>`, a bitwise shift right operator in PostgreSQL
194    ShiftRight,
195    /// `&&`, an overlap operator in PostgreSQL
196    Overlap,
197    /// Exclamation Mark `!` used for PostgreSQL factorial operator
198    ExclamationMark,
199    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
200    DoubleExclamationMark,
201    /// AtSign `@` used for PostgreSQL abs operator
202    AtSign,
203    /// `^@`, a "starts with" string operator in PostgreSQL
204    CaretAt,
205    /// `|/`, a square root math operator in PostgreSQL
206    PGSquareRoot,
207    /// `||/`, a cube root math operator in PostgreSQL
208    PGCubeRoot,
209    /// `?` or `$` , a prepared statement arg placeholder
210    Placeholder(String),
211    /// `->`, used as a operator to extract json field in PostgreSQL
212    Arrow,
213    /// `->>`, used as a operator to extract json field as text in PostgreSQL
214    LongArrow,
215    /// `#>`, extracts JSON sub-object at the specified path
216    HashArrow,
217    /// `#>>`, extracts JSON sub-object at the specified path as text
218    HashLongArrow,
219    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
220    AtArrow,
221    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
222    ArrowAt,
223    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
224    /// path, where path elements can be either field keys or array indexes.
225    HashMinus,
226    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
227    /// JSON value?
228    AtQuestion,
229    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
230    /// for the specified JSON value. Only the first item of the result is taken into
231    /// account. If the result is not Boolean, then NULL is returned.
232    AtAt,
233    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
234    /// jsonb object
235    Question,
236    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
237    /// keys within the jsonb object
238    QuestionAnd,
239    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
240    /// keys within the jsonb object
241    QuestionPipe,
242    /// Custom binary operator
243    /// This is used to represent any custom binary operator that is not part of the SQL standard.
244    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
245    CustomBinaryOperator(String),
246}
247
248impl fmt::Display for Token {
249    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
250        match self {
251            Token::EOF => f.write_str("EOF"),
252            Token::Word(ref w) => write!(f, "{w}"),
253            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
254            Token::Char(ref c) => write!(f, "{c}"),
255            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
256            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
257            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
258            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
259            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
260            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
261            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
262            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
263            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
264            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
265            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
266            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
267            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
268            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
269            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
270            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
271            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
272            Token::Comma => f.write_str(","),
273            Token::Whitespace(ws) => write!(f, "{ws}"),
274            Token::DoubleEq => f.write_str("=="),
275            Token::Spaceship => f.write_str("<=>"),
276            Token::Eq => f.write_str("="),
277            Token::Neq => f.write_str("<>"),
278            Token::Lt => f.write_str("<"),
279            Token::Gt => f.write_str(">"),
280            Token::LtEq => f.write_str("<="),
281            Token::GtEq => f.write_str(">="),
282            Token::Plus => f.write_str("+"),
283            Token::Minus => f.write_str("-"),
284            Token::Mul => f.write_str("*"),
285            Token::Div => f.write_str("/"),
286            Token::DuckIntDiv => f.write_str("//"),
287            Token::StringConcat => f.write_str("||"),
288            Token::Mod => f.write_str("%"),
289            Token::LParen => f.write_str("("),
290            Token::RParen => f.write_str(")"),
291            Token::Period => f.write_str("."),
292            Token::Colon => f.write_str(":"),
293            Token::DoubleColon => f.write_str("::"),
294            Token::Assignment => f.write_str(":="),
295            Token::SemiColon => f.write_str(";"),
296            Token::Backslash => f.write_str("\\"),
297            Token::LBracket => f.write_str("["),
298            Token::RBracket => f.write_str("]"),
299            Token::Ampersand => f.write_str("&"),
300            Token::Caret => f.write_str("^"),
301            Token::Pipe => f.write_str("|"),
302            Token::LBrace => f.write_str("{"),
303            Token::RBrace => f.write_str("}"),
304            Token::RArrow => f.write_str("=>"),
305            Token::Sharp => f.write_str("#"),
306            Token::ExclamationMark => f.write_str("!"),
307            Token::DoubleExclamationMark => f.write_str("!!"),
308            Token::Tilde => f.write_str("~"),
309            Token::TildeAsterisk => f.write_str("~*"),
310            Token::ExclamationMarkTilde => f.write_str("!~"),
311            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
312            Token::DoubleTilde => f.write_str("~~"),
313            Token::DoubleTildeAsterisk => f.write_str("~~*"),
314            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
315            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
316            Token::AtSign => f.write_str("@"),
317            Token::CaretAt => f.write_str("^@"),
318            Token::ShiftLeft => f.write_str("<<"),
319            Token::ShiftRight => f.write_str(">>"),
320            Token::Overlap => f.write_str("&&"),
321            Token::PGSquareRoot => f.write_str("|/"),
322            Token::PGCubeRoot => f.write_str("||/"),
323            Token::Placeholder(ref s) => write!(f, "{s}"),
324            Token::Arrow => write!(f, "->"),
325            Token::LongArrow => write!(f, "->>"),
326            Token::HashArrow => write!(f, "#>"),
327            Token::HashLongArrow => write!(f, "#>>"),
328            Token::AtArrow => write!(f, "@>"),
329            Token::ArrowAt => write!(f, "<@"),
330            Token::HashMinus => write!(f, "#-"),
331            Token::AtQuestion => write!(f, "@?"),
332            Token::AtAt => write!(f, "@@"),
333            Token::Question => write!(f, "?"),
334            Token::QuestionAnd => write!(f, "?&"),
335            Token::QuestionPipe => write!(f, "?|"),
336            Token::CustomBinaryOperator(s) => f.write_str(s),
337        }
338    }
339}
340
341impl Token {
342    pub fn make_keyword(keyword: &str) -> Self {
343        Token::make_word(keyword, None)
344    }
345
346    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
347        let word_uppercase = word.to_uppercase();
348        Token::Word(Word {
349            value: word.to_string(),
350            quote_style,
351            keyword: if quote_style.is_none() {
352                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
353                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
354            } else {
355                Keyword::NoKeyword
356            },
357        })
358    }
359}
360
361/// A keyword (like SELECT) or an optionally quoted SQL identifier
362#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
363#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
364#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
365pub struct Word {
366    /// The value of the token, without the enclosing quotes, and with the
367    /// escape sequences (if any) processed (TODO: escapes are not handled)
368    pub value: String,
369    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
370    /// The standard and most implementations allow using double quotes for this,
371    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
372    pub quote_style: Option<char>,
373    /// If the word was not quoted and it matched one of the known keywords,
374    /// this will have one of the values from dialect::keywords, otherwise empty
375    pub keyword: Keyword,
376}
377
378impl fmt::Display for Word {
379    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
380        match self.quote_style {
381            Some(s) if s == '"' || s == '[' || s == '`' => {
382                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
383            }
384            None => f.write_str(&self.value),
385            _ => panic!("Unexpected quote_style!"),
386        }
387    }
388}
389
390impl Word {
391    fn matching_end_quote(ch: char) -> char {
392        match ch {
393            '"' => '"', // ANSI and most dialects
394            '[' => ']', // MS SQL
395            '`' => '`', // MySQL
396            _ => panic!("unexpected quoting style!"),
397        }
398    }
399}
400
401#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
402#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
403#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
404pub enum Whitespace {
405    Space,
406    Newline,
407    Tab,
408    SingleLineComment { comment: String, prefix: String },
409    MultiLineComment(String),
410}
411
412impl fmt::Display for Whitespace {
413    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
414        match self {
415            Whitespace::Space => f.write_str(" "),
416            Whitespace::Newline => f.write_str("\n"),
417            Whitespace::Tab => f.write_str("\t"),
418            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
419            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
420        }
421    }
422}
423
424/// Location in input string
425#[derive(Debug, Eq, PartialEq, Clone, Copy)]
426pub struct Location {
427    /// Line number, starting from 1
428    pub line: u64,
429    /// Line column, starting from 1
430    pub column: u64,
431}
432
433impl fmt::Display for Location {
434    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
435        if self.line == 0 {
436            return Ok(());
437        }
438        write!(
439            f,
440            // TODO: use standard compiler location syntax (<path>:<line>:<col>)
441            " at Line: {}, Column: {}",
442            self.line, self.column,
443        )
444    }
445}
446
447/// A [Token] with [Location] attached to it
448#[derive(Debug, Eq, PartialEq, Clone)]
449pub struct TokenWithLocation {
450    pub token: Token,
451    pub location: Location,
452}
453
454impl TokenWithLocation {
455    pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
456        TokenWithLocation {
457            token,
458            location: Location { line, column },
459        }
460    }
461
462    pub fn wrap(token: Token) -> TokenWithLocation {
463        TokenWithLocation::new(token, 0, 0)
464    }
465}
466
467impl PartialEq<Token> for TokenWithLocation {
468    fn eq(&self, other: &Token) -> bool {
469        &self.token == other
470    }
471}
472
473impl PartialEq<TokenWithLocation> for Token {
474    fn eq(&self, other: &TokenWithLocation) -> bool {
475        self == &other.token
476    }
477}
478
479impl fmt::Display for TokenWithLocation {
480    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
481        self.token.fmt(f)
482    }
483}
484
485/// Tokenizer error
486#[derive(Debug, PartialEq, Eq)]
487pub struct TokenizerError {
488    pub message: String,
489    pub location: Location,
490}
491
492impl fmt::Display for TokenizerError {
493    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
494        write!(f, "{}{}", self.message, self.location,)
495    }
496}
497
498#[cfg(feature = "std")]
499impl std::error::Error for TokenizerError {}
500
501struct State<'a> {
502    peekable: Peekable<Chars<'a>>,
503    pub line: u64,
504    pub col: u64,
505}
506
507impl<'a> State<'a> {
508    /// return the next character and advance the stream
509    pub fn next(&mut self) -> Option<char> {
510        match self.peekable.next() {
511            None => None,
512            Some(s) => {
513                if s == '\n' {
514                    self.line += 1;
515                    self.col = 1;
516                } else {
517                    self.col += 1;
518                }
519                Some(s)
520            }
521        }
522    }
523
524    /// return the next character but do not advance the stream
525    pub fn peek(&mut self) -> Option<&char> {
526        self.peekable.peek()
527    }
528
529    pub fn location(&self) -> Location {
530        Location {
531            line: self.line,
532            column: self.col,
533        }
534    }
535}
536
537/// Represents how many quote characters enclose a string literal.
538#[derive(Copy, Clone)]
539enum NumStringQuoteChars {
540    /// e.g. `"abc"`, `'abc'`, `r'abc'`
541    One,
542    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
543    Many(NonZeroU8),
544}
545
546/// Settings for tokenizing a quoted string literal.
547struct TokenizeQuotedStringSettings {
548    /// The character used to quote the string.
549    quote_style: char,
550    /// Represents how many quotes characters enclose the string literal.
551    num_quote_chars: NumStringQuoteChars,
552    /// The number of opening quotes left to consume, before parsing
553    /// the remaining string literal.
554    /// For example: given initial string `"""abc"""`. If the caller has
555    /// already parsed the first quote for some reason, then this value
556    /// is set to 1, flagging to look to consume only 2 leading quotes.
557    num_opening_quotes_to_consume: u8,
558    /// True if the string uses backslash escaping of special characters
559    /// e.g `'abc\ndef\'ghi'
560    backslash_escape: bool,
561}
562
563/// SQL Tokenizer
564pub struct Tokenizer<'a> {
565    dialect: &'a dyn Dialect,
566    query: &'a str,
567    /// If true (the default), the tokenizer will un-escape literal
568    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
569    unescape: bool,
570}
571
572impl<'a> Tokenizer<'a> {
573    /// Create a new SQL tokenizer for the specified SQL statement
574    ///
575    /// ```
576    /// # use sqltk_parser::tokenizer::{Token, Whitespace, Tokenizer};
577    /// # use sqltk_parser::dialect::GenericDialect;
578    /// # let dialect = GenericDialect{};
579    /// let query = r#"SELECT 'foo'"#;
580    ///
581    /// // Parsing the query
582    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
583    ///
584    /// assert_eq!(tokens, vec![
585    ///   Token::make_word("SELECT", None),
586    ///   Token::Whitespace(Whitespace::Space),
587    ///   Token::SingleQuotedString("foo".to_string()),
588    /// ]);
589    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
590        Self {
591            dialect,
592            query,
593            unescape: true,
594        }
595    }
596
597    /// Set unescape mode
598    ///
599    /// When true (default) the tokenizer unescapes literal values
600    /// (for example, `""` in SQL is unescaped to the literal `"`).
601    ///
602    /// When false, the tokenizer provides the raw strings as provided
603    /// in the query.  This can be helpful for programs that wish to
604    /// recover the *exact* original query text without normalizing
605    /// the escaping
606    ///
607    /// # Example
608    ///
609    /// ```
610    /// # use sqltk_parser::tokenizer::{Token, Tokenizer};
611    /// # use sqltk_parser::dialect::GenericDialect;
612    /// # let dialect = GenericDialect{};
613    /// let query = r#""Foo "" Bar""#;
614    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
615    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
616    ///
617    /// // Parsing with unescaping (default)
618    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
619    /// assert_eq!(tokens, vec![unescaped]);
620    ///
621    /// // Parsing with unescape = false
622    /// let tokens = Tokenizer::new(&dialect, &query)
623    ///    .with_unescape(false)
624    ///    .tokenize().unwrap();
625    /// assert_eq!(tokens, vec![original]);
626    /// ```
627    pub fn with_unescape(mut self, unescape: bool) -> Self {
628        self.unescape = unescape;
629        self
630    }
631
632    /// Tokenize the statement and produce a vector of tokens
633    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
634        let twl = self.tokenize_with_location()?;
635        Ok(twl.into_iter().map(|t| t.token).collect())
636    }
637
638    /// Tokenize the statement and produce a vector of tokens with location information
639    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
640        let mut tokens: Vec<TokenWithLocation> = vec![];
641        self.tokenize_with_location_into_buf(&mut tokens)
642            .map(|_| tokens)
643    }
644
645    /// Tokenize the statement and append tokens with location information into the provided buffer.
646    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
647    pub fn tokenize_with_location_into_buf(
648        &mut self,
649        buf: &mut Vec<TokenWithLocation>,
650    ) -> Result<(), TokenizerError> {
651        let mut state = State {
652            peekable: self.query.chars().peekable(),
653            line: 1,
654            col: 1,
655        };
656
657        let mut location = state.location();
658        while let Some(token) = self.next_token(&mut state)? {
659            buf.push(TokenWithLocation { token, location });
660
661            location = state.location();
662        }
663        Ok(())
664    }
665
666    // Tokenize the identifier or keywords in `ch`
667    fn tokenize_identifier_or_keyword(
668        &self,
669        ch: impl IntoIterator<Item = char>,
670        chars: &mut State,
671    ) -> Result<Option<Token>, TokenizerError> {
672        chars.next(); // consume the first char
673        let ch: String = ch.into_iter().collect();
674        let word = self.tokenize_word(ch, chars);
675
676        // TODO: implement parsing of exponent here
677        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
678            let mut inner_state = State {
679                peekable: word.chars().peekable(),
680                line: 0,
681                col: 0,
682            };
683            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
684            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
685            s += s2.as_str();
686            return Ok(Some(Token::Number(s, false)));
687        }
688
689        Ok(Some(Token::make_word(&word, None)))
690    }
691
692    /// Get the next token or return None
693    fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
694        match chars.peek() {
695            Some(&ch) => match ch {
696                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
697                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
698                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
699                '\r' => {
700                    // Emit a single Whitespace::Newline token for \r and \r\n
701                    chars.next();
702                    if let Some('\n') = chars.peek() {
703                        chars.next();
704                    }
705                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
706                }
707                // BigQuery uses b or B for byte string literal
708                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
709                    chars.next(); // consume
710                    match chars.peek() {
711                        Some('\'') => {
712                            if self.dialect.supports_triple_quoted_string() {
713                                return self
714                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
715                                        chars,
716                                        '\'',
717                                        false,
718                                        Token::SingleQuotedByteStringLiteral,
719                                        Token::TripleSingleQuotedByteStringLiteral,
720                                    );
721                            }
722                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
723                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
724                        }
725                        Some('\"') => {
726                            if self.dialect.supports_triple_quoted_string() {
727                                return self
728                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
729                                        chars,
730                                        '"',
731                                        false,
732                                        Token::DoubleQuotedByteStringLiteral,
733                                        Token::TripleDoubleQuotedByteStringLiteral,
734                                    );
735                            }
736                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
737                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
738                        }
739                        _ => {
740                            // regular identifier starting with an "b" or "B"
741                            let s = self.tokenize_word(b, chars);
742                            Ok(Some(Token::make_word(&s, None)))
743                        }
744                    }
745                }
746                // BigQuery uses r or R for raw string literal
747                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
748                    chars.next(); // consume
749                    match chars.peek() {
750                        Some('\'') => self
751                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
752                                chars,
753                                '\'',
754                                false,
755                                Token::SingleQuotedRawStringLiteral,
756                                Token::TripleSingleQuotedRawStringLiteral,
757                            ),
758                        Some('\"') => self
759                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
760                                chars,
761                                '"',
762                                false,
763                                Token::DoubleQuotedRawStringLiteral,
764                                Token::TripleDoubleQuotedRawStringLiteral,
765                            ),
766                        _ => {
767                            // regular identifier starting with an "r" or "R"
768                            let s = self.tokenize_word(b, chars);
769                            Ok(Some(Token::make_word(&s, None)))
770                        }
771                    }
772                }
773                // Redshift uses lower case n for national string literal
774                n @ 'N' | n @ 'n' => {
775                    chars.next(); // consume, to check the next char
776                    match chars.peek() {
777                        Some('\'') => {
778                            // N'...' - a <national character string literal>
779                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
780                            Ok(Some(Token::NationalStringLiteral(s)))
781                        }
782                        _ => {
783                            // regular identifier starting with an "N"
784                            let s = self.tokenize_word(n, chars);
785                            Ok(Some(Token::make_word(&s, None)))
786                        }
787                    }
788                }
789                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
790                x @ 'e' | x @ 'E' => {
791                    let starting_loc = chars.location();
792                    chars.next(); // consume, to check the next char
793                    match chars.peek() {
794                        Some('\'') => {
795                            let s =
796                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
797                            Ok(Some(Token::EscapedStringLiteral(s)))
798                        }
799                        _ => {
800                            // regular identifier starting with an "E" or "e"
801                            let s = self.tokenize_word(x, chars);
802                            Ok(Some(Token::make_word(&s, None)))
803                        }
804                    }
805                }
806                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
807                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
808                    chars.next(); // consume, to check the next char
809                    if chars.peek() == Some(&'&') {
810                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
811                        let mut chars_clone = chars.peekable.clone();
812                        chars_clone.next(); // consume the '&' in the clone
813                        if chars_clone.peek() == Some(&'\'') {
814                            chars.next(); // consume the '&' in the original iterator
815                            let s = unescape_unicode_single_quoted_string(chars)?;
816                            return Ok(Some(Token::UnicodeStringLiteral(s)));
817                        }
818                    }
819                    // regular identifier starting with an "U" or "u"
820                    let s = self.tokenize_word(x, chars);
821                    Ok(Some(Token::make_word(&s, None)))
822                }
823                // The spec only allows an uppercase 'X' to introduce a hex
824                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
825                x @ 'x' | x @ 'X' => {
826                    chars.next(); // consume, to check the next char
827                    match chars.peek() {
828                        Some('\'') => {
829                            // X'...' - a <binary string literal>
830                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
831                            Ok(Some(Token::HexStringLiteral(s)))
832                        }
833                        _ => {
834                            // regular identifier starting with an "X"
835                            let s = self.tokenize_word(x, chars);
836                            Ok(Some(Token::make_word(&s, None)))
837                        }
838                    }
839                }
840                // single quoted string
841                '\'' => {
842                    if self.dialect.supports_triple_quoted_string() {
843                        return self
844                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
845                                chars,
846                                '\'',
847                                self.dialect.supports_string_literal_backslash_escape(),
848                                Token::SingleQuotedString,
849                                Token::TripleSingleQuotedString,
850                            );
851                    }
852                    let s = self.tokenize_single_quoted_string(
853                        chars,
854                        '\'',
855                        self.dialect.supports_string_literal_backslash_escape(),
856                    )?;
857
858                    Ok(Some(Token::SingleQuotedString(s)))
859                }
860                // double quoted string
861                '\"' if !self.dialect.is_delimited_identifier_start(ch)
862                    && !self.dialect.is_identifier_start(ch) =>
863                {
864                    if self.dialect.supports_triple_quoted_string() {
865                        return self
866                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
867                                chars,
868                                '"',
869                                self.dialect.supports_string_literal_backslash_escape(),
870                                Token::DoubleQuotedString,
871                                Token::TripleDoubleQuotedString,
872                            );
873                    }
874                    let s = self.tokenize_single_quoted_string(
875                        chars,
876                        '"',
877                        self.dialect.supports_string_literal_backslash_escape(),
878                    )?;
879
880                    Ok(Some(Token::DoubleQuotedString(s)))
881                }
882                // delimited (quoted) identifier
883                quote_start
884                    if self.dialect.is_delimited_identifier_start(ch)
885                        && self
886                            .dialect
887                            .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
888                {
889                    let error_loc = chars.location();
890                    chars.next(); // consume the opening quote
891                    let quote_end = Word::matching_end_quote(quote_start);
892                    let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
893
894                    if last_char == Some(quote_end) {
895                        Ok(Some(Token::make_word(&s, Some(quote_start))))
896                    } else {
897                        self.tokenizer_error(
898                            error_loc,
899                            format!("Expected close delimiter '{quote_end}' before EOF."),
900                        )
901                    }
902                }
903                // numbers and period
904                '0'..='9' | '.' => {
905                    let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
906
907                    // match binary literal that starts with 0x
908                    if s == "0" && chars.peek() == Some(&'x') {
909                        chars.next();
910                        let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
911                        return Ok(Some(Token::HexStringLiteral(s2)));
912                    }
913
914                    // match one period
915                    if let Some('.') = chars.peek() {
916                        s.push('.');
917                        chars.next();
918                    }
919                    s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
920
921                    // No number -> Token::Period
922                    if s == "." {
923                        return Ok(Some(Token::Period));
924                    }
925
926                    let mut exponent_part = String::new();
927                    // Parse exponent as number
928                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
929                        let mut char_clone = chars.peekable.clone();
930                        exponent_part.push(char_clone.next().unwrap());
931
932                        // Optional sign
933                        match char_clone.peek() {
934                            Some(&c) if matches!(c, '+' | '-') => {
935                                exponent_part.push(c);
936                                char_clone.next();
937                            }
938                            _ => (),
939                        }
940
941                        match char_clone.peek() {
942                            // Definitely an exponent, get original iterator up to speed and use it
943                            Some(&c) if c.is_ascii_digit() => {
944                                for _ in 0..exponent_part.len() {
945                                    chars.next();
946                                }
947                                exponent_part +=
948                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
949                                s += exponent_part.as_str();
950                            }
951                            // Not an exponent, discard the work done
952                            _ => (),
953                        }
954                    }
955
956                    // mysql dialect supports identifiers that start with a numeric prefix,
957                    // as long as they aren't an exponent number.
958                    if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
959                        let word =
960                            peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
961
962                        if !word.is_empty() {
963                            s += word.as_str();
964                            return Ok(Some(Token::make_word(s.as_str(), None)));
965                        }
966                    }
967
968                    let long = if chars.peek() == Some(&'L') {
969                        chars.next();
970                        true
971                    } else {
972                        false
973                    };
974                    Ok(Some(Token::Number(s, long)))
975                }
976                // punctuation
977                '(' => self.consume_and_return(chars, Token::LParen),
978                ')' => self.consume_and_return(chars, Token::RParen),
979                ',' => self.consume_and_return(chars, Token::Comma),
980                // operators
981                '-' => {
982                    chars.next(); // consume the '-'
983                    match chars.peek() {
984                        Some('-') => {
985                            chars.next(); // consume the second '-', starting a single-line comment
986                            let comment = self.tokenize_single_line_comment(chars);
987                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
988                                prefix: "--".to_owned(),
989                                comment,
990                            })))
991                        }
992                        Some('>') => {
993                            chars.next();
994                            match chars.peek() {
995                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
996                                _ => self.start_binop(chars, "->", Token::Arrow),
997                            }
998                        }
999                        // a regular '-' operator
1000                        _ => self.start_binop(chars, "-", Token::Minus),
1001                    }
1002                }
1003                '/' => {
1004                    chars.next(); // consume the '/'
1005                    match chars.peek() {
1006                        Some('*') => {
1007                            chars.next(); // consume the '*', starting a multi-line comment
1008                            self.tokenize_multiline_comment(chars)
1009                        }
1010                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1011                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1012                            let comment = self.tokenize_single_line_comment(chars);
1013                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1014                                prefix: "//".to_owned(),
1015                                comment,
1016                            })))
1017                        }
1018                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1019                            self.consume_and_return(chars, Token::DuckIntDiv)
1020                        }
1021                        // a regular '/' operator
1022                        _ => Ok(Some(Token::Div)),
1023                    }
1024                }
1025                '+' => self.consume_and_return(chars, Token::Plus),
1026                '*' => self.consume_and_return(chars, Token::Mul),
1027                '%' => {
1028                    chars.next(); // advance past '%'
1029                    match chars.peek() {
1030                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1031                        Some(sch) if self.dialect.is_identifier_start('%') => {
1032                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1033                        }
1034                        _ => self.start_binop(chars, "%", Token::Mod),
1035                    }
1036                }
1037                '|' => {
1038                    chars.next(); // consume the '|'
1039                    match chars.peek() {
1040                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1041                        Some('|') => {
1042                            chars.next(); // consume the second '|'
1043                            match chars.peek() {
1044                                Some('/') => {
1045                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1046                                }
1047                                _ => self.start_binop(chars, "||", Token::StringConcat),
1048                            }
1049                        }
1050                        // Bitshift '|' operator
1051                        _ => self.start_binop(chars, "|", Token::Pipe),
1052                    }
1053                }
1054                '=' => {
1055                    chars.next(); // consume
1056                    match chars.peek() {
1057                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1058                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1059                        _ => Ok(Some(Token::Eq)),
1060                    }
1061                }
1062                '!' => {
1063                    chars.next(); // consume
1064                    match chars.peek() {
1065                        Some('=') => self.consume_and_return(chars, Token::Neq),
1066                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1067                        Some('~') => {
1068                            chars.next();
1069                            match chars.peek() {
1070                                Some('*') => self
1071                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1072                                Some('~') => {
1073                                    chars.next();
1074                                    match chars.peek() {
1075                                        Some('*') => self.consume_and_return(
1076                                            chars,
1077                                            Token::ExclamationMarkDoubleTildeAsterisk,
1078                                        ),
1079                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1080                                    }
1081                                }
1082                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1083                            }
1084                        }
1085                        _ => Ok(Some(Token::ExclamationMark)),
1086                    }
1087                }
1088                '<' => {
1089                    chars.next(); // consume
1090                    match chars.peek() {
1091                        Some('=') => {
1092                            chars.next();
1093                            match chars.peek() {
1094                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1095                                _ => self.start_binop(chars, "<=", Token::LtEq),
1096                            }
1097                        }
1098                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1099                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1100                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1101                        _ => self.start_binop(chars, "<", Token::Lt),
1102                    }
1103                }
1104                '>' => {
1105                    chars.next(); // consume
1106                    match chars.peek() {
1107                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1108                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1109                        _ => self.start_binop(chars, ">", Token::Gt),
1110                    }
1111                }
1112                ':' => {
1113                    chars.next();
1114                    match chars.peek() {
1115                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1116                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1117                        _ => Ok(Some(Token::Colon)),
1118                    }
1119                }
1120                ';' => self.consume_and_return(chars, Token::SemiColon),
1121                '\\' => self.consume_and_return(chars, Token::Backslash),
1122                '[' => self.consume_and_return(chars, Token::LBracket),
1123                ']' => self.consume_and_return(chars, Token::RBracket),
1124                '&' => {
1125                    chars.next(); // consume the '&'
1126                    match chars.peek() {
1127                        Some('&') => {
1128                            chars.next(); // consume the second '&'
1129                            self.start_binop(chars, "&&", Token::Overlap)
1130                        }
1131                        // Bitshift '&' operator
1132                        _ => self.start_binop(chars, "&", Token::Ampersand),
1133                    }
1134                }
1135                '^' => {
1136                    chars.next(); // consume the '^'
1137                    match chars.peek() {
1138                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1139                        _ => Ok(Some(Token::Caret)),
1140                    }
1141                }
1142                '{' => self.consume_and_return(chars, Token::LBrace),
1143                '}' => self.consume_and_return(chars, Token::RBrace),
1144                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect) => {
1145                    chars.next(); // consume the '#', starting a snowflake single-line comment
1146                    let comment = self.tokenize_single_line_comment(chars);
1147                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1148                        prefix: "#".to_owned(),
1149                        comment,
1150                    })))
1151                }
1152                '~' => {
1153                    chars.next(); // consume
1154                    match chars.peek() {
1155                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1156                        Some('~') => {
1157                            chars.next();
1158                            match chars.peek() {
1159                                Some('*') => {
1160                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1161                                }
1162                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1163                            }
1164                        }
1165                        _ => self.start_binop(chars, "~", Token::Tilde),
1166                    }
1167                }
1168                '#' => {
1169                    chars.next();
1170                    match chars.peek() {
1171                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1172                        Some('>') => {
1173                            chars.next();
1174                            match chars.peek() {
1175                                Some('>') => {
1176                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1177                                }
1178                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1179                            }
1180                        }
1181                        Some(' ') => Ok(Some(Token::Sharp)),
1182                        Some(sch) if self.dialect.is_identifier_start('#') => {
1183                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1184                        }
1185                        _ => self.start_binop(chars, "#", Token::Sharp),
1186                    }
1187                }
1188                '@' => {
1189                    chars.next();
1190                    match chars.peek() {
1191                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1192                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1193                        Some('@') => {
1194                            chars.next();
1195                            match chars.peek() {
1196                                Some(' ') => Ok(Some(Token::AtAt)),
1197                                Some(tch) if self.dialect.is_identifier_start('@') => {
1198                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1199                                }
1200                                _ => Ok(Some(Token::AtAt)),
1201                            }
1202                        }
1203                        Some(' ') => Ok(Some(Token::AtSign)),
1204                        Some(sch) if self.dialect.is_identifier_start('@') => {
1205                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1206                        }
1207                        _ => Ok(Some(Token::AtSign)),
1208                    }
1209                }
1210                // Postgres uses ? for jsonb operators, not prepared statements
1211                '?' if dialect_of!(self is PostgreSqlDialect) => {
1212                    chars.next();
1213                    match chars.peek() {
1214                        Some('|') => self.consume_and_return(chars, Token::QuestionPipe),
1215                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1216                        _ => self.consume_and_return(chars, Token::Question),
1217                    }
1218                }
1219                '?' => {
1220                    chars.next();
1221                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1222                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1223                }
1224
1225                // identifier or keyword
1226                ch if self.dialect.is_identifier_start(ch) => {
1227                    self.tokenize_identifier_or_keyword([ch], chars)
1228                }
1229                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1230
1231                //whitespace check (including unicode chars) should be last as it covers some of the chars above
1232                ch if ch.is_whitespace() => {
1233                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1234                }
1235                other => self.consume_and_return(chars, Token::Char(other)),
1236            },
1237            None => Ok(None),
1238        }
1239    }
1240
1241    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1242    fn consume_for_binop(
1243        &self,
1244        chars: &mut State,
1245        prefix: &str,
1246        default: Token,
1247    ) -> Result<Option<Token>, TokenizerError> {
1248        chars.next(); // consume the first char
1249        self.start_binop(chars, prefix, default)
1250    }
1251
1252    /// parse a custom binary operator
1253    fn start_binop(
1254        &self,
1255        chars: &mut State,
1256        prefix: &str,
1257        default: Token,
1258    ) -> Result<Option<Token>, TokenizerError> {
1259        let mut custom = None;
1260        while let Some(&ch) = chars.peek() {
1261            if !self.dialect.is_custom_operator_part(ch) {
1262                break;
1263            }
1264
1265            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1266            chars.next();
1267        }
1268
1269        Ok(Some(
1270            custom.map(Token::CustomBinaryOperator).unwrap_or(default),
1271        ))
1272    }
1273
1274    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1275    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1276        let mut s = String::new();
1277        let mut value = String::new();
1278
1279        chars.next();
1280
1281        if let Some('$') = chars.peek() {
1282            chars.next();
1283
1284            let mut is_terminated = false;
1285            let mut prev: Option<char> = None;
1286
1287            while let Some(&ch) = chars.peek() {
1288                if prev == Some('$') {
1289                    if ch == '$' {
1290                        chars.next();
1291                        is_terminated = true;
1292                        break;
1293                    } else {
1294                        s.push('$');
1295                        s.push(ch);
1296                    }
1297                } else if ch != '$' {
1298                    s.push(ch);
1299                }
1300
1301                prev = Some(ch);
1302                chars.next();
1303            }
1304
1305            return if chars.peek().is_none() && !is_terminated {
1306                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1307            } else {
1308                Ok(Token::DollarQuotedString(DollarQuotedString {
1309                    value: s,
1310                    tag: None,
1311                }))
1312            };
1313        } else {
1314            value.push_str(&peeking_take_while(chars, |ch| {
1315                ch.is_alphanumeric() || ch == '_'
1316            }));
1317
1318            if let Some('$') = chars.peek() {
1319                chars.next();
1320
1321                'searching_for_end: loop {
1322                    s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
1323                    match chars.peek() {
1324                        Some('$') => {
1325                            chars.next();
1326                            let mut maybe_s = String::from("$");
1327                            for c in value.chars() {
1328                                if let Some(next_char) = chars.next() {
1329                                    maybe_s.push(next_char);
1330                                    if next_char != c {
1331                                        // This doesn't match the dollar quote delimiter so this
1332                                        // is not the end of the string.
1333                                        s.push_str(&maybe_s);
1334                                        continue 'searching_for_end;
1335                                    }
1336                                } else {
1337                                    return self.tokenizer_error(
1338                                        chars.location(),
1339                                        "Unterminated dollar-quoted, expected $",
1340                                    );
1341                                }
1342                            }
1343                            if chars.peek() == Some(&'$') {
1344                                chars.next();
1345                                maybe_s.push('$');
1346                                // maybe_s matches the end delimiter
1347                                break 'searching_for_end;
1348                            } else {
1349                                // This also doesn't match the dollar quote delimiter as there are
1350                                // more characters before the second dollar so this is not the end
1351                                // of the string.
1352                                s.push_str(&maybe_s);
1353                                continue 'searching_for_end;
1354                            }
1355                        }
1356                        _ => {
1357                            return self.tokenizer_error(
1358                                chars.location(),
1359                                "Unterminated dollar-quoted, expected $",
1360                            )
1361                        }
1362                    }
1363                }
1364            } else {
1365                return Ok(Token::Placeholder(String::from("$") + &value));
1366            }
1367        }
1368
1369        Ok(Token::DollarQuotedString(DollarQuotedString {
1370            value: s,
1371            tag: if value.is_empty() { None } else { Some(value) },
1372        }))
1373    }
1374
1375    fn tokenizer_error<R>(
1376        &self,
1377        loc: Location,
1378        message: impl Into<String>,
1379    ) -> Result<R, TokenizerError> {
1380        Err(TokenizerError {
1381            message: message.into(),
1382            location: loc,
1383        })
1384    }
1385
1386    // Consume characters until newline
1387    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1388        let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1389        if let Some(ch) = chars.next() {
1390            assert_eq!(ch, '\n');
1391            comment.push(ch);
1392        }
1393        comment
1394    }
1395
1396    /// Tokenize an identifier or keyword, after the first char is already consumed.
1397    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1398        let mut s = first_chars.into();
1399        s.push_str(&peeking_take_while(chars, |ch| {
1400            self.dialect.is_identifier_part(ch)
1401        }));
1402        s
1403    }
1404
1405    /// Read a single quoted string, starting with the opening quote.
1406    fn tokenize_escaped_single_quoted_string(
1407        &self,
1408        starting_loc: Location,
1409        chars: &mut State,
1410    ) -> Result<String, TokenizerError> {
1411        if let Some(s) = unescape_single_quoted_string(chars) {
1412            return Ok(s);
1413        }
1414
1415        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1416    }
1417
1418    /// Reads a string literal quoted by a single or triple quote characters.
1419    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
1420    fn tokenize_single_or_triple_quoted_string<F>(
1421        &self,
1422        chars: &mut State,
1423        quote_style: char,
1424        backslash_escape: bool,
1425        single_quote_token: F,
1426        triple_quote_token: F,
1427    ) -> Result<Option<Token>, TokenizerError>
1428    where
1429        F: Fn(String) -> Token,
1430    {
1431        let error_loc = chars.location();
1432
1433        let mut num_opening_quotes = 0u8;
1434        for _ in 0..3 {
1435            if Some(&quote_style) == chars.peek() {
1436                chars.next(); // Consume quote.
1437                num_opening_quotes += 1;
1438            } else {
1439                break;
1440            }
1441        }
1442
1443        let (token_fn, num_quote_chars) = match num_opening_quotes {
1444            1 => (single_quote_token, NumStringQuoteChars::One),
1445            2 => {
1446                // If we matched double quotes, then this is an empty string.
1447                return Ok(Some(single_quote_token("".into())));
1448            }
1449            3 => {
1450                let Some(num_quote_chars) = NonZeroU8::new(3) else {
1451                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1452                };
1453                (
1454                    triple_quote_token,
1455                    NumStringQuoteChars::Many(num_quote_chars),
1456                )
1457            }
1458            _ => {
1459                return self.tokenizer_error(error_loc, "invalid string literal opening");
1460            }
1461        };
1462
1463        let settings = TokenizeQuotedStringSettings {
1464            quote_style,
1465            num_quote_chars,
1466            num_opening_quotes_to_consume: 0,
1467            backslash_escape,
1468        };
1469
1470        self.tokenize_quoted_string(chars, settings)
1471            .map(token_fn)
1472            .map(Some)
1473    }
1474
1475    /// Reads a string literal quoted by a single quote character.
1476    fn tokenize_single_quoted_string(
1477        &self,
1478        chars: &mut State,
1479        quote_style: char,
1480        backslash_escape: bool,
1481    ) -> Result<String, TokenizerError> {
1482        self.tokenize_quoted_string(
1483            chars,
1484            TokenizeQuotedStringSettings {
1485                quote_style,
1486                num_quote_chars: NumStringQuoteChars::One,
1487                num_opening_quotes_to_consume: 1,
1488                backslash_escape,
1489            },
1490        )
1491    }
1492
1493    /// Read a quoted string.
1494    fn tokenize_quoted_string(
1495        &self,
1496        chars: &mut State,
1497        settings: TokenizeQuotedStringSettings,
1498    ) -> Result<String, TokenizerError> {
1499        let mut s = String::new();
1500        let error_loc = chars.location();
1501
1502        // Consume any opening quotes.
1503        for _ in 0..settings.num_opening_quotes_to_consume {
1504            if Some(settings.quote_style) != chars.next() {
1505                return self.tokenizer_error(error_loc, "invalid string literal opening");
1506            }
1507        }
1508
1509        let mut num_consecutive_quotes = 0;
1510        while let Some(&ch) = chars.peek() {
1511            let pending_final_quote = match settings.num_quote_chars {
1512                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
1513                n @ NumStringQuoteChars::Many(count)
1514                    if num_consecutive_quotes + 1 == count.get() =>
1515                {
1516                    Some(n)
1517                }
1518                NumStringQuoteChars::Many(_) => None,
1519            };
1520
1521            match ch {
1522                char if char == settings.quote_style && pending_final_quote.is_some() => {
1523                    chars.next(); // consume
1524
1525                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
1526                        // For an initial string like `"""abc"""`, at this point we have
1527                        // `abc""` in the buffer and have now matched the final `"`.
1528                        // However, the string to return is simply `abc`, so we strip off
1529                        // the trailing quotes before returning.
1530                        let mut buf = s.chars();
1531                        for _ in 1..count.get() {
1532                            buf.next_back();
1533                        }
1534                        return Ok(buf.as_str().to_string());
1535                    } else if chars
1536                        .peek()
1537                        .map(|c| *c == settings.quote_style)
1538                        .unwrap_or(false)
1539                    {
1540                        s.push(ch);
1541                        if !self.unescape {
1542                            // In no-escape mode, the given query has to be saved completely
1543                            s.push(ch);
1544                        }
1545                        chars.next();
1546                    } else {
1547                        return Ok(s);
1548                    }
1549                }
1550                '\\' if settings.backslash_escape => {
1551                    // consume backslash
1552                    chars.next();
1553
1554                    num_consecutive_quotes = 0;
1555
1556                    if let Some(next) = chars.peek() {
1557                        if !self.unescape {
1558                            // In no-escape mode, the given query has to be saved completely including backslashes.
1559                            s.push(ch);
1560                            s.push(*next);
1561                            chars.next(); // consume next
1562                        } else {
1563                            let n = match next {
1564                                '0' => '\0',
1565                                'a' => '\u{7}',
1566                                'b' => '\u{8}',
1567                                'f' => '\u{c}',
1568                                'n' => '\n',
1569                                'r' => '\r',
1570                                't' => '\t',
1571                                'Z' => '\u{1a}',
1572                                _ => *next,
1573                            };
1574                            s.push(n);
1575                            chars.next(); // consume next
1576                        }
1577                    }
1578                }
1579                ch => {
1580                    chars.next(); // consume ch
1581
1582                    if ch == settings.quote_style {
1583                        num_consecutive_quotes += 1;
1584                    } else {
1585                        num_consecutive_quotes = 0;
1586                    }
1587
1588                    s.push(ch);
1589                }
1590            }
1591        }
1592        self.tokenizer_error(error_loc, "Unterminated string literal")
1593    }
1594
1595    fn tokenize_multiline_comment(
1596        &self,
1597        chars: &mut State,
1598    ) -> Result<Option<Token>, TokenizerError> {
1599        let mut s = String::new();
1600        let mut nested = 1;
1601        let mut last_ch = ' ';
1602
1603        loop {
1604            match chars.next() {
1605                Some(ch) => {
1606                    if last_ch == '/' && ch == '*' {
1607                        nested += 1;
1608                    } else if last_ch == '*' && ch == '/' {
1609                        nested -= 1;
1610                        if nested == 0 {
1611                            s.pop();
1612                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1613                        }
1614                    }
1615                    s.push(ch);
1616                    last_ch = ch;
1617                }
1618                None => {
1619                    break self.tokenizer_error(
1620                        chars.location(),
1621                        "Unexpected EOF while in a multi-line comment",
1622                    )
1623                }
1624            }
1625        }
1626    }
1627
1628    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
1629        let mut last_char = None;
1630        let mut s = String::new();
1631        while let Some(ch) = chars.next() {
1632            if ch == quote_end {
1633                if chars.peek() == Some(&quote_end) {
1634                    chars.next();
1635                    s.push(ch);
1636                    if !self.unescape {
1637                        // In no-escape mode, the given query has to be saved completely
1638                        s.push(ch);
1639                    }
1640                } else {
1641                    last_char = Some(quote_end);
1642                    break;
1643                }
1644            } else {
1645                s.push(ch);
1646            }
1647        }
1648        (s, last_char)
1649    }
1650
1651    #[allow(clippy::unnecessary_wraps)]
1652    fn consume_and_return(
1653        &self,
1654        chars: &mut State,
1655        t: Token,
1656    ) -> Result<Option<Token>, TokenizerError> {
1657        chars.next();
1658        Ok(Some(t))
1659    }
1660}
1661
1662/// Read from `chars` until `predicate` returns `false` or EOF is hit.
1663/// Return the characters read as String, and keep the first non-matching
1664/// char available as `chars.next()`.
1665fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1666    let mut s = String::new();
1667    while let Some(&ch) = chars.peek() {
1668        if predicate(ch) {
1669            chars.next(); // consume
1670            s.push(ch);
1671        } else {
1672            break;
1673        }
1674    }
1675    s
1676}
1677
1678fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
1679    Unescape::new(chars).unescape()
1680}
1681
1682struct Unescape<'a: 'b, 'b> {
1683    chars: &'b mut State<'a>,
1684}
1685
1686impl<'a: 'b, 'b> Unescape<'a, 'b> {
1687    fn new(chars: &'b mut State<'a>) -> Self {
1688        Self { chars }
1689    }
1690    fn unescape(mut self) -> Option<String> {
1691        let mut unescaped = String::new();
1692
1693        self.chars.next();
1694
1695        while let Some(c) = self.chars.next() {
1696            if c == '\'' {
1697                // case: ''''
1698                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1699                    self.chars.next();
1700                    unescaped.push('\'');
1701                    continue;
1702                }
1703                return Some(unescaped);
1704            }
1705
1706            if c != '\\' {
1707                unescaped.push(c);
1708                continue;
1709            }
1710
1711            let c = match self.chars.next()? {
1712                'b' => '\u{0008}',
1713                'f' => '\u{000C}',
1714                'n' => '\n',
1715                'r' => '\r',
1716                't' => '\t',
1717                'u' => self.unescape_unicode_16()?,
1718                'U' => self.unescape_unicode_32()?,
1719                'x' => self.unescape_hex()?,
1720                c if c.is_digit(8) => self.unescape_octal(c)?,
1721                c => c,
1722            };
1723
1724            unescaped.push(Self::check_null(c)?);
1725        }
1726
1727        None
1728    }
1729
1730    #[inline]
1731    fn check_null(c: char) -> Option<char> {
1732        if c == '\0' {
1733            None
1734        } else {
1735            Some(c)
1736        }
1737    }
1738
1739    #[inline]
1740    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
1741        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
1742        match u32::from_str_radix(s, RADIX) {
1743            Err(_) => None,
1744            Ok(n) => {
1745                let n = n & 0xFF;
1746                if n <= 127 {
1747                    char::from_u32(n)
1748                } else {
1749                    None
1750                }
1751            }
1752        }
1753    }
1754
1755    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
1756    fn unescape_hex(&mut self) -> Option<char> {
1757        let mut s = String::new();
1758
1759        for _ in 0..2 {
1760            match self.next_hex_digit() {
1761                Some(c) => s.push(c),
1762                None => break,
1763            }
1764        }
1765
1766        if s.is_empty() {
1767            return Some('x');
1768        }
1769
1770        Self::byte_to_char::<16>(&s)
1771    }
1772
1773    #[inline]
1774    fn next_hex_digit(&mut self) -> Option<char> {
1775        match self.chars.peek() {
1776            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
1777            _ => None,
1778        }
1779    }
1780
1781    // Octal byte value. \o, \oo, \ooo (o = 0–7)
1782    fn unescape_octal(&mut self, c: char) -> Option<char> {
1783        let mut s = String::new();
1784
1785        s.push(c);
1786        for _ in 0..2 {
1787            match self.next_octal_digest() {
1788                Some(c) => s.push(c),
1789                None => break,
1790            }
1791        }
1792
1793        Self::byte_to_char::<8>(&s)
1794    }
1795
1796    #[inline]
1797    fn next_octal_digest(&mut self) -> Option<char> {
1798        match self.chars.peek() {
1799            Some(c) if c.is_digit(8) => self.chars.next(),
1800            _ => None,
1801        }
1802    }
1803
1804    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
1805    fn unescape_unicode_16(&mut self) -> Option<char> {
1806        self.unescape_unicode::<4>()
1807    }
1808
1809    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
1810    fn unescape_unicode_32(&mut self) -> Option<char> {
1811        self.unescape_unicode::<8>()
1812    }
1813
1814    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
1815        let mut s = String::new();
1816        for _ in 0..NUM {
1817            s.push(self.chars.next()?);
1818        }
1819        match u32::from_str_radix(&s, 16) {
1820            Err(_) => None,
1821            Ok(n) => char::from_u32(n),
1822        }
1823    }
1824}
1825
1826fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
1827    let mut unescaped = String::new();
1828    chars.next(); // consume the opening quote
1829    while let Some(c) = chars.next() {
1830        match c {
1831            '\'' => {
1832                if chars.peek() == Some(&'\'') {
1833                    chars.next();
1834                    unescaped.push('\'');
1835                } else {
1836                    return Ok(unescaped);
1837                }
1838            }
1839            '\\' => match chars.peek() {
1840                Some('\\') => {
1841                    chars.next();
1842                    unescaped.push('\\');
1843                }
1844                Some('+') => {
1845                    chars.next();
1846                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
1847                }
1848                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
1849            },
1850            _ => {
1851                unescaped.push(c);
1852            }
1853        }
1854    }
1855    Err(TokenizerError {
1856        message: "Unterminated unicode encoded string literal".to_string(),
1857        location: chars.location(),
1858    })
1859}
1860
1861fn take_char_from_hex_digits(
1862    chars: &mut State<'_>,
1863    max_digits: usize,
1864) -> Result<char, TokenizerError> {
1865    let mut result = 0u32;
1866    for _ in 0..max_digits {
1867        let next_char = chars.next().ok_or_else(|| TokenizerError {
1868            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
1869                .to_string(),
1870            location: chars.location(),
1871        })?;
1872        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
1873            message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
1874            location: chars.location(),
1875        })?;
1876        result = result * 16 + digit;
1877    }
1878    char::from_u32(result).ok_or_else(|| TokenizerError {
1879        message: format!("Invalid unicode character: {:x}", result),
1880        location: chars.location(),
1881    })
1882}
1883
1884#[cfg(test)]
1885mod tests {
1886    use super::*;
1887    use crate::dialect::{
1888        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect,
1889    };
1890    use core::fmt::Debug;
1891
1892    #[test]
1893    fn tokenizer_error_impl() {
1894        let err = TokenizerError {
1895            message: "test".into(),
1896            location: Location { line: 1, column: 1 },
1897        };
1898        #[cfg(feature = "std")]
1899        {
1900            use std::error::Error;
1901            assert!(err.source().is_none());
1902        }
1903        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
1904    }
1905
1906    #[test]
1907    fn tokenize_select_1() {
1908        let sql = String::from("SELECT 1");
1909        let dialect = GenericDialect {};
1910        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1911
1912        let expected = vec![
1913            Token::make_keyword("SELECT"),
1914            Token::Whitespace(Whitespace::Space),
1915            Token::Number(String::from("1"), false),
1916        ];
1917
1918        compare(expected, tokens);
1919    }
1920
1921    #[test]
1922    fn tokenize_select_float() {
1923        let sql = String::from("SELECT .1");
1924        let dialect = GenericDialect {};
1925        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1926
1927        let expected = vec![
1928            Token::make_keyword("SELECT"),
1929            Token::Whitespace(Whitespace::Space),
1930            Token::Number(String::from(".1"), false),
1931        ];
1932
1933        compare(expected, tokens);
1934    }
1935
1936    #[test]
1937    fn tokenize_clickhouse_double_equal() {
1938        let sql = String::from("SELECT foo=='1'");
1939        let dialect = ClickHouseDialect {};
1940        let mut tokenizer = Tokenizer::new(&dialect, &sql);
1941        let tokens = tokenizer.tokenize().unwrap();
1942
1943        let expected = vec![
1944            Token::make_keyword("SELECT"),
1945            Token::Whitespace(Whitespace::Space),
1946            Token::Word(Word {
1947                value: "foo".to_string(),
1948                quote_style: None,
1949                keyword: Keyword::NoKeyword,
1950            }),
1951            Token::DoubleEq,
1952            Token::SingleQuotedString("1".to_string()),
1953        ];
1954
1955        compare(expected, tokens);
1956    }
1957
1958    #[test]
1959    fn tokenize_select_exponent() {
1960        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
1961        let dialect = GenericDialect {};
1962        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1963
1964        let expected = vec![
1965            Token::make_keyword("SELECT"),
1966            Token::Whitespace(Whitespace::Space),
1967            Token::Number(String::from("1e10"), false),
1968            Token::Comma,
1969            Token::Whitespace(Whitespace::Space),
1970            Token::Number(String::from("1e-10"), false),
1971            Token::Comma,
1972            Token::Whitespace(Whitespace::Space),
1973            Token::Number(String::from("1e+10"), false),
1974            Token::Comma,
1975            Token::Whitespace(Whitespace::Space),
1976            Token::Number(String::from("1"), false),
1977            Token::make_word("ea", None),
1978            Token::Comma,
1979            Token::Whitespace(Whitespace::Space),
1980            Token::Number(String::from("1e-10"), false),
1981            Token::make_word("a", None),
1982            Token::Comma,
1983            Token::Whitespace(Whitespace::Space),
1984            Token::Number(String::from("1e-10"), false),
1985            Token::Minus,
1986            Token::Number(String::from("10"), false),
1987        ];
1988
1989        compare(expected, tokens);
1990    }
1991
1992    #[test]
1993    fn tokenize_scalar_function() {
1994        let sql = String::from("SELECT sqrt(1)");
1995        let dialect = GenericDialect {};
1996        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
1997
1998        let expected = vec![
1999            Token::make_keyword("SELECT"),
2000            Token::Whitespace(Whitespace::Space),
2001            Token::make_word("sqrt", None),
2002            Token::LParen,
2003            Token::Number(String::from("1"), false),
2004            Token::RParen,
2005        ];
2006
2007        compare(expected, tokens);
2008    }
2009
2010    #[test]
2011    fn tokenize_string_string_concat() {
2012        let sql = String::from("SELECT 'a' || 'b'");
2013        let dialect = GenericDialect {};
2014        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2015
2016        let expected = vec![
2017            Token::make_keyword("SELECT"),
2018            Token::Whitespace(Whitespace::Space),
2019            Token::SingleQuotedString(String::from("a")),
2020            Token::Whitespace(Whitespace::Space),
2021            Token::StringConcat,
2022            Token::Whitespace(Whitespace::Space),
2023            Token::SingleQuotedString(String::from("b")),
2024        ];
2025
2026        compare(expected, tokens);
2027    }
2028    #[test]
2029    fn tokenize_bitwise_op() {
2030        let sql = String::from("SELECT one | two ^ three");
2031        let dialect = GenericDialect {};
2032        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2033
2034        let expected = vec![
2035            Token::make_keyword("SELECT"),
2036            Token::Whitespace(Whitespace::Space),
2037            Token::make_word("one", None),
2038            Token::Whitespace(Whitespace::Space),
2039            Token::Pipe,
2040            Token::Whitespace(Whitespace::Space),
2041            Token::make_word("two", None),
2042            Token::Whitespace(Whitespace::Space),
2043            Token::Caret,
2044            Token::Whitespace(Whitespace::Space),
2045            Token::make_word("three", None),
2046        ];
2047        compare(expected, tokens);
2048    }
2049
2050    #[test]
2051    fn tokenize_logical_xor() {
2052        let sql =
2053            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2054        let dialect = GenericDialect {};
2055        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2056
2057        let expected = vec![
2058            Token::make_keyword("SELECT"),
2059            Token::Whitespace(Whitespace::Space),
2060            Token::make_keyword("true"),
2061            Token::Whitespace(Whitespace::Space),
2062            Token::make_keyword("XOR"),
2063            Token::Whitespace(Whitespace::Space),
2064            Token::make_keyword("true"),
2065            Token::Comma,
2066            Token::Whitespace(Whitespace::Space),
2067            Token::make_keyword("false"),
2068            Token::Whitespace(Whitespace::Space),
2069            Token::make_keyword("XOR"),
2070            Token::Whitespace(Whitespace::Space),
2071            Token::make_keyword("false"),
2072            Token::Comma,
2073            Token::Whitespace(Whitespace::Space),
2074            Token::make_keyword("true"),
2075            Token::Whitespace(Whitespace::Space),
2076            Token::make_keyword("XOR"),
2077            Token::Whitespace(Whitespace::Space),
2078            Token::make_keyword("false"),
2079            Token::Comma,
2080            Token::Whitespace(Whitespace::Space),
2081            Token::make_keyword("false"),
2082            Token::Whitespace(Whitespace::Space),
2083            Token::make_keyword("XOR"),
2084            Token::Whitespace(Whitespace::Space),
2085            Token::make_keyword("true"),
2086        ];
2087        compare(expected, tokens);
2088    }
2089
2090    #[test]
2091    fn tokenize_simple_select() {
2092        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2093        let dialect = GenericDialect {};
2094        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2095
2096        let expected = vec![
2097            Token::make_keyword("SELECT"),
2098            Token::Whitespace(Whitespace::Space),
2099            Token::Mul,
2100            Token::Whitespace(Whitespace::Space),
2101            Token::make_keyword("FROM"),
2102            Token::Whitespace(Whitespace::Space),
2103            Token::make_word("customer", None),
2104            Token::Whitespace(Whitespace::Space),
2105            Token::make_keyword("WHERE"),
2106            Token::Whitespace(Whitespace::Space),
2107            Token::make_word("id", None),
2108            Token::Whitespace(Whitespace::Space),
2109            Token::Eq,
2110            Token::Whitespace(Whitespace::Space),
2111            Token::Number(String::from("1"), false),
2112            Token::Whitespace(Whitespace::Space),
2113            Token::make_keyword("LIMIT"),
2114            Token::Whitespace(Whitespace::Space),
2115            Token::Number(String::from("5"), false),
2116        ];
2117
2118        compare(expected, tokens);
2119    }
2120
2121    #[test]
2122    fn tokenize_explain_select() {
2123        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2124        let dialect = GenericDialect {};
2125        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2126
2127        let expected = vec![
2128            Token::make_keyword("EXPLAIN"),
2129            Token::Whitespace(Whitespace::Space),
2130            Token::make_keyword("SELECT"),
2131            Token::Whitespace(Whitespace::Space),
2132            Token::Mul,
2133            Token::Whitespace(Whitespace::Space),
2134            Token::make_keyword("FROM"),
2135            Token::Whitespace(Whitespace::Space),
2136            Token::make_word("customer", None),
2137            Token::Whitespace(Whitespace::Space),
2138            Token::make_keyword("WHERE"),
2139            Token::Whitespace(Whitespace::Space),
2140            Token::make_word("id", None),
2141            Token::Whitespace(Whitespace::Space),
2142            Token::Eq,
2143            Token::Whitespace(Whitespace::Space),
2144            Token::Number(String::from("1"), false),
2145        ];
2146
2147        compare(expected, tokens);
2148    }
2149
2150    #[test]
2151    fn tokenize_explain_analyze_select() {
2152        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2153        let dialect = GenericDialect {};
2154        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2155
2156        let expected = vec![
2157            Token::make_keyword("EXPLAIN"),
2158            Token::Whitespace(Whitespace::Space),
2159            Token::make_keyword("ANALYZE"),
2160            Token::Whitespace(Whitespace::Space),
2161            Token::make_keyword("SELECT"),
2162            Token::Whitespace(Whitespace::Space),
2163            Token::Mul,
2164            Token::Whitespace(Whitespace::Space),
2165            Token::make_keyword("FROM"),
2166            Token::Whitespace(Whitespace::Space),
2167            Token::make_word("customer", None),
2168            Token::Whitespace(Whitespace::Space),
2169            Token::make_keyword("WHERE"),
2170            Token::Whitespace(Whitespace::Space),
2171            Token::make_word("id", None),
2172            Token::Whitespace(Whitespace::Space),
2173            Token::Eq,
2174            Token::Whitespace(Whitespace::Space),
2175            Token::Number(String::from("1"), false),
2176        ];
2177
2178        compare(expected, tokens);
2179    }
2180
2181    #[test]
2182    fn tokenize_string_predicate() {
2183        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2184        let dialect = GenericDialect {};
2185        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2186
2187        let expected = vec![
2188            Token::make_keyword("SELECT"),
2189            Token::Whitespace(Whitespace::Space),
2190            Token::Mul,
2191            Token::Whitespace(Whitespace::Space),
2192            Token::make_keyword("FROM"),
2193            Token::Whitespace(Whitespace::Space),
2194            Token::make_word("customer", None),
2195            Token::Whitespace(Whitespace::Space),
2196            Token::make_keyword("WHERE"),
2197            Token::Whitespace(Whitespace::Space),
2198            Token::make_word("salary", None),
2199            Token::Whitespace(Whitespace::Space),
2200            Token::Neq,
2201            Token::Whitespace(Whitespace::Space),
2202            Token::SingleQuotedString(String::from("Not Provided")),
2203        ];
2204
2205        compare(expected, tokens);
2206    }
2207
2208    #[test]
2209    fn tokenize_invalid_string() {
2210        let sql = String::from("\n💝مصطفىh");
2211
2212        let dialect = GenericDialect {};
2213        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2214        // println!("tokens: {:#?}", tokens);
2215        let expected = vec![
2216            Token::Whitespace(Whitespace::Newline),
2217            Token::Char('💝'),
2218            Token::make_word("مصطفىh", None),
2219        ];
2220        compare(expected, tokens);
2221    }
2222
2223    #[test]
2224    fn tokenize_newline_in_string_literal() {
2225        let sql = String::from("'foo\r\nbar\nbaz'");
2226
2227        let dialect = GenericDialect {};
2228        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2229        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2230        compare(expected, tokens);
2231    }
2232
2233    #[test]
2234    fn tokenize_unterminated_string_literal() {
2235        let sql = String::from("select 'foo");
2236
2237        let dialect = GenericDialect {};
2238        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2239        assert_eq!(
2240            tokenizer.tokenize(),
2241            Err(TokenizerError {
2242                message: "Unterminated string literal".to_string(),
2243                location: Location { line: 1, column: 8 },
2244            })
2245        );
2246    }
2247
2248    #[test]
2249    fn tokenize_unterminated_string_literal_utf8() {
2250        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2251
2252        let dialect = GenericDialect {};
2253        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2254        assert_eq!(
2255            tokenizer.tokenize(),
2256            Err(TokenizerError {
2257                message: "Unterminated string literal".to_string(),
2258                location: Location {
2259                    line: 1,
2260                    column: 35
2261                }
2262            })
2263        );
2264    }
2265
2266    #[test]
2267    fn tokenize_invalid_string_cols() {
2268        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2269
2270        let dialect = GenericDialect {};
2271        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2272        // println!("tokens: {:#?}", tokens);
2273        let expected = vec![
2274            Token::Whitespace(Whitespace::Newline),
2275            Token::Whitespace(Whitespace::Newline),
2276            Token::make_keyword("SELECT"),
2277            Token::Whitespace(Whitespace::Space),
2278            Token::Mul,
2279            Token::Whitespace(Whitespace::Space),
2280            Token::make_keyword("FROM"),
2281            Token::Whitespace(Whitespace::Space),
2282            Token::make_keyword("table"),
2283            Token::Whitespace(Whitespace::Tab),
2284            Token::Char('💝'),
2285            Token::make_word("مصطفىh", None),
2286        ];
2287        compare(expected, tokens);
2288    }
2289
2290    #[test]
2291    fn tokenize_dollar_quoted_string_tagged() {
2292        let sql = String::from(
2293            "SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$",
2294        );
2295        let dialect = GenericDialect {};
2296        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2297        let expected = vec![
2298            Token::make_keyword("SELECT"),
2299            Token::Whitespace(Whitespace::Space),
2300            Token::DollarQuotedString(DollarQuotedString {
2301                value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2302                tag: Some("tag".into()),
2303            }),
2304        ];
2305        compare(expected, tokens);
2306    }
2307
2308    #[test]
2309    fn tokenize_dollar_quoted_string_tagged_unterminated() {
2310        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2311        let dialect = GenericDialect {};
2312        assert_eq!(
2313            Tokenizer::new(&dialect, &sql).tokenize(),
2314            Err(TokenizerError {
2315                message: "Unterminated dollar-quoted, expected $".into(),
2316                location: Location {
2317                    line: 1,
2318                    column: 91
2319                }
2320            })
2321        );
2322    }
2323
2324    #[test]
2325    fn tokenize_dollar_quoted_string_untagged() {
2326        let sql =
2327            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
2328        let dialect = GenericDialect {};
2329        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2330        let expected = vec![
2331            Token::make_keyword("SELECT"),
2332            Token::Whitespace(Whitespace::Space),
2333            Token::DollarQuotedString(DollarQuotedString {
2334                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
2335                tag: None,
2336            }),
2337        ];
2338        compare(expected, tokens);
2339    }
2340
2341    #[test]
2342    fn tokenize_dollar_quoted_string_untagged_unterminated() {
2343        let sql = String::from(
2344            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
2345        );
2346        let dialect = GenericDialect {};
2347        assert_eq!(
2348            Tokenizer::new(&dialect, &sql).tokenize(),
2349            Err(TokenizerError {
2350                message: "Unterminated dollar-quoted string".into(),
2351                location: Location {
2352                    line: 1,
2353                    column: 86
2354                }
2355            })
2356        );
2357    }
2358
2359    #[test]
2360    fn tokenize_right_arrow() {
2361        let sql = String::from("FUNCTION(key=>value)");
2362        let dialect = GenericDialect {};
2363        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2364        let expected = vec![
2365            Token::make_word("FUNCTION", None),
2366            Token::LParen,
2367            Token::make_word("key", None),
2368            Token::RArrow,
2369            Token::make_word("value", None),
2370            Token::RParen,
2371        ];
2372        compare(expected, tokens);
2373    }
2374
2375    #[test]
2376    fn tokenize_is_null() {
2377        let sql = String::from("a IS NULL");
2378        let dialect = GenericDialect {};
2379        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2380
2381        let expected = vec![
2382            Token::make_word("a", None),
2383            Token::Whitespace(Whitespace::Space),
2384            Token::make_keyword("IS"),
2385            Token::Whitespace(Whitespace::Space),
2386            Token::make_keyword("NULL"),
2387        ];
2388
2389        compare(expected, tokens);
2390    }
2391
2392    #[test]
2393    fn tokenize_comment() {
2394        let sql = String::from("0--this is a comment\n1");
2395
2396        let dialect = GenericDialect {};
2397        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2398        let expected = vec![
2399            Token::Number("0".to_string(), false),
2400            Token::Whitespace(Whitespace::SingleLineComment {
2401                prefix: "--".to_string(),
2402                comment: "this is a comment\n".to_string(),
2403            }),
2404            Token::Number("1".to_string(), false),
2405        ];
2406        compare(expected, tokens);
2407    }
2408
2409    #[test]
2410    fn tokenize_comment_at_eof() {
2411        let sql = String::from("--this is a comment");
2412
2413        let dialect = GenericDialect {};
2414        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2415        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
2416            prefix: "--".to_string(),
2417            comment: "this is a comment".to_string(),
2418        })];
2419        compare(expected, tokens);
2420    }
2421
2422    #[test]
2423    fn tokenize_multiline_comment() {
2424        let sql = String::from("0/*multi-line\n* /comment*/1");
2425
2426        let dialect = GenericDialect {};
2427        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2428        let expected = vec![
2429            Token::Number("0".to_string(), false),
2430            Token::Whitespace(Whitespace::MultiLineComment(
2431                "multi-line\n* /comment".to_string(),
2432            )),
2433            Token::Number("1".to_string(), false),
2434        ];
2435        compare(expected, tokens);
2436    }
2437
2438    #[test]
2439    fn tokenize_nested_multiline_comment() {
2440        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
2441
2442        let dialect = GenericDialect {};
2443        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2444        let expected = vec![
2445            Token::Number("0".to_string(), false),
2446            Token::Whitespace(Whitespace::MultiLineComment(
2447                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
2448            )),
2449            Token::Number("1".to_string(), false),
2450        ];
2451        compare(expected, tokens);
2452    }
2453
2454    #[test]
2455    fn tokenize_multiline_comment_with_even_asterisks() {
2456        let sql = String::from("\n/** Comment **/\n");
2457
2458        let dialect = GenericDialect {};
2459        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2460        let expected = vec![
2461            Token::Whitespace(Whitespace::Newline),
2462            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
2463            Token::Whitespace(Whitespace::Newline),
2464        ];
2465        compare(expected, tokens);
2466    }
2467
2468    #[test]
2469    fn tokenize_unicode_whitespace() {
2470        let sql = String::from(" \u{2003}\n");
2471
2472        let dialect = GenericDialect {};
2473        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2474        let expected = vec![
2475            Token::Whitespace(Whitespace::Space),
2476            Token::Whitespace(Whitespace::Space),
2477            Token::Whitespace(Whitespace::Newline),
2478        ];
2479        compare(expected, tokens);
2480    }
2481
2482    #[test]
2483    fn tokenize_mismatched_quotes() {
2484        let sql = String::from("\"foo");
2485
2486        let dialect = GenericDialect {};
2487        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2488        assert_eq!(
2489            tokenizer.tokenize(),
2490            Err(TokenizerError {
2491                message: "Expected close delimiter '\"' before EOF.".to_string(),
2492                location: Location { line: 1, column: 1 },
2493            })
2494        );
2495    }
2496
2497    #[test]
2498    fn tokenize_newlines() {
2499        let sql = String::from("line1\nline2\rline3\r\nline4\r");
2500
2501        let dialect = GenericDialect {};
2502        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2503        let expected = vec![
2504            Token::make_word("line1", None),
2505            Token::Whitespace(Whitespace::Newline),
2506            Token::make_word("line2", None),
2507            Token::Whitespace(Whitespace::Newline),
2508            Token::make_word("line3", None),
2509            Token::Whitespace(Whitespace::Newline),
2510            Token::make_word("line4", None),
2511            Token::Whitespace(Whitespace::Newline),
2512        ];
2513        compare(expected, tokens);
2514    }
2515
2516    #[test]
2517    fn tokenize_mssql_top() {
2518        let sql = "SELECT TOP 5 [bar] FROM foo";
2519        let dialect = MsSqlDialect {};
2520        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2521        let expected = vec![
2522            Token::make_keyword("SELECT"),
2523            Token::Whitespace(Whitespace::Space),
2524            Token::make_keyword("TOP"),
2525            Token::Whitespace(Whitespace::Space),
2526            Token::Number(String::from("5"), false),
2527            Token::Whitespace(Whitespace::Space),
2528            Token::make_word("bar", Some('[')),
2529            Token::Whitespace(Whitespace::Space),
2530            Token::make_keyword("FROM"),
2531            Token::Whitespace(Whitespace::Space),
2532            Token::make_word("foo", None),
2533        ];
2534        compare(expected, tokens);
2535    }
2536
2537    #[test]
2538    fn tokenize_pg_regex_match() {
2539        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
2540        let dialect = GenericDialect {};
2541        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2542        let expected = vec![
2543            Token::make_keyword("SELECT"),
2544            Token::Whitespace(Whitespace::Space),
2545            Token::make_word("col", None),
2546            Token::Whitespace(Whitespace::Space),
2547            Token::Tilde,
2548            Token::Whitespace(Whitespace::Space),
2549            Token::SingleQuotedString("^a".into()),
2550            Token::Comma,
2551            Token::Whitespace(Whitespace::Space),
2552            Token::make_word("col", None),
2553            Token::Whitespace(Whitespace::Space),
2554            Token::TildeAsterisk,
2555            Token::Whitespace(Whitespace::Space),
2556            Token::SingleQuotedString("^a".into()),
2557            Token::Comma,
2558            Token::Whitespace(Whitespace::Space),
2559            Token::make_word("col", None),
2560            Token::Whitespace(Whitespace::Space),
2561            Token::ExclamationMarkTilde,
2562            Token::Whitespace(Whitespace::Space),
2563            Token::SingleQuotedString("^a".into()),
2564            Token::Comma,
2565            Token::Whitespace(Whitespace::Space),
2566            Token::make_word("col", None),
2567            Token::Whitespace(Whitespace::Space),
2568            Token::ExclamationMarkTildeAsterisk,
2569            Token::Whitespace(Whitespace::Space),
2570            Token::SingleQuotedString("^a".into()),
2571        ];
2572        compare(expected, tokens);
2573    }
2574
2575    #[test]
2576    fn tokenize_pg_like_match() {
2577        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
2578        let dialect = GenericDialect {};
2579        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2580        let expected = vec![
2581            Token::make_keyword("SELECT"),
2582            Token::Whitespace(Whitespace::Space),
2583            Token::make_word("col", None),
2584            Token::Whitespace(Whitespace::Space),
2585            Token::DoubleTilde,
2586            Token::Whitespace(Whitespace::Space),
2587            Token::SingleQuotedString("_a%".into()),
2588            Token::Comma,
2589            Token::Whitespace(Whitespace::Space),
2590            Token::make_word("col", None),
2591            Token::Whitespace(Whitespace::Space),
2592            Token::DoubleTildeAsterisk,
2593            Token::Whitespace(Whitespace::Space),
2594            Token::SingleQuotedString("_a%".into()),
2595            Token::Comma,
2596            Token::Whitespace(Whitespace::Space),
2597            Token::make_word("col", None),
2598            Token::Whitespace(Whitespace::Space),
2599            Token::ExclamationMarkDoubleTilde,
2600            Token::Whitespace(Whitespace::Space),
2601            Token::SingleQuotedString("_a%".into()),
2602            Token::Comma,
2603            Token::Whitespace(Whitespace::Space),
2604            Token::make_word("col", None),
2605            Token::Whitespace(Whitespace::Space),
2606            Token::ExclamationMarkDoubleTildeAsterisk,
2607            Token::Whitespace(Whitespace::Space),
2608            Token::SingleQuotedString("_a%".into()),
2609        ];
2610        compare(expected, tokens);
2611    }
2612
2613    #[test]
2614    fn tokenize_quoted_identifier() {
2615        let sql = r#" "a "" b" "a """ "c """"" "#;
2616        let dialect = GenericDialect {};
2617        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2618        let expected = vec![
2619            Token::Whitespace(Whitespace::Space),
2620            Token::make_word(r#"a " b"#, Some('"')),
2621            Token::Whitespace(Whitespace::Space),
2622            Token::make_word(r#"a ""#, Some('"')),
2623            Token::Whitespace(Whitespace::Space),
2624            Token::make_word(r#"c """#, Some('"')),
2625            Token::Whitespace(Whitespace::Space),
2626        ];
2627        compare(expected, tokens);
2628    }
2629
2630    #[test]
2631    fn tokenize_snowflake_div() {
2632        let sql = r#"field/1000"#;
2633        let dialect = SnowflakeDialect {};
2634        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2635        let expected = vec![
2636            Token::make_word(r#"field"#, None),
2637            Token::Div,
2638            Token::Number("1000".to_string(), false),
2639        ];
2640        compare(expected, tokens);
2641    }
2642
2643    #[test]
2644    fn tokenize_quoted_identifier_with_no_escape() {
2645        let sql = r#" "a "" b" "a """ "c """"" "#;
2646        let dialect = GenericDialect {};
2647        let tokens = Tokenizer::new(&dialect, sql)
2648            .with_unescape(false)
2649            .tokenize()
2650            .unwrap();
2651        let expected = vec![
2652            Token::Whitespace(Whitespace::Space),
2653            Token::make_word(r#"a "" b"#, Some('"')),
2654            Token::Whitespace(Whitespace::Space),
2655            Token::make_word(r#"a """#, Some('"')),
2656            Token::Whitespace(Whitespace::Space),
2657            Token::make_word(r#"c """""#, Some('"')),
2658            Token::Whitespace(Whitespace::Space),
2659        ];
2660        compare(expected, tokens);
2661    }
2662
2663    #[test]
2664    fn tokenize_with_location() {
2665        let sql = "SELECT a,\n b";
2666        let dialect = GenericDialect {};
2667        let tokens = Tokenizer::new(&dialect, sql)
2668            .tokenize_with_location()
2669            .unwrap();
2670        let expected = vec![
2671            TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
2672            TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
2673            TokenWithLocation::new(Token::make_word("a", None), 1, 8),
2674            TokenWithLocation::new(Token::Comma, 1, 9),
2675            TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
2676            TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
2677            TokenWithLocation::new(Token::make_word("b", None), 2, 2),
2678        ];
2679        compare(expected, tokens);
2680    }
2681
2682    fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
2683        //println!("------------------------------");
2684        //println!("tokens   = {:?}", actual);
2685        //println!("expected = {:?}", expected);
2686        //println!("------------------------------");
2687        assert_eq!(expected, actual);
2688    }
2689
2690    fn check_unescape(s: &str, expected: Option<&str>) {
2691        let s = format!("'{}'", s);
2692        let mut state = State {
2693            peekable: s.chars().peekable(),
2694            line: 0,
2695            col: 0,
2696        };
2697
2698        assert_eq!(
2699            unescape_single_quoted_string(&mut state),
2700            expected.map(|s| s.to_string())
2701        );
2702    }
2703
2704    #[test]
2705    fn test_unescape() {
2706        check_unescape(r"\b", Some("\u{0008}"));
2707        check_unescape(r"\f", Some("\u{000C}"));
2708        check_unescape(r"\t", Some("\t"));
2709        check_unescape(r"\r\n", Some("\r\n"));
2710        check_unescape(r"\/", Some("/"));
2711        check_unescape(r"/", Some("/"));
2712        check_unescape(r"\\", Some("\\"));
2713
2714        // 16 and 32-bit hexadecimal Unicode character value
2715        check_unescape(r"\u0001", Some("\u{0001}"));
2716        check_unescape(r"\u4c91", Some("\u{4c91}"));
2717        check_unescape(r"\u4c916", Some("\u{4c91}6"));
2718        check_unescape(r"\u4c", None);
2719        check_unescape(r"\u0000", None);
2720        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
2721        check_unescape(r"\U00110000", None);
2722        check_unescape(r"\U00000000", None);
2723        check_unescape(r"\u", None);
2724        check_unescape(r"\U", None);
2725        check_unescape(r"\U1010FFFF", None);
2726
2727        // hexadecimal byte value
2728        check_unescape(r"\x4B", Some("\u{004b}"));
2729        check_unescape(r"\x4", Some("\u{0004}"));
2730        check_unescape(r"\x4L", Some("\u{0004}L"));
2731        check_unescape(r"\x", Some("x"));
2732        check_unescape(r"\xP", Some("xP"));
2733        check_unescape(r"\x0", None);
2734        check_unescape(r"\xCAD", None);
2735        check_unescape(r"\xA9", None);
2736
2737        // octal byte value
2738        check_unescape(r"\1", Some("\u{0001}"));
2739        check_unescape(r"\12", Some("\u{000a}"));
2740        check_unescape(r"\123", Some("\u{0053}"));
2741        check_unescape(r"\1232", Some("\u{0053}2"));
2742        check_unescape(r"\4", Some("\u{0004}"));
2743        check_unescape(r"\45", Some("\u{0025}"));
2744        check_unescape(r"\450", Some("\u{0028}"));
2745        check_unescape(r"\603", None);
2746        check_unescape(r"\0", None);
2747        check_unescape(r"\080", None);
2748
2749        // others
2750        check_unescape(r"\9", Some("9"));
2751        check_unescape(r"''", Some("'"));
2752        check_unescape(
2753            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
2754            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
2755        );
2756        check_unescape(r"Hello\0", None);
2757        check_unescape(r"Hello\xCADRust", None);
2758    }
2759
2760    #[test]
2761    fn tokenize_numeric_prefix_trait() {
2762        #[derive(Debug)]
2763        struct NumericPrefixDialect;
2764
2765        impl Dialect for NumericPrefixDialect {
2766            fn is_identifier_start(&self, ch: char) -> bool {
2767                ch.is_ascii_lowercase()
2768                    || ch.is_ascii_uppercase()
2769                    || ch.is_ascii_digit()
2770                    || ch == '$'
2771            }
2772
2773            fn is_identifier_part(&self, ch: char) -> bool {
2774                ch.is_ascii_lowercase()
2775                    || ch.is_ascii_uppercase()
2776                    || ch.is_ascii_digit()
2777                    || ch == '_'
2778                    || ch == '$'
2779                    || ch == '{'
2780                    || ch == '}'
2781            }
2782
2783            fn supports_numeric_prefix(&self) -> bool {
2784                true
2785            }
2786        }
2787
2788        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
2789        tokenize_numeric_prefix_inner(&HiveDialect {});
2790        tokenize_numeric_prefix_inner(&MySqlDialect {});
2791    }
2792
2793    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
2794        let sql = r#"SELECT * FROM 1"#;
2795        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
2796        let expected = vec![
2797            Token::make_keyword("SELECT"),
2798            Token::Whitespace(Whitespace::Space),
2799            Token::Mul,
2800            Token::Whitespace(Whitespace::Space),
2801            Token::make_keyword("FROM"),
2802            Token::Whitespace(Whitespace::Space),
2803            Token::Number(String::from("1"), false),
2804        ];
2805        compare(expected, tokens);
2806    }
2807
2808    #[test]
2809    fn tokenize_quoted_string_escape() {
2810        let dialect = SnowflakeDialect {};
2811        for (sql, expected, expected_unescaped) in [
2812            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
2813            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
2814            (r#"'\\'"#, r#"\\"#, r#"\"#),
2815            (
2816                r#"'\0\a\b\f\n\r\t\Z'"#,
2817                r#"\0\a\b\f\n\r\t\Z"#,
2818                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
2819            ),
2820            (r#"'\"'"#, r#"\""#, "\""),
2821            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
2822            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
2823            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
2824        ] {
2825            let tokens = Tokenizer::new(&dialect, sql)
2826                .with_unescape(false)
2827                .tokenize()
2828                .unwrap();
2829            let expected = vec![Token::SingleQuotedString(expected.to_string())];
2830            compare(expected, tokens);
2831
2832            let tokens = Tokenizer::new(&dialect, sql)
2833                .with_unescape(true)
2834                .tokenize()
2835                .unwrap();
2836            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
2837            compare(expected, tokens);
2838        }
2839
2840        for sql in [r#"'\'"#, r#"'ab\'"#] {
2841            let mut tokenizer = Tokenizer::new(&dialect, sql);
2842            assert_eq!(
2843                "Unterminated string literal",
2844                tokenizer.tokenize().unwrap_err().message.as_str(),
2845            );
2846        }
2847
2848        // Non-escape dialect
2849        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
2850            let dialect = GenericDialect {};
2851            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2852
2853            let expected = vec![Token::SingleQuotedString(expected.to_string())];
2854
2855            compare(expected, tokens);
2856        }
2857    }
2858
2859    #[test]
2860    fn tokenize_triple_quoted_string() {
2861        fn check<F>(
2862            q: char, // The quote character to test
2863            r: char, // An alternate quote character.
2864            quote_token: F,
2865        ) where
2866            F: Fn(String) -> Token,
2867        {
2868            let dialect = BigQueryDialect {};
2869
2870            for (sql, expected, expected_unescaped) in [
2871                // Empty string
2872                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
2873                // Should not count escaped quote as end of string.
2874                (
2875                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
2876                    format!(r#"ab{q}{q}\{q}{q}cd"#),
2877                    format!(r#"ab{q}{q}{q}{q}cd"#),
2878                ),
2879                // Simple string
2880                (
2881                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
2882                    "abc".into(),
2883                    "abc".into(),
2884                ),
2885                // Mix single-double quotes unescaped.
2886                (
2887                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
2888                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
2889                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
2890                ),
2891                // Escaped quote.
2892                (
2893                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
2894                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
2895                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
2896                ),
2897                // backslash-escaped quote characters.
2898                (
2899                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
2900                    r#"a\'\'b\'c\'d"#.into(),
2901                    r#"a''b'c'd"#.into(),
2902                ),
2903                // backslash-escaped characters
2904                (
2905                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
2906                    r#"abc\0\n\rdef"#.into(),
2907                    "abc\0\n\rdef".into(),
2908                ),
2909            ] {
2910                let tokens = Tokenizer::new(&dialect, sql.as_str())
2911                    .with_unescape(false)
2912                    .tokenize()
2913                    .unwrap();
2914                let expected = vec![quote_token(expected.to_string())];
2915                compare(expected, tokens);
2916
2917                let tokens = Tokenizer::new(&dialect, sql.as_str())
2918                    .with_unescape(true)
2919                    .tokenize()
2920                    .unwrap();
2921                let expected = vec![quote_token(expected_unescaped.to_string())];
2922                compare(expected, tokens);
2923            }
2924
2925            for sql in [
2926                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
2927                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
2928                format!(r#"{q}{q}{q}{q}"#),
2929                format!(r#"{q}{q}{q}{r}{r}"#),
2930                format!(r#"{q}{q}{q}abc{q}"#),
2931                format!(r#"{q}{q}{q}abc{q}{q}"#),
2932                format!(r#"{q}{q}{q}abc"#),
2933            ] {
2934                let dialect = BigQueryDialect {};
2935                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
2936                assert_eq!(
2937                    "Unterminated string literal",
2938                    tokenizer.tokenize().unwrap_err().message.as_str(),
2939                );
2940            }
2941        }
2942
2943        check('"', '\'', Token::TripleDoubleQuotedString);
2944
2945        check('\'', '"', Token::TripleSingleQuotedString);
2946
2947        let dialect = BigQueryDialect {};
2948
2949        let sql = r#"""''"#;
2950        let tokens = Tokenizer::new(&dialect, sql)
2951            .with_unescape(true)
2952            .tokenize()
2953            .unwrap();
2954        let expected = vec![
2955            Token::DoubleQuotedString("".to_string()),
2956            Token::SingleQuotedString("".to_string()),
2957        ];
2958        compare(expected, tokens);
2959
2960        let sql = r#"''"""#;
2961        let tokens = Tokenizer::new(&dialect, sql)
2962            .with_unescape(true)
2963            .tokenize()
2964            .unwrap();
2965        let expected = vec![
2966            Token::SingleQuotedString("".to_string()),
2967            Token::DoubleQuotedString("".to_string()),
2968        ];
2969        compare(expected, tokens);
2970
2971        // Non-triple quoted string dialect
2972        let dialect = SnowflakeDialect {};
2973        let sql = r#"''''''"#;
2974        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2975        let expected = vec![Token::SingleQuotedString("''".to_string())];
2976        compare(expected, tokens);
2977    }
2978}
sqltk_parser/tokenizer.rs

sqltk_parser/
tokenizer.rs