sqlparser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::num::NonZeroU8;
33use core::str::Chars;
34use core::{cmp, fmt};
35use core::{iter::Peekable, str};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46    SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{
50    ast::{DollarQuotedString, QuoteDelimitedString},
51    dialect::HiveDialect,
52};
53
54/// SQL Token enumeration
55#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
58pub enum Token {
59    /// An end-of-file marker, not a real token
60    EOF,
61    /// A keyword (like SELECT) or an optionally quoted SQL identifier
62    Word(Word),
63    /// An unsigned numeric literal
64    Number(String, bool),
65    /// A character that could not be tokenized
66    Char(char),
67    /// Single quoted string: i.e: 'string'
68    SingleQuotedString(String),
69    /// Double quoted string: i.e: "string"
70    DoubleQuotedString(String),
71    /// Triple single quoted strings: Example '''abc'''
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleSingleQuotedString(String),
74    /// Triple double quoted strings: Example """abc"""
75    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
76    TripleDoubleQuotedString(String),
77    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
78    DollarQuotedString(DollarQuotedString),
79    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
80    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
81    SingleQuotedByteStringLiteral(String),
82    /// Byte string literal: i.e: b"string" or B"string"
83    DoubleQuotedByteStringLiteral(String),
84    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleSingleQuotedByteStringLiteral(String),
87    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    TripleDoubleQuotedByteStringLiteral(String),
90    /// Single quoted literal with raw string prefix. Example `R'abc'`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    SingleQuotedRawStringLiteral(String),
93    /// Double quoted literal with raw string prefix. Example `R"abc"`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    DoubleQuotedRawStringLiteral(String),
96    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleSingleQuotedRawStringLiteral(String),
99    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
100    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
101    TripleDoubleQuotedRawStringLiteral(String),
102    /// "National" string literal: i.e: N'string'
103    NationalStringLiteral(String),
104    /// Quote delimited literal. Examples `Q'{ab'c}'`, `Q'|ab'c|'`, `Q'|ab|c|'`
105    /// [Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA)
106    QuoteDelimitedStringLiteral(QuoteDelimitedString),
107    /// "Nationa" quote delimited literal. Examples `NQ'{ab'c}'`, `NQ'|ab'c|'`, `NQ'|ab|c|'`
108    /// [Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA)
109    NationalQuoteDelimitedStringLiteral(QuoteDelimitedString),
110    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
111    EscapedStringLiteral(String),
112    /// Unicode string literal: i.e: U&'first \000A second'
113    UnicodeStringLiteral(String),
114    /// Hexadecimal string literal: i.e.: X'deadbeef'
115    HexStringLiteral(String),
116    /// Comma
117    Comma,
118    /// Whitespace (space, tab, etc)
119    Whitespace(Whitespace),
120    /// Double equals sign `==`
121    DoubleEq,
122    /// Equality operator `=`
123    Eq,
124    /// Not Equals operator `<>` (or `!=` in some dialects)
125    Neq,
126    /// Less Than operator `<`
127    Lt,
128    /// Greater Than operator `>`
129    Gt,
130    /// Less Than Or Equals operator `<=`
131    LtEq,
132    /// Greater Than Or Equals operator `>=`
133    GtEq,
134    /// Spaceship operator <=>
135    Spaceship,
136    /// Plus operator `+`
137    Plus,
138    /// Minus operator `-`
139    Minus,
140    /// Multiplication operator `*`
141    Mul,
142    /// Division operator `/`
143    Div,
144    /// Integer division operator `//` in DuckDB
145    DuckIntDiv,
146    /// Modulo Operator `%`
147    Mod,
148    /// String concatenation `||`
149    StringConcat,
150    /// Left parenthesis `(`
151    LParen,
152    /// Right parenthesis `)`
153    RParen,
154    /// Period (used for compound identifiers or projections into nested types)
155    Period,
156    /// Colon `:`
157    Colon,
158    /// DoubleColon `::` (used for casting in PostgreSQL)
159    DoubleColon,
160    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
161    Assignment,
162    /// SemiColon `;` used as separator for COPY and payload
163    SemiColon,
164    /// Backslash `\` used in terminating the COPY payload with `\.`
165    Backslash,
166    /// Left bracket `[`
167    LBracket,
168    /// Right bracket `]`
169    RBracket,
170    /// Ampersand `&`
171    Ampersand,
172    /// Pipe `|`
173    Pipe,
174    /// Caret `^`
175    Caret,
176    /// Left brace `{`
177    LBrace,
178    /// Right brace `}`
179    RBrace,
180    /// Right Arrow `=>`
181    RArrow,
182    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
183    Sharp,
184    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
185    DoubleSharp,
186    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
187    Tilde,
188    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
189    TildeAsterisk,
190    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
191    ExclamationMarkTilde,
192    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
193    ExclamationMarkTildeAsterisk,
194    /// `~~`, a case sensitive match pattern operator in PostgreSQL
195    DoubleTilde,
196    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
197    DoubleTildeAsterisk,
198    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
199    ExclamationMarkDoubleTilde,
200    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
201    ExclamationMarkDoubleTildeAsterisk,
202    /// `<<`, a bitwise shift left operator in PostgreSQL
203    ShiftLeft,
204    /// `>>`, a bitwise shift right operator in PostgreSQL
205    ShiftRight,
206    /// `&&`, an overlap operator in PostgreSQL
207    Overlap,
208    /// Exclamation Mark `!` used for PostgreSQL factorial operator
209    ExclamationMark,
210    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
211    DoubleExclamationMark,
212    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
213    AtSign,
214    /// `^@`, a "starts with" string operator in PostgreSQL
215    CaretAt,
216    /// `|/`, a square root math operator in PostgreSQL
217    PGSquareRoot,
218    /// `||/`, a cube root math operator in PostgreSQL
219    PGCubeRoot,
220    /// `?` or `$` , a prepared statement arg placeholder
221    Placeholder(String),
222    /// `->`, used as a operator to extract json field in PostgreSQL
223    Arrow,
224    /// `->>`, used as a operator to extract json field as text in PostgreSQL
225    LongArrow,
226    /// `#>`, extracts JSON sub-object at the specified path
227    HashArrow,
228    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
229    AtDashAt,
230    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
231    QuestionMarkDash,
232    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
233    AmpersandLeftAngleBracket,
234    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
235    AmpersandRightAngleBracket,
236    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
237    AmpersandLeftAngleBracketVerticalBar,
238    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
239    VerticalBarAmpersandRightAngleBracket,
240    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
241    TwoWayArrow,
242    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
243    LeftAngleBracketCaret,
244    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
245    RightAngleBracketCaret,
246    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
247    QuestionMarkSharp,
248    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
249    QuestionMarkDashVerticalBar,
250    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
251    QuestionMarkDoubleVerticalBar,
252    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
253    TildeEqual,
254    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
255    ShiftLeftVerticalBar,
256    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
257    VerticalBarShiftRight,
258    /// `|> BigQuery pipe operator
259    VerticalBarRightAngleBracket,
260    /// `#>>`, extracts JSON sub-object at the specified path as text
261    HashLongArrow,
262    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
263    AtArrow,
264    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
265    ArrowAt,
266    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
267    /// path, where path elements can be either field keys or array indexes.
268    HashMinus,
269    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
270    /// JSON value?
271    AtQuestion,
272    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
273    /// for the specified JSON value. Only the first item of the result is taken into
274    /// account. If the result is not Boolean, then NULL is returned.
275    AtAt,
276    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
277    /// jsonb object
278    Question,
279    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
280    /// keys within the jsonb object
281    QuestionAnd,
282    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
283    /// keys within the jsonb object
284    QuestionPipe,
285    /// Custom binary operator
286    /// This is used to represent any custom binary operator that is not part of the SQL standard.
287    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
288    CustomBinaryOperator(String),
289}
290
291impl fmt::Display for Token {
292    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
293        match self {
294            Token::EOF => f.write_str("EOF"),
295            Token::Word(ref w) => write!(f, "{w}"),
296            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
297            Token::Char(ref c) => write!(f, "{c}"),
298            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
299            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
300            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
301            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
302            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
303            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
304            Token::QuoteDelimitedStringLiteral(ref s) => s.fmt(f),
305            Token::NationalQuoteDelimitedStringLiteral(ref s) => write!(f, "N{s}"),
306            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
307            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
308            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
309            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
310            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
311            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
312            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
313            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
314            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
315            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
316            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
317            Token::Comma => f.write_str(","),
318            Token::Whitespace(ws) => write!(f, "{ws}"),
319            Token::DoubleEq => f.write_str("=="),
320            Token::Spaceship => f.write_str("<=>"),
321            Token::Eq => f.write_str("="),
322            Token::Neq => f.write_str("<>"),
323            Token::Lt => f.write_str("<"),
324            Token::Gt => f.write_str(">"),
325            Token::LtEq => f.write_str("<="),
326            Token::GtEq => f.write_str(">="),
327            Token::Plus => f.write_str("+"),
328            Token::Minus => f.write_str("-"),
329            Token::Mul => f.write_str("*"),
330            Token::Div => f.write_str("/"),
331            Token::DuckIntDiv => f.write_str("//"),
332            Token::StringConcat => f.write_str("||"),
333            Token::Mod => f.write_str("%"),
334            Token::LParen => f.write_str("("),
335            Token::RParen => f.write_str(")"),
336            Token::Period => f.write_str("."),
337            Token::Colon => f.write_str(":"),
338            Token::DoubleColon => f.write_str("::"),
339            Token::Assignment => f.write_str(":="),
340            Token::SemiColon => f.write_str(";"),
341            Token::Backslash => f.write_str("\\"),
342            Token::LBracket => f.write_str("["),
343            Token::RBracket => f.write_str("]"),
344            Token::Ampersand => f.write_str("&"),
345            Token::Caret => f.write_str("^"),
346            Token::Pipe => f.write_str("|"),
347            Token::LBrace => f.write_str("{"),
348            Token::RBrace => f.write_str("}"),
349            Token::RArrow => f.write_str("=>"),
350            Token::Sharp => f.write_str("#"),
351            Token::DoubleSharp => f.write_str("##"),
352            Token::ExclamationMark => f.write_str("!"),
353            Token::DoubleExclamationMark => f.write_str("!!"),
354            Token::Tilde => f.write_str("~"),
355            Token::TildeAsterisk => f.write_str("~*"),
356            Token::ExclamationMarkTilde => f.write_str("!~"),
357            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
358            Token::DoubleTilde => f.write_str("~~"),
359            Token::DoubleTildeAsterisk => f.write_str("~~*"),
360            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
361            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
362            Token::AtSign => f.write_str("@"),
363            Token::CaretAt => f.write_str("^@"),
364            Token::ShiftLeft => f.write_str("<<"),
365            Token::ShiftRight => f.write_str(">>"),
366            Token::Overlap => f.write_str("&&"),
367            Token::PGSquareRoot => f.write_str("|/"),
368            Token::PGCubeRoot => f.write_str("||/"),
369            Token::AtDashAt => f.write_str("@-@"),
370            Token::QuestionMarkDash => f.write_str("?-"),
371            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
372            Token::AmpersandRightAngleBracket => f.write_str("&>"),
373            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
374            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
375            Token::VerticalBarRightAngleBracket => f.write_str("|>"),
376            Token::TwoWayArrow => f.write_str("<->"),
377            Token::LeftAngleBracketCaret => f.write_str("<^"),
378            Token::RightAngleBracketCaret => f.write_str(">^"),
379            Token::QuestionMarkSharp => f.write_str("?#"),
380            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
381            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
382            Token::TildeEqual => f.write_str("~="),
383            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
384            Token::VerticalBarShiftRight => f.write_str("|>>"),
385            Token::Placeholder(ref s) => write!(f, "{s}"),
386            Token::Arrow => write!(f, "->"),
387            Token::LongArrow => write!(f, "->>"),
388            Token::HashArrow => write!(f, "#>"),
389            Token::HashLongArrow => write!(f, "#>>"),
390            Token::AtArrow => write!(f, "@>"),
391            Token::ArrowAt => write!(f, "<@"),
392            Token::HashMinus => write!(f, "#-"),
393            Token::AtQuestion => write!(f, "@?"),
394            Token::AtAt => write!(f, "@@"),
395            Token::Question => write!(f, "?"),
396            Token::QuestionAnd => write!(f, "?&"),
397            Token::QuestionPipe => write!(f, "?|"),
398            Token::CustomBinaryOperator(s) => f.write_str(s),
399        }
400    }
401}
402
403impl Token {
404    /// Create a `Token::Word` from an unquoted `keyword`.
405    ///
406    /// The lookup is case-insensitive; unknown values become `Keyword::NoKeyword`.
407    pub fn make_keyword(keyword: &str) -> Self {
408        Token::make_word(keyword, None)
409    }
410
411    /// Create a `Token::Word` from `word` with an optional `quote_style`.
412    ///
413    /// When `quote_style` is `None`, the parser attempts a case-insensitive keyword
414    /// lookup and sets the `Word::keyword` accordingly.
415    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
416        Token::Word(Word {
417            keyword: keyword_lookup(word, quote_style),
418            value: word.to_string(),
419            quote_style,
420        })
421    }
422
423    /// Like [`Self::make_word`] but takes ownership of the word `String`,
424    /// avoiding an extra allocation when the caller already has an owned value.
425    fn make_word_owned(word: String, quote_style: Option<char>) -> Self {
426        Token::Word(Word {
427            keyword: keyword_lookup(&word, quote_style),
428            value: word,
429            quote_style,
430        })
431    }
432}
433
434/// Case-insensitive keyword lookup using binary search over [`ALL_KEYWORDS`].
435fn keyword_lookup(word: &str, quote_style: Option<char>) -> Keyword {
436    if quote_style.is_some() {
437        return Keyword::NoKeyword;
438    }
439    ALL_KEYWORDS
440        .binary_search_by(|probe| {
441            let probe = probe.as_bytes();
442            let word = word.as_bytes();
443            for (p, w) in probe.iter().zip(word.iter()) {
444                let cmp = p.cmp(&w.to_ascii_uppercase());
445                if cmp != core::cmp::Ordering::Equal {
446                    return cmp;
447                }
448            }
449            probe.len().cmp(&word.len())
450        })
451        .map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
452}
453
454/// A keyword (like SELECT) or an optionally quoted SQL identifier
455#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
456#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
457#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
458pub struct Word {
459    /// The value of the token, without the enclosing quotes, and with the
460    /// escape sequences (if any) processed (TODO: escapes are not handled)
461    pub value: String,
462    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
463    /// The standard and most implementations allow using double quotes for this,
464    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
465    pub quote_style: Option<char>,
466    /// If the word was not quoted and it matched one of the known keywords,
467    /// this will have one of the values from dialect::keywords, otherwise empty
468    pub keyword: Keyword,
469}
470
471impl fmt::Display for Word {
472    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
473        match self.quote_style {
474            Some(s) if s == '"' || s == '[' || s == '`' => {
475                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
476            }
477            None => f.write_str(&self.value),
478            _ => panic!("Unexpected quote_style!"),
479        }
480    }
481}
482
483impl Word {
484    fn matching_end_quote(ch: char) -> char {
485        match ch {
486            '"' => '"', // ANSI and most dialects
487            '[' => ']', // MS SQL
488            '`' => '`', // MySQL
489            _ => panic!("unexpected quoting style!"),
490        }
491    }
492}
493
494/// Represents whitespace in the input: spaces, newlines, tabs and comments.
495#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
496#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
497#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
498pub enum Whitespace {
499    /// A single space character.
500    Space,
501    /// A newline character.
502    Newline,
503    /// A tab character.
504    Tab,
505    /// A single-line comment (e.g. `-- comment` or `# comment`).
506    /// The `comment` field contains the text, and `prefix` contains the comment prefix.
507    SingleLineComment {
508        /// The content of the comment (without the prefix).
509        comment: String,
510        /// The prefix used for the comment (for example `--` or `#`).
511        prefix: String,
512    },
513
514    /// A multi-line comment (without the `/* ... */` delimiters).
515    MultiLineComment(String),
516}
517
518impl fmt::Display for Whitespace {
519    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
520        match self {
521            Whitespace::Space => f.write_str(" "),
522            Whitespace::Newline => f.write_str("\n"),
523            Whitespace::Tab => f.write_str("\t"),
524            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
525            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
526        }
527    }
528}
529
530/// Location in input string
531///
532/// # Create an "empty" (unknown) `Location`
533/// ```
534/// # use sqlparser::tokenizer::Location;
535/// let location = Location::empty();
536/// ```
537///
538/// # Create a `Location` from a line and column
539/// ```
540/// # use sqlparser::tokenizer::Location;
541/// let location = Location::new(1, 1);
542/// ```
543///
544/// # Create a `Location` from a pair
545/// ```
546/// # use sqlparser::tokenizer::Location;
547/// let location = Location::from((1, 1));
548/// ```
549#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
550#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
551#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
552pub struct Location {
553    /// Line number, starting from 1.
554    ///
555    /// Note: Line 0 is used for empty spans
556    pub line: u64,
557    /// Line column, starting from 1.
558    ///
559    /// Note: Column 0 is used for empty spans
560    pub column: u64,
561}
562
563impl fmt::Display for Location {
564    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
565        if self.line == 0 {
566            return Ok(());
567        }
568        write!(f, " at Line: {}, Column: {}", self.line, self.column)
569    }
570}
571
572impl fmt::Debug for Location {
573    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
574        write!(f, "Location({},{})", self.line, self.column)
575    }
576}
577
578impl Location {
579    /// Return an "empty" / unknown location
580    pub fn empty() -> Self {
581        Self { line: 0, column: 0 }
582    }
583
584    /// Create a new `Location` for a given line and column
585    pub fn new(line: u64, column: u64) -> Self {
586        Self { line, column }
587    }
588
589    /// Create a new location for a given line and column
590    ///
591    /// Alias for [`Self::new`]
592    // TODO: remove / deprecate in favor of` `new` for consistency?
593    pub fn of(line: u64, column: u64) -> Self {
594        Self::new(line, column)
595    }
596
597    /// Combine self and `end` into a new `Span`
598    pub fn span_to(self, end: Self) -> Span {
599        Span { start: self, end }
600    }
601}
602
603impl From<(u64, u64)> for Location {
604    fn from((line, column): (u64, u64)) -> Self {
605        Self { line, column }
606    }
607}
608
609/// A span represents a linear portion of the input string (start, end)
610///
611/// See [Spanned](crate::ast::Spanned) for more information.
612#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
613#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
614#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
615pub struct Span {
616    /// Start `Location` (inclusive).
617    pub start: Location,
618    /// End `Location` (inclusive).
619    pub end: Location,
620}
621
622impl fmt::Debug for Span {
623    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
624        write!(f, "Span({:?}..{:?})", self.start, self.end)
625    }
626}
627
628impl Span {
629    // An empty span (0, 0) -> (0, 0)
630    // We need a const instance for pattern matching
631    const EMPTY: Span = Self::empty();
632
633    /// Create a new span from a start and end [`Location`]
634    pub fn new(start: Location, end: Location) -> Span {
635        Span { start, end }
636    }
637
638    /// Returns an empty span `(0, 0) -> (0, 0)`
639    ///
640    /// Empty spans represent no knowledge of source location
641    /// See [Spanned](crate::ast::Spanned) for more information.
642    pub const fn empty() -> Span {
643        Span {
644            start: Location { line: 0, column: 0 },
645            end: Location { line: 0, column: 0 },
646        }
647    }
648
649    /// Returns the smallest Span that contains both `self` and `other`
650    /// If either span is [Span::empty], the other span is returned
651    ///
652    /// # Examples
653    /// ```
654    /// # use sqlparser::tokenizer::{Span, Location};
655    /// // line 1, column1 -> line 2, column 5
656    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
657    /// // line 2, column 3 -> line 3, column 7
658    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
659    /// // Union of the two is the min/max of the two spans
660    /// // line 1, column 1 -> line 3, column 7
661    /// let union = span1.union(&span2);
662    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
663    /// ```
664    pub fn union(&self, other: &Span) -> Span {
665        // If either span is empty, return the other
666        // this prevents propagating (0, 0) through the tree
667        match (self, other) {
668            (&Span::EMPTY, _) => *other,
669            (_, &Span::EMPTY) => *self,
670            _ => Span {
671                start: cmp::min(self.start, other.start),
672                end: cmp::max(self.end, other.end),
673            },
674        }
675    }
676
677    /// Same as [Span::union] for `Option<Span>`
678    ///
679    /// If `other` is `None`, `self` is returned
680    pub fn union_opt(&self, other: &Option<Span>) -> Span {
681        match other {
682            Some(other) => self.union(other),
683            None => *self,
684        }
685    }
686
687    /// Return the [Span::union] of all spans in the iterator
688    ///
689    /// If the iterator is empty, an empty span is returned
690    ///
691    /// # Example
692    /// ```
693    /// # use sqlparser::tokenizer::{Span, Location};
694    /// let spans = vec![
695    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
696    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
697    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
698    /// ];
699    /// // line 1, column 1 -> line 4, column 2
700    /// assert_eq!(
701    ///   Span::union_iter(spans),
702    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
703    /// );
704    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
705        iter.into_iter()
706            .reduce(|acc, item| acc.union(&item))
707            .unwrap_or(Span::empty())
708    }
709}
710
711/// Backwards compatibility struct for [`TokenWithSpan`]
712#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
713pub type TokenWithLocation = TokenWithSpan;
714
715/// A [Token] with [Span] attached to it
716///
717/// This is used to track the location of a token in the input string
718///
719/// # Examples
720/// ```
721/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
722/// // commas @ line 1, column 10
723/// let tok1 = TokenWithSpan::new(
724///   Token::Comma,
725///   Span::new(Location::new(1, 10), Location::new(1, 11)),
726/// );
727/// assert_eq!(tok1, Token::Comma); // can compare the token
728///
729/// // commas @ line 2, column 20
730/// let tok2 = TokenWithSpan::new(
731///   Token::Comma,
732///   Span::new(Location::new(2, 20), Location::new(2, 21)),
733/// );
734/// // same token but different locations are not equal
735/// assert_ne!(tok1, tok2);
736/// ```
737#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
738#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
739#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
740/// A `Token` together with its `Span` (location in the source).
741pub struct TokenWithSpan {
742    /// The token value.
743    pub token: Token,
744    /// The span covering the token in the input.
745    pub span: Span,
746}
747
748impl TokenWithSpan {
749    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
750    pub fn new(token: Token, span: Span) -> Self {
751        Self { token, span }
752    }
753
754    /// Wrap a token with an empty span
755    pub fn wrap(token: Token) -> Self {
756        Self::new(token, Span::empty())
757    }
758
759    /// Wrap a token with a location from `start` to `end`
760    pub fn at(token: Token, start: Location, end: Location) -> Self {
761        Self::new(token, Span::new(start, end))
762    }
763
764    /// Return an EOF token with no location
765    pub fn new_eof() -> Self {
766        Self::wrap(Token::EOF)
767    }
768}
769
770impl PartialEq<Token> for TokenWithSpan {
771    fn eq(&self, other: &Token) -> bool {
772        &self.token == other
773    }
774}
775
776impl PartialEq<TokenWithSpan> for Token {
777    fn eq(&self, other: &TokenWithSpan) -> bool {
778        self == &other.token
779    }
780}
781
782impl fmt::Display for TokenWithSpan {
783    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
784        self.token.fmt(f)
785    }
786}
787
788/// An error reported by the tokenizer, with a human-readable `message` and a `location`.
789#[derive(Debug, PartialEq, Eq)]
790pub struct TokenizerError {
791    /// A descriptive error message.
792    pub message: String,
793    /// The `Location` where the error was detected.
794    pub location: Location,
795}
796
797impl fmt::Display for TokenizerError {
798    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
799        write!(f, "{}{}", self.message, self.location,)
800    }
801}
802
803impl core::error::Error for TokenizerError {}
804
805struct State<'a> {
806    peekable: Peekable<Chars<'a>>,
807    line: u64,
808    col: u64,
809}
810
811impl State<'_> {
812    /// return the next character and advance the stream
813    pub fn next(&mut self) -> Option<char> {
814        match self.peekable.next() {
815            None => None,
816            Some(s) => {
817                if s == '\n' {
818                    self.line += 1;
819                    self.col = 1;
820                } else {
821                    self.col += 1;
822                }
823                Some(s)
824            }
825        }
826    }
827
828    /// return the next character but do not advance the stream
829    pub fn peek(&mut self) -> Option<&char> {
830        self.peekable.peek()
831    }
832
833    /// Return the current `Location` (line and column)
834    pub fn location(&self) -> Location {
835        Location {
836            line: self.line,
837            column: self.col,
838        }
839    }
840}
841
842/// Represents how many quote characters enclose a string literal.
843#[derive(Copy, Clone)]
844enum NumStringQuoteChars {
845    /// e.g. `"abc"`, `'abc'`, `r'abc'`
846    One,
847    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
848    Many(NonZeroU8),
849}
850
851/// Settings for tokenizing a quoted string literal.
852struct TokenizeQuotedStringSettings {
853    /// The character used to quote the string.
854    quote_style: char,
855    /// Represents how many quotes characters enclose the string literal.
856    num_quote_chars: NumStringQuoteChars,
857    /// The number of opening quotes left to consume, before parsing
858    /// the remaining string literal.
859    /// For example: given initial string `"""abc"""`. If the caller has
860    /// already parsed the first quote for some reason, then this value
861    /// is set to 1, flagging to look to consume only 2 leading quotes.
862    num_opening_quotes_to_consume: u8,
863    /// True if the string uses backslash escaping of special characters
864    /// e.g `'abc\ndef\'ghi'
865    backslash_escape: bool,
866}
867
868/// SQL Tokenizer
869pub struct Tokenizer<'a> {
870    dialect: &'a dyn Dialect,
871    query: &'a str,
872    /// If true (the default), the tokenizer will un-escape literal
873    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
874    unescape: bool,
875}
876
877impl<'a> Tokenizer<'a> {
878    /// Create a new SQL tokenizer for the specified SQL statement
879    ///
880    /// ```
881    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
882    /// # use sqlparser::dialect::GenericDialect;
883    /// # let dialect = GenericDialect{};
884    /// let query = r#"SELECT 'foo'"#;
885    ///
886    /// // Parsing the query
887    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
888    ///
889    /// assert_eq!(tokens, vec![
890    ///   Token::make_word("SELECT", None),
891    ///   Token::Whitespace(Whitespace::Space),
892    ///   Token::SingleQuotedString("foo".to_string()),
893    /// ]);
894    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
895        Self {
896            dialect,
897            query,
898            unescape: true,
899        }
900    }
901
902    /// Set unescape mode
903    ///
904    /// When true (default) the tokenizer unescapes literal values
905    /// (for example, `""` in SQL is unescaped to the literal `"`).
906    ///
907    /// When false, the tokenizer provides the raw strings as provided
908    /// in the query.  This can be helpful for programs that wish to
909    /// recover the *exact* original query text without normalizing
910    /// the escaping
911    ///
912    /// # Example
913    ///
914    /// ```
915    /// # use sqlparser::tokenizer::{Token, Tokenizer};
916    /// # use sqlparser::dialect::GenericDialect;
917    /// # let dialect = GenericDialect{};
918    /// let query = r#""Foo "" Bar""#;
919    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
920    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
921    ///
922    /// // Parsing with unescaping (default)
923    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
924    /// assert_eq!(tokens, vec![unescaped]);
925    ///
926    /// // Parsing with unescape = false
927    /// let tokens = Tokenizer::new(&dialect, &query)
928    ///    .with_unescape(false)
929    ///    .tokenize().unwrap();
930    /// assert_eq!(tokens, vec![original]);
931    /// ```
932    pub fn with_unescape(mut self, unescape: bool) -> Self {
933        self.unescape = unescape;
934        self
935    }
936
937    /// Tokenize the statement and produce a vector of tokens
938    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
939        let twl = self.tokenize_with_location()?;
940        Ok(twl.into_iter().map(|t| t.token).collect())
941    }
942
943    /// Tokenize the statement and produce a vector of tokens with location information
944    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
945        let mut tokens: Vec<TokenWithSpan> = vec![];
946        self.tokenize_with_location_into_buf(&mut tokens)
947            .map(|_| tokens)
948    }
949
950    /// Tokenize the statement and append tokens with location information into the provided buffer.
951    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
952    pub fn tokenize_with_location_into_buf(
953        &mut self,
954        buf: &mut Vec<TokenWithSpan>,
955    ) -> Result<(), TokenizerError> {
956        self.tokenize_with_location_into_buf_with_mapper(buf, |token| token)
957    }
958
959    /// Tokenize the statement and produce a vector of tokens, mapping each token
960    /// with provided `mapper`
961    pub fn tokenize_with_location_into_buf_with_mapper(
962        &mut self,
963        buf: &mut Vec<TokenWithSpan>,
964        mut mapper: impl FnMut(TokenWithSpan) -> TokenWithSpan,
965    ) -> Result<(), TokenizerError> {
966        let mut state = State {
967            peekable: self.query.chars().peekable(),
968            line: 1,
969            col: 1,
970        };
971
972        let mut location = state.location();
973        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
974            let span = location.span_to(state.location());
975
976            // Check if this is a multiline comment hint that should be expanded
977            match &token {
978                Token::Whitespace(Whitespace::MultiLineComment(comment))
979                    if self.dialect.supports_multiline_comment_hints()
980                        && comment.starts_with('!') =>
981                {
982                    // Re-tokenize the hints and add them to the buffer
983                    self.tokenize_comment_hints(comment, span, buf, &mut mapper)?;
984                }
985                _ => {
986                    buf.push(mapper(TokenWithSpan { token, span }));
987                }
988            }
989
990            location = state.location();
991        }
992        Ok(())
993    }
994
995    /// Re-tokenize optimizer hints from a multiline comment and add them to the buffer.
996    /// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for `KEY_BLOCK_SIZE = 1024`
997    fn tokenize_comment_hints(
998        &self,
999        comment: &str,
1000        span: Span,
1001        buf: &mut Vec<TokenWithSpan>,
1002        mut mapper: impl FnMut(TokenWithSpan) -> TokenWithSpan,
1003    ) -> Result<(), TokenizerError> {
1004        // Strip the leading '!' and any version digits (e.g., "50110")
1005        let hint_content = comment
1006            .strip_prefix('!')
1007            .unwrap_or(comment)
1008            .trim_start_matches(|c: char| c.is_ascii_digit());
1009
1010        // If there's no content after stripping, nothing to tokenize
1011        if hint_content.is_empty() {
1012            return Ok(());
1013        }
1014
1015        // Create a new tokenizer for the hint content
1016        let inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape);
1017
1018        // Create a state for tracking position within the hint
1019        let mut state = State {
1020            peekable: hint_content.chars().peekable(),
1021            line: span.start.line,
1022            col: span.start.column,
1023        };
1024
1025        // Tokenize the hint content and add tokens to the buffer
1026        let mut location = state.location();
1027        while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? {
1028            let token_span = location.span_to(state.location());
1029            buf.push(mapper(TokenWithSpan {
1030                token,
1031                span: token_span,
1032            }));
1033            location = state.location();
1034        }
1035
1036        Ok(())
1037    }
1038
1039    // Tokenize the identifier or keywords in `ch`
1040    fn tokenize_identifier_or_keyword(
1041        &self,
1042        ch: impl IntoIterator<Item = char>,
1043        chars: &mut State,
1044    ) -> Result<Option<Token>, TokenizerError> {
1045        chars.next(); // consume the first char
1046        let ch: String = ch.into_iter().collect();
1047        let word = self.tokenize_word(ch, chars);
1048
1049        // TODO: implement parsing of exponent here
1050        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
1051            let mut inner_state = State {
1052                peekable: word.chars().peekable(),
1053                line: 0,
1054                col: 0,
1055            };
1056            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
1057            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
1058            s += s2.as_str();
1059            return Ok(Some(Token::Number(s, false)));
1060        }
1061
1062        Ok(Some(Token::make_word_owned(word, None)))
1063    }
1064
1065    /// Get the next token or return None
1066    fn next_token(
1067        &self,
1068        chars: &mut State,
1069        prev_token: Option<&Token>,
1070    ) -> Result<Option<Token>, TokenizerError> {
1071        match chars.peek() {
1072            Some(&ch) => match ch {
1073                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
1074                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
1075                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
1076                '\r' => {
1077                    // Emit a single Whitespace::Newline token for \r and \r\n
1078                    chars.next();
1079                    if let Some('\n') = chars.peek() {
1080                        chars.next();
1081                    }
1082                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
1083                }
1084                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
1085                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
1086                {
1087                    chars.next(); // consume
1088                    match chars.peek() {
1089                        Some('\'') => {
1090                            if self.dialect.supports_triple_quoted_string() {
1091                                return self
1092                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1093                                        chars,
1094                                        '\'',
1095                                        false,
1096                                        Token::SingleQuotedByteStringLiteral,
1097                                        Token::TripleSingleQuotedByteStringLiteral,
1098                                    );
1099                            }
1100                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
1101                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
1102                        }
1103                        Some('\"') => {
1104                            if self.dialect.supports_triple_quoted_string() {
1105                                return self
1106                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1107                                        chars,
1108                                        '"',
1109                                        false,
1110                                        Token::DoubleQuotedByteStringLiteral,
1111                                        Token::TripleDoubleQuotedByteStringLiteral,
1112                                    );
1113                            }
1114                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
1115                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
1116                        }
1117                        _ => {
1118                            // regular identifier starting with an "b" or "B"
1119                            let s = self.tokenize_word(b, chars);
1120                            Ok(Some(Token::make_word_owned(s, None)))
1121                        }
1122                    }
1123                }
1124                // BigQuery uses r or R for raw string literal
1125                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
1126                    chars.next(); // consume
1127                    match chars.peek() {
1128                        Some('\'') => self
1129                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1130                                chars,
1131                                '\'',
1132                                false,
1133                                Token::SingleQuotedRawStringLiteral,
1134                                Token::TripleSingleQuotedRawStringLiteral,
1135                            ),
1136                        Some('\"') => self
1137                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1138                                chars,
1139                                '"',
1140                                false,
1141                                Token::DoubleQuotedRawStringLiteral,
1142                                Token::TripleDoubleQuotedRawStringLiteral,
1143                            ),
1144                        _ => {
1145                            // regular identifier starting with an "r" or "R"
1146                            let s = self.tokenize_word(b, chars);
1147                            Ok(Some(Token::make_word_owned(s, None)))
1148                        }
1149                    }
1150                }
1151                // Redshift uses lower case n for national string literal
1152                n @ 'N' | n @ 'n' => {
1153                    chars.next(); // consume, to check the next char
1154                    match chars.peek() {
1155                        Some('\'') => {
1156                            // N'...' - a <national character string literal>
1157                            let backslash_escape =
1158                                self.dialect.supports_string_literal_backslash_escape();
1159                            let s =
1160                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1161                            Ok(Some(Token::NationalStringLiteral(s)))
1162                        }
1163                        Some(&q @ 'q') | Some(&q @ 'Q')
1164                            if self.dialect.supports_quote_delimited_string() =>
1165                        {
1166                            chars.next(); // consume and check the next char
1167                            if let Some('\'') = chars.peek() {
1168                                self.tokenize_quote_delimited_string(chars, &[n, q])
1169                                    .map(|s| Some(Token::NationalQuoteDelimitedStringLiteral(s)))
1170                            } else {
1171                                let s = self.tokenize_word(String::from_iter([n, q]), chars);
1172                                Ok(Some(Token::make_word_owned(s, None)))
1173                            }
1174                        }
1175                        _ => {
1176                            // regular identifier starting with an "N"
1177                            let s = self.tokenize_word(n, chars);
1178                            Ok(Some(Token::make_word_owned(s, None)))
1179                        }
1180                    }
1181                }
1182                q @ 'Q' | q @ 'q' if self.dialect.supports_quote_delimited_string() => {
1183                    chars.next(); // consume and check the next char
1184                    if let Some('\'') = chars.peek() {
1185                        self.tokenize_quote_delimited_string(chars, &[q])
1186                            .map(|s| Some(Token::QuoteDelimitedStringLiteral(s)))
1187                    } else {
1188                        let s = self.tokenize_word(q, chars);
1189                        Ok(Some(Token::make_word_owned(s, None)))
1190                    }
1191                }
1192                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1193                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1194                    let starting_loc = chars.location();
1195                    chars.next(); // consume, to check the next char
1196                    match chars.peek() {
1197                        Some('\'') => {
1198                            let s =
1199                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1200                            Ok(Some(Token::EscapedStringLiteral(s)))
1201                        }
1202                        _ => {
1203                            // regular identifier starting with an "E" or "e"
1204                            let s = self.tokenize_word(x, chars);
1205                            Ok(Some(Token::make_word_owned(s, None)))
1206                        }
1207                    }
1208                }
1209                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1210                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1211                    chars.next(); // consume, to check the next char
1212                    if chars.peek() == Some(&'&') {
1213                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1214                        let mut chars_clone = chars.peekable.clone();
1215                        chars_clone.next(); // consume the '&' in the clone
1216                        if chars_clone.peek() == Some(&'\'') {
1217                            chars.next(); // consume the '&' in the original iterator
1218                            let s = unescape_unicode_single_quoted_string(chars)?;
1219                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1220                        }
1221                    }
1222                    // regular identifier starting with an "U" or "u"
1223                    let s = self.tokenize_word(x, chars);
1224                    Ok(Some(Token::make_word_owned(s, None)))
1225                }
1226                // The spec only allows an uppercase 'X' to introduce a hex
1227                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1228                x @ 'x' | x @ 'X' => {
1229                    chars.next(); // consume, to check the next char
1230                    match chars.peek() {
1231                        Some('\'') => {
1232                            // X'...' - a <binary string literal>
1233                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1234                            Ok(Some(Token::HexStringLiteral(s)))
1235                        }
1236                        _ => {
1237                            // regular identifier starting with an "X"
1238                            let s = self.tokenize_word(x, chars);
1239                            Ok(Some(Token::make_word_owned(s, None)))
1240                        }
1241                    }
1242                }
1243                // single quoted string
1244                '\'' => {
1245                    if self.dialect.supports_triple_quoted_string() {
1246                        return self
1247                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1248                                chars,
1249                                '\'',
1250                                self.dialect.supports_string_literal_backslash_escape(),
1251                                Token::SingleQuotedString,
1252                                Token::TripleSingleQuotedString,
1253                            );
1254                    }
1255                    let s = self.tokenize_single_quoted_string(
1256                        chars,
1257                        '\'',
1258                        self.dialect.supports_string_literal_backslash_escape(),
1259                    )?;
1260
1261                    Ok(Some(Token::SingleQuotedString(s)))
1262                }
1263                // double quoted string
1264                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1265                    && !self.dialect.is_identifier_start(ch) =>
1266                {
1267                    if self.dialect.supports_triple_quoted_string() {
1268                        return self
1269                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1270                                chars,
1271                                '"',
1272                                self.dialect.supports_string_literal_backslash_escape(),
1273                                Token::DoubleQuotedString,
1274                                Token::TripleDoubleQuotedString,
1275                            );
1276                    }
1277                    let s = self.tokenize_single_quoted_string(
1278                        chars,
1279                        '"',
1280                        self.dialect.supports_string_literal_backslash_escape(),
1281                    )?;
1282
1283                    Ok(Some(Token::DoubleQuotedString(s)))
1284                }
1285                // delimited (quoted) identifier
1286                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1287                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1288                    Ok(Some(Token::make_word_owned(word, Some(quote_start))))
1289                }
1290                // Potentially nested delimited (quoted) identifier
1291                quote_start
1292                    if self
1293                        .dialect
1294                        .is_nested_delimited_identifier_start(quote_start)
1295                        && self
1296                            .dialect
1297                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1298                            .is_some() =>
1299                {
1300                    let Some((quote_start, nested_quote_start)) = self
1301                        .dialect
1302                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1303                    else {
1304                        return self.tokenizer_error(
1305                            chars.location(),
1306                            format!("Expected nested delimiter '{quote_start}' before EOF."),
1307                        );
1308                    };
1309
1310                    let Some(nested_quote_start) = nested_quote_start else {
1311                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1312                        return Ok(Some(Token::make_word_owned(word, Some(quote_start))));
1313                    };
1314
1315                    let mut word = vec![];
1316                    let quote_end = Word::matching_end_quote(quote_start);
1317                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1318                    let error_loc = chars.location();
1319
1320                    chars.next(); // skip the first delimiter
1321                    peeking_take_while(chars, |ch| ch.is_whitespace());
1322                    if chars.peek() != Some(&nested_quote_start) {
1323                        return self.tokenizer_error(
1324                            error_loc,
1325                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1326                        );
1327                    }
1328                    word.push(nested_quote_start.into());
1329                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1330                    word.push(nested_quote_end.into());
1331                    peeking_take_while(chars, |ch| ch.is_whitespace());
1332                    if chars.peek() != Some(&quote_end) {
1333                        return self.tokenizer_error(
1334                            error_loc,
1335                            format!("Expected close delimiter '{quote_end}' before EOF."),
1336                        );
1337                    }
1338                    chars.next(); // skip close delimiter
1339
1340                    Ok(Some(Token::make_word_owned(
1341                        word.concat(),
1342                        Some(quote_start),
1343                    )))
1344                }
1345                // numbers and period
1346                '0'..='9' | '.' => {
1347                    // special case where if ._ is encountered after a word then that word
1348                    // is a table and the _ is the start of the col name.
1349                    // if the prev token is not a word, then this is not a valid sql
1350                    // word or number.
1351                    if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1352                        if let Some(Token::Word(_)) = prev_token {
1353                            chars.next();
1354                            return Ok(Some(Token::Period));
1355                        }
1356
1357                        return self.tokenizer_error(
1358                            chars.location(),
1359                            "Unexpected character '_'".to_string(),
1360                        );
1361                    }
1362
1363                    // Some dialects support underscore as number separator
1364                    // There can only be one at a time and it must be followed by another digit
1365                    let is_number_separator = |ch: char, next_char: Option<char>| {
1366                        self.dialect.supports_numeric_literal_underscores()
1367                            && ch == '_'
1368                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1369                    };
1370
1371                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1372                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1373                    });
1374
1375                    // match binary literal that starts with 0x
1376                    if s == "0" && chars.peek() == Some(&'x') {
1377                        chars.next();
1378                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1379                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1380                        });
1381                        return Ok(Some(Token::HexStringLiteral(s2)));
1382                    }
1383
1384                    // match one period
1385                    if let Some('.') = chars.peek() {
1386                        s.push('.');
1387                        chars.next();
1388                    }
1389
1390                    // If the dialect supports identifiers that start with a numeric prefix
1391                    // and we have now consumed a dot, check if the previous token was a Word.
1392                    // If so, what follows is definitely not part of a decimal number and
1393                    // we should yield the dot as a dedicated token so compound identifiers
1394                    // starting with digits can be parsed correctly.
1395                    if s == "." && self.dialect.supports_numeric_prefix() {
1396                        if let Some(Token::Word(_)) = prev_token {
1397                            return Ok(Some(Token::Period));
1398                        }
1399                    }
1400
1401                    // Consume fractional digits.
1402                    s += &peeking_next_take_while(chars, |ch, next_ch| {
1403                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1404                    });
1405
1406                    // No fraction -> Token::Period
1407                    if s == "." {
1408                        return Ok(Some(Token::Period));
1409                    }
1410
1411                    // Parse exponent as number
1412                    let mut exponent_part = String::new();
1413                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1414                        let mut char_clone = chars.peekable.clone();
1415                        exponent_part.push(char_clone.next().unwrap());
1416
1417                        // Optional sign
1418                        match char_clone.peek() {
1419                            Some(&c) if matches!(c, '+' | '-') => {
1420                                exponent_part.push(c);
1421                                char_clone.next();
1422                            }
1423                            _ => (),
1424                        }
1425
1426                        match char_clone.peek() {
1427                            // Definitely an exponent, get original iterator up to speed and use it
1428                            Some(&c) if c.is_ascii_digit() => {
1429                                for _ in 0..exponent_part.len() {
1430                                    chars.next();
1431                                }
1432                                exponent_part +=
1433                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1434                                s += exponent_part.as_str();
1435                            }
1436                            // Not an exponent, discard the work done
1437                            _ => (),
1438                        }
1439                    }
1440
1441                    // If the dialect supports identifiers that start with a numeric prefix,
1442                    // we need to check if the value is in fact an identifier and must thus
1443                    // be tokenized as a word.
1444                    if self.dialect.supports_numeric_prefix() {
1445                        if exponent_part.is_empty() {
1446                            // If it is not a number with an exponent, it may be
1447                            // an identifier starting with digits.
1448                            let word =
1449                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1450
1451                            if !word.is_empty() {
1452                                s += word.as_str();
1453                                return Ok(Some(Token::make_word_owned(s, None)));
1454                            }
1455                        } else if prev_token == Some(&Token::Period) {
1456                            // If the previous token was a period, thus not belonging to a number,
1457                            // the value we have is part of an identifier.
1458                            return Ok(Some(Token::make_word_owned(s, None)));
1459                        }
1460                    }
1461
1462                    let long = if chars.peek() == Some(&'L') {
1463                        chars.next();
1464                        true
1465                    } else {
1466                        false
1467                    };
1468                    Ok(Some(Token::Number(s, long)))
1469                }
1470                // punctuation
1471                '(' => self.consume_and_return(chars, Token::LParen),
1472                ')' => self.consume_and_return(chars, Token::RParen),
1473                ',' => self.consume_and_return(chars, Token::Comma),
1474                // operators
1475                '-' => {
1476                    chars.next(); // consume the '-'
1477
1478                    match chars.peek() {
1479                        Some('-') => {
1480                            let mut is_comment = true;
1481                            if self.dialect.requires_single_line_comment_whitespace() {
1482                                is_comment = chars
1483                                    .peekable
1484                                    .clone()
1485                                    .nth(1)
1486                                    .is_some_and(char::is_whitespace);
1487                            }
1488
1489                            if is_comment {
1490                                chars.next(); // consume second '-'
1491                                let comment = self.tokenize_single_line_comment(chars);
1492                                return Ok(Some(Token::Whitespace(
1493                                    Whitespace::SingleLineComment {
1494                                        prefix: "--".to_owned(),
1495                                        comment,
1496                                    },
1497                                )));
1498                            }
1499
1500                            self.start_binop(chars, "-", Token::Minus)
1501                        }
1502                        Some('>') => {
1503                            chars.next();
1504                            match chars.peek() {
1505                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1506                                _ => self.start_binop(chars, "->", Token::Arrow),
1507                            }
1508                        }
1509                        // a regular '-' operator
1510                        _ => self.start_binop(chars, "-", Token::Minus),
1511                    }
1512                }
1513                '/' => {
1514                    chars.next(); // consume the '/'
1515                    match chars.peek() {
1516                        Some('*') => {
1517                            chars.next(); // consume the '*', starting a multi-line comment
1518                            self.tokenize_multiline_comment(chars)
1519                        }
1520                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1521                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1522                            let comment = self.tokenize_single_line_comment(chars);
1523                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1524                                prefix: "//".to_owned(),
1525                                comment,
1526                            })))
1527                        }
1528                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1529                            self.consume_and_return(chars, Token::DuckIntDiv)
1530                        }
1531                        // a regular '/' operator
1532                        _ => Ok(Some(Token::Div)),
1533                    }
1534                }
1535                '+' => self.consume_and_return(chars, Token::Plus),
1536                '*' => self.consume_and_return(chars, Token::Mul),
1537                '%' => {
1538                    chars.next(); // advance past '%'
1539                    match chars.peek() {
1540                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1541                        Some(sch) if self.dialect.is_identifier_start('%') => {
1542                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1543                        }
1544                        _ => self.start_binop(chars, "%", Token::Mod),
1545                    }
1546                }
1547                '|' => {
1548                    chars.next(); // consume the '|'
1549                    match chars.peek() {
1550                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1551                        Some('|') => {
1552                            chars.next(); // consume the second '|'
1553                            match chars.peek() {
1554                                Some('/') => {
1555                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1556                                }
1557                                _ => self.start_binop(chars, "||", Token::StringConcat),
1558                            }
1559                        }
1560                        Some('&') if self.dialect.supports_geometric_types() => {
1561                            chars.next(); // consume
1562                            match chars.peek() {
1563                                Some('>') => self.consume_for_binop(
1564                                    chars,
1565                                    "|&>",
1566                                    Token::VerticalBarAmpersandRightAngleBracket,
1567                                ),
1568                                _ => self.start_binop_opt(chars, "|&", None),
1569                            }
1570                        }
1571                        Some('>') if self.dialect.supports_geometric_types() => {
1572                            chars.next(); // consume
1573                            match chars.peek() {
1574                                Some('>') => self.consume_for_binop(
1575                                    chars,
1576                                    "|>>",
1577                                    Token::VerticalBarShiftRight,
1578                                ),
1579                                _ => self.start_binop_opt(chars, "|>", None),
1580                            }
1581                        }
1582                        Some('>') if self.dialect.supports_pipe_operator() => {
1583                            self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1584                        }
1585                        // Bitshift '|' operator
1586                        _ => self.start_binop(chars, "|", Token::Pipe),
1587                    }
1588                }
1589                '=' => {
1590                    chars.next(); // consume
1591                    match chars.peek() {
1592                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1593                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1594                        _ => Ok(Some(Token::Eq)),
1595                    }
1596                }
1597                '!' => {
1598                    chars.next(); // consume
1599                    match chars.peek() {
1600                        Some('=') => self.consume_and_return(chars, Token::Neq),
1601                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1602                        Some('~') => {
1603                            chars.next();
1604                            match chars.peek() {
1605                                Some('*') => self
1606                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1607                                Some('~') => {
1608                                    chars.next();
1609                                    match chars.peek() {
1610                                        Some('*') => self.consume_and_return(
1611                                            chars,
1612                                            Token::ExclamationMarkDoubleTildeAsterisk,
1613                                        ),
1614                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1615                                    }
1616                                }
1617                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1618                            }
1619                        }
1620                        _ => Ok(Some(Token::ExclamationMark)),
1621                    }
1622                }
1623                '<' => {
1624                    chars.next(); // consume
1625                    match chars.peek() {
1626                        Some('=') => {
1627                            chars.next();
1628                            match chars.peek() {
1629                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1630                                // `<=+` and `<=-` are not valid combined operators; treat `<=` as
1631                                // the operator and leave `+`/`-` to be tokenized separately.
1632                                Some('+') | Some('-') => Ok(Some(Token::LtEq)),
1633                                _ => self.start_binop(chars, "<=", Token::LtEq),
1634                            }
1635                        }
1636                        Some('|') if self.dialect.supports_geometric_types() => {
1637                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1638                        }
1639                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1640                        Some('<') if self.dialect.supports_geometric_types() => {
1641                            chars.next(); // consume
1642                            match chars.peek() {
1643                                Some('|') => self.consume_for_binop(
1644                                    chars,
1645                                    "<<|",
1646                                    Token::ShiftLeftVerticalBar,
1647                                ),
1648                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1649                            }
1650                        }
1651                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1652                        // `<+` is not a valid combined operator; treat `<` as the operator
1653                        // and leave `+` to be tokenized separately.
1654                        Some('+') => Ok(Some(Token::Lt)),
1655                        Some('-') if self.dialect.supports_geometric_types() => {
1656                            if chars.peekable.clone().nth(1) == Some('>') {
1657                                chars.next(); // consume `-`
1658                                self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1659                            } else {
1660                                Ok(Some(Token::Lt))
1661                            }
1662                        }
1663                        Some('^') if self.dialect.supports_geometric_types() => {
1664                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1665                        }
1666                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1667                        _ => self.start_binop(chars, "<", Token::Lt),
1668                    }
1669                }
1670                '>' => {
1671                    chars.next(); // consume
1672                    match chars.peek() {
1673                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1674                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1675                        Some('^') if self.dialect.supports_geometric_types() => {
1676                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1677                        }
1678                        _ => self.start_binop(chars, ">", Token::Gt),
1679                    }
1680                }
1681                ':' => {
1682                    chars.next();
1683                    match chars.peek() {
1684                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1685                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1686                        _ => Ok(Some(Token::Colon)),
1687                    }
1688                }
1689                ';' => self.consume_and_return(chars, Token::SemiColon),
1690                '\\' => self.consume_and_return(chars, Token::Backslash),
1691                '[' => self.consume_and_return(chars, Token::LBracket),
1692                ']' => self.consume_and_return(chars, Token::RBracket),
1693                '&' => {
1694                    chars.next(); // consume the '&'
1695                    match chars.peek() {
1696                        Some('>') if self.dialect.supports_geometric_types() => {
1697                            chars.next();
1698                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1699                        }
1700                        Some('<') if self.dialect.supports_geometric_types() => {
1701                            chars.next(); // consume
1702                            match chars.peek() {
1703                                Some('|') => self.consume_and_return(
1704                                    chars,
1705                                    Token::AmpersandLeftAngleBracketVerticalBar,
1706                                ),
1707                                _ => {
1708                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1709                                }
1710                            }
1711                        }
1712                        Some('&') => {
1713                            chars.next(); // consume the second '&'
1714                            self.start_binop(chars, "&&", Token::Overlap)
1715                        }
1716                        // Bitshift '&' operator
1717                        _ => self.start_binop(chars, "&", Token::Ampersand),
1718                    }
1719                }
1720                '^' => {
1721                    chars.next(); // consume the '^'
1722                    match chars.peek() {
1723                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1724                        _ => Ok(Some(Token::Caret)),
1725                    }
1726                }
1727                '{' => self.consume_and_return(chars, Token::LBrace),
1728                '}' => self.consume_and_return(chars, Token::RBrace),
1729                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1730                {
1731                    chars.next(); // consume the '#', starting a snowflake single-line comment
1732                    let comment = self.tokenize_single_line_comment(chars);
1733                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1734                        prefix: "#".to_owned(),
1735                        comment,
1736                    })))
1737                }
1738                '~' => {
1739                    chars.next(); // consume
1740                    match chars.peek() {
1741                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1742                        Some('=') if self.dialect.supports_geometric_types() => {
1743                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
1744                        }
1745                        Some('~') => {
1746                            chars.next();
1747                            match chars.peek() {
1748                                Some('*') => {
1749                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1750                                }
1751                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1752                            }
1753                        }
1754                        _ => self.start_binop(chars, "~", Token::Tilde),
1755                    }
1756                }
1757                '#' => {
1758                    chars.next();
1759                    match chars.peek() {
1760                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1761                        Some('>') => {
1762                            chars.next();
1763                            match chars.peek() {
1764                                Some('>') => {
1765                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1766                                }
1767                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1768                            }
1769                        }
1770                        Some(' ') => Ok(Some(Token::Sharp)),
1771                        Some('#') if self.dialect.supports_geometric_types() => {
1772                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
1773                        }
1774                        Some(sch) if self.dialect.is_identifier_start('#') => {
1775                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1776                        }
1777                        _ => self.start_binop(chars, "#", Token::Sharp),
1778                    }
1779                }
1780                '@' => {
1781                    chars.next();
1782                    match chars.peek() {
1783                        Some('@') if self.dialect.supports_geometric_types() => {
1784                            self.consume_and_return(chars, Token::AtAt)
1785                        }
1786                        Some('-') if self.dialect.supports_geometric_types() => {
1787                            chars.next();
1788                            match chars.peek() {
1789                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1790                                _ => self.start_binop_opt(chars, "@-", None),
1791                            }
1792                        }
1793                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1794                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1795                        Some('@') => {
1796                            chars.next();
1797                            match chars.peek() {
1798                                Some(' ') => Ok(Some(Token::AtAt)),
1799                                Some(tch) if self.dialect.is_identifier_start('@') => {
1800                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1801                                }
1802                                _ => Ok(Some(Token::AtAt)),
1803                            }
1804                        }
1805                        Some(' ') => Ok(Some(Token::AtSign)),
1806                        // We break on quotes here, because no dialect allows identifiers starting
1807                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1808                        // quoted, which is tokenized as a quoted string, not here (e.g.
1809                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1810                        // quoted string as two separate tokens, which this allows. For example,
1811                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1812                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1813                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1814                        // for the user, the `@`, and the host.
1815                        Some('\'') => Ok(Some(Token::AtSign)),
1816                        Some('\"') => Ok(Some(Token::AtSign)),
1817                        Some('`') => Ok(Some(Token::AtSign)),
1818                        Some(sch) if self.dialect.is_identifier_start('@') => {
1819                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1820                        }
1821                        _ => Ok(Some(Token::AtSign)),
1822                    }
1823                }
1824                // Postgres uses ? for jsonb operators, not prepared statements
1825                '?' if self.dialect.supports_geometric_types() => {
1826                    chars.next(); // consume
1827                    match chars.peek() {
1828                        Some('|') => {
1829                            chars.next();
1830                            match chars.peek() {
1831                                Some('|') => self.consume_and_return(
1832                                    chars,
1833                                    Token::QuestionMarkDoubleVerticalBar,
1834                                ),
1835                                _ => Ok(Some(Token::QuestionPipe)),
1836                            }
1837                        }
1838
1839                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1840                        Some('-') => {
1841                            chars.next(); // consume
1842                            match chars.peek() {
1843                                Some('|') => self
1844                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1845                                _ => Ok(Some(Token::QuestionMarkDash)),
1846                            }
1847                        }
1848                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1849                        _ => Ok(Some(Token::Question)),
1850                    }
1851                }
1852                '?' => {
1853                    chars.next();
1854                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1855                    Ok(Some(Token::Placeholder(format!("?{s}"))))
1856                }
1857
1858                // identifier or keyword
1859                ch if self.dialect.is_identifier_start(ch) => {
1860                    self.tokenize_identifier_or_keyword([ch], chars)
1861                }
1862                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1863
1864                // whitespace check (including unicode chars) should be last as it covers some of the chars above
1865                ch if ch.is_whitespace() => {
1866                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1867                }
1868                other => self.consume_and_return(chars, Token::Char(other)),
1869            },
1870            None => Ok(None),
1871        }
1872    }
1873
1874    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1875    fn consume_for_binop(
1876        &self,
1877        chars: &mut State,
1878        prefix: &str,
1879        default: Token,
1880    ) -> Result<Option<Token>, TokenizerError> {
1881        chars.next(); // consume the first char
1882        self.start_binop_opt(chars, prefix, Some(default))
1883    }
1884
1885    /// parse a custom binary operator
1886    fn start_binop(
1887        &self,
1888        chars: &mut State,
1889        prefix: &str,
1890        default: Token,
1891    ) -> Result<Option<Token>, TokenizerError> {
1892        self.start_binop_opt(chars, prefix, Some(default))
1893    }
1894
1895    /// parse a custom binary operator
1896    fn start_binop_opt(
1897        &self,
1898        chars: &mut State,
1899        prefix: &str,
1900        default: Option<Token>,
1901    ) -> Result<Option<Token>, TokenizerError> {
1902        let mut custom = None;
1903        while let Some(&ch) = chars.peek() {
1904            if !self.dialect.is_custom_operator_part(ch) {
1905                break;
1906            }
1907
1908            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1909            chars.next();
1910        }
1911        match (custom, default) {
1912            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1913            (None, Some(tok)) => Ok(Some(tok)),
1914            (None, None) => self.tokenizer_error(
1915                chars.location(),
1916                format!("Expected a valid binary operator after '{prefix}'"),
1917            ),
1918        }
1919    }
1920
1921    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1922    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1923        let mut s = String::new();
1924        let mut value = String::new();
1925
1926        chars.next();
1927
1928        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1929        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1930            chars.next();
1931
1932            let mut is_terminated = false;
1933            let mut prev: Option<char> = None;
1934
1935            while let Some(&ch) = chars.peek() {
1936                if prev == Some('$') {
1937                    if ch == '$' {
1938                        chars.next();
1939                        is_terminated = true;
1940                        break;
1941                    } else {
1942                        s.push('$');
1943                        s.push(ch);
1944                    }
1945                } else if ch != '$' {
1946                    s.push(ch);
1947                }
1948
1949                prev = Some(ch);
1950                chars.next();
1951            }
1952
1953            return if chars.peek().is_none() && !is_terminated {
1954                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1955            } else {
1956                Ok(Token::DollarQuotedString(DollarQuotedString {
1957                    value: s,
1958                    tag: None,
1959                }))
1960            };
1961        } else {
1962            value.push_str(&peeking_take_while(chars, |ch| {
1963                ch.is_alphanumeric()
1964                    || ch == '_'
1965                    // Allow $ as a placeholder character if the dialect supports it
1966                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1967            }));
1968
1969            // If the dialect supports a dollar sign as a money prefix (e.g. SQL Server),
1970            // and the value so far is all digits, check for a decimal part, e.g. `$123.45`
1971            if matches!(chars.peek(), Some('.'))
1972                && self.dialect.supports_dollar_as_money_prefix()
1973                && !value.is_empty()
1974                && value.chars().all(|c| c.is_ascii_digit())
1975            {
1976                value.push('.');
1977                chars.next();
1978                value.push_str(&peeking_take_while(chars, |ch| ch.is_ascii_digit()));
1979                return Ok(Token::Placeholder(format!("${value}")));
1980            }
1981
1982            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1983            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1984                chars.next();
1985
1986                let mut temp = String::new();
1987                let end_delimiter = format!("${value}$");
1988
1989                loop {
1990                    match chars.next() {
1991                        Some(ch) => {
1992                            temp.push(ch);
1993
1994                            if temp.ends_with(&end_delimiter) {
1995                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1996                                    s.push_str(temp);
1997                                }
1998                                break;
1999                            }
2000                        }
2001                        None => {
2002                            if temp.ends_with(&end_delimiter) {
2003                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
2004                                    s.push_str(temp);
2005                                }
2006                                break;
2007                            }
2008
2009                            return self.tokenizer_error(
2010                                chars.location(),
2011                                "Unterminated dollar-quoted, expected $",
2012                            );
2013                        }
2014                    }
2015                }
2016            } else {
2017                return Ok(Token::Placeholder(format!("${value}")));
2018            }
2019        }
2020
2021        Ok(Token::DollarQuotedString(DollarQuotedString {
2022            value: s,
2023            tag: if value.is_empty() { None } else { Some(value) },
2024        }))
2025    }
2026
2027    fn tokenizer_error<R>(
2028        &self,
2029        loc: Location,
2030        message: impl Into<String>,
2031    ) -> Result<R, TokenizerError> {
2032        Err(TokenizerError {
2033            message: message.into(),
2034            location: loc,
2035        })
2036    }
2037
2038    // Consume characters until newline
2039    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
2040        let mut comment = peeking_take_while(chars, |ch| match ch {
2041            '\n' => false,                                           // Always stop at \n
2042            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
2043            _ => true, // Keep consuming for other characters
2044        });
2045
2046        if let Some(ch) = chars.next() {
2047            assert!(ch == '\n' || ch == '\r');
2048            comment.push(ch);
2049        }
2050
2051        comment
2052    }
2053
2054    /// Tokenize an identifier or keyword, after the first char is already consumed.
2055    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
2056        let mut s = first_chars.into();
2057        s.push_str(&peeking_take_while(chars, |ch| {
2058            self.dialect.is_identifier_part(ch)
2059        }));
2060        s
2061    }
2062
2063    /// Read a quoted identifier
2064    fn tokenize_quoted_identifier(
2065        &self,
2066        quote_start: char,
2067        chars: &mut State,
2068    ) -> Result<String, TokenizerError> {
2069        let error_loc = chars.location();
2070        chars.next(); // consume the opening quote
2071        let quote_end = Word::matching_end_quote(quote_start);
2072        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
2073
2074        if last_char == Some(quote_end) {
2075            Ok(s)
2076        } else {
2077            self.tokenizer_error(
2078                error_loc,
2079                format!("Expected close delimiter '{quote_end}' before EOF."),
2080            )
2081        }
2082    }
2083
2084    /// Read a single quoted string, starting with the opening quote.
2085    fn tokenize_escaped_single_quoted_string(
2086        &self,
2087        starting_loc: Location,
2088        chars: &mut State,
2089    ) -> Result<String, TokenizerError> {
2090        if let Some(s) = unescape_single_quoted_string(chars) {
2091            return Ok(s);
2092        }
2093
2094        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
2095    }
2096
2097    /// Reads a string literal quoted by a single or triple quote characters.
2098    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
2099    fn tokenize_single_or_triple_quoted_string<F>(
2100        &self,
2101        chars: &mut State,
2102        quote_style: char,
2103        backslash_escape: bool,
2104        single_quote_token: F,
2105        triple_quote_token: F,
2106    ) -> Result<Option<Token>, TokenizerError>
2107    where
2108        F: Fn(String) -> Token,
2109    {
2110        let error_loc = chars.location();
2111
2112        let mut num_opening_quotes = 0u8;
2113        for _ in 0..3 {
2114            if Some(&quote_style) == chars.peek() {
2115                chars.next(); // Consume quote.
2116                num_opening_quotes += 1;
2117            } else {
2118                break;
2119            }
2120        }
2121
2122        let (token_fn, num_quote_chars) = match num_opening_quotes {
2123            1 => (single_quote_token, NumStringQuoteChars::One),
2124            2 => {
2125                // If we matched double quotes, then this is an empty string.
2126                return Ok(Some(single_quote_token("".into())));
2127            }
2128            3 => {
2129                let Some(num_quote_chars) = NonZeroU8::new(3) else {
2130                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
2131                };
2132                (
2133                    triple_quote_token,
2134                    NumStringQuoteChars::Many(num_quote_chars),
2135                )
2136            }
2137            _ => {
2138                return self.tokenizer_error(error_loc, "invalid string literal opening");
2139            }
2140        };
2141
2142        let settings = TokenizeQuotedStringSettings {
2143            quote_style,
2144            num_quote_chars,
2145            num_opening_quotes_to_consume: 0,
2146            backslash_escape,
2147        };
2148
2149        self.tokenize_quoted_string(chars, settings)
2150            .map(token_fn)
2151            .map(Some)
2152    }
2153
2154    /// Reads a string literal quoted by a single quote character.
2155    fn tokenize_single_quoted_string(
2156        &self,
2157        chars: &mut State,
2158        quote_style: char,
2159        backslash_escape: bool,
2160    ) -> Result<String, TokenizerError> {
2161        self.tokenize_quoted_string(
2162            chars,
2163            TokenizeQuotedStringSettings {
2164                quote_style,
2165                num_quote_chars: NumStringQuoteChars::One,
2166                num_opening_quotes_to_consume: 1,
2167                backslash_escape,
2168            },
2169        )
2170    }
2171
2172    /// Reads a quote delimited string expecting `chars.next()` to deliver a quote.
2173    ///
2174    /// See <https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA>
2175    fn tokenize_quote_delimited_string(
2176        &self,
2177        chars: &mut State,
2178        // the prefix that introduced the possible literal or word,
2179        // e.g. "Q" or "nq"
2180        literal_prefix: &[char],
2181    ) -> Result<QuoteDelimitedString, TokenizerError> {
2182        let literal_start_loc = chars.location();
2183        chars.next();
2184
2185        let start_quote_loc = chars.location();
2186        let (start_quote, end_quote) = match chars.next() {
2187            None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => {
2188                return self.tokenizer_error(
2189                    start_quote_loc,
2190                    format!(
2191                        "Invalid space, tab, newline, or EOF after '{}''",
2192                        String::from_iter(literal_prefix)
2193                    ),
2194                );
2195            }
2196            Some(c) => (
2197                c,
2198                match c {
2199                    '[' => ']',
2200                    '{' => '}',
2201                    '<' => '>',
2202                    '(' => ')',
2203                    c => c,
2204                },
2205            ),
2206        };
2207
2208        // read the string literal until the "quote character" following a by literal quote
2209        let mut value = String::new();
2210        while let Some(ch) = chars.next() {
2211            if ch == end_quote {
2212                if let Some('\'') = chars.peek() {
2213                    chars.next(); // ~ consume the quote
2214                    return Ok(QuoteDelimitedString {
2215                        start_quote,
2216                        value,
2217                        end_quote,
2218                    });
2219                }
2220            }
2221            value.push(ch);
2222        }
2223
2224        self.tokenizer_error(literal_start_loc, "Unterminated string literal")
2225    }
2226
2227    /// Read a quoted string.
2228    fn tokenize_quoted_string(
2229        &self,
2230        chars: &mut State,
2231        settings: TokenizeQuotedStringSettings,
2232    ) -> Result<String, TokenizerError> {
2233        let mut s = String::new();
2234        let error_loc = chars.location();
2235
2236        // Consume any opening quotes.
2237        for _ in 0..settings.num_opening_quotes_to_consume {
2238            if Some(settings.quote_style) != chars.next() {
2239                return self.tokenizer_error(error_loc, "invalid string literal opening");
2240            }
2241        }
2242
2243        let mut num_consecutive_quotes = 0;
2244        while let Some(&ch) = chars.peek() {
2245            let pending_final_quote = match settings.num_quote_chars {
2246                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2247                n @ NumStringQuoteChars::Many(count)
2248                    if num_consecutive_quotes + 1 == count.get() =>
2249                {
2250                    Some(n)
2251                }
2252                NumStringQuoteChars::Many(_) => None,
2253            };
2254
2255            match ch {
2256                char if char == settings.quote_style && pending_final_quote.is_some() => {
2257                    chars.next(); // consume
2258
2259                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2260                        // For an initial string like `"""abc"""`, at this point we have
2261                        // `abc""` in the buffer and have now matched the final `"`.
2262                        // However, the string to return is simply `abc`, so we strip off
2263                        // the trailing quotes before returning.
2264                        let mut buf = s.chars();
2265                        for _ in 1..count.get() {
2266                            buf.next_back();
2267                        }
2268                        return Ok(buf.as_str().to_string());
2269                    } else if chars
2270                        .peek()
2271                        .map(|c| *c == settings.quote_style)
2272                        .unwrap_or(false)
2273                    {
2274                        s.push(ch);
2275                        if !self.unescape {
2276                            // In no-escape mode, the given query has to be saved completely
2277                            s.push(ch);
2278                        }
2279                        chars.next();
2280                    } else {
2281                        return Ok(s);
2282                    }
2283                }
2284                '\\' if settings.backslash_escape => {
2285                    // consume backslash
2286                    chars.next();
2287
2288                    num_consecutive_quotes = 0;
2289
2290                    if let Some(next) = chars.peek() {
2291                        if !self.unescape
2292                            || (self.dialect.ignores_wildcard_escapes()
2293                                && (*next == '%' || *next == '_'))
2294                        {
2295                            // In no-escape mode, the given query has to be saved completely
2296                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2297                            // the backslash is not stripped.
2298                            s.push(ch);
2299                            s.push(*next);
2300                            chars.next(); // consume next
2301                        } else {
2302                            let n = match next {
2303                                '0' => '\0',
2304                                'a' => '\u{7}',
2305                                'b' => '\u{8}',
2306                                'f' => '\u{c}',
2307                                'n' => '\n',
2308                                'r' => '\r',
2309                                't' => '\t',
2310                                'Z' => '\u{1a}',
2311                                _ => *next,
2312                            };
2313                            s.push(n);
2314                            chars.next(); // consume next
2315                        }
2316                    }
2317                }
2318                ch => {
2319                    chars.next(); // consume ch
2320
2321                    if ch == settings.quote_style {
2322                        num_consecutive_quotes += 1;
2323                    } else {
2324                        num_consecutive_quotes = 0;
2325                    }
2326
2327                    s.push(ch);
2328                }
2329            }
2330        }
2331        self.tokenizer_error(error_loc, "Unterminated string literal")
2332    }
2333
2334    fn tokenize_multiline_comment(
2335        &self,
2336        chars: &mut State,
2337    ) -> Result<Option<Token>, TokenizerError> {
2338        let mut s = String::new();
2339        let mut nested = 1;
2340        let supports_nested_comments = self.dialect.supports_nested_comments();
2341        loop {
2342            match chars.next() {
2343                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2344                    chars.next(); // consume the '*'
2345                    s.push('/');
2346                    s.push('*');
2347                    nested += 1;
2348                }
2349                Some('*') if matches!(chars.peek(), Some('/')) => {
2350                    chars.next(); // consume the '/'
2351                    nested -= 1;
2352                    if nested == 0 {
2353                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2354                    }
2355                    s.push('*');
2356                    s.push('/');
2357                }
2358                Some(ch) => {
2359                    s.push(ch);
2360                }
2361                None => {
2362                    break self.tokenizer_error(
2363                        chars.location(),
2364                        "Unexpected EOF while in a multi-line comment",
2365                    );
2366                }
2367            }
2368        }
2369    }
2370
2371    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2372        let mut last_char = None;
2373        let mut s = String::new();
2374        while let Some(ch) = chars.next() {
2375            if ch == quote_end {
2376                if chars.peek() == Some(&quote_end) {
2377                    chars.next();
2378                    s.push(ch);
2379                    if !self.unescape {
2380                        // In no-escape mode, the given query has to be saved completely
2381                        s.push(ch);
2382                    }
2383                } else {
2384                    last_char = Some(quote_end);
2385                    break;
2386                }
2387            } else {
2388                s.push(ch);
2389            }
2390        }
2391        (s, last_char)
2392    }
2393
2394    #[allow(clippy::unnecessary_wraps)]
2395    fn consume_and_return(
2396        &self,
2397        chars: &mut State,
2398        t: Token,
2399    ) -> Result<Option<Token>, TokenizerError> {
2400        chars.next();
2401        Ok(Some(t))
2402    }
2403}
2404
2405/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2406/// Return the characters read as String, and keep the first non-matching
2407/// char available as `chars.next()`.
2408fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2409    let mut s = String::new();
2410    while let Some(&ch) = chars.peek() {
2411        if predicate(ch) {
2412            chars.next(); // consume
2413            s.push(ch);
2414        } else {
2415            break;
2416        }
2417    }
2418    s
2419}
2420
2421/// Same as peeking_take_while, but also passes the next character to the predicate.
2422fn peeking_next_take_while(
2423    chars: &mut State,
2424    mut predicate: impl FnMut(char, Option<char>) -> bool,
2425) -> String {
2426    let mut s = String::new();
2427    while let Some(&ch) = chars.peek() {
2428        let next_char = chars.peekable.clone().nth(1);
2429        if predicate(ch, next_char) {
2430            chars.next(); // consume
2431            s.push(ch);
2432        } else {
2433            break;
2434        }
2435    }
2436    s
2437}
2438
2439fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2440    Unescape::new(chars).unescape()
2441}
2442
2443struct Unescape<'a: 'b, 'b> {
2444    chars: &'b mut State<'a>,
2445}
2446
2447impl<'a: 'b, 'b> Unescape<'a, 'b> {
2448    fn new(chars: &'b mut State<'a>) -> Self {
2449        Self { chars }
2450    }
2451    fn unescape(mut self) -> Option<String> {
2452        let mut unescaped = String::new();
2453
2454        self.chars.next();
2455
2456        while let Some(c) = self.chars.next() {
2457            if c == '\'' {
2458                // case: ''''
2459                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2460                    self.chars.next();
2461                    unescaped.push('\'');
2462                    continue;
2463                }
2464                return Some(unescaped);
2465            }
2466
2467            if c != '\\' {
2468                unescaped.push(c);
2469                continue;
2470            }
2471
2472            let c = match self.chars.next()? {
2473                'b' => '\u{0008}',
2474                'f' => '\u{000C}',
2475                'n' => '\n',
2476                'r' => '\r',
2477                't' => '\t',
2478                'u' => self.unescape_unicode_16()?,
2479                'U' => self.unescape_unicode_32()?,
2480                'x' => self.unescape_hex()?,
2481                c if c.is_digit(8) => self.unescape_octal(c)?,
2482                c => c,
2483            };
2484
2485            unescaped.push(Self::check_null(c)?);
2486        }
2487
2488        None
2489    }
2490
2491    #[inline]
2492    fn check_null(c: char) -> Option<char> {
2493        if c == '\0' {
2494            None
2495        } else {
2496            Some(c)
2497        }
2498    }
2499
2500    #[inline]
2501    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2502        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2503        match u32::from_str_radix(s, RADIX) {
2504            Err(_) => None,
2505            Ok(n) => {
2506                let n = n & 0xFF;
2507                if n <= 127 {
2508                    char::from_u32(n)
2509                } else {
2510                    None
2511                }
2512            }
2513        }
2514    }
2515
2516    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2517    fn unescape_hex(&mut self) -> Option<char> {
2518        let mut s = String::new();
2519
2520        for _ in 0..2 {
2521            match self.next_hex_digit() {
2522                Some(c) => s.push(c),
2523                None => break,
2524            }
2525        }
2526
2527        if s.is_empty() {
2528            return Some('x');
2529        }
2530
2531        Self::byte_to_char::<16>(&s)
2532    }
2533
2534    #[inline]
2535    fn next_hex_digit(&mut self) -> Option<char> {
2536        match self.chars.peek() {
2537            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2538            _ => None,
2539        }
2540    }
2541
2542    // Octal byte value. \o, \oo, \ooo (o = 0–7)
2543    fn unescape_octal(&mut self, c: char) -> Option<char> {
2544        let mut s = String::new();
2545
2546        s.push(c);
2547        for _ in 0..2 {
2548            match self.next_octal_digest() {
2549                Some(c) => s.push(c),
2550                None => break,
2551            }
2552        }
2553
2554        Self::byte_to_char::<8>(&s)
2555    }
2556
2557    #[inline]
2558    fn next_octal_digest(&mut self) -> Option<char> {
2559        match self.chars.peek() {
2560            Some(c) if c.is_digit(8) => self.chars.next(),
2561            _ => None,
2562        }
2563    }
2564
2565    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2566    fn unescape_unicode_16(&mut self) -> Option<char> {
2567        self.unescape_unicode::<4>()
2568    }
2569
2570    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2571    fn unescape_unicode_32(&mut self) -> Option<char> {
2572        self.unescape_unicode::<8>()
2573    }
2574
2575    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2576        let mut s = String::new();
2577        for _ in 0..NUM {
2578            s.push(self.chars.next()?);
2579        }
2580        match u32::from_str_radix(&s, 16) {
2581            Err(_) => None,
2582            Ok(n) => char::from_u32(n),
2583        }
2584    }
2585}
2586
2587fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2588    let mut unescaped = String::new();
2589    chars.next(); // consume the opening quote
2590    while let Some(c) = chars.next() {
2591        match c {
2592            '\'' => {
2593                if chars.peek() == Some(&'\'') {
2594                    chars.next();
2595                    unescaped.push('\'');
2596                } else {
2597                    return Ok(unescaped);
2598                }
2599            }
2600            '\\' => match chars.peek() {
2601                Some('\\') => {
2602                    chars.next();
2603                    unescaped.push('\\');
2604                }
2605                Some('+') => {
2606                    chars.next();
2607                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2608                }
2609                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2610            },
2611            _ => {
2612                unescaped.push(c);
2613            }
2614        }
2615    }
2616    Err(TokenizerError {
2617        message: "Unterminated unicode encoded string literal".to_string(),
2618        location: chars.location(),
2619    })
2620}
2621
2622fn take_char_from_hex_digits(
2623    chars: &mut State<'_>,
2624    max_digits: usize,
2625) -> Result<char, TokenizerError> {
2626    let mut result = 0u32;
2627    for _ in 0..max_digits {
2628        let next_char = chars.next().ok_or_else(|| TokenizerError {
2629            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2630                .to_string(),
2631            location: chars.location(),
2632        })?;
2633        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2634            message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2635            location: chars.location(),
2636        })?;
2637        result = result * 16 + digit;
2638    }
2639    char::from_u32(result).ok_or_else(|| TokenizerError {
2640        message: format!("Invalid unicode character: {result:x}"),
2641        location: chars.location(),
2642    })
2643}
2644
2645#[cfg(test)]
2646mod tests {
2647    use super::*;
2648    use crate::dialect::{
2649        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect,
2650        PostgreSqlDialect, SQLiteDialect,
2651    };
2652    use crate::test_utils::{all_dialects, all_dialects_except, all_dialects_where};
2653    use core::fmt::Debug;
2654
2655    #[test]
2656    fn tokenizer_error_impl() {
2657        let err = TokenizerError {
2658            message: "test".into(),
2659            location: Location { line: 1, column: 1 },
2660        };
2661        {
2662            use core::error::Error;
2663            assert!(err.source().is_none());
2664        }
2665        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2666    }
2667
2668    #[test]
2669    fn tokenize_select_1() {
2670        let sql = String::from("SELECT 1");
2671        let dialect = GenericDialect {};
2672        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2673
2674        let expected = vec![
2675            Token::make_keyword("SELECT"),
2676            Token::Whitespace(Whitespace::Space),
2677            Token::Number(String::from("1"), false),
2678        ];
2679
2680        compare(expected, tokens);
2681    }
2682
2683    #[test]
2684    fn tokenize_select_float() {
2685        let sql = String::from("SELECT .1");
2686        let dialect = GenericDialect {};
2687        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2688
2689        let expected = vec![
2690            Token::make_keyword("SELECT"),
2691            Token::Whitespace(Whitespace::Space),
2692            Token::Number(String::from(".1"), false),
2693        ];
2694
2695        compare(expected, tokens);
2696    }
2697
2698    #[test]
2699    fn tokenize_with_mapper() {
2700        let sql = String::from("SELECT ?");
2701        let dialect = GenericDialect {};
2702        let mut param_num = 1;
2703
2704        let mut tokens = vec![];
2705        Tokenizer::new(&dialect, &sql)
2706            .tokenize_with_location_into_buf_with_mapper(&mut tokens, |mut token_span| {
2707                token_span.token = match token_span.token {
2708                    Token::Placeholder(n) => Token::Placeholder(if n == "?" {
2709                        let ret = format!("${}", param_num);
2710                        param_num += 1;
2711                        ret
2712                    } else {
2713                        n
2714                    }),
2715                    token => token,
2716                };
2717                token_span
2718            })
2719            .unwrap();
2720        let actual = tokens.into_iter().map(|t| t.token).collect();
2721        let expected = vec![
2722            Token::make_keyword("SELECT"),
2723            Token::Whitespace(Whitespace::Space),
2724            Token::Placeholder("$1".to_string()),
2725        ];
2726
2727        compare(expected, actual);
2728    }
2729
2730    #[test]
2731    fn tokenize_clickhouse_double_equal() {
2732        let sql = String::from("SELECT foo=='1'");
2733        let dialect = ClickHouseDialect {};
2734        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2735        let tokens = tokenizer.tokenize().unwrap();
2736
2737        let expected = vec![
2738            Token::make_keyword("SELECT"),
2739            Token::Whitespace(Whitespace::Space),
2740            Token::Word(Word {
2741                value: "foo".to_string(),
2742                quote_style: None,
2743                keyword: Keyword::NoKeyword,
2744            }),
2745            Token::DoubleEq,
2746            Token::SingleQuotedString("1".to_string()),
2747        ];
2748
2749        compare(expected, tokens);
2750    }
2751
2752    #[test]
2753    fn tokenize_numeric_literal_underscore() {
2754        let dialect = GenericDialect {};
2755        let sql = String::from("SELECT 10_000");
2756        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2757        let tokens = tokenizer.tokenize().unwrap();
2758        let expected = vec![
2759            Token::make_keyword("SELECT"),
2760            Token::Whitespace(Whitespace::Space),
2761            Token::Number("10".to_string(), false),
2762            Token::make_word("_000", None),
2763        ];
2764        compare(expected, tokens);
2765
2766        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2767            "SELECT 10_000, _10_000, 10_00_, 10___0",
2768            vec![
2769                Token::make_keyword("SELECT"),
2770                Token::Whitespace(Whitespace::Space),
2771                Token::Number("10_000".to_string(), false),
2772                Token::Comma,
2773                Token::Whitespace(Whitespace::Space),
2774                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2775                Token::Comma,
2776                Token::Whitespace(Whitespace::Space),
2777                Token::Number("10_00".to_string(), false),
2778                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2779                Token::Comma,
2780                Token::Whitespace(Whitespace::Space),
2781                Token::Number("10".to_string(), false),
2782                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2783            ],
2784        );
2785    }
2786
2787    #[test]
2788    fn tokenize_select_exponent() {
2789        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2790        let dialect = GenericDialect {};
2791        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2792
2793        let expected = vec![
2794            Token::make_keyword("SELECT"),
2795            Token::Whitespace(Whitespace::Space),
2796            Token::Number(String::from("1e10"), false),
2797            Token::Comma,
2798            Token::Whitespace(Whitespace::Space),
2799            Token::Number(String::from("1e-10"), false),
2800            Token::Comma,
2801            Token::Whitespace(Whitespace::Space),
2802            Token::Number(String::from("1e+10"), false),
2803            Token::Comma,
2804            Token::Whitespace(Whitespace::Space),
2805            Token::Number(String::from("1"), false),
2806            Token::make_word("ea", None),
2807            Token::Comma,
2808            Token::Whitespace(Whitespace::Space),
2809            Token::Number(String::from("1e-10"), false),
2810            Token::make_word("a", None),
2811            Token::Comma,
2812            Token::Whitespace(Whitespace::Space),
2813            Token::Number(String::from("1e-10"), false),
2814            Token::Minus,
2815            Token::Number(String::from("10"), false),
2816        ];
2817
2818        compare(expected, tokens);
2819    }
2820
2821    #[test]
2822    fn tokenize_scalar_function() {
2823        let sql = String::from("SELECT sqrt(1)");
2824        let dialect = GenericDialect {};
2825        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2826
2827        let expected = vec![
2828            Token::make_keyword("SELECT"),
2829            Token::Whitespace(Whitespace::Space),
2830            Token::make_word("sqrt", None),
2831            Token::LParen,
2832            Token::Number(String::from("1"), false),
2833            Token::RParen,
2834        ];
2835
2836        compare(expected, tokens);
2837    }
2838
2839    #[test]
2840    fn tokenize_string_string_concat() {
2841        let sql = String::from("SELECT 'a' || 'b'");
2842        let dialect = GenericDialect {};
2843        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2844
2845        let expected = vec![
2846            Token::make_keyword("SELECT"),
2847            Token::Whitespace(Whitespace::Space),
2848            Token::SingleQuotedString(String::from("a")),
2849            Token::Whitespace(Whitespace::Space),
2850            Token::StringConcat,
2851            Token::Whitespace(Whitespace::Space),
2852            Token::SingleQuotedString(String::from("b")),
2853        ];
2854
2855        compare(expected, tokens);
2856    }
2857    #[test]
2858    fn tokenize_bitwise_op() {
2859        let sql = String::from("SELECT one | two ^ three");
2860        let dialect = GenericDialect {};
2861        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2862
2863        let expected = vec![
2864            Token::make_keyword("SELECT"),
2865            Token::Whitespace(Whitespace::Space),
2866            Token::make_word("one", None),
2867            Token::Whitespace(Whitespace::Space),
2868            Token::Pipe,
2869            Token::Whitespace(Whitespace::Space),
2870            Token::make_word("two", None),
2871            Token::Whitespace(Whitespace::Space),
2872            Token::Caret,
2873            Token::Whitespace(Whitespace::Space),
2874            Token::make_word("three", None),
2875        ];
2876        compare(expected, tokens);
2877    }
2878
2879    #[test]
2880    fn tokenize_logical_xor() {
2881        let sql =
2882            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2883        let dialect = GenericDialect {};
2884        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2885
2886        let expected = vec![
2887            Token::make_keyword("SELECT"),
2888            Token::Whitespace(Whitespace::Space),
2889            Token::make_keyword("true"),
2890            Token::Whitespace(Whitespace::Space),
2891            Token::make_keyword("XOR"),
2892            Token::Whitespace(Whitespace::Space),
2893            Token::make_keyword("true"),
2894            Token::Comma,
2895            Token::Whitespace(Whitespace::Space),
2896            Token::make_keyword("false"),
2897            Token::Whitespace(Whitespace::Space),
2898            Token::make_keyword("XOR"),
2899            Token::Whitespace(Whitespace::Space),
2900            Token::make_keyword("false"),
2901            Token::Comma,
2902            Token::Whitespace(Whitespace::Space),
2903            Token::make_keyword("true"),
2904            Token::Whitespace(Whitespace::Space),
2905            Token::make_keyword("XOR"),
2906            Token::Whitespace(Whitespace::Space),
2907            Token::make_keyword("false"),
2908            Token::Comma,
2909            Token::Whitespace(Whitespace::Space),
2910            Token::make_keyword("false"),
2911            Token::Whitespace(Whitespace::Space),
2912            Token::make_keyword("XOR"),
2913            Token::Whitespace(Whitespace::Space),
2914            Token::make_keyword("true"),
2915        ];
2916        compare(expected, tokens);
2917    }
2918
2919    #[test]
2920    fn tokenize_simple_select() {
2921        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2922        let dialect = GenericDialect {};
2923        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2924
2925        let expected = vec![
2926            Token::make_keyword("SELECT"),
2927            Token::Whitespace(Whitespace::Space),
2928            Token::Mul,
2929            Token::Whitespace(Whitespace::Space),
2930            Token::make_keyword("FROM"),
2931            Token::Whitespace(Whitespace::Space),
2932            Token::make_word("customer", None),
2933            Token::Whitespace(Whitespace::Space),
2934            Token::make_keyword("WHERE"),
2935            Token::Whitespace(Whitespace::Space),
2936            Token::make_word("id", None),
2937            Token::Whitespace(Whitespace::Space),
2938            Token::Eq,
2939            Token::Whitespace(Whitespace::Space),
2940            Token::Number(String::from("1"), false),
2941            Token::Whitespace(Whitespace::Space),
2942            Token::make_keyword("LIMIT"),
2943            Token::Whitespace(Whitespace::Space),
2944            Token::Number(String::from("5"), false),
2945        ];
2946
2947        compare(expected, tokens);
2948    }
2949
2950    #[test]
2951    fn tokenize_explain_select() {
2952        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2953        let dialect = GenericDialect {};
2954        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2955
2956        let expected = vec![
2957            Token::make_keyword("EXPLAIN"),
2958            Token::Whitespace(Whitespace::Space),
2959            Token::make_keyword("SELECT"),
2960            Token::Whitespace(Whitespace::Space),
2961            Token::Mul,
2962            Token::Whitespace(Whitespace::Space),
2963            Token::make_keyword("FROM"),
2964            Token::Whitespace(Whitespace::Space),
2965            Token::make_word("customer", None),
2966            Token::Whitespace(Whitespace::Space),
2967            Token::make_keyword("WHERE"),
2968            Token::Whitespace(Whitespace::Space),
2969            Token::make_word("id", None),
2970            Token::Whitespace(Whitespace::Space),
2971            Token::Eq,
2972            Token::Whitespace(Whitespace::Space),
2973            Token::Number(String::from("1"), false),
2974        ];
2975
2976        compare(expected, tokens);
2977    }
2978
2979    #[test]
2980    fn tokenize_explain_analyze_select() {
2981        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2982        let dialect = GenericDialect {};
2983        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2984
2985        let expected = vec![
2986            Token::make_keyword("EXPLAIN"),
2987            Token::Whitespace(Whitespace::Space),
2988            Token::make_keyword("ANALYZE"),
2989            Token::Whitespace(Whitespace::Space),
2990            Token::make_keyword("SELECT"),
2991            Token::Whitespace(Whitespace::Space),
2992            Token::Mul,
2993            Token::Whitespace(Whitespace::Space),
2994            Token::make_keyword("FROM"),
2995            Token::Whitespace(Whitespace::Space),
2996            Token::make_word("customer", None),
2997            Token::Whitespace(Whitespace::Space),
2998            Token::make_keyword("WHERE"),
2999            Token::Whitespace(Whitespace::Space),
3000            Token::make_word("id", None),
3001            Token::Whitespace(Whitespace::Space),
3002            Token::Eq,
3003            Token::Whitespace(Whitespace::Space),
3004            Token::Number(String::from("1"), false),
3005        ];
3006
3007        compare(expected, tokens);
3008    }
3009
3010    #[test]
3011    fn tokenize_string_predicate() {
3012        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
3013        let dialect = GenericDialect {};
3014        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3015
3016        let expected = vec![
3017            Token::make_keyword("SELECT"),
3018            Token::Whitespace(Whitespace::Space),
3019            Token::Mul,
3020            Token::Whitespace(Whitespace::Space),
3021            Token::make_keyword("FROM"),
3022            Token::Whitespace(Whitespace::Space),
3023            Token::make_word("customer", None),
3024            Token::Whitespace(Whitespace::Space),
3025            Token::make_keyword("WHERE"),
3026            Token::Whitespace(Whitespace::Space),
3027            Token::make_word("salary", None),
3028            Token::Whitespace(Whitespace::Space),
3029            Token::Neq,
3030            Token::Whitespace(Whitespace::Space),
3031            Token::SingleQuotedString(String::from("Not Provided")),
3032        ];
3033
3034        compare(expected, tokens);
3035    }
3036
3037    #[test]
3038    fn tokenize_invalid_string() {
3039        let sql = String::from("\n💝مصطفىh");
3040
3041        let dialect = GenericDialect {};
3042        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3043        // println!("tokens: {:#?}", tokens);
3044        let expected = vec![
3045            Token::Whitespace(Whitespace::Newline),
3046            Token::Char('💝'),
3047            Token::make_word("مصطفىh", None),
3048        ];
3049        compare(expected, tokens);
3050    }
3051
3052    #[test]
3053    fn tokenize_newline_in_string_literal() {
3054        let sql = String::from("'foo\r\nbar\nbaz'");
3055
3056        let dialect = GenericDialect {};
3057        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3058        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
3059        compare(expected, tokens);
3060    }
3061
3062    #[test]
3063    fn tokenize_unterminated_string_literal() {
3064        let sql = String::from("select 'foo");
3065
3066        let dialect = GenericDialect {};
3067        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3068        assert_eq!(
3069            tokenizer.tokenize(),
3070            Err(TokenizerError {
3071                message: "Unterminated string literal".to_string(),
3072                location: Location { line: 1, column: 8 },
3073            })
3074        );
3075    }
3076
3077    #[test]
3078    fn tokenize_unterminated_string_literal_utf8() {
3079        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
3080
3081        let dialect = GenericDialect {};
3082        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3083        assert_eq!(
3084            tokenizer.tokenize(),
3085            Err(TokenizerError {
3086                message: "Unterminated string literal".to_string(),
3087                location: Location {
3088                    line: 1,
3089                    column: 35
3090                }
3091            })
3092        );
3093    }
3094
3095    #[test]
3096    fn tokenize_invalid_string_cols() {
3097        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
3098
3099        let dialect = GenericDialect {};
3100        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3101        // println!("tokens: {:#?}", tokens);
3102        let expected = vec![
3103            Token::Whitespace(Whitespace::Newline),
3104            Token::Whitespace(Whitespace::Newline),
3105            Token::make_keyword("SELECT"),
3106            Token::Whitespace(Whitespace::Space),
3107            Token::Mul,
3108            Token::Whitespace(Whitespace::Space),
3109            Token::make_keyword("FROM"),
3110            Token::Whitespace(Whitespace::Space),
3111            Token::make_keyword("table"),
3112            Token::Whitespace(Whitespace::Tab),
3113            Token::Char('💝'),
3114            Token::make_word("مصطفىh", None),
3115        ];
3116        compare(expected, tokens);
3117    }
3118
3119    #[test]
3120    fn tokenize_dollar_quoted_string_tagged() {
3121        let test_cases = vec![
3122            (
3123                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
3124                vec![
3125                    Token::make_keyword("SELECT"),
3126                    Token::Whitespace(Whitespace::Space),
3127                    Token::DollarQuotedString(DollarQuotedString {
3128                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
3129                        tag: Some("tag".into()),
3130                    })
3131                ]
3132            ),
3133            (
3134                String::from("SELECT $abc$x$ab$abc$"),
3135                vec![
3136                    Token::make_keyword("SELECT"),
3137                    Token::Whitespace(Whitespace::Space),
3138                    Token::DollarQuotedString(DollarQuotedString {
3139                        value: "x$ab".into(),
3140                        tag: Some("abc".into()),
3141                    })
3142                ]
3143            ),
3144            (
3145                String::from("SELECT $abc$$abc$"),
3146                vec![
3147                    Token::make_keyword("SELECT"),
3148                    Token::Whitespace(Whitespace::Space),
3149                    Token::DollarQuotedString(DollarQuotedString {
3150                        value: "".into(),
3151                        tag: Some("abc".into()),
3152                    })
3153                ]
3154            ),
3155            (
3156                String::from("0$abc$$abc$1"),
3157                vec![
3158                    Token::Number("0".into(), false),
3159                    Token::DollarQuotedString(DollarQuotedString {
3160                        value: "".into(),
3161                        tag: Some("abc".into()),
3162                    }),
3163                    Token::Number("1".into(), false),
3164                ]
3165            ),
3166            (
3167                String::from("$function$abc$q$data$q$$function$"),
3168                vec![
3169                    Token::DollarQuotedString(DollarQuotedString {
3170                        value: "abc$q$data$q$".into(),
3171                        tag: Some("function".into()),
3172                    }),
3173                ]
3174            ),
3175        ];
3176
3177        let dialect = GenericDialect {};
3178        for (sql, expected) in test_cases {
3179            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3180            compare(expected, tokens);
3181        }
3182    }
3183
3184    #[test]
3185    fn tokenize_dollar_quoted_string_tagged_unterminated() {
3186        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
3187        let dialect = GenericDialect {};
3188        assert_eq!(
3189            Tokenizer::new(&dialect, &sql).tokenize(),
3190            Err(TokenizerError {
3191                message: "Unterminated dollar-quoted, expected $".into(),
3192                location: Location {
3193                    line: 1,
3194                    column: 91
3195                }
3196            })
3197        );
3198    }
3199
3200    #[test]
3201    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
3202        let sql = String::from("SELECT $abc$abc$");
3203        let dialect = GenericDialect {};
3204        assert_eq!(
3205            Tokenizer::new(&dialect, &sql).tokenize(),
3206            Err(TokenizerError {
3207                message: "Unterminated dollar-quoted, expected $".into(),
3208                location: Location {
3209                    line: 1,
3210                    column: 17
3211                }
3212            })
3213        );
3214    }
3215
3216    #[test]
3217    fn tokenize_dollar_placeholder() {
3218        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
3219        let dialect = SQLiteDialect {};
3220        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3221        assert_eq!(
3222            tokens,
3223            vec![
3224                Token::make_keyword("SELECT"),
3225                Token::Whitespace(Whitespace::Space),
3226                Token::Placeholder("$$".into()),
3227                Token::Comma,
3228                Token::Whitespace(Whitespace::Space),
3229                Token::Placeholder("$$ABC$$".into()),
3230                Token::Comma,
3231                Token::Whitespace(Whitespace::Space),
3232                Token::Placeholder("$ABC$".into()),
3233                Token::Comma,
3234                Token::Whitespace(Whitespace::Space),
3235                Token::Placeholder("$ABC".into()),
3236            ]
3237        );
3238    }
3239
3240    #[test]
3241    fn tokenize_nested_dollar_quoted_strings() {
3242        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
3243        let dialect = GenericDialect {};
3244        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3245        let expected = vec![
3246            Token::make_keyword("SELECT"),
3247            Token::Whitespace(Whitespace::Space),
3248            Token::DollarQuotedString(DollarQuotedString {
3249                value: "dollar $nested$ string".into(),
3250                tag: Some("tag".into()),
3251            }),
3252        ];
3253        compare(expected, tokens);
3254    }
3255
3256    #[test]
3257    fn tokenize_dollar_quoted_string_untagged_empty() {
3258        let sql = String::from("SELECT $$$$");
3259        let dialect = GenericDialect {};
3260        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3261        let expected = vec![
3262            Token::make_keyword("SELECT"),
3263            Token::Whitespace(Whitespace::Space),
3264            Token::DollarQuotedString(DollarQuotedString {
3265                value: "".into(),
3266                tag: None,
3267            }),
3268        ];
3269        compare(expected, tokens);
3270    }
3271
3272    #[test]
3273    fn tokenize_dollar_quoted_string_untagged() {
3274        let sql =
3275            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3276        let dialect = GenericDialect {};
3277        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3278        let expected = vec![
3279            Token::make_keyword("SELECT"),
3280            Token::Whitespace(Whitespace::Space),
3281            Token::DollarQuotedString(DollarQuotedString {
3282                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3283                tag: None,
3284            }),
3285        ];
3286        compare(expected, tokens);
3287    }
3288
3289    #[test]
3290    fn tokenize_dollar_quoted_string_untagged_unterminated() {
3291        let sql = String::from(
3292            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3293        );
3294        let dialect = GenericDialect {};
3295        assert_eq!(
3296            Tokenizer::new(&dialect, &sql).tokenize(),
3297            Err(TokenizerError {
3298                message: "Unterminated dollar-quoted string".into(),
3299                location: Location {
3300                    line: 1,
3301                    column: 86
3302                }
3303            })
3304        );
3305    }
3306
3307    #[test]
3308    fn tokenize_right_arrow() {
3309        let sql = String::from("FUNCTION(key=>value)");
3310        let dialect = GenericDialect {};
3311        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3312        let expected = vec![
3313            Token::make_word("FUNCTION", None),
3314            Token::LParen,
3315            Token::make_word("key", None),
3316            Token::RArrow,
3317            Token::make_word("value", None),
3318            Token::RParen,
3319        ];
3320        compare(expected, tokens);
3321    }
3322
3323    #[test]
3324    fn tokenize_is_null() {
3325        let sql = String::from("a IS NULL");
3326        let dialect = GenericDialect {};
3327        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3328
3329        let expected = vec![
3330            Token::make_word("a", None),
3331            Token::Whitespace(Whitespace::Space),
3332            Token::make_keyword("IS"),
3333            Token::Whitespace(Whitespace::Space),
3334            Token::make_keyword("NULL"),
3335        ];
3336
3337        compare(expected, tokens);
3338    }
3339
3340    #[test]
3341    fn tokenize_comment() {
3342        let test_cases = vec![
3343            (
3344                String::from("0--this is a comment\n1"),
3345                vec![
3346                    Token::Number("0".to_string(), false),
3347                    Token::Whitespace(Whitespace::SingleLineComment {
3348                        prefix: "--".to_string(),
3349                        comment: "this is a comment\n".to_string(),
3350                    }),
3351                    Token::Number("1".to_string(), false),
3352                ],
3353            ),
3354            (
3355                String::from("0--this is a comment\r1"),
3356                vec![
3357                    Token::Number("0".to_string(), false),
3358                    Token::Whitespace(Whitespace::SingleLineComment {
3359                        prefix: "--".to_string(),
3360                        comment: "this is a comment\r1".to_string(),
3361                    }),
3362                ],
3363            ),
3364            (
3365                String::from("0--this is a comment\r\n1"),
3366                vec![
3367                    Token::Number("0".to_string(), false),
3368                    Token::Whitespace(Whitespace::SingleLineComment {
3369                        prefix: "--".to_string(),
3370                        comment: "this is a comment\r\n".to_string(),
3371                    }),
3372                    Token::Number("1".to_string(), false),
3373                ],
3374            ),
3375        ];
3376
3377        let dialect = GenericDialect {};
3378
3379        for (sql, expected) in test_cases {
3380            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3381            compare(expected, tokens);
3382        }
3383    }
3384
3385    #[test]
3386    fn tokenize_comment_postgres() {
3387        let sql = String::from("1--\r0");
3388
3389        let dialect = PostgreSqlDialect {};
3390        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3391        let expected = vec![
3392            Token::Number("1".to_string(), false),
3393            Token::Whitespace(Whitespace::SingleLineComment {
3394                prefix: "--".to_string(),
3395                comment: "\r".to_string(),
3396            }),
3397            Token::Number("0".to_string(), false),
3398        ];
3399        compare(expected, tokens);
3400    }
3401
3402    #[test]
3403    fn tokenize_comment_at_eof() {
3404        let sql = String::from("--this is a comment");
3405
3406        let dialect = GenericDialect {};
3407        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3408        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3409            prefix: "--".to_string(),
3410            comment: "this is a comment".to_string(),
3411        })];
3412        compare(expected, tokens);
3413    }
3414
3415    #[test]
3416    fn tokenize_multiline_comment() {
3417        let sql = String::from("0/*multi-line\n* /comment*/1");
3418
3419        let dialect = GenericDialect {};
3420        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3421        let expected = vec![
3422            Token::Number("0".to_string(), false),
3423            Token::Whitespace(Whitespace::MultiLineComment(
3424                "multi-line\n* /comment".to_string(),
3425            )),
3426            Token::Number("1".to_string(), false),
3427        ];
3428        compare(expected, tokens);
3429    }
3430
3431    #[test]
3432    fn tokenize_nested_multiline_comment() {
3433        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3434            "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3435            vec![
3436                Token::Number("0".to_string(), false),
3437                Token::Whitespace(Whitespace::MultiLineComment(
3438                    "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3439                )),
3440                Token::Whitespace(Whitespace::Space),
3441                Token::Div,
3442                Token::Word(Word {
3443                    value: "comment".to_string(),
3444                    quote_style: None,
3445                    keyword: Keyword::COMMENT,
3446                }),
3447                Token::Mul,
3448                Token::Div,
3449                Token::Number("1".to_string(), false),
3450            ],
3451        );
3452
3453        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3454            "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3455            vec![
3456                Token::Number("0".to_string(), false),
3457                Token::Whitespace(Whitespace::MultiLineComment(
3458                    "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3459                )),
3460                Token::Number("1".to_string(), false),
3461            ],
3462        );
3463
3464        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3465            "SELECT 1/* a /* b */ c */0",
3466            vec![
3467                Token::make_keyword("SELECT"),
3468                Token::Whitespace(Whitespace::Space),
3469                Token::Number("1".to_string(), false),
3470                Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3471                Token::Number("0".to_string(), false),
3472            ],
3473        );
3474    }
3475
3476    #[test]
3477    fn tokenize_nested_multiline_comment_empty() {
3478        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3479            "select 1/*/**/*/0",
3480            vec![
3481                Token::make_keyword("select"),
3482                Token::Whitespace(Whitespace::Space),
3483                Token::Number("1".to_string(), false),
3484                Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3485                Token::Number("0".to_string(), false),
3486            ],
3487        );
3488    }
3489
3490    #[test]
3491    fn tokenize_nested_comments_if_not_supported() {
3492        all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3493            "SELECT 1/*/* nested comment */*/0",
3494            vec![
3495                Token::make_keyword("SELECT"),
3496                Token::Whitespace(Whitespace::Space),
3497                Token::Number("1".to_string(), false),
3498                Token::Whitespace(Whitespace::MultiLineComment(
3499                    "/* nested comment ".to_string(),
3500                )),
3501                Token::Mul,
3502                Token::Div,
3503                Token::Number("0".to_string(), false),
3504            ],
3505        );
3506    }
3507
3508    #[test]
3509    fn tokenize_multiline_comment_with_even_asterisks() {
3510        let sql = String::from("\n/** Comment **/\n");
3511
3512        let dialect = GenericDialect {};
3513        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3514        let expected = vec![
3515            Token::Whitespace(Whitespace::Newline),
3516            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3517            Token::Whitespace(Whitespace::Newline),
3518        ];
3519        compare(expected, tokens);
3520    }
3521
3522    #[test]
3523    fn tokenize_unicode_whitespace() {
3524        let sql = String::from(" \u{2003}\n");
3525
3526        let dialect = GenericDialect {};
3527        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3528        let expected = vec![
3529            Token::Whitespace(Whitespace::Space),
3530            Token::Whitespace(Whitespace::Space),
3531            Token::Whitespace(Whitespace::Newline),
3532        ];
3533        compare(expected, tokens);
3534    }
3535
3536    #[test]
3537    fn tokenize_mismatched_quotes() {
3538        let sql = String::from("\"foo");
3539
3540        let dialect = GenericDialect {};
3541        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3542        assert_eq!(
3543            tokenizer.tokenize(),
3544            Err(TokenizerError {
3545                message: "Expected close delimiter '\"' before EOF.".to_string(),
3546                location: Location { line: 1, column: 1 },
3547            })
3548        );
3549    }
3550
3551    #[test]
3552    fn tokenize_newlines() {
3553        let sql = String::from("line1\nline2\rline3\r\nline4\r");
3554
3555        let dialect = GenericDialect {};
3556        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3557        let expected = vec![
3558            Token::make_word("line1", None),
3559            Token::Whitespace(Whitespace::Newline),
3560            Token::make_word("line2", None),
3561            Token::Whitespace(Whitespace::Newline),
3562            Token::make_word("line3", None),
3563            Token::Whitespace(Whitespace::Newline),
3564            Token::make_word("line4", None),
3565            Token::Whitespace(Whitespace::Newline),
3566        ];
3567        compare(expected, tokens);
3568    }
3569
3570    #[test]
3571    fn tokenize_mssql_top() {
3572        let sql = "SELECT TOP 5 [bar] FROM foo";
3573        let dialect = MsSqlDialect {};
3574        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3575        let expected = vec![
3576            Token::make_keyword("SELECT"),
3577            Token::Whitespace(Whitespace::Space),
3578            Token::make_keyword("TOP"),
3579            Token::Whitespace(Whitespace::Space),
3580            Token::Number(String::from("5"), false),
3581            Token::Whitespace(Whitespace::Space),
3582            Token::make_word("bar", Some('[')),
3583            Token::Whitespace(Whitespace::Space),
3584            Token::make_keyword("FROM"),
3585            Token::Whitespace(Whitespace::Space),
3586            Token::make_word("foo", None),
3587        ];
3588        compare(expected, tokens);
3589    }
3590
3591    #[test]
3592    fn tokenize_pg_regex_match() {
3593        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3594        let dialect = GenericDialect {};
3595        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3596        let expected = vec![
3597            Token::make_keyword("SELECT"),
3598            Token::Whitespace(Whitespace::Space),
3599            Token::make_word("col", None),
3600            Token::Whitespace(Whitespace::Space),
3601            Token::Tilde,
3602            Token::Whitespace(Whitespace::Space),
3603            Token::SingleQuotedString("^a".into()),
3604            Token::Comma,
3605            Token::Whitespace(Whitespace::Space),
3606            Token::make_word("col", None),
3607            Token::Whitespace(Whitespace::Space),
3608            Token::TildeAsterisk,
3609            Token::Whitespace(Whitespace::Space),
3610            Token::SingleQuotedString("^a".into()),
3611            Token::Comma,
3612            Token::Whitespace(Whitespace::Space),
3613            Token::make_word("col", None),
3614            Token::Whitespace(Whitespace::Space),
3615            Token::ExclamationMarkTilde,
3616            Token::Whitespace(Whitespace::Space),
3617            Token::SingleQuotedString("^a".into()),
3618            Token::Comma,
3619            Token::Whitespace(Whitespace::Space),
3620            Token::make_word("col", None),
3621            Token::Whitespace(Whitespace::Space),
3622            Token::ExclamationMarkTildeAsterisk,
3623            Token::Whitespace(Whitespace::Space),
3624            Token::SingleQuotedString("^a".into()),
3625        ];
3626        compare(expected, tokens);
3627    }
3628
3629    #[test]
3630    fn tokenize_pg_like_match() {
3631        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3632        let dialect = GenericDialect {};
3633        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3634        let expected = vec![
3635            Token::make_keyword("SELECT"),
3636            Token::Whitespace(Whitespace::Space),
3637            Token::make_word("col", None),
3638            Token::Whitespace(Whitespace::Space),
3639            Token::DoubleTilde,
3640            Token::Whitespace(Whitespace::Space),
3641            Token::SingleQuotedString("_a%".into()),
3642            Token::Comma,
3643            Token::Whitespace(Whitespace::Space),
3644            Token::make_word("col", None),
3645            Token::Whitespace(Whitespace::Space),
3646            Token::DoubleTildeAsterisk,
3647            Token::Whitespace(Whitespace::Space),
3648            Token::SingleQuotedString("_a%".into()),
3649            Token::Comma,
3650            Token::Whitespace(Whitespace::Space),
3651            Token::make_word("col", None),
3652            Token::Whitespace(Whitespace::Space),
3653            Token::ExclamationMarkDoubleTilde,
3654            Token::Whitespace(Whitespace::Space),
3655            Token::SingleQuotedString("_a%".into()),
3656            Token::Comma,
3657            Token::Whitespace(Whitespace::Space),
3658            Token::make_word("col", None),
3659            Token::Whitespace(Whitespace::Space),
3660            Token::ExclamationMarkDoubleTildeAsterisk,
3661            Token::Whitespace(Whitespace::Space),
3662            Token::SingleQuotedString("_a%".into()),
3663        ];
3664        compare(expected, tokens);
3665    }
3666
3667    #[test]
3668    fn tokenize_quoted_identifier() {
3669        let sql = r#" "a "" b" "a """ "c """"" "#;
3670        let dialect = GenericDialect {};
3671        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3672        let expected = vec![
3673            Token::Whitespace(Whitespace::Space),
3674            Token::make_word(r#"a " b"#, Some('"')),
3675            Token::Whitespace(Whitespace::Space),
3676            Token::make_word(r#"a ""#, Some('"')),
3677            Token::Whitespace(Whitespace::Space),
3678            Token::make_word(r#"c """#, Some('"')),
3679            Token::Whitespace(Whitespace::Space),
3680        ];
3681        compare(expected, tokens);
3682    }
3683
3684    #[test]
3685    fn tokenize_snowflake_div() {
3686        let sql = r#"field/1000"#;
3687        let dialect = SnowflakeDialect {};
3688        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3689        let expected = vec![
3690            Token::make_word(r#"field"#, None),
3691            Token::Div,
3692            Token::Number("1000".to_string(), false),
3693        ];
3694        compare(expected, tokens);
3695    }
3696
3697    #[test]
3698    fn tokenize_quoted_identifier_with_no_escape() {
3699        let sql = r#" "a "" b" "a """ "c """"" "#;
3700        let dialect = GenericDialect {};
3701        let tokens = Tokenizer::new(&dialect, sql)
3702            .with_unescape(false)
3703            .tokenize()
3704            .unwrap();
3705        let expected = vec![
3706            Token::Whitespace(Whitespace::Space),
3707            Token::make_word(r#"a "" b"#, Some('"')),
3708            Token::Whitespace(Whitespace::Space),
3709            Token::make_word(r#"a """#, Some('"')),
3710            Token::Whitespace(Whitespace::Space),
3711            Token::make_word(r#"c """""#, Some('"')),
3712            Token::Whitespace(Whitespace::Space),
3713        ];
3714        compare(expected, tokens);
3715    }
3716
3717    #[test]
3718    fn tokenize_with_location() {
3719        let sql = "SELECT a,\n b";
3720        let dialect = GenericDialect {};
3721        let tokens = Tokenizer::new(&dialect, sql)
3722            .tokenize_with_location()
3723            .unwrap();
3724        let expected = vec![
3725            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3726            TokenWithSpan::at(
3727                Token::Whitespace(Whitespace::Space),
3728                (1, 7).into(),
3729                (1, 8).into(),
3730            ),
3731            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3732            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3733            TokenWithSpan::at(
3734                Token::Whitespace(Whitespace::Newline),
3735                (1, 10).into(),
3736                (2, 1).into(),
3737            ),
3738            TokenWithSpan::at(
3739                Token::Whitespace(Whitespace::Space),
3740                (2, 1).into(),
3741                (2, 2).into(),
3742            ),
3743            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3744        ];
3745        compare(expected, tokens);
3746    }
3747
3748    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3749        //println!("------------------------------");
3750        //println!("tokens   = {:?}", actual);
3751        //println!("expected = {:?}", expected);
3752        //println!("------------------------------");
3753        assert_eq!(expected, actual);
3754    }
3755
3756    fn check_unescape(s: &str, expected: Option<&str>) {
3757        let s = format!("'{s}'");
3758        let mut state = State {
3759            peekable: s.chars().peekable(),
3760            line: 0,
3761            col: 0,
3762        };
3763
3764        assert_eq!(
3765            unescape_single_quoted_string(&mut state),
3766            expected.map(|s| s.to_string())
3767        );
3768    }
3769
3770    #[test]
3771    fn test_unescape() {
3772        check_unescape(r"\b", Some("\u{0008}"));
3773        check_unescape(r"\f", Some("\u{000C}"));
3774        check_unescape(r"\t", Some("\t"));
3775        check_unescape(r"\r\n", Some("\r\n"));
3776        check_unescape(r"\/", Some("/"));
3777        check_unescape(r"/", Some("/"));
3778        check_unescape(r"\\", Some("\\"));
3779
3780        // 16 and 32-bit hexadecimal Unicode character value
3781        check_unescape(r"\u0001", Some("\u{0001}"));
3782        check_unescape(r"\u4c91", Some("\u{4c91}"));
3783        check_unescape(r"\u4c916", Some("\u{4c91}6"));
3784        check_unescape(r"\u4c", None);
3785        check_unescape(r"\u0000", None);
3786        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3787        check_unescape(r"\U00110000", None);
3788        check_unescape(r"\U00000000", None);
3789        check_unescape(r"\u", None);
3790        check_unescape(r"\U", None);
3791        check_unescape(r"\U1010FFFF", None);
3792
3793        // hexadecimal byte value
3794        check_unescape(r"\x4B", Some("\u{004b}"));
3795        check_unescape(r"\x4", Some("\u{0004}"));
3796        check_unescape(r"\x4L", Some("\u{0004}L"));
3797        check_unescape(r"\x", Some("x"));
3798        check_unescape(r"\xP", Some("xP"));
3799        check_unescape(r"\x0", None);
3800        check_unescape(r"\xCAD", None);
3801        check_unescape(r"\xA9", None);
3802
3803        // octal byte value
3804        check_unescape(r"\1", Some("\u{0001}"));
3805        check_unescape(r"\12", Some("\u{000a}"));
3806        check_unescape(r"\123", Some("\u{0053}"));
3807        check_unescape(r"\1232", Some("\u{0053}2"));
3808        check_unescape(r"\4", Some("\u{0004}"));
3809        check_unescape(r"\45", Some("\u{0025}"));
3810        check_unescape(r"\450", Some("\u{0028}"));
3811        check_unescape(r"\603", None);
3812        check_unescape(r"\0", None);
3813        check_unescape(r"\080", None);
3814
3815        // others
3816        check_unescape(r"\9", Some("9"));
3817        check_unescape(r"''", Some("'"));
3818        check_unescape(
3819            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3820            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3821        );
3822        check_unescape(r"Hello\0", None);
3823        check_unescape(r"Hello\xCADRust", None);
3824    }
3825
3826    #[test]
3827    fn tokenize_numeric_prefix_trait() {
3828        #[derive(Debug)]
3829        struct NumericPrefixDialect;
3830
3831        impl Dialect for NumericPrefixDialect {
3832            fn is_identifier_start(&self, ch: char) -> bool {
3833                ch.is_ascii_lowercase()
3834                    || ch.is_ascii_uppercase()
3835                    || ch.is_ascii_digit()
3836                    || ch == '$'
3837            }
3838
3839            fn is_identifier_part(&self, ch: char) -> bool {
3840                ch.is_ascii_lowercase()
3841                    || ch.is_ascii_uppercase()
3842                    || ch.is_ascii_digit()
3843                    || ch == '_'
3844                    || ch == '$'
3845                    || ch == '{'
3846                    || ch == '}'
3847            }
3848
3849            fn supports_numeric_prefix(&self) -> bool {
3850                true
3851            }
3852        }
3853
3854        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3855        tokenize_numeric_prefix_inner(&HiveDialect {});
3856        tokenize_numeric_prefix_inner(&MySqlDialect {});
3857    }
3858
3859    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3860        let sql = r#"SELECT * FROM 1"#;
3861        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3862        let expected = vec![
3863            Token::make_keyword("SELECT"),
3864            Token::Whitespace(Whitespace::Space),
3865            Token::Mul,
3866            Token::Whitespace(Whitespace::Space),
3867            Token::make_keyword("FROM"),
3868            Token::Whitespace(Whitespace::Space),
3869            Token::Number(String::from("1"), false),
3870        ];
3871        compare(expected, tokens);
3872    }
3873
3874    #[test]
3875    fn tokenize_quoted_string_escape() {
3876        let dialect = SnowflakeDialect {};
3877        for (sql, expected, expected_unescaped) in [
3878            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3879            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3880            (r#"'\\'"#, r#"\\"#, r#"\"#),
3881            (
3882                r#"'\0\a\b\f\n\r\t\Z'"#,
3883                r#"\0\a\b\f\n\r\t\Z"#,
3884                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3885            ),
3886            (r#"'\"'"#, r#"\""#, "\""),
3887            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3888            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3889            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3890            (r#"'\q'"#, r#"\q"#, r#"q"#),
3891            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3892            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3893        ] {
3894            let tokens = Tokenizer::new(&dialect, sql)
3895                .with_unescape(false)
3896                .tokenize()
3897                .unwrap();
3898            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3899            compare(expected, tokens);
3900
3901            let tokens = Tokenizer::new(&dialect, sql)
3902                .with_unescape(true)
3903                .tokenize()
3904                .unwrap();
3905            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3906            compare(expected, tokens);
3907        }
3908
3909        for sql in [r#"'\'"#, r#"'ab\'"#] {
3910            let mut tokenizer = Tokenizer::new(&dialect, sql);
3911            assert_eq!(
3912                "Unterminated string literal",
3913                tokenizer.tokenize().unwrap_err().message.as_str(),
3914            );
3915        }
3916
3917        // Non-escape dialect
3918        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3919            let dialect = GenericDialect {};
3920            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3921
3922            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3923
3924            compare(expected, tokens);
3925        }
3926
3927        // MySQL special case for LIKE escapes
3928        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3929            let dialect = MySqlDialect {};
3930            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3931
3932            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3933
3934            compare(expected, tokens);
3935        }
3936    }
3937
3938    #[test]
3939    fn tokenize_triple_quoted_string() {
3940        fn check<F>(
3941            q: char, // The quote character to test
3942            r: char, // An alternate quote character.
3943            quote_token: F,
3944        ) where
3945            F: Fn(String) -> Token,
3946        {
3947            let dialect = BigQueryDialect {};
3948
3949            for (sql, expected, expected_unescaped) in [
3950                // Empty string
3951                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3952                // Should not count escaped quote as end of string.
3953                (
3954                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3955                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3956                    format!(r#"ab{q}{q}{q}{q}cd"#),
3957                ),
3958                // Simple string
3959                (
3960                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3961                    "abc".into(),
3962                    "abc".into(),
3963                ),
3964                // Mix single-double quotes unescaped.
3965                (
3966                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3967                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3968                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3969                ),
3970                // Escaped quote.
3971                (
3972                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3973                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3974                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3975                ),
3976                // backslash-escaped quote characters.
3977                (
3978                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3979                    r#"a\'\'b\'c\'d"#.into(),
3980                    r#"a''b'c'd"#.into(),
3981                ),
3982                // backslash-escaped characters
3983                (
3984                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3985                    r#"abc\0\n\rdef"#.into(),
3986                    "abc\0\n\rdef".into(),
3987                ),
3988            ] {
3989                let tokens = Tokenizer::new(&dialect, sql.as_str())
3990                    .with_unescape(false)
3991                    .tokenize()
3992                    .unwrap();
3993                let expected = vec![quote_token(expected.to_string())];
3994                compare(expected, tokens);
3995
3996                let tokens = Tokenizer::new(&dialect, sql.as_str())
3997                    .with_unescape(true)
3998                    .tokenize()
3999                    .unwrap();
4000                let expected = vec![quote_token(expected_unescaped.to_string())];
4001                compare(expected, tokens);
4002            }
4003
4004            for sql in [
4005                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
4006                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
4007                format!(r#"{q}{q}{q}{q}"#),
4008                format!(r#"{q}{q}{q}{r}{r}"#),
4009                format!(r#"{q}{q}{q}abc{q}"#),
4010                format!(r#"{q}{q}{q}abc{q}{q}"#),
4011                format!(r#"{q}{q}{q}abc"#),
4012            ] {
4013                let dialect = BigQueryDialect {};
4014                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
4015                assert_eq!(
4016                    "Unterminated string literal",
4017                    tokenizer.tokenize().unwrap_err().message.as_str(),
4018                );
4019            }
4020        }
4021
4022        check('"', '\'', Token::TripleDoubleQuotedString);
4023
4024        check('\'', '"', Token::TripleSingleQuotedString);
4025
4026        let dialect = BigQueryDialect {};
4027
4028        let sql = r#"""''"#;
4029        let tokens = Tokenizer::new(&dialect, sql)
4030            .with_unescape(true)
4031            .tokenize()
4032            .unwrap();
4033        let expected = vec![
4034            Token::DoubleQuotedString("".to_string()),
4035            Token::SingleQuotedString("".to_string()),
4036        ];
4037        compare(expected, tokens);
4038
4039        let sql = r#"''"""#;
4040        let tokens = Tokenizer::new(&dialect, sql)
4041            .with_unescape(true)
4042            .tokenize()
4043            .unwrap();
4044        let expected = vec![
4045            Token::SingleQuotedString("".to_string()),
4046            Token::DoubleQuotedString("".to_string()),
4047        ];
4048        compare(expected, tokens);
4049
4050        // Non-triple quoted string dialect
4051        let dialect = SnowflakeDialect {};
4052        let sql = r#"''''''"#;
4053        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4054        let expected = vec![Token::SingleQuotedString("''".to_string())];
4055        compare(expected, tokens);
4056    }
4057
4058    #[test]
4059    fn test_mysql_users_grantees() {
4060        let dialect = MySqlDialect {};
4061
4062        let sql = "CREATE USER `root`@`%`";
4063        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4064        let expected = vec![
4065            Token::make_keyword("CREATE"),
4066            Token::Whitespace(Whitespace::Space),
4067            Token::make_keyword("USER"),
4068            Token::Whitespace(Whitespace::Space),
4069            Token::make_word("root", Some('`')),
4070            Token::AtSign,
4071            Token::make_word("%", Some('`')),
4072        ];
4073        compare(expected, tokens);
4074    }
4075
4076    #[test]
4077    fn test_postgres_abs_without_space_and_string_literal() {
4078        let dialect = MySqlDialect {};
4079
4080        let sql = "SELECT @'1'";
4081        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4082        let expected = vec![
4083            Token::make_keyword("SELECT"),
4084            Token::Whitespace(Whitespace::Space),
4085            Token::AtSign,
4086            Token::SingleQuotedString("1".to_string()),
4087        ];
4088        compare(expected, tokens);
4089    }
4090
4091    #[test]
4092    fn test_postgres_abs_without_space_and_quoted_column() {
4093        let dialect = MySqlDialect {};
4094
4095        let sql = r#"SELECT @"bar" FROM foo"#;
4096        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4097        let expected = vec![
4098            Token::make_keyword("SELECT"),
4099            Token::Whitespace(Whitespace::Space),
4100            Token::AtSign,
4101            Token::DoubleQuotedString("bar".to_string()),
4102            Token::Whitespace(Whitespace::Space),
4103            Token::make_keyword("FROM"),
4104            Token::Whitespace(Whitespace::Space),
4105            Token::make_word("foo", None),
4106        ];
4107        compare(expected, tokens);
4108    }
4109
4110    #[test]
4111    fn test_national_strings_backslash_escape_not_supported() {
4112        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
4113            .tokenizes_to(
4114                "select n'''''\\'",
4115                vec![
4116                    Token::make_keyword("select"),
4117                    Token::Whitespace(Whitespace::Space),
4118                    Token::NationalStringLiteral("''\\".to_string()),
4119                ],
4120            );
4121    }
4122
4123    #[test]
4124    fn test_national_strings_backslash_escape_supported() {
4125        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
4126            .tokenizes_to(
4127                "select n'''''\\''",
4128                vec![
4129                    Token::make_keyword("select"),
4130                    Token::Whitespace(Whitespace::Space),
4131                    Token::NationalStringLiteral("'''".to_string()),
4132                ],
4133            );
4134    }
4135
4136    #[test]
4137    fn test_string_escape_constant_not_supported() {
4138        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
4139            "select e'...'",
4140            vec![
4141                Token::make_keyword("select"),
4142                Token::Whitespace(Whitespace::Space),
4143                Token::make_word("e", None),
4144                Token::SingleQuotedString("...".to_string()),
4145            ],
4146        );
4147
4148        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
4149            "select E'...'",
4150            vec![
4151                Token::make_keyword("select"),
4152                Token::Whitespace(Whitespace::Space),
4153                Token::make_word("E", None),
4154                Token::SingleQuotedString("...".to_string()),
4155            ],
4156        );
4157    }
4158
4159    #[test]
4160    fn test_string_escape_constant_supported() {
4161        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
4162            "select e'\\''",
4163            vec![
4164                Token::make_keyword("select"),
4165                Token::Whitespace(Whitespace::Space),
4166                Token::EscapedStringLiteral("'".to_string()),
4167            ],
4168        );
4169
4170        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
4171            "select E'\\''",
4172            vec![
4173                Token::make_keyword("select"),
4174                Token::Whitespace(Whitespace::Space),
4175                Token::EscapedStringLiteral("'".to_string()),
4176            ],
4177        );
4178    }
4179
4180    #[test]
4181    fn test_whitespace_required_after_single_line_comment() {
4182        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4183            .tokenizes_to(
4184                "SELECT --'abc'",
4185                vec![
4186                    Token::make_keyword("SELECT"),
4187                    Token::Whitespace(Whitespace::Space),
4188                    Token::Minus,
4189                    Token::Minus,
4190                    Token::SingleQuotedString("abc".to_string()),
4191                ],
4192            );
4193
4194        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4195            .tokenizes_to(
4196                "SELECT -- 'abc'",
4197                vec![
4198                    Token::make_keyword("SELECT"),
4199                    Token::Whitespace(Whitespace::Space),
4200                    Token::Whitespace(Whitespace::SingleLineComment {
4201                        prefix: "--".to_string(),
4202                        comment: " 'abc'".to_string(),
4203                    }),
4204                ],
4205            );
4206
4207        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4208            .tokenizes_to(
4209                "SELECT --",
4210                vec![
4211                    Token::make_keyword("SELECT"),
4212                    Token::Whitespace(Whitespace::Space),
4213                    Token::Minus,
4214                    Token::Minus,
4215                ],
4216            );
4217
4218        all_dialects_where(|d| d.requires_single_line_comment_whitespace()).tokenizes_to(
4219            "--\n-- Table structure for table...\n--\n",
4220            vec![
4221                Token::Whitespace(Whitespace::SingleLineComment {
4222                    prefix: "--".to_string(),
4223                    comment: "\n".to_string(),
4224                }),
4225                Token::Whitespace(Whitespace::SingleLineComment {
4226                    prefix: "--".to_string(),
4227                    comment: " Table structure for table...\n".to_string(),
4228                }),
4229                Token::Whitespace(Whitespace::SingleLineComment {
4230                    prefix: "--".to_string(),
4231                    comment: "\n".to_string(),
4232                }),
4233            ],
4234        );
4235    }
4236
4237    #[test]
4238    fn test_whitespace_not_required_after_single_line_comment() {
4239        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4240            .tokenizes_to(
4241                "SELECT --'abc'",
4242                vec![
4243                    Token::make_keyword("SELECT"),
4244                    Token::Whitespace(Whitespace::Space),
4245                    Token::Whitespace(Whitespace::SingleLineComment {
4246                        prefix: "--".to_string(),
4247                        comment: "'abc'".to_string(),
4248                    }),
4249                ],
4250            );
4251
4252        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4253            .tokenizes_to(
4254                "SELECT -- 'abc'",
4255                vec![
4256                    Token::make_keyword("SELECT"),
4257                    Token::Whitespace(Whitespace::Space),
4258                    Token::Whitespace(Whitespace::SingleLineComment {
4259                        prefix: "--".to_string(),
4260                        comment: " 'abc'".to_string(),
4261                    }),
4262                ],
4263            );
4264
4265        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4266            .tokenizes_to(
4267                "SELECT --",
4268                vec![
4269                    Token::make_keyword("SELECT"),
4270                    Token::Whitespace(Whitespace::Space),
4271                    Token::Whitespace(Whitespace::SingleLineComment {
4272                        prefix: "--".to_string(),
4273                        comment: "".to_string(),
4274                    }),
4275                ],
4276            );
4277    }
4278
4279    #[test]
4280    fn test_tokenize_identifiers_numeric_prefix() {
4281        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4282            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4283
4284        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4285            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4286
4287        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4288            "t.12e34",
4289            vec![
4290                Token::make_word("t", None),
4291                Token::Period,
4292                Token::make_word("12e34", None),
4293            ],
4294        );
4295
4296        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4297            "t.1two3",
4298            vec![
4299                Token::make_word("t", None),
4300                Token::Period,
4301                Token::make_word("1two3", None),
4302            ],
4303        );
4304    }
4305
4306    #[test]
4307    fn tokenize_period_underscore() {
4308        let sql = String::from("SELECT table._col");
4309        // a dialect that supports underscores in numeric literals
4310        let dialect = PostgreSqlDialect {};
4311        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4312
4313        let expected = vec![
4314            Token::make_keyword("SELECT"),
4315            Token::Whitespace(Whitespace::Space),
4316            Token::Word(Word {
4317                value: "table".to_string(),
4318                quote_style: None,
4319                keyword: Keyword::TABLE,
4320            }),
4321            Token::Period,
4322            Token::Word(Word {
4323                value: "_col".to_string(),
4324                quote_style: None,
4325                keyword: Keyword::NoKeyword,
4326            }),
4327        ];
4328
4329        compare(expected, tokens);
4330
4331        let sql = String::from("SELECT ._123");
4332        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4333            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4334        }
4335
4336        let sql = String::from("SELECT ._abc");
4337        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4338            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4339        }
4340    }
4341
4342    #[test]
4343    fn tokenize_question_mark() {
4344        let dialect = PostgreSqlDialect {};
4345        let sql = "SELECT x ? y";
4346        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4347        compare(
4348            tokens,
4349            vec![
4350                Token::make_keyword("SELECT"),
4351                Token::Whitespace(Whitespace::Space),
4352                Token::make_word("x", None),
4353                Token::Whitespace(Whitespace::Space),
4354                Token::Question,
4355                Token::Whitespace(Whitespace::Space),
4356                Token::make_word("y", None),
4357            ],
4358        );
4359    }
4360
4361    #[test]
4362    fn tokenize_multiline_comment_with_comment_hint() {
4363        let sql = String::from("0/*! word */1");
4364
4365        let dialect = MySqlDialect {};
4366        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4367        let expected = vec![
4368            Token::Number("0".to_string(), false),
4369            Token::Whitespace(Whitespace::Space),
4370            Token::Word(Word {
4371                value: "word".to_string(),
4372                quote_style: None,
4373                keyword: Keyword::NoKeyword,
4374            }),
4375            Token::Whitespace(Whitespace::Space),
4376            Token::Number("1".to_string(), false),
4377        ];
4378        compare(expected, tokens);
4379    }
4380
4381    #[test]
4382    fn tokenize_multiline_comment_with_comment_hint_and_version() {
4383        let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
4384        let dialect = MySqlDialect {};
4385        let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
4386        let expected = vec![
4387            Token::Number("0".to_string(), false),
4388            Token::Whitespace(Whitespace::Space),
4389            Token::Whitespace(Whitespace::Space),
4390            Token::Word(Word {
4391                value: "KEY_BLOCK_SIZE".to_string(),
4392                quote_style: None,
4393                keyword: Keyword::KEY_BLOCK_SIZE,
4394            }),
4395            Token::Whitespace(Whitespace::Space),
4396            Token::Eq,
4397            Token::Whitespace(Whitespace::Space),
4398            Token::Number("1024".to_string(), false),
4399            Token::Whitespace(Whitespace::Space),
4400            Token::Number("1".to_string(), false),
4401        ];
4402        compare(expected, tokens);
4403
4404        let tokens = Tokenizer::new(&dialect, "0 /*!50110 */ 1")
4405            .tokenize()
4406            .unwrap();
4407        compare(
4408            vec![
4409                Token::Number("0".to_string(), false),
4410                Token::Whitespace(Whitespace::Space),
4411                Token::Whitespace(Whitespace::Space),
4412                Token::Whitespace(Whitespace::Space),
4413                Token::Number("1".to_string(), false),
4414            ],
4415            tokens,
4416        );
4417
4418        let tokens = Tokenizer::new(&dialect, "0 /*!*/ 1").tokenize().unwrap();
4419        compare(
4420            vec![
4421                Token::Number("0".to_string(), false),
4422                Token::Whitespace(Whitespace::Space),
4423                Token::Whitespace(Whitespace::Space),
4424                Token::Number("1".to_string(), false),
4425            ],
4426            tokens,
4427        );
4428        let tokens = Tokenizer::new(&dialect, "0 /*!   */ 1").tokenize().unwrap();
4429        compare(
4430            vec![
4431                Token::Number("0".to_string(), false),
4432                Token::Whitespace(Whitespace::Space),
4433                Token::Whitespace(Whitespace::Space),
4434                Token::Whitespace(Whitespace::Space),
4435                Token::Whitespace(Whitespace::Space),
4436                Token::Whitespace(Whitespace::Space),
4437                Token::Number("1".to_string(), false),
4438            ],
4439            tokens,
4440        );
4441    }
4442
4443    #[test]
4444    fn tokenize_lt() {
4445        all_dialects().tokenizes_to(
4446            "select a <-50",
4447            vec![
4448                Token::make_keyword("select"),
4449                Token::Whitespace(Whitespace::Space),
4450                Token::make_word("a", None),
4451                Token::Whitespace(Whitespace::Space),
4452                Token::Lt,
4453                Token::Minus,
4454                Token::Number("50".to_string(), false),
4455            ],
4456        );
4457        all_dialects().tokenizes_to(
4458            "select a <+50",
4459            vec![
4460                Token::make_keyword("select"),
4461                Token::Whitespace(Whitespace::Space),
4462                Token::make_word("a", None),
4463                Token::Whitespace(Whitespace::Space),
4464                Token::Lt,
4465                Token::Plus,
4466                Token::Number("50".to_string(), false),
4467            ],
4468        );
4469        all_dialects().tokenizes_to(
4470            "select a <=-50",
4471            vec![
4472                Token::make_keyword("select"),
4473                Token::Whitespace(Whitespace::Space),
4474                Token::make_word("a", None),
4475                Token::Whitespace(Whitespace::Space),
4476                Token::LtEq,
4477                Token::Minus,
4478                Token::Number("50".to_string(), false),
4479            ],
4480        );
4481        all_dialects().tokenizes_to(
4482            "select a <=+50",
4483            vec![
4484                Token::make_keyword("select"),
4485                Token::Whitespace(Whitespace::Space),
4486                Token::make_word("a", None),
4487                Token::Whitespace(Whitespace::Space),
4488                Token::LtEq,
4489                Token::Plus,
4490                Token::Number("50".to_string(), false),
4491            ],
4492        );
4493        all_dialects_where(|d| d.supports_geometric_types()).tokenizes_to(
4494            "select a <->b",
4495            vec![
4496                Token::make_keyword("select"),
4497                Token::Whitespace(Whitespace::Space),
4498                Token::make_word("a", None),
4499                Token::Whitespace(Whitespace::Space),
4500                Token::TwoWayArrow,
4501                Token::make_word("b", None),
4502            ],
4503        );
4504
4505        all_dialects().tokenizes_to(
4506            "select a <-b",
4507            vec![
4508                Token::make_keyword("select"),
4509                Token::Whitespace(Whitespace::Space),
4510                Token::make_word("a", None),
4511                Token::Whitespace(Whitespace::Space),
4512                Token::Lt,
4513                Token::Minus,
4514                Token::make_word("b", None),
4515            ],
4516        );
4517        all_dialects().tokenizes_to(
4518            "select a <+b",
4519            vec![
4520                Token::make_keyword("select"),
4521                Token::Whitespace(Whitespace::Space),
4522                Token::make_word("a", None),
4523                Token::Whitespace(Whitespace::Space),
4524                Token::Lt,
4525                Token::Plus,
4526                Token::make_word("b", None),
4527            ],
4528        );
4529    }
4530}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs