sqlparser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::num::NonZeroU8;
33use core::str::Chars;
34use core::{cmp, fmt};
35use core::{iter::Peekable, str};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46    SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{
50    ast::{DollarQuotedString, QuoteDelimitedString},
51    dialect::HiveDialect,
52};
53
54/// SQL Token enumeration
55#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
56#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
57#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
58pub enum Token {
59    /// An end-of-file marker, not a real token
60    EOF,
61    /// A keyword (like SELECT) or an optionally quoted SQL identifier
62    Word(Word),
63    /// An unsigned numeric literal
64    Number(String, bool),
65    /// A character that could not be tokenized
66    Char(char),
67    /// Single quoted string: i.e: 'string'
68    SingleQuotedString(String),
69    /// Double quoted string: i.e: "string"
70    DoubleQuotedString(String),
71    /// Triple single quoted strings: Example '''abc'''
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleSingleQuotedString(String),
74    /// Triple double quoted strings: Example """abc"""
75    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
76    TripleDoubleQuotedString(String),
77    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
78    DollarQuotedString(DollarQuotedString),
79    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
80    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
81    SingleQuotedByteStringLiteral(String),
82    /// Byte string literal: i.e: b"string" or B"string"
83    DoubleQuotedByteStringLiteral(String),
84    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleSingleQuotedByteStringLiteral(String),
87    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    TripleDoubleQuotedByteStringLiteral(String),
90    /// Single quoted literal with raw string prefix. Example `R'abc'`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    SingleQuotedRawStringLiteral(String),
93    /// Double quoted literal with raw string prefix. Example `R"abc"`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    DoubleQuotedRawStringLiteral(String),
96    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleSingleQuotedRawStringLiteral(String),
99    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
100    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
101    TripleDoubleQuotedRawStringLiteral(String),
102    /// "National" string literal: i.e: N'string'
103    NationalStringLiteral(String),
104    /// Quote delimited literal. Examples `Q'{ab'c}'`, `Q'|ab'c|'`, `Q'|ab|c|'`
105    /// [Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA)
106    QuoteDelimitedStringLiteral(QuoteDelimitedString),
107    /// "Nationa" quote delimited literal. Examples `NQ'{ab'c}'`, `NQ'|ab'c|'`, `NQ'|ab|c|'`
108    /// [Oracle](https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA)
109    NationalQuoteDelimitedStringLiteral(QuoteDelimitedString),
110    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
111    EscapedStringLiteral(String),
112    /// Unicode string literal: i.e: U&'first \000A second'
113    UnicodeStringLiteral(String),
114    /// Hexadecimal string literal: i.e.: X'deadbeef'
115    HexStringLiteral(String),
116    /// Comma
117    Comma,
118    /// Whitespace (space, tab, etc)
119    Whitespace(Whitespace),
120    /// Double equals sign `==`
121    DoubleEq,
122    /// Equality operator `=`
123    Eq,
124    /// Not Equals operator `<>` (or `!=` in some dialects)
125    Neq,
126    /// Less Than operator `<`
127    Lt,
128    /// Greater Than operator `>`
129    Gt,
130    /// Less Than Or Equals operator `<=`
131    LtEq,
132    /// Greater Than Or Equals operator `>=`
133    GtEq,
134    /// Spaceship operator <=>
135    Spaceship,
136    /// Plus operator `+`
137    Plus,
138    /// Minus operator `-`
139    Minus,
140    /// Multiplication operator `*`
141    Mul,
142    /// Division operator `/`
143    Div,
144    /// Integer division operator `//` in DuckDB
145    DuckIntDiv,
146    /// Modulo Operator `%`
147    Mod,
148    /// String concatenation `||`
149    StringConcat,
150    /// Left parenthesis `(`
151    LParen,
152    /// Right parenthesis `)`
153    RParen,
154    /// Period (used for compound identifiers or projections into nested types)
155    Period,
156    /// Colon `:`
157    Colon,
158    /// DoubleColon `::` (used for casting in PostgreSQL)
159    DoubleColon,
160    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
161    Assignment,
162    /// SemiColon `;` used as separator for COPY and payload
163    SemiColon,
164    /// Backslash `\` used in terminating the COPY payload with `\.`
165    Backslash,
166    /// Left bracket `[`
167    LBracket,
168    /// Right bracket `]`
169    RBracket,
170    /// Ampersand `&`
171    Ampersand,
172    /// Pipe `|`
173    Pipe,
174    /// Caret `^`
175    Caret,
176    /// Left brace `{`
177    LBrace,
178    /// Right brace `}`
179    RBrace,
180    /// Right Arrow `=>`
181    RArrow,
182    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
183    Sharp,
184    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
185    DoubleSharp,
186    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
187    Tilde,
188    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
189    TildeAsterisk,
190    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
191    ExclamationMarkTilde,
192    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
193    ExclamationMarkTildeAsterisk,
194    /// `~~`, a case sensitive match pattern operator in PostgreSQL
195    DoubleTilde,
196    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
197    DoubleTildeAsterisk,
198    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
199    ExclamationMarkDoubleTilde,
200    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
201    ExclamationMarkDoubleTildeAsterisk,
202    /// `<<`, a bitwise shift left operator in PostgreSQL
203    ShiftLeft,
204    /// `>>`, a bitwise shift right operator in PostgreSQL
205    ShiftRight,
206    /// `&&`, an overlap operator in PostgreSQL
207    Overlap,
208    /// Exclamation Mark `!` used for PostgreSQL factorial operator
209    ExclamationMark,
210    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
211    DoubleExclamationMark,
212    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
213    AtSign,
214    /// `^@`, a "starts with" string operator in PostgreSQL
215    CaretAt,
216    /// `|/`, a square root math operator in PostgreSQL
217    PGSquareRoot,
218    /// `||/`, a cube root math operator in PostgreSQL
219    PGCubeRoot,
220    /// `?` or `$` , a prepared statement arg placeholder
221    Placeholder(String),
222    /// `->`, used as a operator to extract json field in PostgreSQL
223    Arrow,
224    /// `->>`, used as a operator to extract json field as text in PostgreSQL
225    LongArrow,
226    /// `#>`, extracts JSON sub-object at the specified path
227    HashArrow,
228    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
229    AtDashAt,
230    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
231    QuestionMarkDash,
232    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
233    AmpersandLeftAngleBracket,
234    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
235    AmpersandRightAngleBracket,
236    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
237    AmpersandLeftAngleBracketVerticalBar,
238    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
239    VerticalBarAmpersandRightAngleBracket,
240    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
241    TwoWayArrow,
242    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
243    LeftAngleBracketCaret,
244    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
245    RightAngleBracketCaret,
246    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
247    QuestionMarkSharp,
248    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
249    QuestionMarkDashVerticalBar,
250    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
251    QuestionMarkDoubleVerticalBar,
252    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
253    TildeEqual,
254    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
255    ShiftLeftVerticalBar,
256    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
257    VerticalBarShiftRight,
258    /// `|> BigQuery pipe operator
259    VerticalBarRightAngleBracket,
260    /// `#>>`, extracts JSON sub-object at the specified path as text
261    HashLongArrow,
262    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
263    AtArrow,
264    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
265    ArrowAt,
266    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
267    /// path, where path elements can be either field keys or array indexes.
268    HashMinus,
269    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
270    /// JSON value?
271    AtQuestion,
272    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
273    /// for the specified JSON value. Only the first item of the result is taken into
274    /// account. If the result is not Boolean, then NULL is returned.
275    AtAt,
276    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
277    /// jsonb object
278    Question,
279    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
280    /// keys within the jsonb object
281    QuestionAnd,
282    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
283    /// keys within the jsonb object
284    QuestionPipe,
285    /// Custom binary operator
286    /// This is used to represent any custom binary operator that is not part of the SQL standard.
287    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
288    CustomBinaryOperator(String),
289}
290
291impl fmt::Display for Token {
292    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
293        match self {
294            Token::EOF => f.write_str("EOF"),
295            Token::Word(ref w) => write!(f, "{w}"),
296            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
297            Token::Char(ref c) => write!(f, "{c}"),
298            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
299            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
300            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
301            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
302            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
303            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
304            Token::QuoteDelimitedStringLiteral(ref s) => s.fmt(f),
305            Token::NationalQuoteDelimitedStringLiteral(ref s) => write!(f, "N{s}"),
306            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
307            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
308            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
309            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
310            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
311            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
312            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
313            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
314            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
315            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
316            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
317            Token::Comma => f.write_str(","),
318            Token::Whitespace(ws) => write!(f, "{ws}"),
319            Token::DoubleEq => f.write_str("=="),
320            Token::Spaceship => f.write_str("<=>"),
321            Token::Eq => f.write_str("="),
322            Token::Neq => f.write_str("<>"),
323            Token::Lt => f.write_str("<"),
324            Token::Gt => f.write_str(">"),
325            Token::LtEq => f.write_str("<="),
326            Token::GtEq => f.write_str(">="),
327            Token::Plus => f.write_str("+"),
328            Token::Minus => f.write_str("-"),
329            Token::Mul => f.write_str("*"),
330            Token::Div => f.write_str("/"),
331            Token::DuckIntDiv => f.write_str("//"),
332            Token::StringConcat => f.write_str("||"),
333            Token::Mod => f.write_str("%"),
334            Token::LParen => f.write_str("("),
335            Token::RParen => f.write_str(")"),
336            Token::Period => f.write_str("."),
337            Token::Colon => f.write_str(":"),
338            Token::DoubleColon => f.write_str("::"),
339            Token::Assignment => f.write_str(":="),
340            Token::SemiColon => f.write_str(";"),
341            Token::Backslash => f.write_str("\\"),
342            Token::LBracket => f.write_str("["),
343            Token::RBracket => f.write_str("]"),
344            Token::Ampersand => f.write_str("&"),
345            Token::Caret => f.write_str("^"),
346            Token::Pipe => f.write_str("|"),
347            Token::LBrace => f.write_str("{"),
348            Token::RBrace => f.write_str("}"),
349            Token::RArrow => f.write_str("=>"),
350            Token::Sharp => f.write_str("#"),
351            Token::DoubleSharp => f.write_str("##"),
352            Token::ExclamationMark => f.write_str("!"),
353            Token::DoubleExclamationMark => f.write_str("!!"),
354            Token::Tilde => f.write_str("~"),
355            Token::TildeAsterisk => f.write_str("~*"),
356            Token::ExclamationMarkTilde => f.write_str("!~"),
357            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
358            Token::DoubleTilde => f.write_str("~~"),
359            Token::DoubleTildeAsterisk => f.write_str("~~*"),
360            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
361            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
362            Token::AtSign => f.write_str("@"),
363            Token::CaretAt => f.write_str("^@"),
364            Token::ShiftLeft => f.write_str("<<"),
365            Token::ShiftRight => f.write_str(">>"),
366            Token::Overlap => f.write_str("&&"),
367            Token::PGSquareRoot => f.write_str("|/"),
368            Token::PGCubeRoot => f.write_str("||/"),
369            Token::AtDashAt => f.write_str("@-@"),
370            Token::QuestionMarkDash => f.write_str("?-"),
371            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
372            Token::AmpersandRightAngleBracket => f.write_str("&>"),
373            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
374            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
375            Token::VerticalBarRightAngleBracket => f.write_str("|>"),
376            Token::TwoWayArrow => f.write_str("<->"),
377            Token::LeftAngleBracketCaret => f.write_str("<^"),
378            Token::RightAngleBracketCaret => f.write_str(">^"),
379            Token::QuestionMarkSharp => f.write_str("?#"),
380            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
381            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
382            Token::TildeEqual => f.write_str("~="),
383            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
384            Token::VerticalBarShiftRight => f.write_str("|>>"),
385            Token::Placeholder(ref s) => write!(f, "{s}"),
386            Token::Arrow => write!(f, "->"),
387            Token::LongArrow => write!(f, "->>"),
388            Token::HashArrow => write!(f, "#>"),
389            Token::HashLongArrow => write!(f, "#>>"),
390            Token::AtArrow => write!(f, "@>"),
391            Token::ArrowAt => write!(f, "<@"),
392            Token::HashMinus => write!(f, "#-"),
393            Token::AtQuestion => write!(f, "@?"),
394            Token::AtAt => write!(f, "@@"),
395            Token::Question => write!(f, "?"),
396            Token::QuestionAnd => write!(f, "?&"),
397            Token::QuestionPipe => write!(f, "?|"),
398            Token::CustomBinaryOperator(s) => f.write_str(s),
399        }
400    }
401}
402
403impl Token {
404    pub fn make_keyword(keyword: &str) -> Self {
405        Token::make_word(keyword, None)
406    }
407
408    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
409        let word_uppercase = word.to_uppercase();
410        Token::Word(Word {
411            value: word.to_string(),
412            quote_style,
413            keyword: if quote_style.is_none() {
414                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
415                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
416            } else {
417                Keyword::NoKeyword
418            },
419        })
420    }
421}
422
423/// A keyword (like SELECT) or an optionally quoted SQL identifier
424#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
425#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
426#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
427pub struct Word {
428    /// The value of the token, without the enclosing quotes, and with the
429    /// escape sequences (if any) processed (TODO: escapes are not handled)
430    pub value: String,
431    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
432    /// The standard and most implementations allow using double quotes for this,
433    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
434    pub quote_style: Option<char>,
435    /// If the word was not quoted and it matched one of the known keywords,
436    /// this will have one of the values from dialect::keywords, otherwise empty
437    pub keyword: Keyword,
438}
439
440impl fmt::Display for Word {
441    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
442        match self.quote_style {
443            Some(s) if s == '"' || s == '[' || s == '`' => {
444                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
445            }
446            None => f.write_str(&self.value),
447            _ => panic!("Unexpected quote_style!"),
448        }
449    }
450}
451
452impl Word {
453    fn matching_end_quote(ch: char) -> char {
454        match ch {
455            '"' => '"', // ANSI and most dialects
456            '[' => ']', // MS SQL
457            '`' => '`', // MySQL
458            _ => panic!("unexpected quoting style!"),
459        }
460    }
461}
462
463#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
464#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
465#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
466pub enum Whitespace {
467    Space,
468    Newline,
469    Tab,
470    SingleLineComment { comment: String, prefix: String },
471    MultiLineComment(String),
472}
473
474impl fmt::Display for Whitespace {
475    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
476        match self {
477            Whitespace::Space => f.write_str(" "),
478            Whitespace::Newline => f.write_str("\n"),
479            Whitespace::Tab => f.write_str("\t"),
480            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
481            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
482        }
483    }
484}
485
486/// Location in input string
487///
488/// # Create an "empty" (unknown) `Location`
489/// ```
490/// # use sqlparser::tokenizer::Location;
491/// let location = Location::empty();
492/// ```
493///
494/// # Create a `Location` from a line and column
495/// ```
496/// # use sqlparser::tokenizer::Location;
497/// let location = Location::new(1, 1);
498/// ```
499///
500/// # Create a `Location` from a pair
501/// ```
502/// # use sqlparser::tokenizer::Location;
503/// let location = Location::from((1, 1));
504/// ```
505#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
506#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
507#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
508pub struct Location {
509    /// Line number, starting from 1.
510    ///
511    /// Note: Line 0 is used for empty spans
512    pub line: u64,
513    /// Line column, starting from 1.
514    ///
515    /// Note: Column 0 is used for empty spans
516    pub column: u64,
517}
518
519impl fmt::Display for Location {
520    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
521        if self.line == 0 {
522            return Ok(());
523        }
524        write!(f, " at Line: {}, Column: {}", self.line, self.column)
525    }
526}
527
528impl fmt::Debug for Location {
529    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
530        write!(f, "Location({},{})", self.line, self.column)
531    }
532}
533
534impl Location {
535    /// Return an "empty" / unknown location
536    pub fn empty() -> Self {
537        Self { line: 0, column: 0 }
538    }
539
540    /// Create a new `Location` for a given line and column
541    pub fn new(line: u64, column: u64) -> Self {
542        Self { line, column }
543    }
544
545    /// Create a new location for a given line and column
546    ///
547    /// Alias for [`Self::new`]
548    // TODO: remove / deprecate in favor of` `new` for consistency?
549    pub fn of(line: u64, column: u64) -> Self {
550        Self::new(line, column)
551    }
552
553    /// Combine self and `end` into a new `Span`
554    pub fn span_to(self, end: Self) -> Span {
555        Span { start: self, end }
556    }
557}
558
559impl From<(u64, u64)> for Location {
560    fn from((line, column): (u64, u64)) -> Self {
561        Self { line, column }
562    }
563}
564
565/// A span represents a linear portion of the input string (start, end)
566///
567/// See [Spanned](crate::ast::Spanned) for more information.
568#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
569#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
570#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
571pub struct Span {
572    pub start: Location,
573    pub end: Location,
574}
575
576impl fmt::Debug for Span {
577    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
578        write!(f, "Span({:?}..{:?})", self.start, self.end)
579    }
580}
581
582impl Span {
583    // An empty span (0, 0) -> (0, 0)
584    // We need a const instance for pattern matching
585    const EMPTY: Span = Self::empty();
586
587    /// Create a new span from a start and end [`Location`]
588    pub fn new(start: Location, end: Location) -> Span {
589        Span { start, end }
590    }
591
592    /// Returns an empty span `(0, 0) -> (0, 0)`
593    ///
594    /// Empty spans represent no knowledge of source location
595    /// See [Spanned](crate::ast::Spanned) for more information.
596    pub const fn empty() -> Span {
597        Span {
598            start: Location { line: 0, column: 0 },
599            end: Location { line: 0, column: 0 },
600        }
601    }
602
603    /// Returns the smallest Span that contains both `self` and `other`
604    /// If either span is [Span::empty], the other span is returned
605    ///
606    /// # Examples
607    /// ```
608    /// # use sqlparser::tokenizer::{Span, Location};
609    /// // line 1, column1 -> line 2, column 5
610    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
611    /// // line 2, column 3 -> line 3, column 7
612    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
613    /// // Union of the two is the min/max of the two spans
614    /// // line 1, column 1 -> line 3, column 7
615    /// let union = span1.union(&span2);
616    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
617    /// ```
618    pub fn union(&self, other: &Span) -> Span {
619        // If either span is empty, return the other
620        // this prevents propagating (0, 0) through the tree
621        match (self, other) {
622            (&Span::EMPTY, _) => *other,
623            (_, &Span::EMPTY) => *self,
624            _ => Span {
625                start: cmp::min(self.start, other.start),
626                end: cmp::max(self.end, other.end),
627            },
628        }
629    }
630
631    /// Same as [Span::union] for `Option<Span>`
632    ///
633    /// If `other` is `None`, `self` is returned
634    pub fn union_opt(&self, other: &Option<Span>) -> Span {
635        match other {
636            Some(other) => self.union(other),
637            None => *self,
638        }
639    }
640
641    /// Return the [Span::union] of all spans in the iterator
642    ///
643    /// If the iterator is empty, an empty span is returned
644    ///
645    /// # Example
646    /// ```
647    /// # use sqlparser::tokenizer::{Span, Location};
648    /// let spans = vec![
649    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
650    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
651    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
652    /// ];
653    /// // line 1, column 1 -> line 4, column 2
654    /// assert_eq!(
655    ///   Span::union_iter(spans),
656    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
657    /// );
658    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
659        iter.into_iter()
660            .reduce(|acc, item| acc.union(&item))
661            .unwrap_or(Span::empty())
662    }
663}
664
665/// Backwards compatibility struct for [`TokenWithSpan`]
666#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
667pub type TokenWithLocation = TokenWithSpan;
668
669/// A [Token] with [Span] attached to it
670///
671/// This is used to track the location of a token in the input string
672///
673/// # Examples
674/// ```
675/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
676/// // commas @ line 1, column 10
677/// let tok1 = TokenWithSpan::new(
678///   Token::Comma,
679///   Span::new(Location::new(1, 10), Location::new(1, 11)),
680/// );
681/// assert_eq!(tok1, Token::Comma); // can compare the token
682///
683/// // commas @ line 2, column 20
684/// let tok2 = TokenWithSpan::new(
685///   Token::Comma,
686///   Span::new(Location::new(2, 20), Location::new(2, 21)),
687/// );
688/// // same token but different locations are not equal
689/// assert_ne!(tok1, tok2);
690/// ```
691#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
692#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
693#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
694pub struct TokenWithSpan {
695    pub token: Token,
696    pub span: Span,
697}
698
699impl TokenWithSpan {
700    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
701    pub fn new(token: Token, span: Span) -> Self {
702        Self { token, span }
703    }
704
705    /// Wrap a token with an empty span
706    pub fn wrap(token: Token) -> Self {
707        Self::new(token, Span::empty())
708    }
709
710    /// Wrap a token with a location from `start` to `end`
711    pub fn at(token: Token, start: Location, end: Location) -> Self {
712        Self::new(token, Span::new(start, end))
713    }
714
715    /// Return an EOF token with no location
716    pub fn new_eof() -> Self {
717        Self::wrap(Token::EOF)
718    }
719}
720
721impl PartialEq<Token> for TokenWithSpan {
722    fn eq(&self, other: &Token) -> bool {
723        &self.token == other
724    }
725}
726
727impl PartialEq<TokenWithSpan> for Token {
728    fn eq(&self, other: &TokenWithSpan) -> bool {
729        self == &other.token
730    }
731}
732
733impl fmt::Display for TokenWithSpan {
734    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
735        self.token.fmt(f)
736    }
737}
738
739/// Tokenizer error
740#[derive(Debug, PartialEq, Eq)]
741pub struct TokenizerError {
742    pub message: String,
743    pub location: Location,
744}
745
746impl fmt::Display for TokenizerError {
747    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
748        write!(f, "{}{}", self.message, self.location,)
749    }
750}
751
752#[cfg(feature = "std")]
753impl std::error::Error for TokenizerError {}
754
755struct State<'a> {
756    peekable: Peekable<Chars<'a>>,
757    pub line: u64,
758    pub col: u64,
759}
760
761impl State<'_> {
762    /// return the next character and advance the stream
763    pub fn next(&mut self) -> Option<char> {
764        match self.peekable.next() {
765            None => None,
766            Some(s) => {
767                if s == '\n' {
768                    self.line += 1;
769                    self.col = 1;
770                } else {
771                    self.col += 1;
772                }
773                Some(s)
774            }
775        }
776    }
777
778    /// return the next character but do not advance the stream
779    pub fn peek(&mut self) -> Option<&char> {
780        self.peekable.peek()
781    }
782
783    pub fn location(&self) -> Location {
784        Location {
785            line: self.line,
786            column: self.col,
787        }
788    }
789}
790
791/// Represents how many quote characters enclose a string literal.
792#[derive(Copy, Clone)]
793enum NumStringQuoteChars {
794    /// e.g. `"abc"`, `'abc'`, `r'abc'`
795    One,
796    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
797    Many(NonZeroU8),
798}
799
800/// Settings for tokenizing a quoted string literal.
801struct TokenizeQuotedStringSettings {
802    /// The character used to quote the string.
803    quote_style: char,
804    /// Represents how many quotes characters enclose the string literal.
805    num_quote_chars: NumStringQuoteChars,
806    /// The number of opening quotes left to consume, before parsing
807    /// the remaining string literal.
808    /// For example: given initial string `"""abc"""`. If the caller has
809    /// already parsed the first quote for some reason, then this value
810    /// is set to 1, flagging to look to consume only 2 leading quotes.
811    num_opening_quotes_to_consume: u8,
812    /// True if the string uses backslash escaping of special characters
813    /// e.g `'abc\ndef\'ghi'
814    backslash_escape: bool,
815}
816
817/// SQL Tokenizer
818pub struct Tokenizer<'a> {
819    dialect: &'a dyn Dialect,
820    query: &'a str,
821    /// If true (the default), the tokenizer will un-escape literal
822    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
823    unescape: bool,
824}
825
826impl<'a> Tokenizer<'a> {
827    /// Create a new SQL tokenizer for the specified SQL statement
828    ///
829    /// ```
830    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
831    /// # use sqlparser::dialect::GenericDialect;
832    /// # let dialect = GenericDialect{};
833    /// let query = r#"SELECT 'foo'"#;
834    ///
835    /// // Parsing the query
836    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
837    ///
838    /// assert_eq!(tokens, vec![
839    ///   Token::make_word("SELECT", None),
840    ///   Token::Whitespace(Whitespace::Space),
841    ///   Token::SingleQuotedString("foo".to_string()),
842    /// ]);
843    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
844        Self {
845            dialect,
846            query,
847            unescape: true,
848        }
849    }
850
851    /// Set unescape mode
852    ///
853    /// When true (default) the tokenizer unescapes literal values
854    /// (for example, `""` in SQL is unescaped to the literal `"`).
855    ///
856    /// When false, the tokenizer provides the raw strings as provided
857    /// in the query.  This can be helpful for programs that wish to
858    /// recover the *exact* original query text without normalizing
859    /// the escaping
860    ///
861    /// # Example
862    ///
863    /// ```
864    /// # use sqlparser::tokenizer::{Token, Tokenizer};
865    /// # use sqlparser::dialect::GenericDialect;
866    /// # let dialect = GenericDialect{};
867    /// let query = r#""Foo "" Bar""#;
868    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
869    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
870    ///
871    /// // Parsing with unescaping (default)
872    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
873    /// assert_eq!(tokens, vec![unescaped]);
874    ///
875    /// // Parsing with unescape = false
876    /// let tokens = Tokenizer::new(&dialect, &query)
877    ///    .with_unescape(false)
878    ///    .tokenize().unwrap();
879    /// assert_eq!(tokens, vec![original]);
880    /// ```
881    pub fn with_unescape(mut self, unescape: bool) -> Self {
882        self.unescape = unescape;
883        self
884    }
885
886    /// Tokenize the statement and produce a vector of tokens
887    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
888        let twl = self.tokenize_with_location()?;
889        Ok(twl.into_iter().map(|t| t.token).collect())
890    }
891
892    /// Tokenize the statement and produce a vector of tokens with location information
893    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
894        let mut tokens: Vec<TokenWithSpan> = vec![];
895        self.tokenize_with_location_into_buf(&mut tokens)
896            .map(|_| tokens)
897    }
898
899    /// Tokenize the statement and append tokens with location information into the provided buffer.
900    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
901    pub fn tokenize_with_location_into_buf(
902        &mut self,
903        buf: &mut Vec<TokenWithSpan>,
904    ) -> Result<(), TokenizerError> {
905        let mut state = State {
906            peekable: self.query.chars().peekable(),
907            line: 1,
908            col: 1,
909        };
910
911        let mut location = state.location();
912        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
913            let span = location.span_to(state.location());
914
915            buf.push(TokenWithSpan { token, span });
916
917            location = state.location();
918        }
919        Ok(())
920    }
921
922    // Tokenize the identifier or keywords in `ch`
923    fn tokenize_identifier_or_keyword(
924        &self,
925        ch: impl IntoIterator<Item = char>,
926        chars: &mut State,
927    ) -> Result<Option<Token>, TokenizerError> {
928        chars.next(); // consume the first char
929        let ch: String = ch.into_iter().collect();
930        let word = self.tokenize_word(ch, chars);
931
932        // TODO: implement parsing of exponent here
933        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
934            let mut inner_state = State {
935                peekable: word.chars().peekable(),
936                line: 0,
937                col: 0,
938            };
939            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
940            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
941            s += s2.as_str();
942            return Ok(Some(Token::Number(s, false)));
943        }
944
945        Ok(Some(Token::make_word(&word, None)))
946    }
947
948    /// Get the next token or return None
949    fn next_token(
950        &self,
951        chars: &mut State,
952        prev_token: Option<&Token>,
953    ) -> Result<Option<Token>, TokenizerError> {
954        match chars.peek() {
955            Some(&ch) => match ch {
956                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
957                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
958                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
959                '\r' => {
960                    // Emit a single Whitespace::Newline token for \r and \r\n
961                    chars.next();
962                    if let Some('\n') = chars.peek() {
963                        chars.next();
964                    }
965                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
966                }
967                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
968                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
969                {
970                    chars.next(); // consume
971                    match chars.peek() {
972                        Some('\'') => {
973                            if self.dialect.supports_triple_quoted_string() {
974                                return self
975                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
976                                        chars,
977                                        '\'',
978                                        false,
979                                        Token::SingleQuotedByteStringLiteral,
980                                        Token::TripleSingleQuotedByteStringLiteral,
981                                    );
982                            }
983                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
984                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
985                        }
986                        Some('\"') => {
987                            if self.dialect.supports_triple_quoted_string() {
988                                return self
989                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
990                                        chars,
991                                        '"',
992                                        false,
993                                        Token::DoubleQuotedByteStringLiteral,
994                                        Token::TripleDoubleQuotedByteStringLiteral,
995                                    );
996                            }
997                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
998                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
999                        }
1000                        _ => {
1001                            // regular identifier starting with an "b" or "B"
1002                            let s = self.tokenize_word(b, chars);
1003                            Ok(Some(Token::make_word(&s, None)))
1004                        }
1005                    }
1006                }
1007                // BigQuery uses r or R for raw string literal
1008                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
1009                    chars.next(); // consume
1010                    match chars.peek() {
1011                        Some('\'') => self
1012                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1013                                chars,
1014                                '\'',
1015                                false,
1016                                Token::SingleQuotedRawStringLiteral,
1017                                Token::TripleSingleQuotedRawStringLiteral,
1018                            ),
1019                        Some('\"') => self
1020                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1021                                chars,
1022                                '"',
1023                                false,
1024                                Token::DoubleQuotedRawStringLiteral,
1025                                Token::TripleDoubleQuotedRawStringLiteral,
1026                            ),
1027                        _ => {
1028                            // regular identifier starting with an "r" or "R"
1029                            let s = self.tokenize_word(b, chars);
1030                            Ok(Some(Token::make_word(&s, None)))
1031                        }
1032                    }
1033                }
1034                // Redshift uses lower case n for national string literal
1035                n @ 'N' | n @ 'n' => {
1036                    chars.next(); // consume, to check the next char
1037                    match chars.peek() {
1038                        Some('\'') => {
1039                            // N'...' - a <national character string literal>
1040                            let backslash_escape =
1041                                self.dialect.supports_string_literal_backslash_escape();
1042                            let s =
1043                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1044                            Ok(Some(Token::NationalStringLiteral(s)))
1045                        }
1046                        Some(&q @ 'q') | Some(&q @ 'Q')
1047                            if self.dialect.supports_quote_delimited_string() =>
1048                        {
1049                            chars.next(); // consume and check the next char
1050                            if let Some('\'') = chars.peek() {
1051                                self.tokenize_quote_delimited_string(chars, &[n, q])
1052                                    .map(|s| Some(Token::NationalQuoteDelimitedStringLiteral(s)))
1053                            } else {
1054                                let s = self.tokenize_word(String::from_iter([n, q]), chars);
1055                                Ok(Some(Token::make_word(&s, None)))
1056                            }
1057                        }
1058                        _ => {
1059                            // regular identifier starting with an "N"
1060                            let s = self.tokenize_word(n, chars);
1061                            Ok(Some(Token::make_word(&s, None)))
1062                        }
1063                    }
1064                }
1065                q @ 'Q' | q @ 'q' if self.dialect.supports_quote_delimited_string() => {
1066                    chars.next(); // consume and check the next char
1067                    if let Some('\'') = chars.peek() {
1068                        self.tokenize_quote_delimited_string(chars, &[q])
1069                            .map(|s| Some(Token::QuoteDelimitedStringLiteral(s)))
1070                    } else {
1071                        let s = self.tokenize_word(q, chars);
1072                        Ok(Some(Token::make_word(&s, None)))
1073                    }
1074                }
1075                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1076                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1077                    let starting_loc = chars.location();
1078                    chars.next(); // consume, to check the next char
1079                    match chars.peek() {
1080                        Some('\'') => {
1081                            let s =
1082                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1083                            Ok(Some(Token::EscapedStringLiteral(s)))
1084                        }
1085                        _ => {
1086                            // regular identifier starting with an "E" or "e"
1087                            let s = self.tokenize_word(x, chars);
1088                            Ok(Some(Token::make_word(&s, None)))
1089                        }
1090                    }
1091                }
1092                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1093                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1094                    chars.next(); // consume, to check the next char
1095                    if chars.peek() == Some(&'&') {
1096                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1097                        let mut chars_clone = chars.peekable.clone();
1098                        chars_clone.next(); // consume the '&' in the clone
1099                        if chars_clone.peek() == Some(&'\'') {
1100                            chars.next(); // consume the '&' in the original iterator
1101                            let s = unescape_unicode_single_quoted_string(chars)?;
1102                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1103                        }
1104                    }
1105                    // regular identifier starting with an "U" or "u"
1106                    let s = self.tokenize_word(x, chars);
1107                    Ok(Some(Token::make_word(&s, None)))
1108                }
1109                // The spec only allows an uppercase 'X' to introduce a hex
1110                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1111                x @ 'x' | x @ 'X' => {
1112                    chars.next(); // consume, to check the next char
1113                    match chars.peek() {
1114                        Some('\'') => {
1115                            // X'...' - a <binary string literal>
1116                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1117                            Ok(Some(Token::HexStringLiteral(s)))
1118                        }
1119                        _ => {
1120                            // regular identifier starting with an "X"
1121                            let s = self.tokenize_word(x, chars);
1122                            Ok(Some(Token::make_word(&s, None)))
1123                        }
1124                    }
1125                }
1126                // single quoted string
1127                '\'' => {
1128                    if self.dialect.supports_triple_quoted_string() {
1129                        return self
1130                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1131                                chars,
1132                                '\'',
1133                                self.dialect.supports_string_literal_backslash_escape(),
1134                                Token::SingleQuotedString,
1135                                Token::TripleSingleQuotedString,
1136                            );
1137                    }
1138                    let s = self.tokenize_single_quoted_string(
1139                        chars,
1140                        '\'',
1141                        self.dialect.supports_string_literal_backslash_escape(),
1142                    )?;
1143
1144                    Ok(Some(Token::SingleQuotedString(s)))
1145                }
1146                // double quoted string
1147                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1148                    && !self.dialect.is_identifier_start(ch) =>
1149                {
1150                    if self.dialect.supports_triple_quoted_string() {
1151                        return self
1152                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1153                                chars,
1154                                '"',
1155                                self.dialect.supports_string_literal_backslash_escape(),
1156                                Token::DoubleQuotedString,
1157                                Token::TripleDoubleQuotedString,
1158                            );
1159                    }
1160                    let s = self.tokenize_single_quoted_string(
1161                        chars,
1162                        '"',
1163                        self.dialect.supports_string_literal_backslash_escape(),
1164                    )?;
1165
1166                    Ok(Some(Token::DoubleQuotedString(s)))
1167                }
1168                // delimited (quoted) identifier
1169                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1170                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1171                    Ok(Some(Token::make_word(&word, Some(quote_start))))
1172                }
1173                // Potentially nested delimited (quoted) identifier
1174                quote_start
1175                    if self
1176                        .dialect
1177                        .is_nested_delimited_identifier_start(quote_start)
1178                        && self
1179                            .dialect
1180                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1181                            .is_some() =>
1182                {
1183                    let Some((quote_start, nested_quote_start)) = self
1184                        .dialect
1185                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1186                    else {
1187                        return self.tokenizer_error(
1188                            chars.location(),
1189                            format!("Expected nested delimiter '{quote_start}' before EOF."),
1190                        );
1191                    };
1192
1193                    let Some(nested_quote_start) = nested_quote_start else {
1194                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1195                        return Ok(Some(Token::make_word(&word, Some(quote_start))));
1196                    };
1197
1198                    let mut word = vec![];
1199                    let quote_end = Word::matching_end_quote(quote_start);
1200                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1201                    let error_loc = chars.location();
1202
1203                    chars.next(); // skip the first delimiter
1204                    peeking_take_while(chars, |ch| ch.is_whitespace());
1205                    if chars.peek() != Some(&nested_quote_start) {
1206                        return self.tokenizer_error(
1207                            error_loc,
1208                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1209                        );
1210                    }
1211                    word.push(nested_quote_start.into());
1212                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1213                    word.push(nested_quote_end.into());
1214                    peeking_take_while(chars, |ch| ch.is_whitespace());
1215                    if chars.peek() != Some(&quote_end) {
1216                        return self.tokenizer_error(
1217                            error_loc,
1218                            format!("Expected close delimiter '{quote_end}' before EOF."),
1219                        );
1220                    }
1221                    chars.next(); // skip close delimiter
1222
1223                    Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1224                }
1225                // numbers and period
1226                '0'..='9' | '.' => {
1227                    // special case where if ._ is encountered after a word then that word
1228                    // is a table and the _ is the start of the col name.
1229                    // if the prev token is not a word, then this is not a valid sql
1230                    // word or number.
1231                    if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1232                        if let Some(Token::Word(_)) = prev_token {
1233                            chars.next();
1234                            return Ok(Some(Token::Period));
1235                        }
1236
1237                        return self.tokenizer_error(
1238                            chars.location(),
1239                            "Unexpected character '_'".to_string(),
1240                        );
1241                    }
1242
1243                    // Some dialects support underscore as number separator
1244                    // There can only be one at a time and it must be followed by another digit
1245                    let is_number_separator = |ch: char, next_char: Option<char>| {
1246                        self.dialect.supports_numeric_literal_underscores()
1247                            && ch == '_'
1248                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1249                    };
1250
1251                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1252                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1253                    });
1254
1255                    // match binary literal that starts with 0x
1256                    if s == "0" && chars.peek() == Some(&'x') {
1257                        chars.next();
1258                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1259                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1260                        });
1261                        return Ok(Some(Token::HexStringLiteral(s2)));
1262                    }
1263
1264                    // match one period
1265                    if let Some('.') = chars.peek() {
1266                        s.push('.');
1267                        chars.next();
1268                    }
1269
1270                    // If the dialect supports identifiers that start with a numeric prefix
1271                    // and we have now consumed a dot, check if the previous token was a Word.
1272                    // If so, what follows is definitely not part of a decimal number and
1273                    // we should yield the dot as a dedicated token so compound identifiers
1274                    // starting with digits can be parsed correctly.
1275                    if s == "." && self.dialect.supports_numeric_prefix() {
1276                        if let Some(Token::Word(_)) = prev_token {
1277                            return Ok(Some(Token::Period));
1278                        }
1279                    }
1280
1281                    // Consume fractional digits.
1282                    s += &peeking_next_take_while(chars, |ch, next_ch| {
1283                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1284                    });
1285
1286                    // No fraction -> Token::Period
1287                    if s == "." {
1288                        return Ok(Some(Token::Period));
1289                    }
1290
1291                    // Parse exponent as number
1292                    let mut exponent_part = String::new();
1293                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1294                        let mut char_clone = chars.peekable.clone();
1295                        exponent_part.push(char_clone.next().unwrap());
1296
1297                        // Optional sign
1298                        match char_clone.peek() {
1299                            Some(&c) if matches!(c, '+' | '-') => {
1300                                exponent_part.push(c);
1301                                char_clone.next();
1302                            }
1303                            _ => (),
1304                        }
1305
1306                        match char_clone.peek() {
1307                            // Definitely an exponent, get original iterator up to speed and use it
1308                            Some(&c) if c.is_ascii_digit() => {
1309                                for _ in 0..exponent_part.len() {
1310                                    chars.next();
1311                                }
1312                                exponent_part +=
1313                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1314                                s += exponent_part.as_str();
1315                            }
1316                            // Not an exponent, discard the work done
1317                            _ => (),
1318                        }
1319                    }
1320
1321                    // If the dialect supports identifiers that start with a numeric prefix,
1322                    // we need to check if the value is in fact an identifier and must thus
1323                    // be tokenized as a word.
1324                    if self.dialect.supports_numeric_prefix() {
1325                        if exponent_part.is_empty() {
1326                            // If it is not a number with an exponent, it may be
1327                            // an identifier starting with digits.
1328                            let word =
1329                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1330
1331                            if !word.is_empty() {
1332                                s += word.as_str();
1333                                return Ok(Some(Token::make_word(s.as_str(), None)));
1334                            }
1335                        } else if prev_token == Some(&Token::Period) {
1336                            // If the previous token was a period, thus not belonging to a number,
1337                            // the value we have is part of an identifier.
1338                            return Ok(Some(Token::make_word(s.as_str(), None)));
1339                        }
1340                    }
1341
1342                    let long = if chars.peek() == Some(&'L') {
1343                        chars.next();
1344                        true
1345                    } else {
1346                        false
1347                    };
1348                    Ok(Some(Token::Number(s, long)))
1349                }
1350                // punctuation
1351                '(' => self.consume_and_return(chars, Token::LParen),
1352                ')' => self.consume_and_return(chars, Token::RParen),
1353                ',' => self.consume_and_return(chars, Token::Comma),
1354                // operators
1355                '-' => {
1356                    chars.next(); // consume the '-'
1357
1358                    match chars.peek() {
1359                        Some('-') => {
1360                            let mut is_comment = true;
1361                            if self.dialect.requires_single_line_comment_whitespace() {
1362                                is_comment = Some(' ') == chars.peekable.clone().nth(1);
1363                            }
1364
1365                            if is_comment {
1366                                chars.next(); // consume second '-'
1367                                let comment = self.tokenize_single_line_comment(chars);
1368                                return Ok(Some(Token::Whitespace(
1369                                    Whitespace::SingleLineComment {
1370                                        prefix: "--".to_owned(),
1371                                        comment,
1372                                    },
1373                                )));
1374                            }
1375
1376                            self.start_binop(chars, "-", Token::Minus)
1377                        }
1378                        Some('>') => {
1379                            chars.next();
1380                            match chars.peek() {
1381                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1382                                _ => self.start_binop(chars, "->", Token::Arrow),
1383                            }
1384                        }
1385                        // a regular '-' operator
1386                        _ => self.start_binop(chars, "-", Token::Minus),
1387                    }
1388                }
1389                '/' => {
1390                    chars.next(); // consume the '/'
1391                    match chars.peek() {
1392                        Some('*') => {
1393                            chars.next(); // consume the '*', starting a multi-line comment
1394                            self.tokenize_multiline_comment(chars)
1395                        }
1396                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1397                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1398                            let comment = self.tokenize_single_line_comment(chars);
1399                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1400                                prefix: "//".to_owned(),
1401                                comment,
1402                            })))
1403                        }
1404                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1405                            self.consume_and_return(chars, Token::DuckIntDiv)
1406                        }
1407                        // a regular '/' operator
1408                        _ => Ok(Some(Token::Div)),
1409                    }
1410                }
1411                '+' => self.consume_and_return(chars, Token::Plus),
1412                '*' => self.consume_and_return(chars, Token::Mul),
1413                '%' => {
1414                    chars.next(); // advance past '%'
1415                    match chars.peek() {
1416                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1417                        Some(sch) if self.dialect.is_identifier_start('%') => {
1418                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1419                        }
1420                        _ => self.start_binop(chars, "%", Token::Mod),
1421                    }
1422                }
1423                '|' => {
1424                    chars.next(); // consume the '|'
1425                    match chars.peek() {
1426                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1427                        Some('|') => {
1428                            chars.next(); // consume the second '|'
1429                            match chars.peek() {
1430                                Some('/') => {
1431                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1432                                }
1433                                _ => self.start_binop(chars, "||", Token::StringConcat),
1434                            }
1435                        }
1436                        Some('&') if self.dialect.supports_geometric_types() => {
1437                            chars.next(); // consume
1438                            match chars.peek() {
1439                                Some('>') => self.consume_for_binop(
1440                                    chars,
1441                                    "|&>",
1442                                    Token::VerticalBarAmpersandRightAngleBracket,
1443                                ),
1444                                _ => self.start_binop_opt(chars, "|&", None),
1445                            }
1446                        }
1447                        Some('>') if self.dialect.supports_geometric_types() => {
1448                            chars.next(); // consume
1449                            match chars.peek() {
1450                                Some('>') => self.consume_for_binop(
1451                                    chars,
1452                                    "|>>",
1453                                    Token::VerticalBarShiftRight,
1454                                ),
1455                                _ => self.start_binop_opt(chars, "|>", None),
1456                            }
1457                        }
1458                        Some('>') if self.dialect.supports_pipe_operator() => {
1459                            self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1460                        }
1461                        // Bitshift '|' operator
1462                        _ => self.start_binop(chars, "|", Token::Pipe),
1463                    }
1464                }
1465                '=' => {
1466                    chars.next(); // consume
1467                    match chars.peek() {
1468                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1469                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1470                        _ => Ok(Some(Token::Eq)),
1471                    }
1472                }
1473                '!' => {
1474                    chars.next(); // consume
1475                    match chars.peek() {
1476                        Some('=') => self.consume_and_return(chars, Token::Neq),
1477                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1478                        Some('~') => {
1479                            chars.next();
1480                            match chars.peek() {
1481                                Some('*') => self
1482                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1483                                Some('~') => {
1484                                    chars.next();
1485                                    match chars.peek() {
1486                                        Some('*') => self.consume_and_return(
1487                                            chars,
1488                                            Token::ExclamationMarkDoubleTildeAsterisk,
1489                                        ),
1490                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1491                                    }
1492                                }
1493                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1494                            }
1495                        }
1496                        _ => Ok(Some(Token::ExclamationMark)),
1497                    }
1498                }
1499                '<' => {
1500                    chars.next(); // consume
1501                    match chars.peek() {
1502                        Some('=') => {
1503                            chars.next();
1504                            match chars.peek() {
1505                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1506                                _ => self.start_binop(chars, "<=", Token::LtEq),
1507                            }
1508                        }
1509                        Some('|') if self.dialect.supports_geometric_types() => {
1510                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1511                        }
1512                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1513                        Some('<') if self.dialect.supports_geometric_types() => {
1514                            chars.next(); // consume
1515                            match chars.peek() {
1516                                Some('|') => self.consume_for_binop(
1517                                    chars,
1518                                    "<<|",
1519                                    Token::ShiftLeftVerticalBar,
1520                                ),
1521                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1522                            }
1523                        }
1524                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1525                        Some('-') if self.dialect.supports_geometric_types() => {
1526                            chars.next(); // consume
1527                            match chars.peek() {
1528                                Some('>') => {
1529                                    self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1530                                }
1531                                _ => self.start_binop_opt(chars, "<-", None),
1532                            }
1533                        }
1534                        Some('^') if self.dialect.supports_geometric_types() => {
1535                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1536                        }
1537                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1538                        _ => self.start_binop(chars, "<", Token::Lt),
1539                    }
1540                }
1541                '>' => {
1542                    chars.next(); // consume
1543                    match chars.peek() {
1544                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1545                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1546                        Some('^') if self.dialect.supports_geometric_types() => {
1547                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1548                        }
1549                        _ => self.start_binop(chars, ">", Token::Gt),
1550                    }
1551                }
1552                ':' => {
1553                    chars.next();
1554                    match chars.peek() {
1555                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1556                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1557                        _ => Ok(Some(Token::Colon)),
1558                    }
1559                }
1560                ';' => self.consume_and_return(chars, Token::SemiColon),
1561                '\\' => self.consume_and_return(chars, Token::Backslash),
1562                '[' => self.consume_and_return(chars, Token::LBracket),
1563                ']' => self.consume_and_return(chars, Token::RBracket),
1564                '&' => {
1565                    chars.next(); // consume the '&'
1566                    match chars.peek() {
1567                        Some('>') if self.dialect.supports_geometric_types() => {
1568                            chars.next();
1569                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1570                        }
1571                        Some('<') if self.dialect.supports_geometric_types() => {
1572                            chars.next(); // consume
1573                            match chars.peek() {
1574                                Some('|') => self.consume_and_return(
1575                                    chars,
1576                                    Token::AmpersandLeftAngleBracketVerticalBar,
1577                                ),
1578                                _ => {
1579                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1580                                }
1581                            }
1582                        }
1583                        Some('&') => {
1584                            chars.next(); // consume the second '&'
1585                            self.start_binop(chars, "&&", Token::Overlap)
1586                        }
1587                        // Bitshift '&' operator
1588                        _ => self.start_binop(chars, "&", Token::Ampersand),
1589                    }
1590                }
1591                '^' => {
1592                    chars.next(); // consume the '^'
1593                    match chars.peek() {
1594                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1595                        _ => Ok(Some(Token::Caret)),
1596                    }
1597                }
1598                '{' => self.consume_and_return(chars, Token::LBrace),
1599                '}' => self.consume_and_return(chars, Token::RBrace),
1600                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1601                {
1602                    chars.next(); // consume the '#', starting a snowflake single-line comment
1603                    let comment = self.tokenize_single_line_comment(chars);
1604                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1605                        prefix: "#".to_owned(),
1606                        comment,
1607                    })))
1608                }
1609                '~' => {
1610                    chars.next(); // consume
1611                    match chars.peek() {
1612                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1613                        Some('=') if self.dialect.supports_geometric_types() => {
1614                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
1615                        }
1616                        Some('~') => {
1617                            chars.next();
1618                            match chars.peek() {
1619                                Some('*') => {
1620                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1621                                }
1622                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1623                            }
1624                        }
1625                        _ => self.start_binop(chars, "~", Token::Tilde),
1626                    }
1627                }
1628                '#' => {
1629                    chars.next();
1630                    match chars.peek() {
1631                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1632                        Some('>') => {
1633                            chars.next();
1634                            match chars.peek() {
1635                                Some('>') => {
1636                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1637                                }
1638                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1639                            }
1640                        }
1641                        Some(' ') => Ok(Some(Token::Sharp)),
1642                        Some('#') if self.dialect.supports_geometric_types() => {
1643                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
1644                        }
1645                        Some(sch) if self.dialect.is_identifier_start('#') => {
1646                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1647                        }
1648                        _ => self.start_binop(chars, "#", Token::Sharp),
1649                    }
1650                }
1651                '@' => {
1652                    chars.next();
1653                    match chars.peek() {
1654                        Some('@') if self.dialect.supports_geometric_types() => {
1655                            self.consume_and_return(chars, Token::AtAt)
1656                        }
1657                        Some('-') if self.dialect.supports_geometric_types() => {
1658                            chars.next();
1659                            match chars.peek() {
1660                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1661                                _ => self.start_binop_opt(chars, "@-", None),
1662                            }
1663                        }
1664                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1665                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1666                        Some('@') => {
1667                            chars.next();
1668                            match chars.peek() {
1669                                Some(' ') => Ok(Some(Token::AtAt)),
1670                                Some(tch) if self.dialect.is_identifier_start('@') => {
1671                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1672                                }
1673                                _ => Ok(Some(Token::AtAt)),
1674                            }
1675                        }
1676                        Some(' ') => Ok(Some(Token::AtSign)),
1677                        // We break on quotes here, because no dialect allows identifiers starting
1678                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1679                        // quoted, which is tokenized as a quoted string, not here (e.g.
1680                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1681                        // quoted string as two separate tokens, which this allows. For example,
1682                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1683                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1684                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1685                        // for the user, the `@`, and the host.
1686                        Some('\'') => Ok(Some(Token::AtSign)),
1687                        Some('\"') => Ok(Some(Token::AtSign)),
1688                        Some('`') => Ok(Some(Token::AtSign)),
1689                        Some(sch) if self.dialect.is_identifier_start('@') => {
1690                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1691                        }
1692                        _ => Ok(Some(Token::AtSign)),
1693                    }
1694                }
1695                // Postgres uses ? for jsonb operators, not prepared statements
1696                '?' if self.dialect.supports_geometric_types() => {
1697                    chars.next(); // consume
1698                    match chars.peek() {
1699                        Some('|') => {
1700                            chars.next();
1701                            match chars.peek() {
1702                                Some('|') => self.consume_and_return(
1703                                    chars,
1704                                    Token::QuestionMarkDoubleVerticalBar,
1705                                ),
1706                                _ => Ok(Some(Token::QuestionPipe)),
1707                            }
1708                        }
1709
1710                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1711                        Some('-') => {
1712                            chars.next(); // consume
1713                            match chars.peek() {
1714                                Some('|') => self
1715                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1716                                _ => Ok(Some(Token::QuestionMarkDash)),
1717                            }
1718                        }
1719                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1720                        _ => Ok(Some(Token::Question)),
1721                    }
1722                }
1723                '?' => {
1724                    chars.next();
1725                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1726                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1727                }
1728
1729                // identifier or keyword
1730                ch if self.dialect.is_identifier_start(ch) => {
1731                    self.tokenize_identifier_or_keyword([ch], chars)
1732                }
1733                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1734
1735                // whitespace check (including unicode chars) should be last as it covers some of the chars above
1736                ch if ch.is_whitespace() => {
1737                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1738                }
1739                other => self.consume_and_return(chars, Token::Char(other)),
1740            },
1741            None => Ok(None),
1742        }
1743    }
1744
1745    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1746    fn consume_for_binop(
1747        &self,
1748        chars: &mut State,
1749        prefix: &str,
1750        default: Token,
1751    ) -> Result<Option<Token>, TokenizerError> {
1752        chars.next(); // consume the first char
1753        self.start_binop_opt(chars, prefix, Some(default))
1754    }
1755
1756    /// parse a custom binary operator
1757    fn start_binop(
1758        &self,
1759        chars: &mut State,
1760        prefix: &str,
1761        default: Token,
1762    ) -> Result<Option<Token>, TokenizerError> {
1763        self.start_binop_opt(chars, prefix, Some(default))
1764    }
1765
1766    /// parse a custom binary operator
1767    fn start_binop_opt(
1768        &self,
1769        chars: &mut State,
1770        prefix: &str,
1771        default: Option<Token>,
1772    ) -> Result<Option<Token>, TokenizerError> {
1773        let mut custom = None;
1774        while let Some(&ch) = chars.peek() {
1775            if !self.dialect.is_custom_operator_part(ch) {
1776                break;
1777            }
1778
1779            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1780            chars.next();
1781        }
1782        match (custom, default) {
1783            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1784            (None, Some(tok)) => Ok(Some(tok)),
1785            (None, None) => self.tokenizer_error(
1786                chars.location(),
1787                format!("Expected a valid binary operator after '{prefix}'"),
1788            ),
1789        }
1790    }
1791
1792    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1793    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1794        let mut s = String::new();
1795        let mut value = String::new();
1796
1797        chars.next();
1798
1799        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1800        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1801            chars.next();
1802
1803            let mut is_terminated = false;
1804            let mut prev: Option<char> = None;
1805
1806            while let Some(&ch) = chars.peek() {
1807                if prev == Some('$') {
1808                    if ch == '$' {
1809                        chars.next();
1810                        is_terminated = true;
1811                        break;
1812                    } else {
1813                        s.push('$');
1814                        s.push(ch);
1815                    }
1816                } else if ch != '$' {
1817                    s.push(ch);
1818                }
1819
1820                prev = Some(ch);
1821                chars.next();
1822            }
1823
1824            return if chars.peek().is_none() && !is_terminated {
1825                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1826            } else {
1827                Ok(Token::DollarQuotedString(DollarQuotedString {
1828                    value: s,
1829                    tag: None,
1830                }))
1831            };
1832        } else {
1833            value.push_str(&peeking_take_while(chars, |ch| {
1834                ch.is_alphanumeric()
1835                    || ch == '_'
1836                    // Allow $ as a placeholder character if the dialect supports it
1837                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1838            }));
1839
1840            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1841            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1842                chars.next();
1843
1844                let mut temp = String::new();
1845                let end_delimiter = format!("${value}$");
1846
1847                loop {
1848                    match chars.next() {
1849                        Some(ch) => {
1850                            temp.push(ch);
1851
1852                            if temp.ends_with(&end_delimiter) {
1853                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1854                                    s.push_str(temp);
1855                                }
1856                                break;
1857                            }
1858                        }
1859                        None => {
1860                            if temp.ends_with(&end_delimiter) {
1861                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1862                                    s.push_str(temp);
1863                                }
1864                                break;
1865                            }
1866
1867                            return self.tokenizer_error(
1868                                chars.location(),
1869                                "Unterminated dollar-quoted, expected $",
1870                            );
1871                        }
1872                    }
1873                }
1874            } else {
1875                return Ok(Token::Placeholder(String::from("$") + &value));
1876            }
1877        }
1878
1879        Ok(Token::DollarQuotedString(DollarQuotedString {
1880            value: s,
1881            tag: if value.is_empty() { None } else { Some(value) },
1882        }))
1883    }
1884
1885    fn tokenizer_error<R>(
1886        &self,
1887        loc: Location,
1888        message: impl Into<String>,
1889    ) -> Result<R, TokenizerError> {
1890        Err(TokenizerError {
1891            message: message.into(),
1892            location: loc,
1893        })
1894    }
1895
1896    // Consume characters until newline
1897    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1898        let mut comment = peeking_take_while(chars, |ch| match ch {
1899            '\n' => false,                                           // Always stop at \n
1900            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
1901            _ => true, // Keep consuming for other characters
1902        });
1903
1904        if let Some(ch) = chars.next() {
1905            assert!(ch == '\n' || ch == '\r');
1906            comment.push(ch);
1907        }
1908
1909        comment
1910    }
1911
1912    /// Tokenize an identifier or keyword, after the first char is already consumed.
1913    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1914        let mut s = first_chars.into();
1915        s.push_str(&peeking_take_while(chars, |ch| {
1916            self.dialect.is_identifier_part(ch)
1917        }));
1918        s
1919    }
1920
1921    /// Read a quoted identifier
1922    fn tokenize_quoted_identifier(
1923        &self,
1924        quote_start: char,
1925        chars: &mut State,
1926    ) -> Result<String, TokenizerError> {
1927        let error_loc = chars.location();
1928        chars.next(); // consume the opening quote
1929        let quote_end = Word::matching_end_quote(quote_start);
1930        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1931
1932        if last_char == Some(quote_end) {
1933            Ok(s)
1934        } else {
1935            self.tokenizer_error(
1936                error_loc,
1937                format!("Expected close delimiter '{quote_end}' before EOF."),
1938            )
1939        }
1940    }
1941
1942    /// Read a single quoted string, starting with the opening quote.
1943    fn tokenize_escaped_single_quoted_string(
1944        &self,
1945        starting_loc: Location,
1946        chars: &mut State,
1947    ) -> Result<String, TokenizerError> {
1948        if let Some(s) = unescape_single_quoted_string(chars) {
1949            return Ok(s);
1950        }
1951
1952        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1953    }
1954
1955    /// Reads a string literal quoted by a single or triple quote characters.
1956    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
1957    fn tokenize_single_or_triple_quoted_string<F>(
1958        &self,
1959        chars: &mut State,
1960        quote_style: char,
1961        backslash_escape: bool,
1962        single_quote_token: F,
1963        triple_quote_token: F,
1964    ) -> Result<Option<Token>, TokenizerError>
1965    where
1966        F: Fn(String) -> Token,
1967    {
1968        let error_loc = chars.location();
1969
1970        let mut num_opening_quotes = 0u8;
1971        for _ in 0..3 {
1972            if Some(&quote_style) == chars.peek() {
1973                chars.next(); // Consume quote.
1974                num_opening_quotes += 1;
1975            } else {
1976                break;
1977            }
1978        }
1979
1980        let (token_fn, num_quote_chars) = match num_opening_quotes {
1981            1 => (single_quote_token, NumStringQuoteChars::One),
1982            2 => {
1983                // If we matched double quotes, then this is an empty string.
1984                return Ok(Some(single_quote_token("".into())));
1985            }
1986            3 => {
1987                let Some(num_quote_chars) = NonZeroU8::new(3) else {
1988                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1989                };
1990                (
1991                    triple_quote_token,
1992                    NumStringQuoteChars::Many(num_quote_chars),
1993                )
1994            }
1995            _ => {
1996                return self.tokenizer_error(error_loc, "invalid string literal opening");
1997            }
1998        };
1999
2000        let settings = TokenizeQuotedStringSettings {
2001            quote_style,
2002            num_quote_chars,
2003            num_opening_quotes_to_consume: 0,
2004            backslash_escape,
2005        };
2006
2007        self.tokenize_quoted_string(chars, settings)
2008            .map(token_fn)
2009            .map(Some)
2010    }
2011
2012    /// Reads a string literal quoted by a single quote character.
2013    fn tokenize_single_quoted_string(
2014        &self,
2015        chars: &mut State,
2016        quote_style: char,
2017        backslash_escape: bool,
2018    ) -> Result<String, TokenizerError> {
2019        self.tokenize_quoted_string(
2020            chars,
2021            TokenizeQuotedStringSettings {
2022                quote_style,
2023                num_quote_chars: NumStringQuoteChars::One,
2024                num_opening_quotes_to_consume: 1,
2025                backslash_escape,
2026            },
2027        )
2028    }
2029
2030    /// Reads a quote delimited string expecting `chars.next()` to deliver a quote.
2031    ///
2032    /// See <https://docs.oracle.com/en/database/oracle/oracle-database/21/sqlrf/Literals.html#GUID-1824CBAA-6E16-4921-B2A6-112FB02248DA>
2033    fn tokenize_quote_delimited_string(
2034        &self,
2035        chars: &mut State,
2036        // the prefix that introduced the possible literal or word,
2037        // e.g. "Q" or "nq"
2038        literal_prefix: &[char],
2039    ) -> Result<QuoteDelimitedString, TokenizerError> {
2040        let literal_start_loc = chars.location();
2041        chars.next();
2042
2043        let start_quote_loc = chars.location();
2044        let (start_quote, end_quote) = match chars.next() {
2045            None | Some(' ') | Some('\t') | Some('\r') | Some('\n') => {
2046                return self.tokenizer_error(
2047                    start_quote_loc,
2048                    format!(
2049                        "Invalid space, tab, newline, or EOF after '{}''",
2050                        String::from_iter(literal_prefix)
2051                    ),
2052                );
2053            }
2054            Some(c) => (
2055                c,
2056                match c {
2057                    '[' => ']',
2058                    '{' => '}',
2059                    '<' => '>',
2060                    '(' => ')',
2061                    c => c,
2062                },
2063            ),
2064        };
2065
2066        // read the string literal until the "quote character" following a by literal quote
2067        let mut value = String::new();
2068        while let Some(ch) = chars.next() {
2069            if ch == end_quote {
2070                if let Some('\'') = chars.peek() {
2071                    chars.next(); // ~ consume the quote
2072                    return Ok(QuoteDelimitedString {
2073                        start_quote,
2074                        value,
2075                        end_quote,
2076                    });
2077                }
2078            }
2079            value.push(ch);
2080        }
2081
2082        self.tokenizer_error(literal_start_loc, "Unterminated string literal")
2083    }
2084
2085    /// Read a quoted string.
2086    fn tokenize_quoted_string(
2087        &self,
2088        chars: &mut State,
2089        settings: TokenizeQuotedStringSettings,
2090    ) -> Result<String, TokenizerError> {
2091        let mut s = String::new();
2092        let error_loc = chars.location();
2093
2094        // Consume any opening quotes.
2095        for _ in 0..settings.num_opening_quotes_to_consume {
2096            if Some(settings.quote_style) != chars.next() {
2097                return self.tokenizer_error(error_loc, "invalid string literal opening");
2098            }
2099        }
2100
2101        let mut num_consecutive_quotes = 0;
2102        while let Some(&ch) = chars.peek() {
2103            let pending_final_quote = match settings.num_quote_chars {
2104                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2105                n @ NumStringQuoteChars::Many(count)
2106                    if num_consecutive_quotes + 1 == count.get() =>
2107                {
2108                    Some(n)
2109                }
2110                NumStringQuoteChars::Many(_) => None,
2111            };
2112
2113            match ch {
2114                char if char == settings.quote_style && pending_final_quote.is_some() => {
2115                    chars.next(); // consume
2116
2117                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2118                        // For an initial string like `"""abc"""`, at this point we have
2119                        // `abc""` in the buffer and have now matched the final `"`.
2120                        // However, the string to return is simply `abc`, so we strip off
2121                        // the trailing quotes before returning.
2122                        let mut buf = s.chars();
2123                        for _ in 1..count.get() {
2124                            buf.next_back();
2125                        }
2126                        return Ok(buf.as_str().to_string());
2127                    } else if chars
2128                        .peek()
2129                        .map(|c| *c == settings.quote_style)
2130                        .unwrap_or(false)
2131                    {
2132                        s.push(ch);
2133                        if !self.unescape {
2134                            // In no-escape mode, the given query has to be saved completely
2135                            s.push(ch);
2136                        }
2137                        chars.next();
2138                    } else {
2139                        return Ok(s);
2140                    }
2141                }
2142                '\\' if settings.backslash_escape => {
2143                    // consume backslash
2144                    chars.next();
2145
2146                    num_consecutive_quotes = 0;
2147
2148                    if let Some(next) = chars.peek() {
2149                        if !self.unescape
2150                            || (self.dialect.ignores_wildcard_escapes()
2151                                && (*next == '%' || *next == '_'))
2152                        {
2153                            // In no-escape mode, the given query has to be saved completely
2154                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2155                            // the backslash is not stripped.
2156                            s.push(ch);
2157                            s.push(*next);
2158                            chars.next(); // consume next
2159                        } else {
2160                            let n = match next {
2161                                '0' => '\0',
2162                                'a' => '\u{7}',
2163                                'b' => '\u{8}',
2164                                'f' => '\u{c}',
2165                                'n' => '\n',
2166                                'r' => '\r',
2167                                't' => '\t',
2168                                'Z' => '\u{1a}',
2169                                _ => *next,
2170                            };
2171                            s.push(n);
2172                            chars.next(); // consume next
2173                        }
2174                    }
2175                }
2176                ch => {
2177                    chars.next(); // consume ch
2178
2179                    if ch == settings.quote_style {
2180                        num_consecutive_quotes += 1;
2181                    } else {
2182                        num_consecutive_quotes = 0;
2183                    }
2184
2185                    s.push(ch);
2186                }
2187            }
2188        }
2189        self.tokenizer_error(error_loc, "Unterminated string literal")
2190    }
2191
2192    fn tokenize_multiline_comment(
2193        &self,
2194        chars: &mut State,
2195    ) -> Result<Option<Token>, TokenizerError> {
2196        let mut s = String::new();
2197        let mut nested = 1;
2198        let supports_nested_comments = self.dialect.supports_nested_comments();
2199
2200        loop {
2201            match chars.next() {
2202                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2203                    chars.next(); // consume the '*'
2204                    s.push('/');
2205                    s.push('*');
2206                    nested += 1;
2207                }
2208                Some('*') if matches!(chars.peek(), Some('/')) => {
2209                    chars.next(); // consume the '/'
2210                    nested -= 1;
2211                    if nested == 0 {
2212                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2213                    }
2214                    s.push('*');
2215                    s.push('/');
2216                }
2217                Some(ch) => {
2218                    s.push(ch);
2219                }
2220                None => {
2221                    break self.tokenizer_error(
2222                        chars.location(),
2223                        "Unexpected EOF while in a multi-line comment",
2224                    );
2225                }
2226            }
2227        }
2228    }
2229
2230    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2231        let mut last_char = None;
2232        let mut s = String::new();
2233        while let Some(ch) = chars.next() {
2234            if ch == quote_end {
2235                if chars.peek() == Some(&quote_end) {
2236                    chars.next();
2237                    s.push(ch);
2238                    if !self.unescape {
2239                        // In no-escape mode, the given query has to be saved completely
2240                        s.push(ch);
2241                    }
2242                } else {
2243                    last_char = Some(quote_end);
2244                    break;
2245                }
2246            } else {
2247                s.push(ch);
2248            }
2249        }
2250        (s, last_char)
2251    }
2252
2253    #[allow(clippy::unnecessary_wraps)]
2254    fn consume_and_return(
2255        &self,
2256        chars: &mut State,
2257        t: Token,
2258    ) -> Result<Option<Token>, TokenizerError> {
2259        chars.next();
2260        Ok(Some(t))
2261    }
2262}
2263
2264/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2265/// Return the characters read as String, and keep the first non-matching
2266/// char available as `chars.next()`.
2267fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2268    let mut s = String::new();
2269    while let Some(&ch) = chars.peek() {
2270        if predicate(ch) {
2271            chars.next(); // consume
2272            s.push(ch);
2273        } else {
2274            break;
2275        }
2276    }
2277    s
2278}
2279
2280/// Same as peeking_take_while, but also passes the next character to the predicate.
2281fn peeking_next_take_while(
2282    chars: &mut State,
2283    mut predicate: impl FnMut(char, Option<char>) -> bool,
2284) -> String {
2285    let mut s = String::new();
2286    while let Some(&ch) = chars.peek() {
2287        let next_char = chars.peekable.clone().nth(1);
2288        if predicate(ch, next_char) {
2289            chars.next(); // consume
2290            s.push(ch);
2291        } else {
2292            break;
2293        }
2294    }
2295    s
2296}
2297
2298fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2299    Unescape::new(chars).unescape()
2300}
2301
2302struct Unescape<'a: 'b, 'b> {
2303    chars: &'b mut State<'a>,
2304}
2305
2306impl<'a: 'b, 'b> Unescape<'a, 'b> {
2307    fn new(chars: &'b mut State<'a>) -> Self {
2308        Self { chars }
2309    }
2310    fn unescape(mut self) -> Option<String> {
2311        let mut unescaped = String::new();
2312
2313        self.chars.next();
2314
2315        while let Some(c) = self.chars.next() {
2316            if c == '\'' {
2317                // case: ''''
2318                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2319                    self.chars.next();
2320                    unescaped.push('\'');
2321                    continue;
2322                }
2323                return Some(unescaped);
2324            }
2325
2326            if c != '\\' {
2327                unescaped.push(c);
2328                continue;
2329            }
2330
2331            let c = match self.chars.next()? {
2332                'b' => '\u{0008}',
2333                'f' => '\u{000C}',
2334                'n' => '\n',
2335                'r' => '\r',
2336                't' => '\t',
2337                'u' => self.unescape_unicode_16()?,
2338                'U' => self.unescape_unicode_32()?,
2339                'x' => self.unescape_hex()?,
2340                c if c.is_digit(8) => self.unescape_octal(c)?,
2341                c => c,
2342            };
2343
2344            unescaped.push(Self::check_null(c)?);
2345        }
2346
2347        None
2348    }
2349
2350    #[inline]
2351    fn check_null(c: char) -> Option<char> {
2352        if c == '\0' {
2353            None
2354        } else {
2355            Some(c)
2356        }
2357    }
2358
2359    #[inline]
2360    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2361        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2362        match u32::from_str_radix(s, RADIX) {
2363            Err(_) => None,
2364            Ok(n) => {
2365                let n = n & 0xFF;
2366                if n <= 127 {
2367                    char::from_u32(n)
2368                } else {
2369                    None
2370                }
2371            }
2372        }
2373    }
2374
2375    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2376    fn unescape_hex(&mut self) -> Option<char> {
2377        let mut s = String::new();
2378
2379        for _ in 0..2 {
2380            match self.next_hex_digit() {
2381                Some(c) => s.push(c),
2382                None => break,
2383            }
2384        }
2385
2386        if s.is_empty() {
2387            return Some('x');
2388        }
2389
2390        Self::byte_to_char::<16>(&s)
2391    }
2392
2393    #[inline]
2394    fn next_hex_digit(&mut self) -> Option<char> {
2395        match self.chars.peek() {
2396            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2397            _ => None,
2398        }
2399    }
2400
2401    // Octal byte value. \o, \oo, \ooo (o = 0–7)
2402    fn unescape_octal(&mut self, c: char) -> Option<char> {
2403        let mut s = String::new();
2404
2405        s.push(c);
2406        for _ in 0..2 {
2407            match self.next_octal_digest() {
2408                Some(c) => s.push(c),
2409                None => break,
2410            }
2411        }
2412
2413        Self::byte_to_char::<8>(&s)
2414    }
2415
2416    #[inline]
2417    fn next_octal_digest(&mut self) -> Option<char> {
2418        match self.chars.peek() {
2419            Some(c) if c.is_digit(8) => self.chars.next(),
2420            _ => None,
2421        }
2422    }
2423
2424    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2425    fn unescape_unicode_16(&mut self) -> Option<char> {
2426        self.unescape_unicode::<4>()
2427    }
2428
2429    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2430    fn unescape_unicode_32(&mut self) -> Option<char> {
2431        self.unescape_unicode::<8>()
2432    }
2433
2434    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2435        let mut s = String::new();
2436        for _ in 0..NUM {
2437            s.push(self.chars.next()?);
2438        }
2439        match u32::from_str_radix(&s, 16) {
2440            Err(_) => None,
2441            Ok(n) => char::from_u32(n),
2442        }
2443    }
2444}
2445
2446fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2447    let mut unescaped = String::new();
2448    chars.next(); // consume the opening quote
2449    while let Some(c) = chars.next() {
2450        match c {
2451            '\'' => {
2452                if chars.peek() == Some(&'\'') {
2453                    chars.next();
2454                    unescaped.push('\'');
2455                } else {
2456                    return Ok(unescaped);
2457                }
2458            }
2459            '\\' => match chars.peek() {
2460                Some('\\') => {
2461                    chars.next();
2462                    unescaped.push('\\');
2463                }
2464                Some('+') => {
2465                    chars.next();
2466                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2467                }
2468                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2469            },
2470            _ => {
2471                unescaped.push(c);
2472            }
2473        }
2474    }
2475    Err(TokenizerError {
2476        message: "Unterminated unicode encoded string literal".to_string(),
2477        location: chars.location(),
2478    })
2479}
2480
2481fn take_char_from_hex_digits(
2482    chars: &mut State<'_>,
2483    max_digits: usize,
2484) -> Result<char, TokenizerError> {
2485    let mut result = 0u32;
2486    for _ in 0..max_digits {
2487        let next_char = chars.next().ok_or_else(|| TokenizerError {
2488            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2489                .to_string(),
2490            location: chars.location(),
2491        })?;
2492        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2493            message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2494            location: chars.location(),
2495        })?;
2496        result = result * 16 + digit;
2497    }
2498    char::from_u32(result).ok_or_else(|| TokenizerError {
2499        message: format!("Invalid unicode character: {result:x}"),
2500        location: chars.location(),
2501    })
2502}
2503
2504#[cfg(test)]
2505mod tests {
2506    use super::*;
2507    use crate::dialect::{
2508        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2509    };
2510    use crate::test_utils::{all_dialects_except, all_dialects_where};
2511    use core::fmt::Debug;
2512
2513    #[test]
2514    fn tokenizer_error_impl() {
2515        let err = TokenizerError {
2516            message: "test".into(),
2517            location: Location { line: 1, column: 1 },
2518        };
2519        #[cfg(feature = "std")]
2520        {
2521            use std::error::Error;
2522            assert!(err.source().is_none());
2523        }
2524        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2525    }
2526
2527    #[test]
2528    fn tokenize_select_1() {
2529        let sql = String::from("SELECT 1");
2530        let dialect = GenericDialect {};
2531        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2532
2533        let expected = vec![
2534            Token::make_keyword("SELECT"),
2535            Token::Whitespace(Whitespace::Space),
2536            Token::Number(String::from("1"), false),
2537        ];
2538
2539        compare(expected, tokens);
2540    }
2541
2542    #[test]
2543    fn tokenize_select_float() {
2544        let sql = String::from("SELECT .1");
2545        let dialect = GenericDialect {};
2546        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2547
2548        let expected = vec![
2549            Token::make_keyword("SELECT"),
2550            Token::Whitespace(Whitespace::Space),
2551            Token::Number(String::from(".1"), false),
2552        ];
2553
2554        compare(expected, tokens);
2555    }
2556
2557    #[test]
2558    fn tokenize_clickhouse_double_equal() {
2559        let sql = String::from("SELECT foo=='1'");
2560        let dialect = ClickHouseDialect {};
2561        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2562        let tokens = tokenizer.tokenize().unwrap();
2563
2564        let expected = vec![
2565            Token::make_keyword("SELECT"),
2566            Token::Whitespace(Whitespace::Space),
2567            Token::Word(Word {
2568                value: "foo".to_string(),
2569                quote_style: None,
2570                keyword: Keyword::NoKeyword,
2571            }),
2572            Token::DoubleEq,
2573            Token::SingleQuotedString("1".to_string()),
2574        ];
2575
2576        compare(expected, tokens);
2577    }
2578
2579    #[test]
2580    fn tokenize_numeric_literal_underscore() {
2581        let dialect = GenericDialect {};
2582        let sql = String::from("SELECT 10_000");
2583        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2584        let tokens = tokenizer.tokenize().unwrap();
2585        let expected = vec![
2586            Token::make_keyword("SELECT"),
2587            Token::Whitespace(Whitespace::Space),
2588            Token::Number("10".to_string(), false),
2589            Token::make_word("_000", None),
2590        ];
2591        compare(expected, tokens);
2592
2593        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2594            "SELECT 10_000, _10_000, 10_00_, 10___0",
2595            vec![
2596                Token::make_keyword("SELECT"),
2597                Token::Whitespace(Whitespace::Space),
2598                Token::Number("10_000".to_string(), false),
2599                Token::Comma,
2600                Token::Whitespace(Whitespace::Space),
2601                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2602                Token::Comma,
2603                Token::Whitespace(Whitespace::Space),
2604                Token::Number("10_00".to_string(), false),
2605                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2606                Token::Comma,
2607                Token::Whitespace(Whitespace::Space),
2608                Token::Number("10".to_string(), false),
2609                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2610            ],
2611        );
2612    }
2613
2614    #[test]
2615    fn tokenize_select_exponent() {
2616        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2617        let dialect = GenericDialect {};
2618        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2619
2620        let expected = vec![
2621            Token::make_keyword("SELECT"),
2622            Token::Whitespace(Whitespace::Space),
2623            Token::Number(String::from("1e10"), false),
2624            Token::Comma,
2625            Token::Whitespace(Whitespace::Space),
2626            Token::Number(String::from("1e-10"), false),
2627            Token::Comma,
2628            Token::Whitespace(Whitespace::Space),
2629            Token::Number(String::from("1e+10"), false),
2630            Token::Comma,
2631            Token::Whitespace(Whitespace::Space),
2632            Token::Number(String::from("1"), false),
2633            Token::make_word("ea", None),
2634            Token::Comma,
2635            Token::Whitespace(Whitespace::Space),
2636            Token::Number(String::from("1e-10"), false),
2637            Token::make_word("a", None),
2638            Token::Comma,
2639            Token::Whitespace(Whitespace::Space),
2640            Token::Number(String::from("1e-10"), false),
2641            Token::Minus,
2642            Token::Number(String::from("10"), false),
2643        ];
2644
2645        compare(expected, tokens);
2646    }
2647
2648    #[test]
2649    fn tokenize_scalar_function() {
2650        let sql = String::from("SELECT sqrt(1)");
2651        let dialect = GenericDialect {};
2652        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2653
2654        let expected = vec![
2655            Token::make_keyword("SELECT"),
2656            Token::Whitespace(Whitespace::Space),
2657            Token::make_word("sqrt", None),
2658            Token::LParen,
2659            Token::Number(String::from("1"), false),
2660            Token::RParen,
2661        ];
2662
2663        compare(expected, tokens);
2664    }
2665
2666    #[test]
2667    fn tokenize_string_string_concat() {
2668        let sql = String::from("SELECT 'a' || 'b'");
2669        let dialect = GenericDialect {};
2670        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2671
2672        let expected = vec![
2673            Token::make_keyword("SELECT"),
2674            Token::Whitespace(Whitespace::Space),
2675            Token::SingleQuotedString(String::from("a")),
2676            Token::Whitespace(Whitespace::Space),
2677            Token::StringConcat,
2678            Token::Whitespace(Whitespace::Space),
2679            Token::SingleQuotedString(String::from("b")),
2680        ];
2681
2682        compare(expected, tokens);
2683    }
2684    #[test]
2685    fn tokenize_bitwise_op() {
2686        let sql = String::from("SELECT one | two ^ three");
2687        let dialect = GenericDialect {};
2688        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2689
2690        let expected = vec![
2691            Token::make_keyword("SELECT"),
2692            Token::Whitespace(Whitespace::Space),
2693            Token::make_word("one", None),
2694            Token::Whitespace(Whitespace::Space),
2695            Token::Pipe,
2696            Token::Whitespace(Whitespace::Space),
2697            Token::make_word("two", None),
2698            Token::Whitespace(Whitespace::Space),
2699            Token::Caret,
2700            Token::Whitespace(Whitespace::Space),
2701            Token::make_word("three", None),
2702        ];
2703        compare(expected, tokens);
2704    }
2705
2706    #[test]
2707    fn tokenize_logical_xor() {
2708        let sql =
2709            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2710        let dialect = GenericDialect {};
2711        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2712
2713        let expected = vec![
2714            Token::make_keyword("SELECT"),
2715            Token::Whitespace(Whitespace::Space),
2716            Token::make_keyword("true"),
2717            Token::Whitespace(Whitespace::Space),
2718            Token::make_keyword("XOR"),
2719            Token::Whitespace(Whitespace::Space),
2720            Token::make_keyword("true"),
2721            Token::Comma,
2722            Token::Whitespace(Whitespace::Space),
2723            Token::make_keyword("false"),
2724            Token::Whitespace(Whitespace::Space),
2725            Token::make_keyword("XOR"),
2726            Token::Whitespace(Whitespace::Space),
2727            Token::make_keyword("false"),
2728            Token::Comma,
2729            Token::Whitespace(Whitespace::Space),
2730            Token::make_keyword("true"),
2731            Token::Whitespace(Whitespace::Space),
2732            Token::make_keyword("XOR"),
2733            Token::Whitespace(Whitespace::Space),
2734            Token::make_keyword("false"),
2735            Token::Comma,
2736            Token::Whitespace(Whitespace::Space),
2737            Token::make_keyword("false"),
2738            Token::Whitespace(Whitespace::Space),
2739            Token::make_keyword("XOR"),
2740            Token::Whitespace(Whitespace::Space),
2741            Token::make_keyword("true"),
2742        ];
2743        compare(expected, tokens);
2744    }
2745
2746    #[test]
2747    fn tokenize_simple_select() {
2748        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2749        let dialect = GenericDialect {};
2750        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2751
2752        let expected = vec![
2753            Token::make_keyword("SELECT"),
2754            Token::Whitespace(Whitespace::Space),
2755            Token::Mul,
2756            Token::Whitespace(Whitespace::Space),
2757            Token::make_keyword("FROM"),
2758            Token::Whitespace(Whitespace::Space),
2759            Token::make_word("customer", None),
2760            Token::Whitespace(Whitespace::Space),
2761            Token::make_keyword("WHERE"),
2762            Token::Whitespace(Whitespace::Space),
2763            Token::make_word("id", None),
2764            Token::Whitespace(Whitespace::Space),
2765            Token::Eq,
2766            Token::Whitespace(Whitespace::Space),
2767            Token::Number(String::from("1"), false),
2768            Token::Whitespace(Whitespace::Space),
2769            Token::make_keyword("LIMIT"),
2770            Token::Whitespace(Whitespace::Space),
2771            Token::Number(String::from("5"), false),
2772        ];
2773
2774        compare(expected, tokens);
2775    }
2776
2777    #[test]
2778    fn tokenize_explain_select() {
2779        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2780        let dialect = GenericDialect {};
2781        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2782
2783        let expected = vec![
2784            Token::make_keyword("EXPLAIN"),
2785            Token::Whitespace(Whitespace::Space),
2786            Token::make_keyword("SELECT"),
2787            Token::Whitespace(Whitespace::Space),
2788            Token::Mul,
2789            Token::Whitespace(Whitespace::Space),
2790            Token::make_keyword("FROM"),
2791            Token::Whitespace(Whitespace::Space),
2792            Token::make_word("customer", None),
2793            Token::Whitespace(Whitespace::Space),
2794            Token::make_keyword("WHERE"),
2795            Token::Whitespace(Whitespace::Space),
2796            Token::make_word("id", None),
2797            Token::Whitespace(Whitespace::Space),
2798            Token::Eq,
2799            Token::Whitespace(Whitespace::Space),
2800            Token::Number(String::from("1"), false),
2801        ];
2802
2803        compare(expected, tokens);
2804    }
2805
2806    #[test]
2807    fn tokenize_explain_analyze_select() {
2808        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2809        let dialect = GenericDialect {};
2810        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2811
2812        let expected = vec![
2813            Token::make_keyword("EXPLAIN"),
2814            Token::Whitespace(Whitespace::Space),
2815            Token::make_keyword("ANALYZE"),
2816            Token::Whitespace(Whitespace::Space),
2817            Token::make_keyword("SELECT"),
2818            Token::Whitespace(Whitespace::Space),
2819            Token::Mul,
2820            Token::Whitespace(Whitespace::Space),
2821            Token::make_keyword("FROM"),
2822            Token::Whitespace(Whitespace::Space),
2823            Token::make_word("customer", None),
2824            Token::Whitespace(Whitespace::Space),
2825            Token::make_keyword("WHERE"),
2826            Token::Whitespace(Whitespace::Space),
2827            Token::make_word("id", None),
2828            Token::Whitespace(Whitespace::Space),
2829            Token::Eq,
2830            Token::Whitespace(Whitespace::Space),
2831            Token::Number(String::from("1"), false),
2832        ];
2833
2834        compare(expected, tokens);
2835    }
2836
2837    #[test]
2838    fn tokenize_string_predicate() {
2839        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2840        let dialect = GenericDialect {};
2841        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2842
2843        let expected = vec![
2844            Token::make_keyword("SELECT"),
2845            Token::Whitespace(Whitespace::Space),
2846            Token::Mul,
2847            Token::Whitespace(Whitespace::Space),
2848            Token::make_keyword("FROM"),
2849            Token::Whitespace(Whitespace::Space),
2850            Token::make_word("customer", None),
2851            Token::Whitespace(Whitespace::Space),
2852            Token::make_keyword("WHERE"),
2853            Token::Whitespace(Whitespace::Space),
2854            Token::make_word("salary", None),
2855            Token::Whitespace(Whitespace::Space),
2856            Token::Neq,
2857            Token::Whitespace(Whitespace::Space),
2858            Token::SingleQuotedString(String::from("Not Provided")),
2859        ];
2860
2861        compare(expected, tokens);
2862    }
2863
2864    #[test]
2865    fn tokenize_invalid_string() {
2866        let sql = String::from("\n💝مصطفىh");
2867
2868        let dialect = GenericDialect {};
2869        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2870        // println!("tokens: {:#?}", tokens);
2871        let expected = vec![
2872            Token::Whitespace(Whitespace::Newline),
2873            Token::Char('💝'),
2874            Token::make_word("مصطفىh", None),
2875        ];
2876        compare(expected, tokens);
2877    }
2878
2879    #[test]
2880    fn tokenize_newline_in_string_literal() {
2881        let sql = String::from("'foo\r\nbar\nbaz'");
2882
2883        let dialect = GenericDialect {};
2884        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2885        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2886        compare(expected, tokens);
2887    }
2888
2889    #[test]
2890    fn tokenize_unterminated_string_literal() {
2891        let sql = String::from("select 'foo");
2892
2893        let dialect = GenericDialect {};
2894        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2895        assert_eq!(
2896            tokenizer.tokenize(),
2897            Err(TokenizerError {
2898                message: "Unterminated string literal".to_string(),
2899                location: Location { line: 1, column: 8 },
2900            })
2901        );
2902    }
2903
2904    #[test]
2905    fn tokenize_unterminated_string_literal_utf8() {
2906        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2907
2908        let dialect = GenericDialect {};
2909        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2910        assert_eq!(
2911            tokenizer.tokenize(),
2912            Err(TokenizerError {
2913                message: "Unterminated string literal".to_string(),
2914                location: Location {
2915                    line: 1,
2916                    column: 35
2917                }
2918            })
2919        );
2920    }
2921
2922    #[test]
2923    fn tokenize_invalid_string_cols() {
2924        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2925
2926        let dialect = GenericDialect {};
2927        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2928        // println!("tokens: {:#?}", tokens);
2929        let expected = vec![
2930            Token::Whitespace(Whitespace::Newline),
2931            Token::Whitespace(Whitespace::Newline),
2932            Token::make_keyword("SELECT"),
2933            Token::Whitespace(Whitespace::Space),
2934            Token::Mul,
2935            Token::Whitespace(Whitespace::Space),
2936            Token::make_keyword("FROM"),
2937            Token::Whitespace(Whitespace::Space),
2938            Token::make_keyword("table"),
2939            Token::Whitespace(Whitespace::Tab),
2940            Token::Char('💝'),
2941            Token::make_word("مصطفىh", None),
2942        ];
2943        compare(expected, tokens);
2944    }
2945
2946    #[test]
2947    fn tokenize_dollar_quoted_string_tagged() {
2948        let test_cases = vec![
2949            (
2950                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2951                vec![
2952                    Token::make_keyword("SELECT"),
2953                    Token::Whitespace(Whitespace::Space),
2954                    Token::DollarQuotedString(DollarQuotedString {
2955                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2956                        tag: Some("tag".into()),
2957                    })
2958                ]
2959            ),
2960            (
2961                String::from("SELECT $abc$x$ab$abc$"),
2962                vec![
2963                    Token::make_keyword("SELECT"),
2964                    Token::Whitespace(Whitespace::Space),
2965                    Token::DollarQuotedString(DollarQuotedString {
2966                        value: "x$ab".into(),
2967                        tag: Some("abc".into()),
2968                    })
2969                ]
2970            ),
2971            (
2972                String::from("SELECT $abc$$abc$"),
2973                vec![
2974                    Token::make_keyword("SELECT"),
2975                    Token::Whitespace(Whitespace::Space),
2976                    Token::DollarQuotedString(DollarQuotedString {
2977                        value: "".into(),
2978                        tag: Some("abc".into()),
2979                    })
2980                ]
2981            ),
2982            (
2983                String::from("0$abc$$abc$1"),
2984                vec![
2985                    Token::Number("0".into(), false),
2986                    Token::DollarQuotedString(DollarQuotedString {
2987                        value: "".into(),
2988                        tag: Some("abc".into()),
2989                    }),
2990                    Token::Number("1".into(), false),
2991                ]
2992            ),
2993            (
2994                String::from("$function$abc$q$data$q$$function$"),
2995                vec![
2996                    Token::DollarQuotedString(DollarQuotedString {
2997                        value: "abc$q$data$q$".into(),
2998                        tag: Some("function".into()),
2999                    }),
3000                ]
3001            ),
3002        ];
3003
3004        let dialect = GenericDialect {};
3005        for (sql, expected) in test_cases {
3006            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3007            compare(expected, tokens);
3008        }
3009    }
3010
3011    #[test]
3012    fn tokenize_dollar_quoted_string_tagged_unterminated() {
3013        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
3014        let dialect = GenericDialect {};
3015        assert_eq!(
3016            Tokenizer::new(&dialect, &sql).tokenize(),
3017            Err(TokenizerError {
3018                message: "Unterminated dollar-quoted, expected $".into(),
3019                location: Location {
3020                    line: 1,
3021                    column: 91
3022                }
3023            })
3024        );
3025    }
3026
3027    #[test]
3028    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
3029        let sql = String::from("SELECT $abc$abc$");
3030        let dialect = GenericDialect {};
3031        assert_eq!(
3032            Tokenizer::new(&dialect, &sql).tokenize(),
3033            Err(TokenizerError {
3034                message: "Unterminated dollar-quoted, expected $".into(),
3035                location: Location {
3036                    line: 1,
3037                    column: 17
3038                }
3039            })
3040        );
3041    }
3042
3043    #[test]
3044    fn tokenize_dollar_placeholder() {
3045        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
3046        let dialect = SQLiteDialect {};
3047        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3048        assert_eq!(
3049            tokens,
3050            vec![
3051                Token::make_keyword("SELECT"),
3052                Token::Whitespace(Whitespace::Space),
3053                Token::Placeholder("$$".into()),
3054                Token::Comma,
3055                Token::Whitespace(Whitespace::Space),
3056                Token::Placeholder("$$ABC$$".into()),
3057                Token::Comma,
3058                Token::Whitespace(Whitespace::Space),
3059                Token::Placeholder("$ABC$".into()),
3060                Token::Comma,
3061                Token::Whitespace(Whitespace::Space),
3062                Token::Placeholder("$ABC".into()),
3063            ]
3064        );
3065    }
3066
3067    #[test]
3068    fn tokenize_nested_dollar_quoted_strings() {
3069        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
3070        let dialect = GenericDialect {};
3071        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3072        let expected = vec![
3073            Token::make_keyword("SELECT"),
3074            Token::Whitespace(Whitespace::Space),
3075            Token::DollarQuotedString(DollarQuotedString {
3076                value: "dollar $nested$ string".into(),
3077                tag: Some("tag".into()),
3078            }),
3079        ];
3080        compare(expected, tokens);
3081    }
3082
3083    #[test]
3084    fn tokenize_dollar_quoted_string_untagged_empty() {
3085        let sql = String::from("SELECT $$$$");
3086        let dialect = GenericDialect {};
3087        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3088        let expected = vec![
3089            Token::make_keyword("SELECT"),
3090            Token::Whitespace(Whitespace::Space),
3091            Token::DollarQuotedString(DollarQuotedString {
3092                value: "".into(),
3093                tag: None,
3094            }),
3095        ];
3096        compare(expected, tokens);
3097    }
3098
3099    #[test]
3100    fn tokenize_dollar_quoted_string_untagged() {
3101        let sql =
3102            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3103        let dialect = GenericDialect {};
3104        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3105        let expected = vec![
3106            Token::make_keyword("SELECT"),
3107            Token::Whitespace(Whitespace::Space),
3108            Token::DollarQuotedString(DollarQuotedString {
3109                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3110                tag: None,
3111            }),
3112        ];
3113        compare(expected, tokens);
3114    }
3115
3116    #[test]
3117    fn tokenize_dollar_quoted_string_untagged_unterminated() {
3118        let sql = String::from(
3119            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3120        );
3121        let dialect = GenericDialect {};
3122        assert_eq!(
3123            Tokenizer::new(&dialect, &sql).tokenize(),
3124            Err(TokenizerError {
3125                message: "Unterminated dollar-quoted string".into(),
3126                location: Location {
3127                    line: 1,
3128                    column: 86
3129                }
3130            })
3131        );
3132    }
3133
3134    #[test]
3135    fn tokenize_right_arrow() {
3136        let sql = String::from("FUNCTION(key=>value)");
3137        let dialect = GenericDialect {};
3138        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3139        let expected = vec![
3140            Token::make_word("FUNCTION", None),
3141            Token::LParen,
3142            Token::make_word("key", None),
3143            Token::RArrow,
3144            Token::make_word("value", None),
3145            Token::RParen,
3146        ];
3147        compare(expected, tokens);
3148    }
3149
3150    #[test]
3151    fn tokenize_is_null() {
3152        let sql = String::from("a IS NULL");
3153        let dialect = GenericDialect {};
3154        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3155
3156        let expected = vec![
3157            Token::make_word("a", None),
3158            Token::Whitespace(Whitespace::Space),
3159            Token::make_keyword("IS"),
3160            Token::Whitespace(Whitespace::Space),
3161            Token::make_keyword("NULL"),
3162        ];
3163
3164        compare(expected, tokens);
3165    }
3166
3167    #[test]
3168    fn tokenize_comment() {
3169        let test_cases = vec![
3170            (
3171                String::from("0--this is a comment\n1"),
3172                vec![
3173                    Token::Number("0".to_string(), false),
3174                    Token::Whitespace(Whitespace::SingleLineComment {
3175                        prefix: "--".to_string(),
3176                        comment: "this is a comment\n".to_string(),
3177                    }),
3178                    Token::Number("1".to_string(), false),
3179                ],
3180            ),
3181            (
3182                String::from("0--this is a comment\r1"),
3183                vec![
3184                    Token::Number("0".to_string(), false),
3185                    Token::Whitespace(Whitespace::SingleLineComment {
3186                        prefix: "--".to_string(),
3187                        comment: "this is a comment\r1".to_string(),
3188                    }),
3189                ],
3190            ),
3191            (
3192                String::from("0--this is a comment\r\n1"),
3193                vec![
3194                    Token::Number("0".to_string(), false),
3195                    Token::Whitespace(Whitespace::SingleLineComment {
3196                        prefix: "--".to_string(),
3197                        comment: "this is a comment\r\n".to_string(),
3198                    }),
3199                    Token::Number("1".to_string(), false),
3200                ],
3201            ),
3202        ];
3203
3204        let dialect = GenericDialect {};
3205
3206        for (sql, expected) in test_cases {
3207            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3208            compare(expected, tokens);
3209        }
3210    }
3211
3212    #[test]
3213    fn tokenize_comment_postgres() {
3214        let sql = String::from("1--\r0");
3215
3216        let dialect = PostgreSqlDialect {};
3217        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3218        let expected = vec![
3219            Token::Number("1".to_string(), false),
3220            Token::Whitespace(Whitespace::SingleLineComment {
3221                prefix: "--".to_string(),
3222                comment: "\r".to_string(),
3223            }),
3224            Token::Number("0".to_string(), false),
3225        ];
3226        compare(expected, tokens);
3227    }
3228
3229    #[test]
3230    fn tokenize_comment_at_eof() {
3231        let sql = String::from("--this is a comment");
3232
3233        let dialect = GenericDialect {};
3234        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3235        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3236            prefix: "--".to_string(),
3237            comment: "this is a comment".to_string(),
3238        })];
3239        compare(expected, tokens);
3240    }
3241
3242    #[test]
3243    fn tokenize_multiline_comment() {
3244        let sql = String::from("0/*multi-line\n* /comment*/1");
3245
3246        let dialect = GenericDialect {};
3247        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3248        let expected = vec![
3249            Token::Number("0".to_string(), false),
3250            Token::Whitespace(Whitespace::MultiLineComment(
3251                "multi-line\n* /comment".to_string(),
3252            )),
3253            Token::Number("1".to_string(), false),
3254        ];
3255        compare(expected, tokens);
3256    }
3257
3258    #[test]
3259    fn tokenize_nested_multiline_comment() {
3260        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3261            "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3262            vec![
3263                Token::Number("0".to_string(), false),
3264                Token::Whitespace(Whitespace::MultiLineComment(
3265                    "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3266                )),
3267                Token::Whitespace(Whitespace::Space),
3268                Token::Div,
3269                Token::Word(Word {
3270                    value: "comment".to_string(),
3271                    quote_style: None,
3272                    keyword: Keyword::COMMENT,
3273                }),
3274                Token::Mul,
3275                Token::Div,
3276                Token::Number("1".to_string(), false),
3277            ],
3278        );
3279
3280        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3281            "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3282            vec![
3283                Token::Number("0".to_string(), false),
3284                Token::Whitespace(Whitespace::MultiLineComment(
3285                    "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3286                )),
3287                Token::Number("1".to_string(), false),
3288            ],
3289        );
3290
3291        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3292            "SELECT 1/* a /* b */ c */0",
3293            vec![
3294                Token::make_keyword("SELECT"),
3295                Token::Whitespace(Whitespace::Space),
3296                Token::Number("1".to_string(), false),
3297                Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3298                Token::Number("0".to_string(), false),
3299            ],
3300        );
3301    }
3302
3303    #[test]
3304    fn tokenize_nested_multiline_comment_empty() {
3305        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3306            "select 1/*/**/*/0",
3307            vec![
3308                Token::make_keyword("select"),
3309                Token::Whitespace(Whitespace::Space),
3310                Token::Number("1".to_string(), false),
3311                Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3312                Token::Number("0".to_string(), false),
3313            ],
3314        );
3315    }
3316
3317    #[test]
3318    fn tokenize_nested_comments_if_not_supported() {
3319        all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3320            "SELECT 1/*/* nested comment */*/0",
3321            vec![
3322                Token::make_keyword("SELECT"),
3323                Token::Whitespace(Whitespace::Space),
3324                Token::Number("1".to_string(), false),
3325                Token::Whitespace(Whitespace::MultiLineComment(
3326                    "/* nested comment ".to_string(),
3327                )),
3328                Token::Mul,
3329                Token::Div,
3330                Token::Number("0".to_string(), false),
3331            ],
3332        );
3333    }
3334
3335    #[test]
3336    fn tokenize_multiline_comment_with_even_asterisks() {
3337        let sql = String::from("\n/** Comment **/\n");
3338
3339        let dialect = GenericDialect {};
3340        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3341        let expected = vec![
3342            Token::Whitespace(Whitespace::Newline),
3343            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3344            Token::Whitespace(Whitespace::Newline),
3345        ];
3346        compare(expected, tokens);
3347    }
3348
3349    #[test]
3350    fn tokenize_unicode_whitespace() {
3351        let sql = String::from(" \u{2003}\n");
3352
3353        let dialect = GenericDialect {};
3354        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3355        let expected = vec![
3356            Token::Whitespace(Whitespace::Space),
3357            Token::Whitespace(Whitespace::Space),
3358            Token::Whitespace(Whitespace::Newline),
3359        ];
3360        compare(expected, tokens);
3361    }
3362
3363    #[test]
3364    fn tokenize_mismatched_quotes() {
3365        let sql = String::from("\"foo");
3366
3367        let dialect = GenericDialect {};
3368        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3369        assert_eq!(
3370            tokenizer.tokenize(),
3371            Err(TokenizerError {
3372                message: "Expected close delimiter '\"' before EOF.".to_string(),
3373                location: Location { line: 1, column: 1 },
3374            })
3375        );
3376    }
3377
3378    #[test]
3379    fn tokenize_newlines() {
3380        let sql = String::from("line1\nline2\rline3\r\nline4\r");
3381
3382        let dialect = GenericDialect {};
3383        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3384        let expected = vec![
3385            Token::make_word("line1", None),
3386            Token::Whitespace(Whitespace::Newline),
3387            Token::make_word("line2", None),
3388            Token::Whitespace(Whitespace::Newline),
3389            Token::make_word("line3", None),
3390            Token::Whitespace(Whitespace::Newline),
3391            Token::make_word("line4", None),
3392            Token::Whitespace(Whitespace::Newline),
3393        ];
3394        compare(expected, tokens);
3395    }
3396
3397    #[test]
3398    fn tokenize_mssql_top() {
3399        let sql = "SELECT TOP 5 [bar] FROM foo";
3400        let dialect = MsSqlDialect {};
3401        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3402        let expected = vec![
3403            Token::make_keyword("SELECT"),
3404            Token::Whitespace(Whitespace::Space),
3405            Token::make_keyword("TOP"),
3406            Token::Whitespace(Whitespace::Space),
3407            Token::Number(String::from("5"), false),
3408            Token::Whitespace(Whitespace::Space),
3409            Token::make_word("bar", Some('[')),
3410            Token::Whitespace(Whitespace::Space),
3411            Token::make_keyword("FROM"),
3412            Token::Whitespace(Whitespace::Space),
3413            Token::make_word("foo", None),
3414        ];
3415        compare(expected, tokens);
3416    }
3417
3418    #[test]
3419    fn tokenize_pg_regex_match() {
3420        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3421        let dialect = GenericDialect {};
3422        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3423        let expected = vec![
3424            Token::make_keyword("SELECT"),
3425            Token::Whitespace(Whitespace::Space),
3426            Token::make_word("col", None),
3427            Token::Whitespace(Whitespace::Space),
3428            Token::Tilde,
3429            Token::Whitespace(Whitespace::Space),
3430            Token::SingleQuotedString("^a".into()),
3431            Token::Comma,
3432            Token::Whitespace(Whitespace::Space),
3433            Token::make_word("col", None),
3434            Token::Whitespace(Whitespace::Space),
3435            Token::TildeAsterisk,
3436            Token::Whitespace(Whitespace::Space),
3437            Token::SingleQuotedString("^a".into()),
3438            Token::Comma,
3439            Token::Whitespace(Whitespace::Space),
3440            Token::make_word("col", None),
3441            Token::Whitespace(Whitespace::Space),
3442            Token::ExclamationMarkTilde,
3443            Token::Whitespace(Whitespace::Space),
3444            Token::SingleQuotedString("^a".into()),
3445            Token::Comma,
3446            Token::Whitespace(Whitespace::Space),
3447            Token::make_word("col", None),
3448            Token::Whitespace(Whitespace::Space),
3449            Token::ExclamationMarkTildeAsterisk,
3450            Token::Whitespace(Whitespace::Space),
3451            Token::SingleQuotedString("^a".into()),
3452        ];
3453        compare(expected, tokens);
3454    }
3455
3456    #[test]
3457    fn tokenize_pg_like_match() {
3458        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3459        let dialect = GenericDialect {};
3460        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3461        let expected = vec![
3462            Token::make_keyword("SELECT"),
3463            Token::Whitespace(Whitespace::Space),
3464            Token::make_word("col", None),
3465            Token::Whitespace(Whitespace::Space),
3466            Token::DoubleTilde,
3467            Token::Whitespace(Whitespace::Space),
3468            Token::SingleQuotedString("_a%".into()),
3469            Token::Comma,
3470            Token::Whitespace(Whitespace::Space),
3471            Token::make_word("col", None),
3472            Token::Whitespace(Whitespace::Space),
3473            Token::DoubleTildeAsterisk,
3474            Token::Whitespace(Whitespace::Space),
3475            Token::SingleQuotedString("_a%".into()),
3476            Token::Comma,
3477            Token::Whitespace(Whitespace::Space),
3478            Token::make_word("col", None),
3479            Token::Whitespace(Whitespace::Space),
3480            Token::ExclamationMarkDoubleTilde,
3481            Token::Whitespace(Whitespace::Space),
3482            Token::SingleQuotedString("_a%".into()),
3483            Token::Comma,
3484            Token::Whitespace(Whitespace::Space),
3485            Token::make_word("col", None),
3486            Token::Whitespace(Whitespace::Space),
3487            Token::ExclamationMarkDoubleTildeAsterisk,
3488            Token::Whitespace(Whitespace::Space),
3489            Token::SingleQuotedString("_a%".into()),
3490        ];
3491        compare(expected, tokens);
3492    }
3493
3494    #[test]
3495    fn tokenize_quoted_identifier() {
3496        let sql = r#" "a "" b" "a """ "c """"" "#;
3497        let dialect = GenericDialect {};
3498        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3499        let expected = vec![
3500            Token::Whitespace(Whitespace::Space),
3501            Token::make_word(r#"a " b"#, Some('"')),
3502            Token::Whitespace(Whitespace::Space),
3503            Token::make_word(r#"a ""#, Some('"')),
3504            Token::Whitespace(Whitespace::Space),
3505            Token::make_word(r#"c """#, Some('"')),
3506            Token::Whitespace(Whitespace::Space),
3507        ];
3508        compare(expected, tokens);
3509    }
3510
3511    #[test]
3512    fn tokenize_snowflake_div() {
3513        let sql = r#"field/1000"#;
3514        let dialect = SnowflakeDialect {};
3515        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3516        let expected = vec![
3517            Token::make_word(r#"field"#, None),
3518            Token::Div,
3519            Token::Number("1000".to_string(), false),
3520        ];
3521        compare(expected, tokens);
3522    }
3523
3524    #[test]
3525    fn tokenize_quoted_identifier_with_no_escape() {
3526        let sql = r#" "a "" b" "a """ "c """"" "#;
3527        let dialect = GenericDialect {};
3528        let tokens = Tokenizer::new(&dialect, sql)
3529            .with_unescape(false)
3530            .tokenize()
3531            .unwrap();
3532        let expected = vec![
3533            Token::Whitespace(Whitespace::Space),
3534            Token::make_word(r#"a "" b"#, Some('"')),
3535            Token::Whitespace(Whitespace::Space),
3536            Token::make_word(r#"a """#, Some('"')),
3537            Token::Whitespace(Whitespace::Space),
3538            Token::make_word(r#"c """""#, Some('"')),
3539            Token::Whitespace(Whitespace::Space),
3540        ];
3541        compare(expected, tokens);
3542    }
3543
3544    #[test]
3545    fn tokenize_with_location() {
3546        let sql = "SELECT a,\n b";
3547        let dialect = GenericDialect {};
3548        let tokens = Tokenizer::new(&dialect, sql)
3549            .tokenize_with_location()
3550            .unwrap();
3551        let expected = vec![
3552            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3553            TokenWithSpan::at(
3554                Token::Whitespace(Whitespace::Space),
3555                (1, 7).into(),
3556                (1, 8).into(),
3557            ),
3558            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3559            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3560            TokenWithSpan::at(
3561                Token::Whitespace(Whitespace::Newline),
3562                (1, 10).into(),
3563                (2, 1).into(),
3564            ),
3565            TokenWithSpan::at(
3566                Token::Whitespace(Whitespace::Space),
3567                (2, 1).into(),
3568                (2, 2).into(),
3569            ),
3570            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3571        ];
3572        compare(expected, tokens);
3573    }
3574
3575    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3576        //println!("------------------------------");
3577        //println!("tokens   = {:?}", actual);
3578        //println!("expected = {:?}", expected);
3579        //println!("------------------------------");
3580        assert_eq!(expected, actual);
3581    }
3582
3583    fn check_unescape(s: &str, expected: Option<&str>) {
3584        let s = format!("'{s}'");
3585        let mut state = State {
3586            peekable: s.chars().peekable(),
3587            line: 0,
3588            col: 0,
3589        };
3590
3591        assert_eq!(
3592            unescape_single_quoted_string(&mut state),
3593            expected.map(|s| s.to_string())
3594        );
3595    }
3596
3597    #[test]
3598    fn test_unescape() {
3599        check_unescape(r"\b", Some("\u{0008}"));
3600        check_unescape(r"\f", Some("\u{000C}"));
3601        check_unescape(r"\t", Some("\t"));
3602        check_unescape(r"\r\n", Some("\r\n"));
3603        check_unescape(r"\/", Some("/"));
3604        check_unescape(r"/", Some("/"));
3605        check_unescape(r"\\", Some("\\"));
3606
3607        // 16 and 32-bit hexadecimal Unicode character value
3608        check_unescape(r"\u0001", Some("\u{0001}"));
3609        check_unescape(r"\u4c91", Some("\u{4c91}"));
3610        check_unescape(r"\u4c916", Some("\u{4c91}6"));
3611        check_unescape(r"\u4c", None);
3612        check_unescape(r"\u0000", None);
3613        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3614        check_unescape(r"\U00110000", None);
3615        check_unescape(r"\U00000000", None);
3616        check_unescape(r"\u", None);
3617        check_unescape(r"\U", None);
3618        check_unescape(r"\U1010FFFF", None);
3619
3620        // hexadecimal byte value
3621        check_unescape(r"\x4B", Some("\u{004b}"));
3622        check_unescape(r"\x4", Some("\u{0004}"));
3623        check_unescape(r"\x4L", Some("\u{0004}L"));
3624        check_unescape(r"\x", Some("x"));
3625        check_unescape(r"\xP", Some("xP"));
3626        check_unescape(r"\x0", None);
3627        check_unescape(r"\xCAD", None);
3628        check_unescape(r"\xA9", None);
3629
3630        // octal byte value
3631        check_unescape(r"\1", Some("\u{0001}"));
3632        check_unescape(r"\12", Some("\u{000a}"));
3633        check_unescape(r"\123", Some("\u{0053}"));
3634        check_unescape(r"\1232", Some("\u{0053}2"));
3635        check_unescape(r"\4", Some("\u{0004}"));
3636        check_unescape(r"\45", Some("\u{0025}"));
3637        check_unescape(r"\450", Some("\u{0028}"));
3638        check_unescape(r"\603", None);
3639        check_unescape(r"\0", None);
3640        check_unescape(r"\080", None);
3641
3642        // others
3643        check_unescape(r"\9", Some("9"));
3644        check_unescape(r"''", Some("'"));
3645        check_unescape(
3646            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3647            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3648        );
3649        check_unescape(r"Hello\0", None);
3650        check_unescape(r"Hello\xCADRust", None);
3651    }
3652
3653    #[test]
3654    fn tokenize_numeric_prefix_trait() {
3655        #[derive(Debug)]
3656        struct NumericPrefixDialect;
3657
3658        impl Dialect for NumericPrefixDialect {
3659            fn is_identifier_start(&self, ch: char) -> bool {
3660                ch.is_ascii_lowercase()
3661                    || ch.is_ascii_uppercase()
3662                    || ch.is_ascii_digit()
3663                    || ch == '$'
3664            }
3665
3666            fn is_identifier_part(&self, ch: char) -> bool {
3667                ch.is_ascii_lowercase()
3668                    || ch.is_ascii_uppercase()
3669                    || ch.is_ascii_digit()
3670                    || ch == '_'
3671                    || ch == '$'
3672                    || ch == '{'
3673                    || ch == '}'
3674            }
3675
3676            fn supports_numeric_prefix(&self) -> bool {
3677                true
3678            }
3679        }
3680
3681        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3682        tokenize_numeric_prefix_inner(&HiveDialect {});
3683        tokenize_numeric_prefix_inner(&MySqlDialect {});
3684    }
3685
3686    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3687        let sql = r#"SELECT * FROM 1"#;
3688        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3689        let expected = vec![
3690            Token::make_keyword("SELECT"),
3691            Token::Whitespace(Whitespace::Space),
3692            Token::Mul,
3693            Token::Whitespace(Whitespace::Space),
3694            Token::make_keyword("FROM"),
3695            Token::Whitespace(Whitespace::Space),
3696            Token::Number(String::from("1"), false),
3697        ];
3698        compare(expected, tokens);
3699    }
3700
3701    #[test]
3702    fn tokenize_quoted_string_escape() {
3703        let dialect = SnowflakeDialect {};
3704        for (sql, expected, expected_unescaped) in [
3705            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3706            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3707            (r#"'\\'"#, r#"\\"#, r#"\"#),
3708            (
3709                r#"'\0\a\b\f\n\r\t\Z'"#,
3710                r#"\0\a\b\f\n\r\t\Z"#,
3711                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3712            ),
3713            (r#"'\"'"#, r#"\""#, "\""),
3714            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3715            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3716            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3717            (r#"'\q'"#, r#"\q"#, r#"q"#),
3718            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3719            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3720        ] {
3721            let tokens = Tokenizer::new(&dialect, sql)
3722                .with_unescape(false)
3723                .tokenize()
3724                .unwrap();
3725            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3726            compare(expected, tokens);
3727
3728            let tokens = Tokenizer::new(&dialect, sql)
3729                .with_unescape(true)
3730                .tokenize()
3731                .unwrap();
3732            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3733            compare(expected, tokens);
3734        }
3735
3736        for sql in [r#"'\'"#, r#"'ab\'"#] {
3737            let mut tokenizer = Tokenizer::new(&dialect, sql);
3738            assert_eq!(
3739                "Unterminated string literal",
3740                tokenizer.tokenize().unwrap_err().message.as_str(),
3741            );
3742        }
3743
3744        // Non-escape dialect
3745        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3746            let dialect = GenericDialect {};
3747            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3748
3749            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3750
3751            compare(expected, tokens);
3752        }
3753
3754        // MySQL special case for LIKE escapes
3755        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3756            let dialect = MySqlDialect {};
3757            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3758
3759            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3760
3761            compare(expected, tokens);
3762        }
3763    }
3764
3765    #[test]
3766    fn tokenize_triple_quoted_string() {
3767        fn check<F>(
3768            q: char, // The quote character to test
3769            r: char, // An alternate quote character.
3770            quote_token: F,
3771        ) where
3772            F: Fn(String) -> Token,
3773        {
3774            let dialect = BigQueryDialect {};
3775
3776            for (sql, expected, expected_unescaped) in [
3777                // Empty string
3778                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3779                // Should not count escaped quote as end of string.
3780                (
3781                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3782                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3783                    format!(r#"ab{q}{q}{q}{q}cd"#),
3784                ),
3785                // Simple string
3786                (
3787                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3788                    "abc".into(),
3789                    "abc".into(),
3790                ),
3791                // Mix single-double quotes unescaped.
3792                (
3793                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3794                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3795                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3796                ),
3797                // Escaped quote.
3798                (
3799                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3800                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3801                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3802                ),
3803                // backslash-escaped quote characters.
3804                (
3805                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3806                    r#"a\'\'b\'c\'d"#.into(),
3807                    r#"a''b'c'd"#.into(),
3808                ),
3809                // backslash-escaped characters
3810                (
3811                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3812                    r#"abc\0\n\rdef"#.into(),
3813                    "abc\0\n\rdef".into(),
3814                ),
3815            ] {
3816                let tokens = Tokenizer::new(&dialect, sql.as_str())
3817                    .with_unescape(false)
3818                    .tokenize()
3819                    .unwrap();
3820                let expected = vec![quote_token(expected.to_string())];
3821                compare(expected, tokens);
3822
3823                let tokens = Tokenizer::new(&dialect, sql.as_str())
3824                    .with_unescape(true)
3825                    .tokenize()
3826                    .unwrap();
3827                let expected = vec![quote_token(expected_unescaped.to_string())];
3828                compare(expected, tokens);
3829            }
3830
3831            for sql in [
3832                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3833                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3834                format!(r#"{q}{q}{q}{q}"#),
3835                format!(r#"{q}{q}{q}{r}{r}"#),
3836                format!(r#"{q}{q}{q}abc{q}"#),
3837                format!(r#"{q}{q}{q}abc{q}{q}"#),
3838                format!(r#"{q}{q}{q}abc"#),
3839            ] {
3840                let dialect = BigQueryDialect {};
3841                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3842                assert_eq!(
3843                    "Unterminated string literal",
3844                    tokenizer.tokenize().unwrap_err().message.as_str(),
3845                );
3846            }
3847        }
3848
3849        check('"', '\'', Token::TripleDoubleQuotedString);
3850
3851        check('\'', '"', Token::TripleSingleQuotedString);
3852
3853        let dialect = BigQueryDialect {};
3854
3855        let sql = r#"""''"#;
3856        let tokens = Tokenizer::new(&dialect, sql)
3857            .with_unescape(true)
3858            .tokenize()
3859            .unwrap();
3860        let expected = vec![
3861            Token::DoubleQuotedString("".to_string()),
3862            Token::SingleQuotedString("".to_string()),
3863        ];
3864        compare(expected, tokens);
3865
3866        let sql = r#"''"""#;
3867        let tokens = Tokenizer::new(&dialect, sql)
3868            .with_unescape(true)
3869            .tokenize()
3870            .unwrap();
3871        let expected = vec![
3872            Token::SingleQuotedString("".to_string()),
3873            Token::DoubleQuotedString("".to_string()),
3874        ];
3875        compare(expected, tokens);
3876
3877        // Non-triple quoted string dialect
3878        let dialect = SnowflakeDialect {};
3879        let sql = r#"''''''"#;
3880        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3881        let expected = vec![Token::SingleQuotedString("''".to_string())];
3882        compare(expected, tokens);
3883    }
3884
3885    #[test]
3886    fn test_mysql_users_grantees() {
3887        let dialect = MySqlDialect {};
3888
3889        let sql = "CREATE USER `root`@`%`";
3890        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3891        let expected = vec![
3892            Token::make_keyword("CREATE"),
3893            Token::Whitespace(Whitespace::Space),
3894            Token::make_keyword("USER"),
3895            Token::Whitespace(Whitespace::Space),
3896            Token::make_word("root", Some('`')),
3897            Token::AtSign,
3898            Token::make_word("%", Some('`')),
3899        ];
3900        compare(expected, tokens);
3901    }
3902
3903    #[test]
3904    fn test_postgres_abs_without_space_and_string_literal() {
3905        let dialect = MySqlDialect {};
3906
3907        let sql = "SELECT @'1'";
3908        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3909        let expected = vec![
3910            Token::make_keyword("SELECT"),
3911            Token::Whitespace(Whitespace::Space),
3912            Token::AtSign,
3913            Token::SingleQuotedString("1".to_string()),
3914        ];
3915        compare(expected, tokens);
3916    }
3917
3918    #[test]
3919    fn test_postgres_abs_without_space_and_quoted_column() {
3920        let dialect = MySqlDialect {};
3921
3922        let sql = r#"SELECT @"bar" FROM foo"#;
3923        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3924        let expected = vec![
3925            Token::make_keyword("SELECT"),
3926            Token::Whitespace(Whitespace::Space),
3927            Token::AtSign,
3928            Token::DoubleQuotedString("bar".to_string()),
3929            Token::Whitespace(Whitespace::Space),
3930            Token::make_keyword("FROM"),
3931            Token::Whitespace(Whitespace::Space),
3932            Token::make_word("foo", None),
3933        ];
3934        compare(expected, tokens);
3935    }
3936
3937    #[test]
3938    fn test_national_strings_backslash_escape_not_supported() {
3939        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3940            .tokenizes_to(
3941                "select n'''''\\'",
3942                vec![
3943                    Token::make_keyword("select"),
3944                    Token::Whitespace(Whitespace::Space),
3945                    Token::NationalStringLiteral("''\\".to_string()),
3946                ],
3947            );
3948    }
3949
3950    #[test]
3951    fn test_national_strings_backslash_escape_supported() {
3952        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3953            .tokenizes_to(
3954                "select n'''''\\''",
3955                vec![
3956                    Token::make_keyword("select"),
3957                    Token::Whitespace(Whitespace::Space),
3958                    Token::NationalStringLiteral("'''".to_string()),
3959                ],
3960            );
3961    }
3962
3963    #[test]
3964    fn test_string_escape_constant_not_supported() {
3965        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3966            "select e'...'",
3967            vec![
3968                Token::make_keyword("select"),
3969                Token::Whitespace(Whitespace::Space),
3970                Token::make_word("e", None),
3971                Token::SingleQuotedString("...".to_string()),
3972            ],
3973        );
3974
3975        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3976            "select E'...'",
3977            vec![
3978                Token::make_keyword("select"),
3979                Token::Whitespace(Whitespace::Space),
3980                Token::make_word("E", None),
3981                Token::SingleQuotedString("...".to_string()),
3982            ],
3983        );
3984    }
3985
3986    #[test]
3987    fn test_string_escape_constant_supported() {
3988        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3989            "select e'\\''",
3990            vec![
3991                Token::make_keyword("select"),
3992                Token::Whitespace(Whitespace::Space),
3993                Token::EscapedStringLiteral("'".to_string()),
3994            ],
3995        );
3996
3997        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3998            "select E'\\''",
3999            vec![
4000                Token::make_keyword("select"),
4001                Token::Whitespace(Whitespace::Space),
4002                Token::EscapedStringLiteral("'".to_string()),
4003            ],
4004        );
4005    }
4006
4007    #[test]
4008    fn test_whitespace_required_after_single_line_comment() {
4009        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4010            .tokenizes_to(
4011                "SELECT --'abc'",
4012                vec![
4013                    Token::make_keyword("SELECT"),
4014                    Token::Whitespace(Whitespace::Space),
4015                    Token::Minus,
4016                    Token::Minus,
4017                    Token::SingleQuotedString("abc".to_string()),
4018                ],
4019            );
4020
4021        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4022            .tokenizes_to(
4023                "SELECT -- 'abc'",
4024                vec![
4025                    Token::make_keyword("SELECT"),
4026                    Token::Whitespace(Whitespace::Space),
4027                    Token::Whitespace(Whitespace::SingleLineComment {
4028                        prefix: "--".to_string(),
4029                        comment: " 'abc'".to_string(),
4030                    }),
4031                ],
4032            );
4033
4034        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4035            .tokenizes_to(
4036                "SELECT --",
4037                vec![
4038                    Token::make_keyword("SELECT"),
4039                    Token::Whitespace(Whitespace::Space),
4040                    Token::Minus,
4041                    Token::Minus,
4042                ],
4043            );
4044    }
4045
4046    #[test]
4047    fn test_whitespace_not_required_after_single_line_comment() {
4048        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4049            .tokenizes_to(
4050                "SELECT --'abc'",
4051                vec![
4052                    Token::make_keyword("SELECT"),
4053                    Token::Whitespace(Whitespace::Space),
4054                    Token::Whitespace(Whitespace::SingleLineComment {
4055                        prefix: "--".to_string(),
4056                        comment: "'abc'".to_string(),
4057                    }),
4058                ],
4059            );
4060
4061        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4062            .tokenizes_to(
4063                "SELECT -- 'abc'",
4064                vec![
4065                    Token::make_keyword("SELECT"),
4066                    Token::Whitespace(Whitespace::Space),
4067                    Token::Whitespace(Whitespace::SingleLineComment {
4068                        prefix: "--".to_string(),
4069                        comment: " 'abc'".to_string(),
4070                    }),
4071                ],
4072            );
4073
4074        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4075            .tokenizes_to(
4076                "SELECT --",
4077                vec![
4078                    Token::make_keyword("SELECT"),
4079                    Token::Whitespace(Whitespace::Space),
4080                    Token::Whitespace(Whitespace::SingleLineComment {
4081                        prefix: "--".to_string(),
4082                        comment: "".to_string(),
4083                    }),
4084                ],
4085            );
4086    }
4087
4088    #[test]
4089    fn test_tokenize_identifiers_numeric_prefix() {
4090        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4091            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4092
4093        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4094            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4095
4096        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4097            "t.12e34",
4098            vec![
4099                Token::make_word("t", None),
4100                Token::Period,
4101                Token::make_word("12e34", None),
4102            ],
4103        );
4104
4105        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4106            "t.1two3",
4107            vec![
4108                Token::make_word("t", None),
4109                Token::Period,
4110                Token::make_word("1two3", None),
4111            ],
4112        );
4113    }
4114
4115    #[test]
4116    fn tokenize_period_underscore() {
4117        let sql = String::from("SELECT table._col");
4118        // a dialect that supports underscores in numeric literals
4119        let dialect = PostgreSqlDialect {};
4120        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4121
4122        let expected = vec![
4123            Token::make_keyword("SELECT"),
4124            Token::Whitespace(Whitespace::Space),
4125            Token::Word(Word {
4126                value: "table".to_string(),
4127                quote_style: None,
4128                keyword: Keyword::TABLE,
4129            }),
4130            Token::Period,
4131            Token::Word(Word {
4132                value: "_col".to_string(),
4133                quote_style: None,
4134                keyword: Keyword::NoKeyword,
4135            }),
4136        ];
4137
4138        compare(expected, tokens);
4139
4140        let sql = String::from("SELECT ._123");
4141        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4142            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4143        }
4144
4145        let sql = String::from("SELECT ._abc");
4146        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4147            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4148        }
4149    }
4150
4151    #[test]
4152    fn tokenize_question_mark() {
4153        let dialect = PostgreSqlDialect {};
4154        let sql = "SELECT x ? y";
4155        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
4156        compare(
4157            tokens,
4158            vec![
4159                Token::make_keyword("SELECT"),
4160                Token::Whitespace(Whitespace::Space),
4161                Token::make_word("x", None),
4162                Token::Whitespace(Whitespace::Space),
4163                Token::Question,
4164                Token::Whitespace(Whitespace::Space),
4165                Token::make_word("y", None),
4166            ],
4167        )
4168    }
4169}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs