sqlparser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46    SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51/// SQL Token enumeration
52#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56    /// An end-of-file marker, not a real token
57    EOF,
58    /// A keyword (like SELECT) or an optionally quoted SQL identifier
59    Word(Word),
60    /// An unsigned numeric literal
61    Number(String, bool),
62    /// A character that could not be tokenized
63    Char(char),
64    /// Single quoted string: i.e: 'string'
65    SingleQuotedString(String),
66    /// Double quoted string: i.e: "string"
67    DoubleQuotedString(String),
68    /// Triple single quoted strings: Example '''abc'''
69    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
70    TripleSingleQuotedString(String),
71    /// Triple double quoted strings: Example """abc"""
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleDoubleQuotedString(String),
74    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
75    DollarQuotedString(DollarQuotedString),
76    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
77    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
78    SingleQuotedByteStringLiteral(String),
79    /// Byte string literal: i.e: b"string" or B"string"
80    DoubleQuotedByteStringLiteral(String),
81    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
82    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
83    TripleSingleQuotedByteStringLiteral(String),
84    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleDoubleQuotedByteStringLiteral(String),
87    /// Single quoted literal with raw string prefix. Example `R'abc'`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    SingleQuotedRawStringLiteral(String),
90    /// Double quoted literal with raw string prefix. Example `R"abc"`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    DoubleQuotedRawStringLiteral(String),
93    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    TripleSingleQuotedRawStringLiteral(String),
96    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleDoubleQuotedRawStringLiteral(String),
99    /// "National" string literal: i.e: N'string'
100    NationalStringLiteral(String),
101    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
102    EscapedStringLiteral(String),
103    /// Unicode string literal: i.e: U&'first \000A second'
104    UnicodeStringLiteral(String),
105    /// Hexadecimal string literal: i.e.: X'deadbeef'
106    HexStringLiteral(String),
107    /// Comma
108    Comma,
109    /// Whitespace (space, tab, etc)
110    Whitespace(Whitespace),
111    /// Double equals sign `==`
112    DoubleEq,
113    /// Equality operator `=`
114    Eq,
115    /// Not Equals operator `<>` (or `!=` in some dialects)
116    Neq,
117    /// Less Than operator `<`
118    Lt,
119    /// Greater Than operator `>`
120    Gt,
121    /// Less Than Or Equals operator `<=`
122    LtEq,
123    /// Greater Than Or Equals operator `>=`
124    GtEq,
125    /// Spaceship operator <=>
126    Spaceship,
127    /// Plus operator `+`
128    Plus,
129    /// Minus operator `-`
130    Minus,
131    /// Multiplication operator `*`
132    Mul,
133    /// Division operator `/`
134    Div,
135    /// Integer division operator `//` in DuckDB
136    DuckIntDiv,
137    /// Modulo Operator `%`
138    Mod,
139    /// String concatenation `||`
140    StringConcat,
141    /// Left parenthesis `(`
142    LParen,
143    /// Right parenthesis `)`
144    RParen,
145    /// Period (used for compound identifiers or projections into nested types)
146    Period,
147    /// Colon `:`
148    Colon,
149    /// DoubleColon `::` (used for casting in PostgreSQL)
150    DoubleColon,
151    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
152    Assignment,
153    /// SemiColon `;` used as separator for COPY and payload
154    SemiColon,
155    /// Backslash `\` used in terminating the COPY payload with `\.`
156    Backslash,
157    /// Left bracket `[`
158    LBracket,
159    /// Right bracket `]`
160    RBracket,
161    /// Ampersand `&`
162    Ampersand,
163    /// Pipe `|`
164    Pipe,
165    /// Caret `^`
166    Caret,
167    /// Left brace `{`
168    LBrace,
169    /// Right brace `}`
170    RBrace,
171    /// Right Arrow `=>`
172    RArrow,
173    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
174    Sharp,
175    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
176    DoubleSharp,
177    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
178    Tilde,
179    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
180    TildeAsterisk,
181    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
182    ExclamationMarkTilde,
183    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
184    ExclamationMarkTildeAsterisk,
185    /// `~~`, a case sensitive match pattern operator in PostgreSQL
186    DoubleTilde,
187    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
188    DoubleTildeAsterisk,
189    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
190    ExclamationMarkDoubleTilde,
191    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
192    ExclamationMarkDoubleTildeAsterisk,
193    /// `<<`, a bitwise shift left operator in PostgreSQL
194    ShiftLeft,
195    /// `>>`, a bitwise shift right operator in PostgreSQL
196    ShiftRight,
197    /// `&&`, an overlap operator in PostgreSQL
198    Overlap,
199    /// Exclamation Mark `!` used for PostgreSQL factorial operator
200    ExclamationMark,
201    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
202    DoubleExclamationMark,
203    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
204    AtSign,
205    /// `^@`, a "starts with" string operator in PostgreSQL
206    CaretAt,
207    /// `|/`, a square root math operator in PostgreSQL
208    PGSquareRoot,
209    /// `||/`, a cube root math operator in PostgreSQL
210    PGCubeRoot,
211    /// `?` or `$` , a prepared statement arg placeholder
212    Placeholder(String),
213    /// `->`, used as a operator to extract json field in PostgreSQL
214    Arrow,
215    /// `->>`, used as a operator to extract json field as text in PostgreSQL
216    LongArrow,
217    /// `#>`, extracts JSON sub-object at the specified path
218    HashArrow,
219    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
220    AtDashAt,
221    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
222    QuestionMarkDash,
223    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
224    AmpersandLeftAngleBracket,
225    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
226    AmpersandRightAngleBracket,
227    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
228    AmpersandLeftAngleBracketVerticalBar,
229    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
230    VerticalBarAmpersandRightAngleBracket,
231    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
232    TwoWayArrow,
233    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
234    LeftAngleBracketCaret,
235    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
236    RightAngleBracketCaret,
237    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
238    QuestionMarkSharp,
239    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
240    QuestionMarkDashVerticalBar,
241    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
242    QuestionMarkDoubleVerticalBar,
243    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
244    TildeEqual,
245    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
246    ShiftLeftVerticalBar,
247    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
248    VerticalBarShiftRight,
249    /// `|> BigQuery pipe operator
250    VerticalBarRightAngleBracket,
251    /// `#>>`, extracts JSON sub-object at the specified path as text
252    HashLongArrow,
253    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
254    AtArrow,
255    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
256    ArrowAt,
257    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
258    /// path, where path elements can be either field keys or array indexes.
259    HashMinus,
260    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
261    /// JSON value?
262    AtQuestion,
263    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
264    /// for the specified JSON value. Only the first item of the result is taken into
265    /// account. If the result is not Boolean, then NULL is returned.
266    AtAt,
267    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
268    /// jsonb object
269    Question,
270    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
271    /// keys within the jsonb object
272    QuestionAnd,
273    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
274    /// keys within the jsonb object
275    QuestionPipe,
276    /// Custom binary operator
277    /// This is used to represent any custom binary operator that is not part of the SQL standard.
278    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
279    CustomBinaryOperator(String),
280}
281
282impl fmt::Display for Token {
283    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
284        match self {
285            Token::EOF => f.write_str("EOF"),
286            Token::Word(ref w) => write!(f, "{w}"),
287            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
288            Token::Char(ref c) => write!(f, "{c}"),
289            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
290            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
291            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
292            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
293            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
294            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
295            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
296            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
297            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
298            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
299            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
300            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
301            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
302            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
303            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
304            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
305            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
306            Token::Comma => f.write_str(","),
307            Token::Whitespace(ws) => write!(f, "{ws}"),
308            Token::DoubleEq => f.write_str("=="),
309            Token::Spaceship => f.write_str("<=>"),
310            Token::Eq => f.write_str("="),
311            Token::Neq => f.write_str("<>"),
312            Token::Lt => f.write_str("<"),
313            Token::Gt => f.write_str(">"),
314            Token::LtEq => f.write_str("<="),
315            Token::GtEq => f.write_str(">="),
316            Token::Plus => f.write_str("+"),
317            Token::Minus => f.write_str("-"),
318            Token::Mul => f.write_str("*"),
319            Token::Div => f.write_str("/"),
320            Token::DuckIntDiv => f.write_str("//"),
321            Token::StringConcat => f.write_str("||"),
322            Token::Mod => f.write_str("%"),
323            Token::LParen => f.write_str("("),
324            Token::RParen => f.write_str(")"),
325            Token::Period => f.write_str("."),
326            Token::Colon => f.write_str(":"),
327            Token::DoubleColon => f.write_str("::"),
328            Token::Assignment => f.write_str(":="),
329            Token::SemiColon => f.write_str(";"),
330            Token::Backslash => f.write_str("\\"),
331            Token::LBracket => f.write_str("["),
332            Token::RBracket => f.write_str("]"),
333            Token::Ampersand => f.write_str("&"),
334            Token::Caret => f.write_str("^"),
335            Token::Pipe => f.write_str("|"),
336            Token::LBrace => f.write_str("{"),
337            Token::RBrace => f.write_str("}"),
338            Token::RArrow => f.write_str("=>"),
339            Token::Sharp => f.write_str("#"),
340            Token::DoubleSharp => f.write_str("##"),
341            Token::ExclamationMark => f.write_str("!"),
342            Token::DoubleExclamationMark => f.write_str("!!"),
343            Token::Tilde => f.write_str("~"),
344            Token::TildeAsterisk => f.write_str("~*"),
345            Token::ExclamationMarkTilde => f.write_str("!~"),
346            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
347            Token::DoubleTilde => f.write_str("~~"),
348            Token::DoubleTildeAsterisk => f.write_str("~~*"),
349            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
350            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
351            Token::AtSign => f.write_str("@"),
352            Token::CaretAt => f.write_str("^@"),
353            Token::ShiftLeft => f.write_str("<<"),
354            Token::ShiftRight => f.write_str(">>"),
355            Token::Overlap => f.write_str("&&"),
356            Token::PGSquareRoot => f.write_str("|/"),
357            Token::PGCubeRoot => f.write_str("||/"),
358            Token::AtDashAt => f.write_str("@-@"),
359            Token::QuestionMarkDash => f.write_str("?-"),
360            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
361            Token::AmpersandRightAngleBracket => f.write_str("&>"),
362            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
363            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
364            Token::VerticalBarRightAngleBracket => f.write_str("|>"),
365            Token::TwoWayArrow => f.write_str("<->"),
366            Token::LeftAngleBracketCaret => f.write_str("<^"),
367            Token::RightAngleBracketCaret => f.write_str(">^"),
368            Token::QuestionMarkSharp => f.write_str("?#"),
369            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
370            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
371            Token::TildeEqual => f.write_str("~="),
372            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
373            Token::VerticalBarShiftRight => f.write_str("|>>"),
374            Token::Placeholder(ref s) => write!(f, "{s}"),
375            Token::Arrow => write!(f, "->"),
376            Token::LongArrow => write!(f, "->>"),
377            Token::HashArrow => write!(f, "#>"),
378            Token::HashLongArrow => write!(f, "#>>"),
379            Token::AtArrow => write!(f, "@>"),
380            Token::ArrowAt => write!(f, "<@"),
381            Token::HashMinus => write!(f, "#-"),
382            Token::AtQuestion => write!(f, "@?"),
383            Token::AtAt => write!(f, "@@"),
384            Token::Question => write!(f, "?"),
385            Token::QuestionAnd => write!(f, "?&"),
386            Token::QuestionPipe => write!(f, "?|"),
387            Token::CustomBinaryOperator(s) => f.write_str(s),
388        }
389    }
390}
391
392impl Token {
393    pub fn make_keyword(keyword: &str) -> Self {
394        Token::make_word(keyword, None)
395    }
396
397    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
398        let word_uppercase = word.to_uppercase();
399        Token::Word(Word {
400            value: word.to_string(),
401            quote_style,
402            keyword: if quote_style.is_none() {
403                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
404                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
405            } else {
406                Keyword::NoKeyword
407            },
408        })
409    }
410}
411
412/// A keyword (like SELECT) or an optionally quoted SQL identifier
413#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
416pub struct Word {
417    /// The value of the token, without the enclosing quotes, and with the
418    /// escape sequences (if any) processed (TODO: escapes are not handled)
419    pub value: String,
420    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
421    /// The standard and most implementations allow using double quotes for this,
422    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
423    pub quote_style: Option<char>,
424    /// If the word was not quoted and it matched one of the known keywords,
425    /// this will have one of the values from dialect::keywords, otherwise empty
426    pub keyword: Keyword,
427}
428
429impl fmt::Display for Word {
430    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
431        match self.quote_style {
432            Some(s) if s == '"' || s == '[' || s == '`' => {
433                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
434            }
435            None => f.write_str(&self.value),
436            _ => panic!("Unexpected quote_style!"),
437        }
438    }
439}
440
441impl Word {
442    fn matching_end_quote(ch: char) -> char {
443        match ch {
444            '"' => '"', // ANSI and most dialects
445            '[' => ']', // MS SQL
446            '`' => '`', // MySQL
447            _ => panic!("unexpected quoting style!"),
448        }
449    }
450}
451
452#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
453#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
454#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
455pub enum Whitespace {
456    Space,
457    Newline,
458    Tab,
459    SingleLineComment { comment: String, prefix: String },
460    MultiLineComment(String),
461}
462
463impl fmt::Display for Whitespace {
464    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
465        match self {
466            Whitespace::Space => f.write_str(" "),
467            Whitespace::Newline => f.write_str("\n"),
468            Whitespace::Tab => f.write_str("\t"),
469            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
470            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
471        }
472    }
473}
474
475/// Location in input string
476///
477/// # Create an "empty" (unknown) `Location`
478/// ```
479/// # use sqlparser::tokenizer::Location;
480/// let location = Location::empty();
481/// ```
482///
483/// # Create a `Location` from a line and column
484/// ```
485/// # use sqlparser::tokenizer::Location;
486/// let location = Location::new(1, 1);
487/// ```
488///
489/// # Create a `Location` from a pair
490/// ```
491/// # use sqlparser::tokenizer::Location;
492/// let location = Location::from((1, 1));
493/// ```
494#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
495#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
496#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
497pub struct Location {
498    /// Line number, starting from 1.
499    ///
500    /// Note: Line 0 is used for empty spans
501    pub line: u64,
502    /// Line column, starting from 1.
503    ///
504    /// Note: Column 0 is used for empty spans
505    pub column: u64,
506}
507
508impl fmt::Display for Location {
509    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
510        if self.line == 0 {
511            return Ok(());
512        }
513        write!(f, " at Line: {}, Column: {}", self.line, self.column)
514    }
515}
516
517impl fmt::Debug for Location {
518    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
519        write!(f, "Location({},{})", self.line, self.column)
520    }
521}
522
523impl Location {
524    /// Return an "empty" / unknown location
525    pub fn empty() -> Self {
526        Self { line: 0, column: 0 }
527    }
528
529    /// Create a new `Location` for a given line and column
530    pub fn new(line: u64, column: u64) -> Self {
531        Self { line, column }
532    }
533
534    /// Create a new location for a given line and column
535    ///
536    /// Alias for [`Self::new`]
537    // TODO: remove / deprecate in favor of` `new` for consistency?
538    pub fn of(line: u64, column: u64) -> Self {
539        Self::new(line, column)
540    }
541
542    /// Combine self and `end` into a new `Span`
543    pub fn span_to(self, end: Self) -> Span {
544        Span { start: self, end }
545    }
546}
547
548impl From<(u64, u64)> for Location {
549    fn from((line, column): (u64, u64)) -> Self {
550        Self { line, column }
551    }
552}
553
554/// A span represents a linear portion of the input string (start, end)
555///
556/// See [Spanned](crate::ast::Spanned) for more information.
557#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
558#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
559#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
560pub struct Span {
561    pub start: Location,
562    pub end: Location,
563}
564
565impl fmt::Debug for Span {
566    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
567        write!(f, "Span({:?}..{:?})", self.start, self.end)
568    }
569}
570
571impl Span {
572    // An empty span (0, 0) -> (0, 0)
573    // We need a const instance for pattern matching
574    const EMPTY: Span = Self::empty();
575
576    /// Create a new span from a start and end [`Location`]
577    pub fn new(start: Location, end: Location) -> Span {
578        Span { start, end }
579    }
580
581    /// Returns an empty span `(0, 0) -> (0, 0)`
582    ///
583    /// Empty spans represent no knowledge of source location
584    /// See [Spanned](crate::ast::Spanned) for more information.
585    pub const fn empty() -> Span {
586        Span {
587            start: Location { line: 0, column: 0 },
588            end: Location { line: 0, column: 0 },
589        }
590    }
591
592    /// Returns the smallest Span that contains both `self` and `other`
593    /// If either span is [Span::empty], the other span is returned
594    ///
595    /// # Examples
596    /// ```
597    /// # use sqlparser::tokenizer::{Span, Location};
598    /// // line 1, column1 -> line 2, column 5
599    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
600    /// // line 2, column 3 -> line 3, column 7
601    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
602    /// // Union of the two is the min/max of the two spans
603    /// // line 1, column 1 -> line 3, column 7
604    /// let union = span1.union(&span2);
605    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
606    /// ```
607    pub fn union(&self, other: &Span) -> Span {
608        // If either span is empty, return the other
609        // this prevents propagating (0, 0) through the tree
610        match (self, other) {
611            (&Span::EMPTY, _) => *other,
612            (_, &Span::EMPTY) => *self,
613            _ => Span {
614                start: cmp::min(self.start, other.start),
615                end: cmp::max(self.end, other.end),
616            },
617        }
618    }
619
620    /// Same as [Span::union] for `Option<Span>`
621    ///
622    /// If `other` is `None`, `self` is returned
623    pub fn union_opt(&self, other: &Option<Span>) -> Span {
624        match other {
625            Some(other) => self.union(other),
626            None => *self,
627        }
628    }
629
630    /// Return the [Span::union] of all spans in the iterator
631    ///
632    /// If the iterator is empty, an empty span is returned
633    ///
634    /// # Example
635    /// ```
636    /// # use sqlparser::tokenizer::{Span, Location};
637    /// let spans = vec![
638    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
639    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
640    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
641    /// ];
642    /// // line 1, column 1 -> line 4, column 2
643    /// assert_eq!(
644    ///   Span::union_iter(spans),
645    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
646    /// );
647    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
648        iter.into_iter()
649            .reduce(|acc, item| acc.union(&item))
650            .unwrap_or(Span::empty())
651    }
652}
653
654/// Backwards compatibility struct for [`TokenWithSpan`]
655#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656pub type TokenWithLocation = TokenWithSpan;
657
658/// A [Token] with [Span] attached to it
659///
660/// This is used to track the location of a token in the input string
661///
662/// # Examples
663/// ```
664/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
665/// // commas @ line 1, column 10
666/// let tok1 = TokenWithSpan::new(
667///   Token::Comma,
668///   Span::new(Location::new(1, 10), Location::new(1, 11)),
669/// );
670/// assert_eq!(tok1, Token::Comma); // can compare the token
671///
672/// // commas @ line 2, column 20
673/// let tok2 = TokenWithSpan::new(
674///   Token::Comma,
675///   Span::new(Location::new(2, 20), Location::new(2, 21)),
676/// );
677/// // same token but different locations are not equal
678/// assert_ne!(tok1, tok2);
679/// ```
680#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
681#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
682#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
683pub struct TokenWithSpan {
684    pub token: Token,
685    pub span: Span,
686}
687
688impl TokenWithSpan {
689    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
690    pub fn new(token: Token, span: Span) -> Self {
691        Self { token, span }
692    }
693
694    /// Wrap a token with an empty span
695    pub fn wrap(token: Token) -> Self {
696        Self::new(token, Span::empty())
697    }
698
699    /// Wrap a token with a location from `start` to `end`
700    pub fn at(token: Token, start: Location, end: Location) -> Self {
701        Self::new(token, Span::new(start, end))
702    }
703
704    /// Return an EOF token with no location
705    pub fn new_eof() -> Self {
706        Self::wrap(Token::EOF)
707    }
708}
709
710impl PartialEq<Token> for TokenWithSpan {
711    fn eq(&self, other: &Token) -> bool {
712        &self.token == other
713    }
714}
715
716impl PartialEq<TokenWithSpan> for Token {
717    fn eq(&self, other: &TokenWithSpan) -> bool {
718        self == &other.token
719    }
720}
721
722impl fmt::Display for TokenWithSpan {
723    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
724        self.token.fmt(f)
725    }
726}
727
728/// Tokenizer error
729#[derive(Debug, PartialEq, Eq)]
730pub struct TokenizerError {
731    pub message: String,
732    pub location: Location,
733}
734
735impl fmt::Display for TokenizerError {
736    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737        write!(f, "{}{}", self.message, self.location,)
738    }
739}
740
741#[cfg(feature = "std")]
742impl std::error::Error for TokenizerError {}
743
744struct State<'a> {
745    peekable: Peekable<Chars<'a>>,
746    pub line: u64,
747    pub col: u64,
748}
749
750impl State<'_> {
751    /// return the next character and advance the stream
752    pub fn next(&mut self) -> Option<char> {
753        match self.peekable.next() {
754            None => None,
755            Some(s) => {
756                if s == '\n' {
757                    self.line += 1;
758                    self.col = 1;
759                } else {
760                    self.col += 1;
761                }
762                Some(s)
763            }
764        }
765    }
766
767    /// return the next character but do not advance the stream
768    pub fn peek(&mut self) -> Option<&char> {
769        self.peekable.peek()
770    }
771
772    pub fn location(&self) -> Location {
773        Location {
774            line: self.line,
775            column: self.col,
776        }
777    }
778}
779
780/// Represents how many quote characters enclose a string literal.
781#[derive(Copy, Clone)]
782enum NumStringQuoteChars {
783    /// e.g. `"abc"`, `'abc'`, `r'abc'`
784    One,
785    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
786    Many(NonZeroU8),
787}
788
789/// Settings for tokenizing a quoted string literal.
790struct TokenizeQuotedStringSettings {
791    /// The character used to quote the string.
792    quote_style: char,
793    /// Represents how many quotes characters enclose the string literal.
794    num_quote_chars: NumStringQuoteChars,
795    /// The number of opening quotes left to consume, before parsing
796    /// the remaining string literal.
797    /// For example: given initial string `"""abc"""`. If the caller has
798    /// already parsed the first quote for some reason, then this value
799    /// is set to 1, flagging to look to consume only 2 leading quotes.
800    num_opening_quotes_to_consume: u8,
801    /// True if the string uses backslash escaping of special characters
802    /// e.g `'abc\ndef\'ghi'
803    backslash_escape: bool,
804}
805
806/// SQL Tokenizer
807pub struct Tokenizer<'a> {
808    dialect: &'a dyn Dialect,
809    query: &'a str,
810    /// If true (the default), the tokenizer will un-escape literal
811    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
812    unescape: bool,
813}
814
815impl<'a> Tokenizer<'a> {
816    /// Create a new SQL tokenizer for the specified SQL statement
817    ///
818    /// ```
819    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
820    /// # use sqlparser::dialect::GenericDialect;
821    /// # let dialect = GenericDialect{};
822    /// let query = r#"SELECT 'foo'"#;
823    ///
824    /// // Parsing the query
825    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
826    ///
827    /// assert_eq!(tokens, vec![
828    ///   Token::make_word("SELECT", None),
829    ///   Token::Whitespace(Whitespace::Space),
830    ///   Token::SingleQuotedString("foo".to_string()),
831    /// ]);
832    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
833        Self {
834            dialect,
835            query,
836            unescape: true,
837        }
838    }
839
840    /// Set unescape mode
841    ///
842    /// When true (default) the tokenizer unescapes literal values
843    /// (for example, `""` in SQL is unescaped to the literal `"`).
844    ///
845    /// When false, the tokenizer provides the raw strings as provided
846    /// in the query.  This can be helpful for programs that wish to
847    /// recover the *exact* original query text without normalizing
848    /// the escaping
849    ///
850    /// # Example
851    ///
852    /// ```
853    /// # use sqlparser::tokenizer::{Token, Tokenizer};
854    /// # use sqlparser::dialect::GenericDialect;
855    /// # let dialect = GenericDialect{};
856    /// let query = r#""Foo "" Bar""#;
857    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
858    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
859    ///
860    /// // Parsing with unescaping (default)
861    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
862    /// assert_eq!(tokens, vec![unescaped]);
863    ///
864    /// // Parsing with unescape = false
865    /// let tokens = Tokenizer::new(&dialect, &query)
866    ///    .with_unescape(false)
867    ///    .tokenize().unwrap();
868    /// assert_eq!(tokens, vec![original]);
869    /// ```
870    pub fn with_unescape(mut self, unescape: bool) -> Self {
871        self.unescape = unescape;
872        self
873    }
874
875    /// Tokenize the statement and produce a vector of tokens
876    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877        let twl = self.tokenize_with_location()?;
878        Ok(twl.into_iter().map(|t| t.token).collect())
879    }
880
881    /// Tokenize the statement and produce a vector of tokens with location information
882    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
883        let mut tokens: Vec<TokenWithSpan> = vec![];
884        self.tokenize_with_location_into_buf(&mut tokens)
885            .map(|_| tokens)
886    }
887
888    /// Tokenize the statement and append tokens with location information into the provided buffer.
889    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
890    pub fn tokenize_with_location_into_buf(
891        &mut self,
892        buf: &mut Vec<TokenWithSpan>,
893    ) -> Result<(), TokenizerError> {
894        let mut state = State {
895            peekable: self.query.chars().peekable(),
896            line: 1,
897            col: 1,
898        };
899
900        let mut location = state.location();
901        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902            let span = location.span_to(state.location());
903
904            buf.push(TokenWithSpan { token, span });
905
906            location = state.location();
907        }
908        Ok(())
909    }
910
911    // Tokenize the identifier or keywords in `ch`
912    fn tokenize_identifier_or_keyword(
913        &self,
914        ch: impl IntoIterator<Item = char>,
915        chars: &mut State,
916    ) -> Result<Option<Token>, TokenizerError> {
917        chars.next(); // consume the first char
918        let ch: String = ch.into_iter().collect();
919        let word = self.tokenize_word(ch, chars);
920
921        // TODO: implement parsing of exponent here
922        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923            let mut inner_state = State {
924                peekable: word.chars().peekable(),
925                line: 0,
926                col: 0,
927            };
928            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
930            s += s2.as_str();
931            return Ok(Some(Token::Number(s, false)));
932        }
933
934        Ok(Some(Token::make_word(&word, None)))
935    }
936
937    /// Get the next token or return None
938    fn next_token(
939        &self,
940        chars: &mut State,
941        prev_token: Option<&Token>,
942    ) -> Result<Option<Token>, TokenizerError> {
943        match chars.peek() {
944            Some(&ch) => match ch {
945                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
946                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
947                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
948                '\r' => {
949                    // Emit a single Whitespace::Newline token for \r and \r\n
950                    chars.next();
951                    if let Some('\n') = chars.peek() {
952                        chars.next();
953                    }
954                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
955                }
956                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
957                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
958                {
959                    chars.next(); // consume
960                    match chars.peek() {
961                        Some('\'') => {
962                            if self.dialect.supports_triple_quoted_string() {
963                                return self
964                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
965                                        chars,
966                                        '\'',
967                                        false,
968                                        Token::SingleQuotedByteStringLiteral,
969                                        Token::TripleSingleQuotedByteStringLiteral,
970                                    );
971                            }
972                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
973                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
974                        }
975                        Some('\"') => {
976                            if self.dialect.supports_triple_quoted_string() {
977                                return self
978                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
979                                        chars,
980                                        '"',
981                                        false,
982                                        Token::DoubleQuotedByteStringLiteral,
983                                        Token::TripleDoubleQuotedByteStringLiteral,
984                                    );
985                            }
986                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
987                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
988                        }
989                        _ => {
990                            // regular identifier starting with an "b" or "B"
991                            let s = self.tokenize_word(b, chars);
992                            Ok(Some(Token::make_word(&s, None)))
993                        }
994                    }
995                }
996                // BigQuery uses r or R for raw string literal
997                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
998                    chars.next(); // consume
999                    match chars.peek() {
1000                        Some('\'') => self
1001                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1002                                chars,
1003                                '\'',
1004                                false,
1005                                Token::SingleQuotedRawStringLiteral,
1006                                Token::TripleSingleQuotedRawStringLiteral,
1007                            ),
1008                        Some('\"') => self
1009                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1010                                chars,
1011                                '"',
1012                                false,
1013                                Token::DoubleQuotedRawStringLiteral,
1014                                Token::TripleDoubleQuotedRawStringLiteral,
1015                            ),
1016                        _ => {
1017                            // regular identifier starting with an "r" or "R"
1018                            let s = self.tokenize_word(b, chars);
1019                            Ok(Some(Token::make_word(&s, None)))
1020                        }
1021                    }
1022                }
1023                // Redshift uses lower case n for national string literal
1024                n @ 'N' | n @ 'n' => {
1025                    chars.next(); // consume, to check the next char
1026                    match chars.peek() {
1027                        Some('\'') => {
1028                            // N'...' - a <national character string literal>
1029                            let backslash_escape =
1030                                self.dialect.supports_string_literal_backslash_escape();
1031                            let s =
1032                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1033                            Ok(Some(Token::NationalStringLiteral(s)))
1034                        }
1035                        _ => {
1036                            // regular identifier starting with an "N"
1037                            let s = self.tokenize_word(n, chars);
1038                            Ok(Some(Token::make_word(&s, None)))
1039                        }
1040                    }
1041                }
1042                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1043                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1044                    let starting_loc = chars.location();
1045                    chars.next(); // consume, to check the next char
1046                    match chars.peek() {
1047                        Some('\'') => {
1048                            let s =
1049                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1050                            Ok(Some(Token::EscapedStringLiteral(s)))
1051                        }
1052                        _ => {
1053                            // regular identifier starting with an "E" or "e"
1054                            let s = self.tokenize_word(x, chars);
1055                            Ok(Some(Token::make_word(&s, None)))
1056                        }
1057                    }
1058                }
1059                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1060                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1061                    chars.next(); // consume, to check the next char
1062                    if chars.peek() == Some(&'&') {
1063                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1064                        let mut chars_clone = chars.peekable.clone();
1065                        chars_clone.next(); // consume the '&' in the clone
1066                        if chars_clone.peek() == Some(&'\'') {
1067                            chars.next(); // consume the '&' in the original iterator
1068                            let s = unescape_unicode_single_quoted_string(chars)?;
1069                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1070                        }
1071                    }
1072                    // regular identifier starting with an "U" or "u"
1073                    let s = self.tokenize_word(x, chars);
1074                    Ok(Some(Token::make_word(&s, None)))
1075                }
1076                // The spec only allows an uppercase 'X' to introduce a hex
1077                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1078                x @ 'x' | x @ 'X' => {
1079                    chars.next(); // consume, to check the next char
1080                    match chars.peek() {
1081                        Some('\'') => {
1082                            // X'...' - a <binary string literal>
1083                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1084                            Ok(Some(Token::HexStringLiteral(s)))
1085                        }
1086                        _ => {
1087                            // regular identifier starting with an "X"
1088                            let s = self.tokenize_word(x, chars);
1089                            Ok(Some(Token::make_word(&s, None)))
1090                        }
1091                    }
1092                }
1093                // single quoted string
1094                '\'' => {
1095                    if self.dialect.supports_triple_quoted_string() {
1096                        return self
1097                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1098                                chars,
1099                                '\'',
1100                                self.dialect.supports_string_literal_backslash_escape(),
1101                                Token::SingleQuotedString,
1102                                Token::TripleSingleQuotedString,
1103                            );
1104                    }
1105                    let s = self.tokenize_single_quoted_string(
1106                        chars,
1107                        '\'',
1108                        self.dialect.supports_string_literal_backslash_escape(),
1109                    )?;
1110
1111                    Ok(Some(Token::SingleQuotedString(s)))
1112                }
1113                // double quoted string
1114                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1115                    && !self.dialect.is_identifier_start(ch) =>
1116                {
1117                    if self.dialect.supports_triple_quoted_string() {
1118                        return self
1119                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120                                chars,
1121                                '"',
1122                                self.dialect.supports_string_literal_backslash_escape(),
1123                                Token::DoubleQuotedString,
1124                                Token::TripleDoubleQuotedString,
1125                            );
1126                    }
1127                    let s = self.tokenize_single_quoted_string(
1128                        chars,
1129                        '"',
1130                        self.dialect.supports_string_literal_backslash_escape(),
1131                    )?;
1132
1133                    Ok(Some(Token::DoubleQuotedString(s)))
1134                }
1135                // delimited (quoted) identifier
1136                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1137                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1138                    Ok(Some(Token::make_word(&word, Some(quote_start))))
1139                }
1140                // Potentially nested delimited (quoted) identifier
1141                quote_start
1142                    if self
1143                        .dialect
1144                        .is_nested_delimited_identifier_start(quote_start)
1145                        && self
1146                            .dialect
1147                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1148                            .is_some() =>
1149                {
1150                    let Some((quote_start, nested_quote_start)) = self
1151                        .dialect
1152                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1153                    else {
1154                        return self.tokenizer_error(
1155                            chars.location(),
1156                            format!("Expected nested delimiter '{quote_start}' before EOF."),
1157                        );
1158                    };
1159
1160                    let Some(nested_quote_start) = nested_quote_start else {
1161                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1162                        return Ok(Some(Token::make_word(&word, Some(quote_start))));
1163                    };
1164
1165                    let mut word = vec![];
1166                    let quote_end = Word::matching_end_quote(quote_start);
1167                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1168                    let error_loc = chars.location();
1169
1170                    chars.next(); // skip the first delimiter
1171                    peeking_take_while(chars, |ch| ch.is_whitespace());
1172                    if chars.peek() != Some(&nested_quote_start) {
1173                        return self.tokenizer_error(
1174                            error_loc,
1175                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1176                        );
1177                    }
1178                    word.push(nested_quote_start.into());
1179                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1180                    word.push(nested_quote_end.into());
1181                    peeking_take_while(chars, |ch| ch.is_whitespace());
1182                    if chars.peek() != Some(&quote_end) {
1183                        return self.tokenizer_error(
1184                            error_loc,
1185                            format!("Expected close delimiter '{quote_end}' before EOF."),
1186                        );
1187                    }
1188                    chars.next(); // skip close delimiter
1189
1190                    Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1191                }
1192                // numbers and period
1193                '0'..='9' | '.' => {
1194                    // special case where if ._ is encountered after a word then that word
1195                    // is a table and the _ is the start of the col name.
1196                    // if the prev token is not a word, then this is not a valid sql
1197                    // word or number.
1198                    if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1199                        if let Some(Token::Word(_)) = prev_token {
1200                            chars.next();
1201                            return Ok(Some(Token::Period));
1202                        }
1203
1204                        return self.tokenizer_error(
1205                            chars.location(),
1206                            "Unexpected character '_'".to_string(),
1207                        );
1208                    }
1209
1210                    // Some dialects support underscore as number separator
1211                    // There can only be one at a time and it must be followed by another digit
1212                    let is_number_separator = |ch: char, next_char: Option<char>| {
1213                        self.dialect.supports_numeric_literal_underscores()
1214                            && ch == '_'
1215                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1216                    };
1217
1218                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1219                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1220                    });
1221
1222                    // match binary literal that starts with 0x
1223                    if s == "0" && chars.peek() == Some(&'x') {
1224                        chars.next();
1225                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1226                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1227                        });
1228                        return Ok(Some(Token::HexStringLiteral(s2)));
1229                    }
1230
1231                    // match one period
1232                    if let Some('.') = chars.peek() {
1233                        s.push('.');
1234                        chars.next();
1235                    }
1236
1237                    // If the dialect supports identifiers that start with a numeric prefix
1238                    // and we have now consumed a dot, check if the previous token was a Word.
1239                    // If so, what follows is definitely not part of a decimal number and
1240                    // we should yield the dot as a dedicated token so compound identifiers
1241                    // starting with digits can be parsed correctly.
1242                    if s == "." && self.dialect.supports_numeric_prefix() {
1243                        if let Some(Token::Word(_)) = prev_token {
1244                            return Ok(Some(Token::Period));
1245                        }
1246                    }
1247
1248                    // Consume fractional digits.
1249                    s += &peeking_next_take_while(chars, |ch, next_ch| {
1250                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1251                    });
1252
1253                    // No fraction -> Token::Period
1254                    if s == "." {
1255                        return Ok(Some(Token::Period));
1256                    }
1257
1258                    // Parse exponent as number
1259                    let mut exponent_part = String::new();
1260                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1261                        let mut char_clone = chars.peekable.clone();
1262                        exponent_part.push(char_clone.next().unwrap());
1263
1264                        // Optional sign
1265                        match char_clone.peek() {
1266                            Some(&c) if matches!(c, '+' | '-') => {
1267                                exponent_part.push(c);
1268                                char_clone.next();
1269                            }
1270                            _ => (),
1271                        }
1272
1273                        match char_clone.peek() {
1274                            // Definitely an exponent, get original iterator up to speed and use it
1275                            Some(&c) if c.is_ascii_digit() => {
1276                                for _ in 0..exponent_part.len() {
1277                                    chars.next();
1278                                }
1279                                exponent_part +=
1280                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1281                                s += exponent_part.as_str();
1282                            }
1283                            // Not an exponent, discard the work done
1284                            _ => (),
1285                        }
1286                    }
1287
1288                    // If the dialect supports identifiers that start with a numeric prefix,
1289                    // we need to check if the value is in fact an identifier and must thus
1290                    // be tokenized as a word.
1291                    if self.dialect.supports_numeric_prefix() {
1292                        if exponent_part.is_empty() {
1293                            // If it is not a number with an exponent, it may be
1294                            // an identifier starting with digits.
1295                            let word =
1296                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1297
1298                            if !word.is_empty() {
1299                                s += word.as_str();
1300                                return Ok(Some(Token::make_word(s.as_str(), None)));
1301                            }
1302                        } else if prev_token == Some(&Token::Period) {
1303                            // If the previous token was a period, thus not belonging to a number,
1304                            // the value we have is part of an identifier.
1305                            return Ok(Some(Token::make_word(s.as_str(), None)));
1306                        }
1307                    }
1308
1309                    let long = if chars.peek() == Some(&'L') {
1310                        chars.next();
1311                        true
1312                    } else {
1313                        false
1314                    };
1315                    Ok(Some(Token::Number(s, long)))
1316                }
1317                // punctuation
1318                '(' => self.consume_and_return(chars, Token::LParen),
1319                ')' => self.consume_and_return(chars, Token::RParen),
1320                ',' => self.consume_and_return(chars, Token::Comma),
1321                // operators
1322                '-' => {
1323                    chars.next(); // consume the '-'
1324
1325                    match chars.peek() {
1326                        Some('-') => {
1327                            let mut is_comment = true;
1328                            if self.dialect.requires_single_line_comment_whitespace() {
1329                                is_comment = Some(' ') == chars.peekable.clone().nth(1);
1330                            }
1331
1332                            if is_comment {
1333                                chars.next(); // consume second '-'
1334                                let comment = self.tokenize_single_line_comment(chars);
1335                                return Ok(Some(Token::Whitespace(
1336                                    Whitespace::SingleLineComment {
1337                                        prefix: "--".to_owned(),
1338                                        comment,
1339                                    },
1340                                )));
1341                            }
1342
1343                            self.start_binop(chars, "-", Token::Minus)
1344                        }
1345                        Some('>') => {
1346                            chars.next();
1347                            match chars.peek() {
1348                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1349                                _ => self.start_binop(chars, "->", Token::Arrow),
1350                            }
1351                        }
1352                        // a regular '-' operator
1353                        _ => self.start_binop(chars, "-", Token::Minus),
1354                    }
1355                }
1356                '/' => {
1357                    chars.next(); // consume the '/'
1358                    match chars.peek() {
1359                        Some('*') => {
1360                            chars.next(); // consume the '*', starting a multi-line comment
1361                            self.tokenize_multiline_comment(chars)
1362                        }
1363                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1364                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1365                            let comment = self.tokenize_single_line_comment(chars);
1366                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1367                                prefix: "//".to_owned(),
1368                                comment,
1369                            })))
1370                        }
1371                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1372                            self.consume_and_return(chars, Token::DuckIntDiv)
1373                        }
1374                        // a regular '/' operator
1375                        _ => Ok(Some(Token::Div)),
1376                    }
1377                }
1378                '+' => self.consume_and_return(chars, Token::Plus),
1379                '*' => self.consume_and_return(chars, Token::Mul),
1380                '%' => {
1381                    chars.next(); // advance past '%'
1382                    match chars.peek() {
1383                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1384                        Some(sch) if self.dialect.is_identifier_start('%') => {
1385                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1386                        }
1387                        _ => self.start_binop(chars, "%", Token::Mod),
1388                    }
1389                }
1390                '|' => {
1391                    chars.next(); // consume the '|'
1392                    match chars.peek() {
1393                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1394                        Some('|') => {
1395                            chars.next(); // consume the second '|'
1396                            match chars.peek() {
1397                                Some('/') => {
1398                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1399                                }
1400                                _ => self.start_binop(chars, "||", Token::StringConcat),
1401                            }
1402                        }
1403                        Some('&') if self.dialect.supports_geometric_types() => {
1404                            chars.next(); // consume
1405                            match chars.peek() {
1406                                Some('>') => self.consume_for_binop(
1407                                    chars,
1408                                    "|&>",
1409                                    Token::VerticalBarAmpersandRightAngleBracket,
1410                                ),
1411                                _ => self.start_binop_opt(chars, "|&", None),
1412                            }
1413                        }
1414                        Some('>') if self.dialect.supports_geometric_types() => {
1415                            chars.next(); // consume
1416                            match chars.peek() {
1417                                Some('>') => self.consume_for_binop(
1418                                    chars,
1419                                    "|>>",
1420                                    Token::VerticalBarShiftRight,
1421                                ),
1422                                _ => self.start_binop_opt(chars, "|>", None),
1423                            }
1424                        }
1425                        Some('>') if self.dialect.supports_pipe_operator() => {
1426                            self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1427                        }
1428                        // Bitshift '|' operator
1429                        _ => self.start_binop(chars, "|", Token::Pipe),
1430                    }
1431                }
1432                '=' => {
1433                    chars.next(); // consume
1434                    match chars.peek() {
1435                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1436                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1437                        _ => Ok(Some(Token::Eq)),
1438                    }
1439                }
1440                '!' => {
1441                    chars.next(); // consume
1442                    match chars.peek() {
1443                        Some('=') => self.consume_and_return(chars, Token::Neq),
1444                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1445                        Some('~') => {
1446                            chars.next();
1447                            match chars.peek() {
1448                                Some('*') => self
1449                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1450                                Some('~') => {
1451                                    chars.next();
1452                                    match chars.peek() {
1453                                        Some('*') => self.consume_and_return(
1454                                            chars,
1455                                            Token::ExclamationMarkDoubleTildeAsterisk,
1456                                        ),
1457                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1458                                    }
1459                                }
1460                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1461                            }
1462                        }
1463                        _ => Ok(Some(Token::ExclamationMark)),
1464                    }
1465                }
1466                '<' => {
1467                    chars.next(); // consume
1468                    match chars.peek() {
1469                        Some('=') => {
1470                            chars.next();
1471                            match chars.peek() {
1472                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1473                                _ => self.start_binop(chars, "<=", Token::LtEq),
1474                            }
1475                        }
1476                        Some('|') if self.dialect.supports_geometric_types() => {
1477                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1478                        }
1479                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1480                        Some('<') if self.dialect.supports_geometric_types() => {
1481                            chars.next(); // consume
1482                            match chars.peek() {
1483                                Some('|') => self.consume_for_binop(
1484                                    chars,
1485                                    "<<|",
1486                                    Token::ShiftLeftVerticalBar,
1487                                ),
1488                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1489                            }
1490                        }
1491                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1492                        Some('-') if self.dialect.supports_geometric_types() => {
1493                            chars.next(); // consume
1494                            match chars.peek() {
1495                                Some('>') => {
1496                                    self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1497                                }
1498                                _ => self.start_binop_opt(chars, "<-", None),
1499                            }
1500                        }
1501                        Some('^') if self.dialect.supports_geometric_types() => {
1502                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1503                        }
1504                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1505                        _ => self.start_binop(chars, "<", Token::Lt),
1506                    }
1507                }
1508                '>' => {
1509                    chars.next(); // consume
1510                    match chars.peek() {
1511                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1512                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1513                        Some('^') if self.dialect.supports_geometric_types() => {
1514                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1515                        }
1516                        _ => self.start_binop(chars, ">", Token::Gt),
1517                    }
1518                }
1519                ':' => {
1520                    chars.next();
1521                    match chars.peek() {
1522                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1523                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1524                        _ => Ok(Some(Token::Colon)),
1525                    }
1526                }
1527                ';' => self.consume_and_return(chars, Token::SemiColon),
1528                '\\' => self.consume_and_return(chars, Token::Backslash),
1529                '[' => self.consume_and_return(chars, Token::LBracket),
1530                ']' => self.consume_and_return(chars, Token::RBracket),
1531                '&' => {
1532                    chars.next(); // consume the '&'
1533                    match chars.peek() {
1534                        Some('>') if self.dialect.supports_geometric_types() => {
1535                            chars.next();
1536                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1537                        }
1538                        Some('<') if self.dialect.supports_geometric_types() => {
1539                            chars.next(); // consume
1540                            match chars.peek() {
1541                                Some('|') => self.consume_and_return(
1542                                    chars,
1543                                    Token::AmpersandLeftAngleBracketVerticalBar,
1544                                ),
1545                                _ => {
1546                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1547                                }
1548                            }
1549                        }
1550                        Some('&') => {
1551                            chars.next(); // consume the second '&'
1552                            self.start_binop(chars, "&&", Token::Overlap)
1553                        }
1554                        // Bitshift '&' operator
1555                        _ => self.start_binop(chars, "&", Token::Ampersand),
1556                    }
1557                }
1558                '^' => {
1559                    chars.next(); // consume the '^'
1560                    match chars.peek() {
1561                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1562                        _ => Ok(Some(Token::Caret)),
1563                    }
1564                }
1565                '{' => self.consume_and_return(chars, Token::LBrace),
1566                '}' => self.consume_and_return(chars, Token::RBrace),
1567                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1568                {
1569                    chars.next(); // consume the '#', starting a snowflake single-line comment
1570                    let comment = self.tokenize_single_line_comment(chars);
1571                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1572                        prefix: "#".to_owned(),
1573                        comment,
1574                    })))
1575                }
1576                '~' => {
1577                    chars.next(); // consume
1578                    match chars.peek() {
1579                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1580                        Some('=') if self.dialect.supports_geometric_types() => {
1581                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
1582                        }
1583                        Some('~') => {
1584                            chars.next();
1585                            match chars.peek() {
1586                                Some('*') => {
1587                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1588                                }
1589                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1590                            }
1591                        }
1592                        _ => self.start_binop(chars, "~", Token::Tilde),
1593                    }
1594                }
1595                '#' => {
1596                    chars.next();
1597                    match chars.peek() {
1598                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1599                        Some('>') => {
1600                            chars.next();
1601                            match chars.peek() {
1602                                Some('>') => {
1603                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1604                                }
1605                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1606                            }
1607                        }
1608                        Some(' ') => Ok(Some(Token::Sharp)),
1609                        Some('#') if self.dialect.supports_geometric_types() => {
1610                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
1611                        }
1612                        Some(sch) if self.dialect.is_identifier_start('#') => {
1613                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1614                        }
1615                        _ => self.start_binop(chars, "#", Token::Sharp),
1616                    }
1617                }
1618                '@' => {
1619                    chars.next();
1620                    match chars.peek() {
1621                        Some('@') if self.dialect.supports_geometric_types() => {
1622                            self.consume_and_return(chars, Token::AtAt)
1623                        }
1624                        Some('-') if self.dialect.supports_geometric_types() => {
1625                            chars.next();
1626                            match chars.peek() {
1627                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1628                                _ => self.start_binop_opt(chars, "@-", None),
1629                            }
1630                        }
1631                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1632                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1633                        Some('@') => {
1634                            chars.next();
1635                            match chars.peek() {
1636                                Some(' ') => Ok(Some(Token::AtAt)),
1637                                Some(tch) if self.dialect.is_identifier_start('@') => {
1638                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1639                                }
1640                                _ => Ok(Some(Token::AtAt)),
1641                            }
1642                        }
1643                        Some(' ') => Ok(Some(Token::AtSign)),
1644                        // We break on quotes here, because no dialect allows identifiers starting
1645                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1646                        // quoted, which is tokenized as a quoted string, not here (e.g.
1647                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1648                        // quoted string as two separate tokens, which this allows. For example,
1649                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1650                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1651                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1652                        // for the user, the `@`, and the host.
1653                        Some('\'') => Ok(Some(Token::AtSign)),
1654                        Some('\"') => Ok(Some(Token::AtSign)),
1655                        Some('`') => Ok(Some(Token::AtSign)),
1656                        Some(sch) if self.dialect.is_identifier_start('@') => {
1657                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1658                        }
1659                        _ => Ok(Some(Token::AtSign)),
1660                    }
1661                }
1662                // Postgres uses ? for jsonb operators, not prepared statements
1663                '?' if self.dialect.supports_geometric_types() => {
1664                    chars.next(); // consume
1665                    match chars.peek() {
1666                        Some('|') => {
1667                            chars.next();
1668                            match chars.peek() {
1669                                Some('|') => self.consume_and_return(
1670                                    chars,
1671                                    Token::QuestionMarkDoubleVerticalBar,
1672                                ),
1673                                _ => Ok(Some(Token::QuestionPipe)),
1674                            }
1675                        }
1676
1677                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1678                        Some('-') => {
1679                            chars.next(); // consume
1680                            match chars.peek() {
1681                                Some('|') => self
1682                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1683                                _ => Ok(Some(Token::QuestionMarkDash)),
1684                            }
1685                        }
1686                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1687                        _ => self.consume_and_return(chars, Token::Question),
1688                    }
1689                }
1690                '?' => {
1691                    chars.next();
1692                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1693                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1694                }
1695
1696                // identifier or keyword
1697                ch if self.dialect.is_identifier_start(ch) => {
1698                    self.tokenize_identifier_or_keyword([ch], chars)
1699                }
1700                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1701
1702                // whitespace check (including unicode chars) should be last as it covers some of the chars above
1703                ch if ch.is_whitespace() => {
1704                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1705                }
1706                other => self.consume_and_return(chars, Token::Char(other)),
1707            },
1708            None => Ok(None),
1709        }
1710    }
1711
1712    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1713    fn consume_for_binop(
1714        &self,
1715        chars: &mut State,
1716        prefix: &str,
1717        default: Token,
1718    ) -> Result<Option<Token>, TokenizerError> {
1719        chars.next(); // consume the first char
1720        self.start_binop_opt(chars, prefix, Some(default))
1721    }
1722
1723    /// parse a custom binary operator
1724    fn start_binop(
1725        &self,
1726        chars: &mut State,
1727        prefix: &str,
1728        default: Token,
1729    ) -> Result<Option<Token>, TokenizerError> {
1730        self.start_binop_opt(chars, prefix, Some(default))
1731    }
1732
1733    /// parse a custom binary operator
1734    fn start_binop_opt(
1735        &self,
1736        chars: &mut State,
1737        prefix: &str,
1738        default: Option<Token>,
1739    ) -> Result<Option<Token>, TokenizerError> {
1740        let mut custom = None;
1741        while let Some(&ch) = chars.peek() {
1742            if !self.dialect.is_custom_operator_part(ch) {
1743                break;
1744            }
1745
1746            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1747            chars.next();
1748        }
1749        match (custom, default) {
1750            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1751            (None, Some(tok)) => Ok(Some(tok)),
1752            (None, None) => self.tokenizer_error(
1753                chars.location(),
1754                format!("Expected a valid binary operator after '{prefix}'"),
1755            ),
1756        }
1757    }
1758
1759    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1760    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1761        let mut s = String::new();
1762        let mut value = String::new();
1763
1764        chars.next();
1765
1766        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1767        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1768            chars.next();
1769
1770            let mut is_terminated = false;
1771            let mut prev: Option<char> = None;
1772
1773            while let Some(&ch) = chars.peek() {
1774                if prev == Some('$') {
1775                    if ch == '$' {
1776                        chars.next();
1777                        is_terminated = true;
1778                        break;
1779                    } else {
1780                        s.push('$');
1781                        s.push(ch);
1782                    }
1783                } else if ch != '$' {
1784                    s.push(ch);
1785                }
1786
1787                prev = Some(ch);
1788                chars.next();
1789            }
1790
1791            return if chars.peek().is_none() && !is_terminated {
1792                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1793            } else {
1794                Ok(Token::DollarQuotedString(DollarQuotedString {
1795                    value: s,
1796                    tag: None,
1797                }))
1798            };
1799        } else {
1800            value.push_str(&peeking_take_while(chars, |ch| {
1801                ch.is_alphanumeric()
1802                    || ch == '_'
1803                    // Allow $ as a placeholder character if the dialect supports it
1804                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1805            }));
1806
1807            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1808            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1809                chars.next();
1810
1811                let mut temp = String::new();
1812                let end_delimiter = format!("${value}$");
1813
1814                loop {
1815                    match chars.next() {
1816                        Some(ch) => {
1817                            temp.push(ch);
1818
1819                            if temp.ends_with(&end_delimiter) {
1820                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1821                                    s.push_str(temp);
1822                                }
1823                                break;
1824                            }
1825                        }
1826                        None => {
1827                            if temp.ends_with(&end_delimiter) {
1828                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1829                                    s.push_str(temp);
1830                                }
1831                                break;
1832                            }
1833
1834                            return self.tokenizer_error(
1835                                chars.location(),
1836                                "Unterminated dollar-quoted, expected $",
1837                            );
1838                        }
1839                    }
1840                }
1841            } else {
1842                return Ok(Token::Placeholder(String::from("$") + &value));
1843            }
1844        }
1845
1846        Ok(Token::DollarQuotedString(DollarQuotedString {
1847            value: s,
1848            tag: if value.is_empty() { None } else { Some(value) },
1849        }))
1850    }
1851
1852    fn tokenizer_error<R>(
1853        &self,
1854        loc: Location,
1855        message: impl Into<String>,
1856    ) -> Result<R, TokenizerError> {
1857        Err(TokenizerError {
1858            message: message.into(),
1859            location: loc,
1860        })
1861    }
1862
1863    // Consume characters until newline
1864    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1865        let mut comment = peeking_take_while(chars, |ch| match ch {
1866            '\n' => false,                                           // Always stop at \n
1867            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
1868            _ => true, // Keep consuming for other characters
1869        });
1870
1871        if let Some(ch) = chars.next() {
1872            assert!(ch == '\n' || ch == '\r');
1873            comment.push(ch);
1874        }
1875
1876        comment
1877    }
1878
1879    /// Tokenize an identifier or keyword, after the first char is already consumed.
1880    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881        let mut s = first_chars.into();
1882        s.push_str(&peeking_take_while(chars, |ch| {
1883            self.dialect.is_identifier_part(ch)
1884        }));
1885        s
1886    }
1887
1888    /// Read a quoted identifier
1889    fn tokenize_quoted_identifier(
1890        &self,
1891        quote_start: char,
1892        chars: &mut State,
1893    ) -> Result<String, TokenizerError> {
1894        let error_loc = chars.location();
1895        chars.next(); // consume the opening quote
1896        let quote_end = Word::matching_end_quote(quote_start);
1897        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1898
1899        if last_char == Some(quote_end) {
1900            Ok(s)
1901        } else {
1902            self.tokenizer_error(
1903                error_loc,
1904                format!("Expected close delimiter '{quote_end}' before EOF."),
1905            )
1906        }
1907    }
1908
1909    /// Read a single quoted string, starting with the opening quote.
1910    fn tokenize_escaped_single_quoted_string(
1911        &self,
1912        starting_loc: Location,
1913        chars: &mut State,
1914    ) -> Result<String, TokenizerError> {
1915        if let Some(s) = unescape_single_quoted_string(chars) {
1916            return Ok(s);
1917        }
1918
1919        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1920    }
1921
1922    /// Reads a string literal quoted by a single or triple quote characters.
1923    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
1924    fn tokenize_single_or_triple_quoted_string<F>(
1925        &self,
1926        chars: &mut State,
1927        quote_style: char,
1928        backslash_escape: bool,
1929        single_quote_token: F,
1930        triple_quote_token: F,
1931    ) -> Result<Option<Token>, TokenizerError>
1932    where
1933        F: Fn(String) -> Token,
1934    {
1935        let error_loc = chars.location();
1936
1937        let mut num_opening_quotes = 0u8;
1938        for _ in 0..3 {
1939            if Some(&quote_style) == chars.peek() {
1940                chars.next(); // Consume quote.
1941                num_opening_quotes += 1;
1942            } else {
1943                break;
1944            }
1945        }
1946
1947        let (token_fn, num_quote_chars) = match num_opening_quotes {
1948            1 => (single_quote_token, NumStringQuoteChars::One),
1949            2 => {
1950                // If we matched double quotes, then this is an empty string.
1951                return Ok(Some(single_quote_token("".into())));
1952            }
1953            3 => {
1954                let Some(num_quote_chars) = NonZeroU8::new(3) else {
1955                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1956                };
1957                (
1958                    triple_quote_token,
1959                    NumStringQuoteChars::Many(num_quote_chars),
1960                )
1961            }
1962            _ => {
1963                return self.tokenizer_error(error_loc, "invalid string literal opening");
1964            }
1965        };
1966
1967        let settings = TokenizeQuotedStringSettings {
1968            quote_style,
1969            num_quote_chars,
1970            num_opening_quotes_to_consume: 0,
1971            backslash_escape,
1972        };
1973
1974        self.tokenize_quoted_string(chars, settings)
1975            .map(token_fn)
1976            .map(Some)
1977    }
1978
1979    /// Reads a string literal quoted by a single quote character.
1980    fn tokenize_single_quoted_string(
1981        &self,
1982        chars: &mut State,
1983        quote_style: char,
1984        backslash_escape: bool,
1985    ) -> Result<String, TokenizerError> {
1986        self.tokenize_quoted_string(
1987            chars,
1988            TokenizeQuotedStringSettings {
1989                quote_style,
1990                num_quote_chars: NumStringQuoteChars::One,
1991                num_opening_quotes_to_consume: 1,
1992                backslash_escape,
1993            },
1994        )
1995    }
1996
1997    /// Read a quoted string.
1998    fn tokenize_quoted_string(
1999        &self,
2000        chars: &mut State,
2001        settings: TokenizeQuotedStringSettings,
2002    ) -> Result<String, TokenizerError> {
2003        let mut s = String::new();
2004        let error_loc = chars.location();
2005
2006        // Consume any opening quotes.
2007        for _ in 0..settings.num_opening_quotes_to_consume {
2008            if Some(settings.quote_style) != chars.next() {
2009                return self.tokenizer_error(error_loc, "invalid string literal opening");
2010            }
2011        }
2012
2013        let mut num_consecutive_quotes = 0;
2014        while let Some(&ch) = chars.peek() {
2015            let pending_final_quote = match settings.num_quote_chars {
2016                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2017                n @ NumStringQuoteChars::Many(count)
2018                    if num_consecutive_quotes + 1 == count.get() =>
2019                {
2020                    Some(n)
2021                }
2022                NumStringQuoteChars::Many(_) => None,
2023            };
2024
2025            match ch {
2026                char if char == settings.quote_style && pending_final_quote.is_some() => {
2027                    chars.next(); // consume
2028
2029                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2030                        // For an initial string like `"""abc"""`, at this point we have
2031                        // `abc""` in the buffer and have now matched the final `"`.
2032                        // However, the string to return is simply `abc`, so we strip off
2033                        // the trailing quotes before returning.
2034                        let mut buf = s.chars();
2035                        for _ in 1..count.get() {
2036                            buf.next_back();
2037                        }
2038                        return Ok(buf.as_str().to_string());
2039                    } else if chars
2040                        .peek()
2041                        .map(|c| *c == settings.quote_style)
2042                        .unwrap_or(false)
2043                    {
2044                        s.push(ch);
2045                        if !self.unescape {
2046                            // In no-escape mode, the given query has to be saved completely
2047                            s.push(ch);
2048                        }
2049                        chars.next();
2050                    } else {
2051                        return Ok(s);
2052                    }
2053                }
2054                '\\' if settings.backslash_escape => {
2055                    // consume backslash
2056                    chars.next();
2057
2058                    num_consecutive_quotes = 0;
2059
2060                    if let Some(next) = chars.peek() {
2061                        if !self.unescape
2062                            || (self.dialect.ignores_wildcard_escapes()
2063                                && (*next == '%' || *next == '_'))
2064                        {
2065                            // In no-escape mode, the given query has to be saved completely
2066                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2067                            // the backslash is not stripped.
2068                            s.push(ch);
2069                            s.push(*next);
2070                            chars.next(); // consume next
2071                        } else {
2072                            let n = match next {
2073                                '0' => '\0',
2074                                'a' => '\u{7}',
2075                                'b' => '\u{8}',
2076                                'f' => '\u{c}',
2077                                'n' => '\n',
2078                                'r' => '\r',
2079                                't' => '\t',
2080                                'Z' => '\u{1a}',
2081                                _ => *next,
2082                            };
2083                            s.push(n);
2084                            chars.next(); // consume next
2085                        }
2086                    }
2087                }
2088                ch => {
2089                    chars.next(); // consume ch
2090
2091                    if ch == settings.quote_style {
2092                        num_consecutive_quotes += 1;
2093                    } else {
2094                        num_consecutive_quotes = 0;
2095                    }
2096
2097                    s.push(ch);
2098                }
2099            }
2100        }
2101        self.tokenizer_error(error_loc, "Unterminated string literal")
2102    }
2103
2104    fn tokenize_multiline_comment(
2105        &self,
2106        chars: &mut State,
2107    ) -> Result<Option<Token>, TokenizerError> {
2108        let mut s = String::new();
2109        let mut nested = 1;
2110        let supports_nested_comments = self.dialect.supports_nested_comments();
2111
2112        loop {
2113            match chars.next() {
2114                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2115                    chars.next(); // consume the '*'
2116                    s.push('/');
2117                    s.push('*');
2118                    nested += 1;
2119                }
2120                Some('*') if matches!(chars.peek(), Some('/')) => {
2121                    chars.next(); // consume the '/'
2122                    nested -= 1;
2123                    if nested == 0 {
2124                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2125                    }
2126                    s.push('*');
2127                    s.push('/');
2128                }
2129                Some(ch) => {
2130                    s.push(ch);
2131                }
2132                None => {
2133                    break self.tokenizer_error(
2134                        chars.location(),
2135                        "Unexpected EOF while in a multi-line comment",
2136                    );
2137                }
2138            }
2139        }
2140    }
2141
2142    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2143        let mut last_char = None;
2144        let mut s = String::new();
2145        while let Some(ch) = chars.next() {
2146            if ch == quote_end {
2147                if chars.peek() == Some(&quote_end) {
2148                    chars.next();
2149                    s.push(ch);
2150                    if !self.unescape {
2151                        // In no-escape mode, the given query has to be saved completely
2152                        s.push(ch);
2153                    }
2154                } else {
2155                    last_char = Some(quote_end);
2156                    break;
2157                }
2158            } else {
2159                s.push(ch);
2160            }
2161        }
2162        (s, last_char)
2163    }
2164
2165    #[allow(clippy::unnecessary_wraps)]
2166    fn consume_and_return(
2167        &self,
2168        chars: &mut State,
2169        t: Token,
2170    ) -> Result<Option<Token>, TokenizerError> {
2171        chars.next();
2172        Ok(Some(t))
2173    }
2174}
2175
2176/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2177/// Return the characters read as String, and keep the first non-matching
2178/// char available as `chars.next()`.
2179fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2180    let mut s = String::new();
2181    while let Some(&ch) = chars.peek() {
2182        if predicate(ch) {
2183            chars.next(); // consume
2184            s.push(ch);
2185        } else {
2186            break;
2187        }
2188    }
2189    s
2190}
2191
2192/// Same as peeking_take_while, but also passes the next character to the predicate.
2193fn peeking_next_take_while(
2194    chars: &mut State,
2195    mut predicate: impl FnMut(char, Option<char>) -> bool,
2196) -> String {
2197    let mut s = String::new();
2198    while let Some(&ch) = chars.peek() {
2199        let next_char = chars.peekable.clone().nth(1);
2200        if predicate(ch, next_char) {
2201            chars.next(); // consume
2202            s.push(ch);
2203        } else {
2204            break;
2205        }
2206    }
2207    s
2208}
2209
2210fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2211    Unescape::new(chars).unescape()
2212}
2213
2214struct Unescape<'a: 'b, 'b> {
2215    chars: &'b mut State<'a>,
2216}
2217
2218impl<'a: 'b, 'b> Unescape<'a, 'b> {
2219    fn new(chars: &'b mut State<'a>) -> Self {
2220        Self { chars }
2221    }
2222    fn unescape(mut self) -> Option<String> {
2223        let mut unescaped = String::new();
2224
2225        self.chars.next();
2226
2227        while let Some(c) = self.chars.next() {
2228            if c == '\'' {
2229                // case: ''''
2230                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2231                    self.chars.next();
2232                    unescaped.push('\'');
2233                    continue;
2234                }
2235                return Some(unescaped);
2236            }
2237
2238            if c != '\\' {
2239                unescaped.push(c);
2240                continue;
2241            }
2242
2243            let c = match self.chars.next()? {
2244                'b' => '\u{0008}',
2245                'f' => '\u{000C}',
2246                'n' => '\n',
2247                'r' => '\r',
2248                't' => '\t',
2249                'u' => self.unescape_unicode_16()?,
2250                'U' => self.unescape_unicode_32()?,
2251                'x' => self.unescape_hex()?,
2252                c if c.is_digit(8) => self.unescape_octal(c)?,
2253                c => c,
2254            };
2255
2256            unescaped.push(Self::check_null(c)?);
2257        }
2258
2259        None
2260    }
2261
2262    #[inline]
2263    fn check_null(c: char) -> Option<char> {
2264        if c == '\0' {
2265            None
2266        } else {
2267            Some(c)
2268        }
2269    }
2270
2271    #[inline]
2272    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2273        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2274        match u32::from_str_radix(s, RADIX) {
2275            Err(_) => None,
2276            Ok(n) => {
2277                let n = n & 0xFF;
2278                if n <= 127 {
2279                    char::from_u32(n)
2280                } else {
2281                    None
2282                }
2283            }
2284        }
2285    }
2286
2287    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2288    fn unescape_hex(&mut self) -> Option<char> {
2289        let mut s = String::new();
2290
2291        for _ in 0..2 {
2292            match self.next_hex_digit() {
2293                Some(c) => s.push(c),
2294                None => break,
2295            }
2296        }
2297
2298        if s.is_empty() {
2299            return Some('x');
2300        }
2301
2302        Self::byte_to_char::<16>(&s)
2303    }
2304
2305    #[inline]
2306    fn next_hex_digit(&mut self) -> Option<char> {
2307        match self.chars.peek() {
2308            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2309            _ => None,
2310        }
2311    }
2312
2313    // Octal byte value. \o, \oo, \ooo (o = 0–7)
2314    fn unescape_octal(&mut self, c: char) -> Option<char> {
2315        let mut s = String::new();
2316
2317        s.push(c);
2318        for _ in 0..2 {
2319            match self.next_octal_digest() {
2320                Some(c) => s.push(c),
2321                None => break,
2322            }
2323        }
2324
2325        Self::byte_to_char::<8>(&s)
2326    }
2327
2328    #[inline]
2329    fn next_octal_digest(&mut self) -> Option<char> {
2330        match self.chars.peek() {
2331            Some(c) if c.is_digit(8) => self.chars.next(),
2332            _ => None,
2333        }
2334    }
2335
2336    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2337    fn unescape_unicode_16(&mut self) -> Option<char> {
2338        self.unescape_unicode::<4>()
2339    }
2340
2341    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2342    fn unescape_unicode_32(&mut self) -> Option<char> {
2343        self.unescape_unicode::<8>()
2344    }
2345
2346    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2347        let mut s = String::new();
2348        for _ in 0..NUM {
2349            s.push(self.chars.next()?);
2350        }
2351        match u32::from_str_radix(&s, 16) {
2352            Err(_) => None,
2353            Ok(n) => char::from_u32(n),
2354        }
2355    }
2356}
2357
2358fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2359    let mut unescaped = String::new();
2360    chars.next(); // consume the opening quote
2361    while let Some(c) = chars.next() {
2362        match c {
2363            '\'' => {
2364                if chars.peek() == Some(&'\'') {
2365                    chars.next();
2366                    unescaped.push('\'');
2367                } else {
2368                    return Ok(unescaped);
2369                }
2370            }
2371            '\\' => match chars.peek() {
2372                Some('\\') => {
2373                    chars.next();
2374                    unescaped.push('\\');
2375                }
2376                Some('+') => {
2377                    chars.next();
2378                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2379                }
2380                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2381            },
2382            _ => {
2383                unescaped.push(c);
2384            }
2385        }
2386    }
2387    Err(TokenizerError {
2388        message: "Unterminated unicode encoded string literal".to_string(),
2389        location: chars.location(),
2390    })
2391}
2392
2393fn take_char_from_hex_digits(
2394    chars: &mut State<'_>,
2395    max_digits: usize,
2396) -> Result<char, TokenizerError> {
2397    let mut result = 0u32;
2398    for _ in 0..max_digits {
2399        let next_char = chars.next().ok_or_else(|| TokenizerError {
2400            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2401                .to_string(),
2402            location: chars.location(),
2403        })?;
2404        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2405            message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2406            location: chars.location(),
2407        })?;
2408        result = result * 16 + digit;
2409    }
2410    char::from_u32(result).ok_or_else(|| TokenizerError {
2411        message: format!("Invalid unicode character: {result:x}"),
2412        location: chars.location(),
2413    })
2414}
2415
2416#[cfg(test)]
2417mod tests {
2418    use super::*;
2419    use crate::dialect::{
2420        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2421    };
2422    use crate::test_utils::all_dialects_where;
2423    use core::fmt::Debug;
2424
2425    #[test]
2426    fn tokenizer_error_impl() {
2427        let err = TokenizerError {
2428            message: "test".into(),
2429            location: Location { line: 1, column: 1 },
2430        };
2431        #[cfg(feature = "std")]
2432        {
2433            use std::error::Error;
2434            assert!(err.source().is_none());
2435        }
2436        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2437    }
2438
2439    #[test]
2440    fn tokenize_select_1() {
2441        let sql = String::from("SELECT 1");
2442        let dialect = GenericDialect {};
2443        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2444
2445        let expected = vec![
2446            Token::make_keyword("SELECT"),
2447            Token::Whitespace(Whitespace::Space),
2448            Token::Number(String::from("1"), false),
2449        ];
2450
2451        compare(expected, tokens);
2452    }
2453
2454    #[test]
2455    fn tokenize_select_float() {
2456        let sql = String::from("SELECT .1");
2457        let dialect = GenericDialect {};
2458        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2459
2460        let expected = vec![
2461            Token::make_keyword("SELECT"),
2462            Token::Whitespace(Whitespace::Space),
2463            Token::Number(String::from(".1"), false),
2464        ];
2465
2466        compare(expected, tokens);
2467    }
2468
2469    #[test]
2470    fn tokenize_clickhouse_double_equal() {
2471        let sql = String::from("SELECT foo=='1'");
2472        let dialect = ClickHouseDialect {};
2473        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2474        let tokens = tokenizer.tokenize().unwrap();
2475
2476        let expected = vec![
2477            Token::make_keyword("SELECT"),
2478            Token::Whitespace(Whitespace::Space),
2479            Token::Word(Word {
2480                value: "foo".to_string(),
2481                quote_style: None,
2482                keyword: Keyword::NoKeyword,
2483            }),
2484            Token::DoubleEq,
2485            Token::SingleQuotedString("1".to_string()),
2486        ];
2487
2488        compare(expected, tokens);
2489    }
2490
2491    #[test]
2492    fn tokenize_numeric_literal_underscore() {
2493        let dialect = GenericDialect {};
2494        let sql = String::from("SELECT 10_000");
2495        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2496        let tokens = tokenizer.tokenize().unwrap();
2497        let expected = vec![
2498            Token::make_keyword("SELECT"),
2499            Token::Whitespace(Whitespace::Space),
2500            Token::Number("10".to_string(), false),
2501            Token::make_word("_000", None),
2502        ];
2503        compare(expected, tokens);
2504
2505        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2506            "SELECT 10_000, _10_000, 10_00_, 10___0",
2507            vec![
2508                Token::make_keyword("SELECT"),
2509                Token::Whitespace(Whitespace::Space),
2510                Token::Number("10_000".to_string(), false),
2511                Token::Comma,
2512                Token::Whitespace(Whitespace::Space),
2513                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2514                Token::Comma,
2515                Token::Whitespace(Whitespace::Space),
2516                Token::Number("10_00".to_string(), false),
2517                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2518                Token::Comma,
2519                Token::Whitespace(Whitespace::Space),
2520                Token::Number("10".to_string(), false),
2521                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2522            ],
2523        );
2524    }
2525
2526    #[test]
2527    fn tokenize_select_exponent() {
2528        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2529        let dialect = GenericDialect {};
2530        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2531
2532        let expected = vec![
2533            Token::make_keyword("SELECT"),
2534            Token::Whitespace(Whitespace::Space),
2535            Token::Number(String::from("1e10"), false),
2536            Token::Comma,
2537            Token::Whitespace(Whitespace::Space),
2538            Token::Number(String::from("1e-10"), false),
2539            Token::Comma,
2540            Token::Whitespace(Whitespace::Space),
2541            Token::Number(String::from("1e+10"), false),
2542            Token::Comma,
2543            Token::Whitespace(Whitespace::Space),
2544            Token::Number(String::from("1"), false),
2545            Token::make_word("ea", None),
2546            Token::Comma,
2547            Token::Whitespace(Whitespace::Space),
2548            Token::Number(String::from("1e-10"), false),
2549            Token::make_word("a", None),
2550            Token::Comma,
2551            Token::Whitespace(Whitespace::Space),
2552            Token::Number(String::from("1e-10"), false),
2553            Token::Minus,
2554            Token::Number(String::from("10"), false),
2555        ];
2556
2557        compare(expected, tokens);
2558    }
2559
2560    #[test]
2561    fn tokenize_scalar_function() {
2562        let sql = String::from("SELECT sqrt(1)");
2563        let dialect = GenericDialect {};
2564        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2565
2566        let expected = vec![
2567            Token::make_keyword("SELECT"),
2568            Token::Whitespace(Whitespace::Space),
2569            Token::make_word("sqrt", None),
2570            Token::LParen,
2571            Token::Number(String::from("1"), false),
2572            Token::RParen,
2573        ];
2574
2575        compare(expected, tokens);
2576    }
2577
2578    #[test]
2579    fn tokenize_string_string_concat() {
2580        let sql = String::from("SELECT 'a' || 'b'");
2581        let dialect = GenericDialect {};
2582        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2583
2584        let expected = vec![
2585            Token::make_keyword("SELECT"),
2586            Token::Whitespace(Whitespace::Space),
2587            Token::SingleQuotedString(String::from("a")),
2588            Token::Whitespace(Whitespace::Space),
2589            Token::StringConcat,
2590            Token::Whitespace(Whitespace::Space),
2591            Token::SingleQuotedString(String::from("b")),
2592        ];
2593
2594        compare(expected, tokens);
2595    }
2596    #[test]
2597    fn tokenize_bitwise_op() {
2598        let sql = String::from("SELECT one | two ^ three");
2599        let dialect = GenericDialect {};
2600        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2601
2602        let expected = vec![
2603            Token::make_keyword("SELECT"),
2604            Token::Whitespace(Whitespace::Space),
2605            Token::make_word("one", None),
2606            Token::Whitespace(Whitespace::Space),
2607            Token::Pipe,
2608            Token::Whitespace(Whitespace::Space),
2609            Token::make_word("two", None),
2610            Token::Whitespace(Whitespace::Space),
2611            Token::Caret,
2612            Token::Whitespace(Whitespace::Space),
2613            Token::make_word("three", None),
2614        ];
2615        compare(expected, tokens);
2616    }
2617
2618    #[test]
2619    fn tokenize_logical_xor() {
2620        let sql =
2621            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2622        let dialect = GenericDialect {};
2623        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2624
2625        let expected = vec![
2626            Token::make_keyword("SELECT"),
2627            Token::Whitespace(Whitespace::Space),
2628            Token::make_keyword("true"),
2629            Token::Whitespace(Whitespace::Space),
2630            Token::make_keyword("XOR"),
2631            Token::Whitespace(Whitespace::Space),
2632            Token::make_keyword("true"),
2633            Token::Comma,
2634            Token::Whitespace(Whitespace::Space),
2635            Token::make_keyword("false"),
2636            Token::Whitespace(Whitespace::Space),
2637            Token::make_keyword("XOR"),
2638            Token::Whitespace(Whitespace::Space),
2639            Token::make_keyword("false"),
2640            Token::Comma,
2641            Token::Whitespace(Whitespace::Space),
2642            Token::make_keyword("true"),
2643            Token::Whitespace(Whitespace::Space),
2644            Token::make_keyword("XOR"),
2645            Token::Whitespace(Whitespace::Space),
2646            Token::make_keyword("false"),
2647            Token::Comma,
2648            Token::Whitespace(Whitespace::Space),
2649            Token::make_keyword("false"),
2650            Token::Whitespace(Whitespace::Space),
2651            Token::make_keyword("XOR"),
2652            Token::Whitespace(Whitespace::Space),
2653            Token::make_keyword("true"),
2654        ];
2655        compare(expected, tokens);
2656    }
2657
2658    #[test]
2659    fn tokenize_simple_select() {
2660        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2661        let dialect = GenericDialect {};
2662        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2663
2664        let expected = vec![
2665            Token::make_keyword("SELECT"),
2666            Token::Whitespace(Whitespace::Space),
2667            Token::Mul,
2668            Token::Whitespace(Whitespace::Space),
2669            Token::make_keyword("FROM"),
2670            Token::Whitespace(Whitespace::Space),
2671            Token::make_word("customer", None),
2672            Token::Whitespace(Whitespace::Space),
2673            Token::make_keyword("WHERE"),
2674            Token::Whitespace(Whitespace::Space),
2675            Token::make_word("id", None),
2676            Token::Whitespace(Whitespace::Space),
2677            Token::Eq,
2678            Token::Whitespace(Whitespace::Space),
2679            Token::Number(String::from("1"), false),
2680            Token::Whitespace(Whitespace::Space),
2681            Token::make_keyword("LIMIT"),
2682            Token::Whitespace(Whitespace::Space),
2683            Token::Number(String::from("5"), false),
2684        ];
2685
2686        compare(expected, tokens);
2687    }
2688
2689    #[test]
2690    fn tokenize_explain_select() {
2691        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2692        let dialect = GenericDialect {};
2693        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2694
2695        let expected = vec![
2696            Token::make_keyword("EXPLAIN"),
2697            Token::Whitespace(Whitespace::Space),
2698            Token::make_keyword("SELECT"),
2699            Token::Whitespace(Whitespace::Space),
2700            Token::Mul,
2701            Token::Whitespace(Whitespace::Space),
2702            Token::make_keyword("FROM"),
2703            Token::Whitespace(Whitespace::Space),
2704            Token::make_word("customer", None),
2705            Token::Whitespace(Whitespace::Space),
2706            Token::make_keyword("WHERE"),
2707            Token::Whitespace(Whitespace::Space),
2708            Token::make_word("id", None),
2709            Token::Whitespace(Whitespace::Space),
2710            Token::Eq,
2711            Token::Whitespace(Whitespace::Space),
2712            Token::Number(String::from("1"), false),
2713        ];
2714
2715        compare(expected, tokens);
2716    }
2717
2718    #[test]
2719    fn tokenize_explain_analyze_select() {
2720        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2721        let dialect = GenericDialect {};
2722        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2723
2724        let expected = vec![
2725            Token::make_keyword("EXPLAIN"),
2726            Token::Whitespace(Whitespace::Space),
2727            Token::make_keyword("ANALYZE"),
2728            Token::Whitespace(Whitespace::Space),
2729            Token::make_keyword("SELECT"),
2730            Token::Whitespace(Whitespace::Space),
2731            Token::Mul,
2732            Token::Whitespace(Whitespace::Space),
2733            Token::make_keyword("FROM"),
2734            Token::Whitespace(Whitespace::Space),
2735            Token::make_word("customer", None),
2736            Token::Whitespace(Whitespace::Space),
2737            Token::make_keyword("WHERE"),
2738            Token::Whitespace(Whitespace::Space),
2739            Token::make_word("id", None),
2740            Token::Whitespace(Whitespace::Space),
2741            Token::Eq,
2742            Token::Whitespace(Whitespace::Space),
2743            Token::Number(String::from("1"), false),
2744        ];
2745
2746        compare(expected, tokens);
2747    }
2748
2749    #[test]
2750    fn tokenize_string_predicate() {
2751        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2752        let dialect = GenericDialect {};
2753        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2754
2755        let expected = vec![
2756            Token::make_keyword("SELECT"),
2757            Token::Whitespace(Whitespace::Space),
2758            Token::Mul,
2759            Token::Whitespace(Whitespace::Space),
2760            Token::make_keyword("FROM"),
2761            Token::Whitespace(Whitespace::Space),
2762            Token::make_word("customer", None),
2763            Token::Whitespace(Whitespace::Space),
2764            Token::make_keyword("WHERE"),
2765            Token::Whitespace(Whitespace::Space),
2766            Token::make_word("salary", None),
2767            Token::Whitespace(Whitespace::Space),
2768            Token::Neq,
2769            Token::Whitespace(Whitespace::Space),
2770            Token::SingleQuotedString(String::from("Not Provided")),
2771        ];
2772
2773        compare(expected, tokens);
2774    }
2775
2776    #[test]
2777    fn tokenize_invalid_string() {
2778        let sql = String::from("\n💝مصطفىh");
2779
2780        let dialect = GenericDialect {};
2781        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2782        // println!("tokens: {:#?}", tokens);
2783        let expected = vec![
2784            Token::Whitespace(Whitespace::Newline),
2785            Token::Char('💝'),
2786            Token::make_word("مصطفىh", None),
2787        ];
2788        compare(expected, tokens);
2789    }
2790
2791    #[test]
2792    fn tokenize_newline_in_string_literal() {
2793        let sql = String::from("'foo\r\nbar\nbaz'");
2794
2795        let dialect = GenericDialect {};
2796        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2797        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2798        compare(expected, tokens);
2799    }
2800
2801    #[test]
2802    fn tokenize_unterminated_string_literal() {
2803        let sql = String::from("select 'foo");
2804
2805        let dialect = GenericDialect {};
2806        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2807        assert_eq!(
2808            tokenizer.tokenize(),
2809            Err(TokenizerError {
2810                message: "Unterminated string literal".to_string(),
2811                location: Location { line: 1, column: 8 },
2812            })
2813        );
2814    }
2815
2816    #[test]
2817    fn tokenize_unterminated_string_literal_utf8() {
2818        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2819
2820        let dialect = GenericDialect {};
2821        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2822        assert_eq!(
2823            tokenizer.tokenize(),
2824            Err(TokenizerError {
2825                message: "Unterminated string literal".to_string(),
2826                location: Location {
2827                    line: 1,
2828                    column: 35
2829                }
2830            })
2831        );
2832    }
2833
2834    #[test]
2835    fn tokenize_invalid_string_cols() {
2836        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2837
2838        let dialect = GenericDialect {};
2839        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2840        // println!("tokens: {:#?}", tokens);
2841        let expected = vec![
2842            Token::Whitespace(Whitespace::Newline),
2843            Token::Whitespace(Whitespace::Newline),
2844            Token::make_keyword("SELECT"),
2845            Token::Whitespace(Whitespace::Space),
2846            Token::Mul,
2847            Token::Whitespace(Whitespace::Space),
2848            Token::make_keyword("FROM"),
2849            Token::Whitespace(Whitespace::Space),
2850            Token::make_keyword("table"),
2851            Token::Whitespace(Whitespace::Tab),
2852            Token::Char('💝'),
2853            Token::make_word("مصطفىh", None),
2854        ];
2855        compare(expected, tokens);
2856    }
2857
2858    #[test]
2859    fn tokenize_dollar_quoted_string_tagged() {
2860        let test_cases = vec![
2861            (
2862                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2863                vec![
2864                    Token::make_keyword("SELECT"),
2865                    Token::Whitespace(Whitespace::Space),
2866                    Token::DollarQuotedString(DollarQuotedString {
2867                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2868                        tag: Some("tag".into()),
2869                    })
2870                ]
2871            ),
2872            (
2873                String::from("SELECT $abc$x$ab$abc$"),
2874                vec![
2875                    Token::make_keyword("SELECT"),
2876                    Token::Whitespace(Whitespace::Space),
2877                    Token::DollarQuotedString(DollarQuotedString {
2878                        value: "x$ab".into(),
2879                        tag: Some("abc".into()),
2880                    })
2881                ]
2882            ),
2883            (
2884                String::from("SELECT $abc$$abc$"),
2885                vec![
2886                    Token::make_keyword("SELECT"),
2887                    Token::Whitespace(Whitespace::Space),
2888                    Token::DollarQuotedString(DollarQuotedString {
2889                        value: "".into(),
2890                        tag: Some("abc".into()),
2891                    })
2892                ]
2893            ),
2894            (
2895                String::from("0$abc$$abc$1"),
2896                vec![
2897                    Token::Number("0".into(), false),
2898                    Token::DollarQuotedString(DollarQuotedString {
2899                        value: "".into(),
2900                        tag: Some("abc".into()),
2901                    }),
2902                    Token::Number("1".into(), false),
2903                ]
2904            ),
2905            (
2906                String::from("$function$abc$q$data$q$$function$"),
2907                vec![
2908                    Token::DollarQuotedString(DollarQuotedString {
2909                        value: "abc$q$data$q$".into(),
2910                        tag: Some("function".into()),
2911                    }),
2912                ]
2913            ),
2914        ];
2915
2916        let dialect = GenericDialect {};
2917        for (sql, expected) in test_cases {
2918            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2919            compare(expected, tokens);
2920        }
2921    }
2922
2923    #[test]
2924    fn tokenize_dollar_quoted_string_tagged_unterminated() {
2925        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2926        let dialect = GenericDialect {};
2927        assert_eq!(
2928            Tokenizer::new(&dialect, &sql).tokenize(),
2929            Err(TokenizerError {
2930                message: "Unterminated dollar-quoted, expected $".into(),
2931                location: Location {
2932                    line: 1,
2933                    column: 91
2934                }
2935            })
2936        );
2937    }
2938
2939    #[test]
2940    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2941        let sql = String::from("SELECT $abc$abc$");
2942        let dialect = GenericDialect {};
2943        assert_eq!(
2944            Tokenizer::new(&dialect, &sql).tokenize(),
2945            Err(TokenizerError {
2946                message: "Unterminated dollar-quoted, expected $".into(),
2947                location: Location {
2948                    line: 1,
2949                    column: 17
2950                }
2951            })
2952        );
2953    }
2954
2955    #[test]
2956    fn tokenize_dollar_placeholder() {
2957        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
2958        let dialect = SQLiteDialect {};
2959        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2960        assert_eq!(
2961            tokens,
2962            vec![
2963                Token::make_keyword("SELECT"),
2964                Token::Whitespace(Whitespace::Space),
2965                Token::Placeholder("$$".into()),
2966                Token::Comma,
2967                Token::Whitespace(Whitespace::Space),
2968                Token::Placeholder("$$ABC$$".into()),
2969                Token::Comma,
2970                Token::Whitespace(Whitespace::Space),
2971                Token::Placeholder("$ABC$".into()),
2972                Token::Comma,
2973                Token::Whitespace(Whitespace::Space),
2974                Token::Placeholder("$ABC".into()),
2975            ]
2976        );
2977    }
2978
2979    #[test]
2980    fn tokenize_nested_dollar_quoted_strings() {
2981        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
2982        let dialect = GenericDialect {};
2983        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2984        let expected = vec![
2985            Token::make_keyword("SELECT"),
2986            Token::Whitespace(Whitespace::Space),
2987            Token::DollarQuotedString(DollarQuotedString {
2988                value: "dollar $nested$ string".into(),
2989                tag: Some("tag".into()),
2990            }),
2991        ];
2992        compare(expected, tokens);
2993    }
2994
2995    #[test]
2996    fn tokenize_dollar_quoted_string_untagged_empty() {
2997        let sql = String::from("SELECT $$$$");
2998        let dialect = GenericDialect {};
2999        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3000        let expected = vec![
3001            Token::make_keyword("SELECT"),
3002            Token::Whitespace(Whitespace::Space),
3003            Token::DollarQuotedString(DollarQuotedString {
3004                value: "".into(),
3005                tag: None,
3006            }),
3007        ];
3008        compare(expected, tokens);
3009    }
3010
3011    #[test]
3012    fn tokenize_dollar_quoted_string_untagged() {
3013        let sql =
3014            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3015        let dialect = GenericDialect {};
3016        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3017        let expected = vec![
3018            Token::make_keyword("SELECT"),
3019            Token::Whitespace(Whitespace::Space),
3020            Token::DollarQuotedString(DollarQuotedString {
3021                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3022                tag: None,
3023            }),
3024        ];
3025        compare(expected, tokens);
3026    }
3027
3028    #[test]
3029    fn tokenize_dollar_quoted_string_untagged_unterminated() {
3030        let sql = String::from(
3031            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3032        );
3033        let dialect = GenericDialect {};
3034        assert_eq!(
3035            Tokenizer::new(&dialect, &sql).tokenize(),
3036            Err(TokenizerError {
3037                message: "Unterminated dollar-quoted string".into(),
3038                location: Location {
3039                    line: 1,
3040                    column: 86
3041                }
3042            })
3043        );
3044    }
3045
3046    #[test]
3047    fn tokenize_right_arrow() {
3048        let sql = String::from("FUNCTION(key=>value)");
3049        let dialect = GenericDialect {};
3050        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3051        let expected = vec![
3052            Token::make_word("FUNCTION", None),
3053            Token::LParen,
3054            Token::make_word("key", None),
3055            Token::RArrow,
3056            Token::make_word("value", None),
3057            Token::RParen,
3058        ];
3059        compare(expected, tokens);
3060    }
3061
3062    #[test]
3063    fn tokenize_is_null() {
3064        let sql = String::from("a IS NULL");
3065        let dialect = GenericDialect {};
3066        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3067
3068        let expected = vec![
3069            Token::make_word("a", None),
3070            Token::Whitespace(Whitespace::Space),
3071            Token::make_keyword("IS"),
3072            Token::Whitespace(Whitespace::Space),
3073            Token::make_keyword("NULL"),
3074        ];
3075
3076        compare(expected, tokens);
3077    }
3078
3079    #[test]
3080    fn tokenize_comment() {
3081        let test_cases = vec![
3082            (
3083                String::from("0--this is a comment\n1"),
3084                vec![
3085                    Token::Number("0".to_string(), false),
3086                    Token::Whitespace(Whitespace::SingleLineComment {
3087                        prefix: "--".to_string(),
3088                        comment: "this is a comment\n".to_string(),
3089                    }),
3090                    Token::Number("1".to_string(), false),
3091                ],
3092            ),
3093            (
3094                String::from("0--this is a comment\r1"),
3095                vec![
3096                    Token::Number("0".to_string(), false),
3097                    Token::Whitespace(Whitespace::SingleLineComment {
3098                        prefix: "--".to_string(),
3099                        comment: "this is a comment\r1".to_string(),
3100                    }),
3101                ],
3102            ),
3103            (
3104                String::from("0--this is a comment\r\n1"),
3105                vec![
3106                    Token::Number("0".to_string(), false),
3107                    Token::Whitespace(Whitespace::SingleLineComment {
3108                        prefix: "--".to_string(),
3109                        comment: "this is a comment\r\n".to_string(),
3110                    }),
3111                    Token::Number("1".to_string(), false),
3112                ],
3113            ),
3114        ];
3115
3116        let dialect = GenericDialect {};
3117
3118        for (sql, expected) in test_cases {
3119            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3120            compare(expected, tokens);
3121        }
3122    }
3123
3124    #[test]
3125    fn tokenize_comment_postgres() {
3126        let sql = String::from("1--\r0");
3127
3128        let dialect = PostgreSqlDialect {};
3129        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3130        let expected = vec![
3131            Token::Number("1".to_string(), false),
3132            Token::Whitespace(Whitespace::SingleLineComment {
3133                prefix: "--".to_string(),
3134                comment: "\r".to_string(),
3135            }),
3136            Token::Number("0".to_string(), false),
3137        ];
3138        compare(expected, tokens);
3139    }
3140
3141    #[test]
3142    fn tokenize_comment_at_eof() {
3143        let sql = String::from("--this is a comment");
3144
3145        let dialect = GenericDialect {};
3146        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3147        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3148            prefix: "--".to_string(),
3149            comment: "this is a comment".to_string(),
3150        })];
3151        compare(expected, tokens);
3152    }
3153
3154    #[test]
3155    fn tokenize_multiline_comment() {
3156        let sql = String::from("0/*multi-line\n* /comment*/1");
3157
3158        let dialect = GenericDialect {};
3159        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3160        let expected = vec![
3161            Token::Number("0".to_string(), false),
3162            Token::Whitespace(Whitespace::MultiLineComment(
3163                "multi-line\n* /comment".to_string(),
3164            )),
3165            Token::Number("1".to_string(), false),
3166        ];
3167        compare(expected, tokens);
3168    }
3169
3170    #[test]
3171    fn tokenize_nested_multiline_comment() {
3172        let dialect = GenericDialect {};
3173        let test_cases = vec![
3174            (
3175                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3176                vec![
3177                    Token::Number("0".to_string(), false),
3178                    Token::Whitespace(Whitespace::MultiLineComment(
3179                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3180                    )),
3181                    Token::Whitespace(Whitespace::Space),
3182                    Token::Div,
3183                    Token::Word(Word {
3184                        value: "comment".to_string(),
3185                        quote_style: None,
3186                        keyword: Keyword::COMMENT,
3187                    }),
3188                    Token::Mul,
3189                    Token::Div,
3190                    Token::Number("1".to_string(), false),
3191                ],
3192            ),
3193            (
3194                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3195                vec![
3196                    Token::Number("0".to_string(), false),
3197                    Token::Whitespace(Whitespace::MultiLineComment(
3198                        "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3199                    )),
3200                    Token::Number("1".to_string(), false),
3201                ],
3202            ),
3203            (
3204                "SELECT 1/* a /* b */ c */0",
3205                vec![
3206                    Token::make_keyword("SELECT"),
3207                    Token::Whitespace(Whitespace::Space),
3208                    Token::Number("1".to_string(), false),
3209                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3210                    Token::Number("0".to_string(), false),
3211                ],
3212            ),
3213        ];
3214
3215        for (sql, expected) in test_cases {
3216            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3217            compare(expected, tokens);
3218        }
3219    }
3220
3221    #[test]
3222    fn tokenize_nested_multiline_comment_empty() {
3223        let sql = "select 1/*/**/*/0";
3224
3225        let dialect = GenericDialect {};
3226        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3227        let expected = vec![
3228            Token::make_keyword("select"),
3229            Token::Whitespace(Whitespace::Space),
3230            Token::Number("1".to_string(), false),
3231            Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3232            Token::Number("0".to_string(), false),
3233        ];
3234
3235        compare(expected, tokens);
3236    }
3237
3238    #[test]
3239    fn tokenize_nested_comments_if_not_supported() {
3240        let dialect = SQLiteDialect {};
3241        let sql = "SELECT 1/*/* nested comment */*/0";
3242        let tokens = Tokenizer::new(&dialect, sql).tokenize();
3243        let expected = vec![
3244            Token::make_keyword("SELECT"),
3245            Token::Whitespace(Whitespace::Space),
3246            Token::Number("1".to_string(), false),
3247            Token::Whitespace(Whitespace::MultiLineComment(
3248                "/* nested comment ".to_string(),
3249            )),
3250            Token::Mul,
3251            Token::Div,
3252            Token::Number("0".to_string(), false),
3253        ];
3254
3255        compare(expected, tokens.unwrap());
3256    }
3257
3258    #[test]
3259    fn tokenize_multiline_comment_with_even_asterisks() {
3260        let sql = String::from("\n/** Comment **/\n");
3261
3262        let dialect = GenericDialect {};
3263        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3264        let expected = vec![
3265            Token::Whitespace(Whitespace::Newline),
3266            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3267            Token::Whitespace(Whitespace::Newline),
3268        ];
3269        compare(expected, tokens);
3270    }
3271
3272    #[test]
3273    fn tokenize_unicode_whitespace() {
3274        let sql = String::from(" \u{2003}\n");
3275
3276        let dialect = GenericDialect {};
3277        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3278        let expected = vec![
3279            Token::Whitespace(Whitespace::Space),
3280            Token::Whitespace(Whitespace::Space),
3281            Token::Whitespace(Whitespace::Newline),
3282        ];
3283        compare(expected, tokens);
3284    }
3285
3286    #[test]
3287    fn tokenize_mismatched_quotes() {
3288        let sql = String::from("\"foo");
3289
3290        let dialect = GenericDialect {};
3291        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3292        assert_eq!(
3293            tokenizer.tokenize(),
3294            Err(TokenizerError {
3295                message: "Expected close delimiter '\"' before EOF.".to_string(),
3296                location: Location { line: 1, column: 1 },
3297            })
3298        );
3299    }
3300
3301    #[test]
3302    fn tokenize_newlines() {
3303        let sql = String::from("line1\nline2\rline3\r\nline4\r");
3304
3305        let dialect = GenericDialect {};
3306        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3307        let expected = vec![
3308            Token::make_word("line1", None),
3309            Token::Whitespace(Whitespace::Newline),
3310            Token::make_word("line2", None),
3311            Token::Whitespace(Whitespace::Newline),
3312            Token::make_word("line3", None),
3313            Token::Whitespace(Whitespace::Newline),
3314            Token::make_word("line4", None),
3315            Token::Whitespace(Whitespace::Newline),
3316        ];
3317        compare(expected, tokens);
3318    }
3319
3320    #[test]
3321    fn tokenize_mssql_top() {
3322        let sql = "SELECT TOP 5 [bar] FROM foo";
3323        let dialect = MsSqlDialect {};
3324        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3325        let expected = vec![
3326            Token::make_keyword("SELECT"),
3327            Token::Whitespace(Whitespace::Space),
3328            Token::make_keyword("TOP"),
3329            Token::Whitespace(Whitespace::Space),
3330            Token::Number(String::from("5"), false),
3331            Token::Whitespace(Whitespace::Space),
3332            Token::make_word("bar", Some('[')),
3333            Token::Whitespace(Whitespace::Space),
3334            Token::make_keyword("FROM"),
3335            Token::Whitespace(Whitespace::Space),
3336            Token::make_word("foo", None),
3337        ];
3338        compare(expected, tokens);
3339    }
3340
3341    #[test]
3342    fn tokenize_pg_regex_match() {
3343        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3344        let dialect = GenericDialect {};
3345        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3346        let expected = vec![
3347            Token::make_keyword("SELECT"),
3348            Token::Whitespace(Whitespace::Space),
3349            Token::make_word("col", None),
3350            Token::Whitespace(Whitespace::Space),
3351            Token::Tilde,
3352            Token::Whitespace(Whitespace::Space),
3353            Token::SingleQuotedString("^a".into()),
3354            Token::Comma,
3355            Token::Whitespace(Whitespace::Space),
3356            Token::make_word("col", None),
3357            Token::Whitespace(Whitespace::Space),
3358            Token::TildeAsterisk,
3359            Token::Whitespace(Whitespace::Space),
3360            Token::SingleQuotedString("^a".into()),
3361            Token::Comma,
3362            Token::Whitespace(Whitespace::Space),
3363            Token::make_word("col", None),
3364            Token::Whitespace(Whitespace::Space),
3365            Token::ExclamationMarkTilde,
3366            Token::Whitespace(Whitespace::Space),
3367            Token::SingleQuotedString("^a".into()),
3368            Token::Comma,
3369            Token::Whitespace(Whitespace::Space),
3370            Token::make_word("col", None),
3371            Token::Whitespace(Whitespace::Space),
3372            Token::ExclamationMarkTildeAsterisk,
3373            Token::Whitespace(Whitespace::Space),
3374            Token::SingleQuotedString("^a".into()),
3375        ];
3376        compare(expected, tokens);
3377    }
3378
3379    #[test]
3380    fn tokenize_pg_like_match() {
3381        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3382        let dialect = GenericDialect {};
3383        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3384        let expected = vec![
3385            Token::make_keyword("SELECT"),
3386            Token::Whitespace(Whitespace::Space),
3387            Token::make_word("col", None),
3388            Token::Whitespace(Whitespace::Space),
3389            Token::DoubleTilde,
3390            Token::Whitespace(Whitespace::Space),
3391            Token::SingleQuotedString("_a%".into()),
3392            Token::Comma,
3393            Token::Whitespace(Whitespace::Space),
3394            Token::make_word("col", None),
3395            Token::Whitespace(Whitespace::Space),
3396            Token::DoubleTildeAsterisk,
3397            Token::Whitespace(Whitespace::Space),
3398            Token::SingleQuotedString("_a%".into()),
3399            Token::Comma,
3400            Token::Whitespace(Whitespace::Space),
3401            Token::make_word("col", None),
3402            Token::Whitespace(Whitespace::Space),
3403            Token::ExclamationMarkDoubleTilde,
3404            Token::Whitespace(Whitespace::Space),
3405            Token::SingleQuotedString("_a%".into()),
3406            Token::Comma,
3407            Token::Whitespace(Whitespace::Space),
3408            Token::make_word("col", None),
3409            Token::Whitespace(Whitespace::Space),
3410            Token::ExclamationMarkDoubleTildeAsterisk,
3411            Token::Whitespace(Whitespace::Space),
3412            Token::SingleQuotedString("_a%".into()),
3413        ];
3414        compare(expected, tokens);
3415    }
3416
3417    #[test]
3418    fn tokenize_quoted_identifier() {
3419        let sql = r#" "a "" b" "a """ "c """"" "#;
3420        let dialect = GenericDialect {};
3421        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3422        let expected = vec![
3423            Token::Whitespace(Whitespace::Space),
3424            Token::make_word(r#"a " b"#, Some('"')),
3425            Token::Whitespace(Whitespace::Space),
3426            Token::make_word(r#"a ""#, Some('"')),
3427            Token::Whitespace(Whitespace::Space),
3428            Token::make_word(r#"c """#, Some('"')),
3429            Token::Whitespace(Whitespace::Space),
3430        ];
3431        compare(expected, tokens);
3432    }
3433
3434    #[test]
3435    fn tokenize_snowflake_div() {
3436        let sql = r#"field/1000"#;
3437        let dialect = SnowflakeDialect {};
3438        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3439        let expected = vec![
3440            Token::make_word(r#"field"#, None),
3441            Token::Div,
3442            Token::Number("1000".to_string(), false),
3443        ];
3444        compare(expected, tokens);
3445    }
3446
3447    #[test]
3448    fn tokenize_quoted_identifier_with_no_escape() {
3449        let sql = r#" "a "" b" "a """ "c """"" "#;
3450        let dialect = GenericDialect {};
3451        let tokens = Tokenizer::new(&dialect, sql)
3452            .with_unescape(false)
3453            .tokenize()
3454            .unwrap();
3455        let expected = vec![
3456            Token::Whitespace(Whitespace::Space),
3457            Token::make_word(r#"a "" b"#, Some('"')),
3458            Token::Whitespace(Whitespace::Space),
3459            Token::make_word(r#"a """#, Some('"')),
3460            Token::Whitespace(Whitespace::Space),
3461            Token::make_word(r#"c """""#, Some('"')),
3462            Token::Whitespace(Whitespace::Space),
3463        ];
3464        compare(expected, tokens);
3465    }
3466
3467    #[test]
3468    fn tokenize_with_location() {
3469        let sql = "SELECT a,\n b";
3470        let dialect = GenericDialect {};
3471        let tokens = Tokenizer::new(&dialect, sql)
3472            .tokenize_with_location()
3473            .unwrap();
3474        let expected = vec![
3475            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3476            TokenWithSpan::at(
3477                Token::Whitespace(Whitespace::Space),
3478                (1, 7).into(),
3479                (1, 8).into(),
3480            ),
3481            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3482            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3483            TokenWithSpan::at(
3484                Token::Whitespace(Whitespace::Newline),
3485                (1, 10).into(),
3486                (2, 1).into(),
3487            ),
3488            TokenWithSpan::at(
3489                Token::Whitespace(Whitespace::Space),
3490                (2, 1).into(),
3491                (2, 2).into(),
3492            ),
3493            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3494        ];
3495        compare(expected, tokens);
3496    }
3497
3498    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3499        //println!("------------------------------");
3500        //println!("tokens   = {:?}", actual);
3501        //println!("expected = {:?}", expected);
3502        //println!("------------------------------");
3503        assert_eq!(expected, actual);
3504    }
3505
3506    fn check_unescape(s: &str, expected: Option<&str>) {
3507        let s = format!("'{s}'");
3508        let mut state = State {
3509            peekable: s.chars().peekable(),
3510            line: 0,
3511            col: 0,
3512        };
3513
3514        assert_eq!(
3515            unescape_single_quoted_string(&mut state),
3516            expected.map(|s| s.to_string())
3517        );
3518    }
3519
3520    #[test]
3521    fn test_unescape() {
3522        check_unescape(r"\b", Some("\u{0008}"));
3523        check_unescape(r"\f", Some("\u{000C}"));
3524        check_unescape(r"\t", Some("\t"));
3525        check_unescape(r"\r\n", Some("\r\n"));
3526        check_unescape(r"\/", Some("/"));
3527        check_unescape(r"/", Some("/"));
3528        check_unescape(r"\\", Some("\\"));
3529
3530        // 16 and 32-bit hexadecimal Unicode character value
3531        check_unescape(r"\u0001", Some("\u{0001}"));
3532        check_unescape(r"\u4c91", Some("\u{4c91}"));
3533        check_unescape(r"\u4c916", Some("\u{4c91}6"));
3534        check_unescape(r"\u4c", None);
3535        check_unescape(r"\u0000", None);
3536        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3537        check_unescape(r"\U00110000", None);
3538        check_unescape(r"\U00000000", None);
3539        check_unescape(r"\u", None);
3540        check_unescape(r"\U", None);
3541        check_unescape(r"\U1010FFFF", None);
3542
3543        // hexadecimal byte value
3544        check_unescape(r"\x4B", Some("\u{004b}"));
3545        check_unescape(r"\x4", Some("\u{0004}"));
3546        check_unescape(r"\x4L", Some("\u{0004}L"));
3547        check_unescape(r"\x", Some("x"));
3548        check_unescape(r"\xP", Some("xP"));
3549        check_unescape(r"\x0", None);
3550        check_unescape(r"\xCAD", None);
3551        check_unescape(r"\xA9", None);
3552
3553        // octal byte value
3554        check_unescape(r"\1", Some("\u{0001}"));
3555        check_unescape(r"\12", Some("\u{000a}"));
3556        check_unescape(r"\123", Some("\u{0053}"));
3557        check_unescape(r"\1232", Some("\u{0053}2"));
3558        check_unescape(r"\4", Some("\u{0004}"));
3559        check_unescape(r"\45", Some("\u{0025}"));
3560        check_unescape(r"\450", Some("\u{0028}"));
3561        check_unescape(r"\603", None);
3562        check_unescape(r"\0", None);
3563        check_unescape(r"\080", None);
3564
3565        // others
3566        check_unescape(r"\9", Some("9"));
3567        check_unescape(r"''", Some("'"));
3568        check_unescape(
3569            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3570            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3571        );
3572        check_unescape(r"Hello\0", None);
3573        check_unescape(r"Hello\xCADRust", None);
3574    }
3575
3576    #[test]
3577    fn tokenize_numeric_prefix_trait() {
3578        #[derive(Debug)]
3579        struct NumericPrefixDialect;
3580
3581        impl Dialect for NumericPrefixDialect {
3582            fn is_identifier_start(&self, ch: char) -> bool {
3583                ch.is_ascii_lowercase()
3584                    || ch.is_ascii_uppercase()
3585                    || ch.is_ascii_digit()
3586                    || ch == '$'
3587            }
3588
3589            fn is_identifier_part(&self, ch: char) -> bool {
3590                ch.is_ascii_lowercase()
3591                    || ch.is_ascii_uppercase()
3592                    || ch.is_ascii_digit()
3593                    || ch == '_'
3594                    || ch == '$'
3595                    || ch == '{'
3596                    || ch == '}'
3597            }
3598
3599            fn supports_numeric_prefix(&self) -> bool {
3600                true
3601            }
3602        }
3603
3604        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3605        tokenize_numeric_prefix_inner(&HiveDialect {});
3606        tokenize_numeric_prefix_inner(&MySqlDialect {});
3607    }
3608
3609    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3610        let sql = r#"SELECT * FROM 1"#;
3611        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3612        let expected = vec![
3613            Token::make_keyword("SELECT"),
3614            Token::Whitespace(Whitespace::Space),
3615            Token::Mul,
3616            Token::Whitespace(Whitespace::Space),
3617            Token::make_keyword("FROM"),
3618            Token::Whitespace(Whitespace::Space),
3619            Token::Number(String::from("1"), false),
3620        ];
3621        compare(expected, tokens);
3622    }
3623
3624    #[test]
3625    fn tokenize_quoted_string_escape() {
3626        let dialect = SnowflakeDialect {};
3627        for (sql, expected, expected_unescaped) in [
3628            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3629            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3630            (r#"'\\'"#, r#"\\"#, r#"\"#),
3631            (
3632                r#"'\0\a\b\f\n\r\t\Z'"#,
3633                r#"\0\a\b\f\n\r\t\Z"#,
3634                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3635            ),
3636            (r#"'\"'"#, r#"\""#, "\""),
3637            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3638            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3639            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3640            (r#"'\q'"#, r#"\q"#, r#"q"#),
3641            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3642            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3643        ] {
3644            let tokens = Tokenizer::new(&dialect, sql)
3645                .with_unescape(false)
3646                .tokenize()
3647                .unwrap();
3648            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3649            compare(expected, tokens);
3650
3651            let tokens = Tokenizer::new(&dialect, sql)
3652                .with_unescape(true)
3653                .tokenize()
3654                .unwrap();
3655            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3656            compare(expected, tokens);
3657        }
3658
3659        for sql in [r#"'\'"#, r#"'ab\'"#] {
3660            let mut tokenizer = Tokenizer::new(&dialect, sql);
3661            assert_eq!(
3662                "Unterminated string literal",
3663                tokenizer.tokenize().unwrap_err().message.as_str(),
3664            );
3665        }
3666
3667        // Non-escape dialect
3668        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3669            let dialect = GenericDialect {};
3670            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3671
3672            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3673
3674            compare(expected, tokens);
3675        }
3676
3677        // MySQL special case for LIKE escapes
3678        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3679            let dialect = MySqlDialect {};
3680            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3681
3682            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3683
3684            compare(expected, tokens);
3685        }
3686    }
3687
3688    #[test]
3689    fn tokenize_triple_quoted_string() {
3690        fn check<F>(
3691            q: char, // The quote character to test
3692            r: char, // An alternate quote character.
3693            quote_token: F,
3694        ) where
3695            F: Fn(String) -> Token,
3696        {
3697            let dialect = BigQueryDialect {};
3698
3699            for (sql, expected, expected_unescaped) in [
3700                // Empty string
3701                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3702                // Should not count escaped quote as end of string.
3703                (
3704                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3705                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3706                    format!(r#"ab{q}{q}{q}{q}cd"#),
3707                ),
3708                // Simple string
3709                (
3710                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3711                    "abc".into(),
3712                    "abc".into(),
3713                ),
3714                // Mix single-double quotes unescaped.
3715                (
3716                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3717                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3718                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3719                ),
3720                // Escaped quote.
3721                (
3722                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3723                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3724                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3725                ),
3726                // backslash-escaped quote characters.
3727                (
3728                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3729                    r#"a\'\'b\'c\'d"#.into(),
3730                    r#"a''b'c'd"#.into(),
3731                ),
3732                // backslash-escaped characters
3733                (
3734                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3735                    r#"abc\0\n\rdef"#.into(),
3736                    "abc\0\n\rdef".into(),
3737                ),
3738            ] {
3739                let tokens = Tokenizer::new(&dialect, sql.as_str())
3740                    .with_unescape(false)
3741                    .tokenize()
3742                    .unwrap();
3743                let expected = vec![quote_token(expected.to_string())];
3744                compare(expected, tokens);
3745
3746                let tokens = Tokenizer::new(&dialect, sql.as_str())
3747                    .with_unescape(true)
3748                    .tokenize()
3749                    .unwrap();
3750                let expected = vec![quote_token(expected_unescaped.to_string())];
3751                compare(expected, tokens);
3752            }
3753
3754            for sql in [
3755                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3756                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3757                format!(r#"{q}{q}{q}{q}"#),
3758                format!(r#"{q}{q}{q}{r}{r}"#),
3759                format!(r#"{q}{q}{q}abc{q}"#),
3760                format!(r#"{q}{q}{q}abc{q}{q}"#),
3761                format!(r#"{q}{q}{q}abc"#),
3762            ] {
3763                let dialect = BigQueryDialect {};
3764                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3765                assert_eq!(
3766                    "Unterminated string literal",
3767                    tokenizer.tokenize().unwrap_err().message.as_str(),
3768                );
3769            }
3770        }
3771
3772        check('"', '\'', Token::TripleDoubleQuotedString);
3773
3774        check('\'', '"', Token::TripleSingleQuotedString);
3775
3776        let dialect = BigQueryDialect {};
3777
3778        let sql = r#"""''"#;
3779        let tokens = Tokenizer::new(&dialect, sql)
3780            .with_unescape(true)
3781            .tokenize()
3782            .unwrap();
3783        let expected = vec![
3784            Token::DoubleQuotedString("".to_string()),
3785            Token::SingleQuotedString("".to_string()),
3786        ];
3787        compare(expected, tokens);
3788
3789        let sql = r#"''"""#;
3790        let tokens = Tokenizer::new(&dialect, sql)
3791            .with_unescape(true)
3792            .tokenize()
3793            .unwrap();
3794        let expected = vec![
3795            Token::SingleQuotedString("".to_string()),
3796            Token::DoubleQuotedString("".to_string()),
3797        ];
3798        compare(expected, tokens);
3799
3800        // Non-triple quoted string dialect
3801        let dialect = SnowflakeDialect {};
3802        let sql = r#"''''''"#;
3803        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3804        let expected = vec![Token::SingleQuotedString("''".to_string())];
3805        compare(expected, tokens);
3806    }
3807
3808    #[test]
3809    fn test_mysql_users_grantees() {
3810        let dialect = MySqlDialect {};
3811
3812        let sql = "CREATE USER `root`@`%`";
3813        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3814        let expected = vec![
3815            Token::make_keyword("CREATE"),
3816            Token::Whitespace(Whitespace::Space),
3817            Token::make_keyword("USER"),
3818            Token::Whitespace(Whitespace::Space),
3819            Token::make_word("root", Some('`')),
3820            Token::AtSign,
3821            Token::make_word("%", Some('`')),
3822        ];
3823        compare(expected, tokens);
3824    }
3825
3826    #[test]
3827    fn test_postgres_abs_without_space_and_string_literal() {
3828        let dialect = MySqlDialect {};
3829
3830        let sql = "SELECT @'1'";
3831        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3832        let expected = vec![
3833            Token::make_keyword("SELECT"),
3834            Token::Whitespace(Whitespace::Space),
3835            Token::AtSign,
3836            Token::SingleQuotedString("1".to_string()),
3837        ];
3838        compare(expected, tokens);
3839    }
3840
3841    #[test]
3842    fn test_postgres_abs_without_space_and_quoted_column() {
3843        let dialect = MySqlDialect {};
3844
3845        let sql = r#"SELECT @"bar" FROM foo"#;
3846        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3847        let expected = vec![
3848            Token::make_keyword("SELECT"),
3849            Token::Whitespace(Whitespace::Space),
3850            Token::AtSign,
3851            Token::DoubleQuotedString("bar".to_string()),
3852            Token::Whitespace(Whitespace::Space),
3853            Token::make_keyword("FROM"),
3854            Token::Whitespace(Whitespace::Space),
3855            Token::make_word("foo", None),
3856        ];
3857        compare(expected, tokens);
3858    }
3859
3860    #[test]
3861    fn test_national_strings_backslash_escape_not_supported() {
3862        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3863            .tokenizes_to(
3864                "select n'''''\\'",
3865                vec![
3866                    Token::make_keyword("select"),
3867                    Token::Whitespace(Whitespace::Space),
3868                    Token::NationalStringLiteral("''\\".to_string()),
3869                ],
3870            );
3871    }
3872
3873    #[test]
3874    fn test_national_strings_backslash_escape_supported() {
3875        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3876            .tokenizes_to(
3877                "select n'''''\\''",
3878                vec![
3879                    Token::make_keyword("select"),
3880                    Token::Whitespace(Whitespace::Space),
3881                    Token::NationalStringLiteral("'''".to_string()),
3882                ],
3883            );
3884    }
3885
3886    #[test]
3887    fn test_string_escape_constant_not_supported() {
3888        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3889            "select e'...'",
3890            vec![
3891                Token::make_keyword("select"),
3892                Token::Whitespace(Whitespace::Space),
3893                Token::make_word("e", None),
3894                Token::SingleQuotedString("...".to_string()),
3895            ],
3896        );
3897
3898        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3899            "select E'...'",
3900            vec![
3901                Token::make_keyword("select"),
3902                Token::Whitespace(Whitespace::Space),
3903                Token::make_word("E", None),
3904                Token::SingleQuotedString("...".to_string()),
3905            ],
3906        );
3907    }
3908
3909    #[test]
3910    fn test_string_escape_constant_supported() {
3911        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3912            "select e'\\''",
3913            vec![
3914                Token::make_keyword("select"),
3915                Token::Whitespace(Whitespace::Space),
3916                Token::EscapedStringLiteral("'".to_string()),
3917            ],
3918        );
3919
3920        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3921            "select E'\\''",
3922            vec![
3923                Token::make_keyword("select"),
3924                Token::Whitespace(Whitespace::Space),
3925                Token::EscapedStringLiteral("'".to_string()),
3926            ],
3927        );
3928    }
3929
3930    #[test]
3931    fn test_whitespace_required_after_single_line_comment() {
3932        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3933            .tokenizes_to(
3934                "SELECT --'abc'",
3935                vec![
3936                    Token::make_keyword("SELECT"),
3937                    Token::Whitespace(Whitespace::Space),
3938                    Token::Minus,
3939                    Token::Minus,
3940                    Token::SingleQuotedString("abc".to_string()),
3941                ],
3942            );
3943
3944        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3945            .tokenizes_to(
3946                "SELECT -- 'abc'",
3947                vec![
3948                    Token::make_keyword("SELECT"),
3949                    Token::Whitespace(Whitespace::Space),
3950                    Token::Whitespace(Whitespace::SingleLineComment {
3951                        prefix: "--".to_string(),
3952                        comment: " 'abc'".to_string(),
3953                    }),
3954                ],
3955            );
3956
3957        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3958            .tokenizes_to(
3959                "SELECT --",
3960                vec![
3961                    Token::make_keyword("SELECT"),
3962                    Token::Whitespace(Whitespace::Space),
3963                    Token::Minus,
3964                    Token::Minus,
3965                ],
3966            );
3967    }
3968
3969    #[test]
3970    fn test_whitespace_not_required_after_single_line_comment() {
3971        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3972            .tokenizes_to(
3973                "SELECT --'abc'",
3974                vec![
3975                    Token::make_keyword("SELECT"),
3976                    Token::Whitespace(Whitespace::Space),
3977                    Token::Whitespace(Whitespace::SingleLineComment {
3978                        prefix: "--".to_string(),
3979                        comment: "'abc'".to_string(),
3980                    }),
3981                ],
3982            );
3983
3984        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3985            .tokenizes_to(
3986                "SELECT -- 'abc'",
3987                vec![
3988                    Token::make_keyword("SELECT"),
3989                    Token::Whitespace(Whitespace::Space),
3990                    Token::Whitespace(Whitespace::SingleLineComment {
3991                        prefix: "--".to_string(),
3992                        comment: " 'abc'".to_string(),
3993                    }),
3994                ],
3995            );
3996
3997        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3998            .tokenizes_to(
3999                "SELECT --",
4000                vec![
4001                    Token::make_keyword("SELECT"),
4002                    Token::Whitespace(Whitespace::Space),
4003                    Token::Whitespace(Whitespace::SingleLineComment {
4004                        prefix: "--".to_string(),
4005                        comment: "".to_string(),
4006                    }),
4007                ],
4008            );
4009    }
4010
4011    #[test]
4012    fn test_tokenize_identifiers_numeric_prefix() {
4013        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4014            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4015
4016        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4017            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4018
4019        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4020            "t.12e34",
4021            vec![
4022                Token::make_word("t", None),
4023                Token::Period,
4024                Token::make_word("12e34", None),
4025            ],
4026        );
4027
4028        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4029            "t.1two3",
4030            vec![
4031                Token::make_word("t", None),
4032                Token::Period,
4033                Token::make_word("1two3", None),
4034            ],
4035        );
4036    }
4037
4038    #[test]
4039    fn tokenize_period_underscore() {
4040        let sql = String::from("SELECT table._col");
4041        // a dialect that supports underscores in numeric literals
4042        let dialect = PostgreSqlDialect {};
4043        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4044
4045        let expected = vec![
4046            Token::make_keyword("SELECT"),
4047            Token::Whitespace(Whitespace::Space),
4048            Token::Word(Word {
4049                value: "table".to_string(),
4050                quote_style: None,
4051                keyword: Keyword::TABLE,
4052            }),
4053            Token::Period,
4054            Token::Word(Word {
4055                value: "_col".to_string(),
4056                quote_style: None,
4057                keyword: Keyword::NoKeyword,
4058            }),
4059        ];
4060
4061        compare(expected, tokens);
4062
4063        let sql = String::from("SELECT ._123");
4064        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4065            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4066        }
4067
4068        let sql = String::from("SELECT ._abc");
4069        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4070            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4071        }
4072    }
4073}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs