sqlparser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46    SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51/// SQL Token enumeration
52#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56    /// An end-of-file marker, not a real token
57    EOF,
58    /// A keyword (like SELECT) or an optionally quoted SQL identifier
59    Word(Word),
60    /// An unsigned numeric literal
61    Number(String, bool),
62    /// A character that could not be tokenized
63    Char(char),
64    /// Single quoted string: i.e: 'string'
65    SingleQuotedString(String),
66    /// Double quoted string: i.e: "string"
67    DoubleQuotedString(String),
68    /// Triple single quoted strings: Example '''abc'''
69    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
70    TripleSingleQuotedString(String),
71    /// Triple double quoted strings: Example """abc"""
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleDoubleQuotedString(String),
74    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
75    DollarQuotedString(DollarQuotedString),
76    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
77    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
78    SingleQuotedByteStringLiteral(String),
79    /// Byte string literal: i.e: b"string" or B"string"
80    DoubleQuotedByteStringLiteral(String),
81    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
82    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
83    TripleSingleQuotedByteStringLiteral(String),
84    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleDoubleQuotedByteStringLiteral(String),
87    /// Single quoted literal with raw string prefix. Example `R'abc'`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    SingleQuotedRawStringLiteral(String),
90    /// Double quoted literal with raw string prefix. Example `R"abc"`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    DoubleQuotedRawStringLiteral(String),
93    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    TripleSingleQuotedRawStringLiteral(String),
96    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleDoubleQuotedRawStringLiteral(String),
99    /// "National" string literal: i.e: N'string'
100    NationalStringLiteral(String),
101    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
102    EscapedStringLiteral(String),
103    /// Unicode string literal: i.e: U&'first \000A second'
104    UnicodeStringLiteral(String),
105    /// Hexadecimal string literal: i.e.: X'deadbeef'
106    HexStringLiteral(String),
107    /// Comma
108    Comma,
109    /// Whitespace (space, tab, etc)
110    Whitespace(Whitespace),
111    /// Double equals sign `==`
112    DoubleEq,
113    /// Equality operator `=`
114    Eq,
115    /// Not Equals operator `<>` (or `!=` in some dialects)
116    Neq,
117    /// Less Than operator `<`
118    Lt,
119    /// Greater Than operator `>`
120    Gt,
121    /// Less Than Or Equals operator `<=`
122    LtEq,
123    /// Greater Than Or Equals operator `>=`
124    GtEq,
125    /// Spaceship operator <=>
126    Spaceship,
127    /// Plus operator `+`
128    Plus,
129    /// Minus operator `-`
130    Minus,
131    /// Multiplication operator `*`
132    Mul,
133    /// Division operator `/`
134    Div,
135    /// Integer division operator `//` in DuckDB
136    DuckIntDiv,
137    /// Modulo Operator `%`
138    Mod,
139    /// String concatenation `||`
140    StringConcat,
141    /// Left parenthesis `(`
142    LParen,
143    /// Right parenthesis `)`
144    RParen,
145    /// Period (used for compound identifiers or projections into nested types)
146    Period,
147    /// Colon `:`
148    Colon,
149    /// DoubleColon `::` (used for casting in PostgreSQL)
150    DoubleColon,
151    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
152    Assignment,
153    /// SemiColon `;` used as separator for COPY and payload
154    SemiColon,
155    /// Backslash `\` used in terminating the COPY payload with `\.`
156    Backslash,
157    /// Left bracket `[`
158    LBracket,
159    /// Right bracket `]`
160    RBracket,
161    /// Ampersand `&`
162    Ampersand,
163    /// Pipe `|`
164    Pipe,
165    /// Caret `^`
166    Caret,
167    /// Left brace `{`
168    LBrace,
169    /// Right brace `}`
170    RBrace,
171    /// Right Arrow `=>`
172    RArrow,
173    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
174    Sharp,
175    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
176    DoubleSharp,
177    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
178    Tilde,
179    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
180    TildeAsterisk,
181    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
182    ExclamationMarkTilde,
183    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
184    ExclamationMarkTildeAsterisk,
185    /// `~~`, a case sensitive match pattern operator in PostgreSQL
186    DoubleTilde,
187    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
188    DoubleTildeAsterisk,
189    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
190    ExclamationMarkDoubleTilde,
191    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
192    ExclamationMarkDoubleTildeAsterisk,
193    /// `<<`, a bitwise shift left operator in PostgreSQL
194    ShiftLeft,
195    /// `>>`, a bitwise shift right operator in PostgreSQL
196    ShiftRight,
197    /// `&&`, an overlap operator in PostgreSQL
198    Overlap,
199    /// Exclamation Mark `!` used for PostgreSQL factorial operator
200    ExclamationMark,
201    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
202    DoubleExclamationMark,
203    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
204    AtSign,
205    /// `^@`, a "starts with" string operator in PostgreSQL
206    CaretAt,
207    /// `|/`, a square root math operator in PostgreSQL
208    PGSquareRoot,
209    /// `||/`, a cube root math operator in PostgreSQL
210    PGCubeRoot,
211    /// `?` or `$` , a prepared statement arg placeholder
212    Placeholder(String),
213    /// `->`, used as a operator to extract json field in PostgreSQL
214    Arrow,
215    /// `->>`, used as a operator to extract json field as text in PostgreSQL
216    LongArrow,
217    /// `#>`, extracts JSON sub-object at the specified path
218    HashArrow,
219    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
220    AtDashAt,
221    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
222    QuestionMarkDash,
223    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
224    AmpersandLeftAngleBracket,
225    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
226    AmpersandRightAngleBracket,
227    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
228    AmpersandLeftAngleBracketVerticalBar,
229    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
230    VerticalBarAmpersandRightAngleBracket,
231    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
232    TwoWayArrow,
233    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
234    LeftAngleBracketCaret,
235    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
236    RightAngleBracketCaret,
237    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
238    QuestionMarkSharp,
239    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
240    QuestionMarkDashVerticalBar,
241    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
242    QuestionMarkDoubleVerticalBar,
243    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
244    TildeEqual,
245    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
246    ShiftLeftVerticalBar,
247    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
248    VerticalBarShiftRight,
249    /// `|> BigQuery pipe operator
250    VerticalBarRightAngleBracket,
251    /// `#>>`, extracts JSON sub-object at the specified path as text
252    HashLongArrow,
253    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
254    AtArrow,
255    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
256    ArrowAt,
257    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
258    /// path, where path elements can be either field keys or array indexes.
259    HashMinus,
260    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
261    /// JSON value?
262    AtQuestion,
263    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
264    /// for the specified JSON value. Only the first item of the result is taken into
265    /// account. If the result is not Boolean, then NULL is returned.
266    AtAt,
267    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
268    /// jsonb object
269    Question,
270    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
271    /// keys within the jsonb object
272    QuestionAnd,
273    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
274    /// keys within the jsonb object
275    QuestionPipe,
276    /// Custom binary operator
277    /// This is used to represent any custom binary operator that is not part of the SQL standard.
278    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
279    CustomBinaryOperator(String),
280}
281
282impl fmt::Display for Token {
283    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
284        match self {
285            Token::EOF => f.write_str("EOF"),
286            Token::Word(ref w) => write!(f, "{w}"),
287            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
288            Token::Char(ref c) => write!(f, "{c}"),
289            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
290            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
291            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
292            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
293            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
294            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
295            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
296            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
297            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
298            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
299            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
300            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
301            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
302            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
303            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
304            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
305            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
306            Token::Comma => f.write_str(","),
307            Token::Whitespace(ws) => write!(f, "{ws}"),
308            Token::DoubleEq => f.write_str("=="),
309            Token::Spaceship => f.write_str("<=>"),
310            Token::Eq => f.write_str("="),
311            Token::Neq => f.write_str("<>"),
312            Token::Lt => f.write_str("<"),
313            Token::Gt => f.write_str(">"),
314            Token::LtEq => f.write_str("<="),
315            Token::GtEq => f.write_str(">="),
316            Token::Plus => f.write_str("+"),
317            Token::Minus => f.write_str("-"),
318            Token::Mul => f.write_str("*"),
319            Token::Div => f.write_str("/"),
320            Token::DuckIntDiv => f.write_str("//"),
321            Token::StringConcat => f.write_str("||"),
322            Token::Mod => f.write_str("%"),
323            Token::LParen => f.write_str("("),
324            Token::RParen => f.write_str(")"),
325            Token::Period => f.write_str("."),
326            Token::Colon => f.write_str(":"),
327            Token::DoubleColon => f.write_str("::"),
328            Token::Assignment => f.write_str(":="),
329            Token::SemiColon => f.write_str(";"),
330            Token::Backslash => f.write_str("\\"),
331            Token::LBracket => f.write_str("["),
332            Token::RBracket => f.write_str("]"),
333            Token::Ampersand => f.write_str("&"),
334            Token::Caret => f.write_str("^"),
335            Token::Pipe => f.write_str("|"),
336            Token::LBrace => f.write_str("{"),
337            Token::RBrace => f.write_str("}"),
338            Token::RArrow => f.write_str("=>"),
339            Token::Sharp => f.write_str("#"),
340            Token::DoubleSharp => f.write_str("##"),
341            Token::ExclamationMark => f.write_str("!"),
342            Token::DoubleExclamationMark => f.write_str("!!"),
343            Token::Tilde => f.write_str("~"),
344            Token::TildeAsterisk => f.write_str("~*"),
345            Token::ExclamationMarkTilde => f.write_str("!~"),
346            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
347            Token::DoubleTilde => f.write_str("~~"),
348            Token::DoubleTildeAsterisk => f.write_str("~~*"),
349            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
350            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
351            Token::AtSign => f.write_str("@"),
352            Token::CaretAt => f.write_str("^@"),
353            Token::ShiftLeft => f.write_str("<<"),
354            Token::ShiftRight => f.write_str(">>"),
355            Token::Overlap => f.write_str("&&"),
356            Token::PGSquareRoot => f.write_str("|/"),
357            Token::PGCubeRoot => f.write_str("||/"),
358            Token::AtDashAt => f.write_str("@-@"),
359            Token::QuestionMarkDash => f.write_str("?-"),
360            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
361            Token::AmpersandRightAngleBracket => f.write_str("&>"),
362            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
363            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
364            Token::VerticalBarRightAngleBracket => f.write_str("|>"),
365            Token::TwoWayArrow => f.write_str("<->"),
366            Token::LeftAngleBracketCaret => f.write_str("<^"),
367            Token::RightAngleBracketCaret => f.write_str(">^"),
368            Token::QuestionMarkSharp => f.write_str("?#"),
369            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
370            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
371            Token::TildeEqual => f.write_str("~="),
372            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
373            Token::VerticalBarShiftRight => f.write_str("|>>"),
374            Token::Placeholder(ref s) => write!(f, "{s}"),
375            Token::Arrow => write!(f, "->"),
376            Token::LongArrow => write!(f, "->>"),
377            Token::HashArrow => write!(f, "#>"),
378            Token::HashLongArrow => write!(f, "#>>"),
379            Token::AtArrow => write!(f, "@>"),
380            Token::ArrowAt => write!(f, "<@"),
381            Token::HashMinus => write!(f, "#-"),
382            Token::AtQuestion => write!(f, "@?"),
383            Token::AtAt => write!(f, "@@"),
384            Token::Question => write!(f, "?"),
385            Token::QuestionAnd => write!(f, "?&"),
386            Token::QuestionPipe => write!(f, "?|"),
387            Token::CustomBinaryOperator(s) => f.write_str(s),
388        }
389    }
390}
391
392impl Token {
393    pub fn make_keyword(keyword: &str) -> Self {
394        Token::make_word(keyword, None)
395    }
396
397    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
398        let word_uppercase = word.to_uppercase();
399        Token::Word(Word {
400            value: word.to_string(),
401            quote_style,
402            keyword: if quote_style.is_none() {
403                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
404                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
405            } else {
406                Keyword::NoKeyword
407            },
408        })
409    }
410}
411
412/// A keyword (like SELECT) or an optionally quoted SQL identifier
413#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
416pub struct Word {
417    /// The value of the token, without the enclosing quotes, and with the
418    /// escape sequences (if any) processed (TODO: escapes are not handled)
419    pub value: String,
420    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
421    /// The standard and most implementations allow using double quotes for this,
422    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
423    pub quote_style: Option<char>,
424    /// If the word was not quoted and it matched one of the known keywords,
425    /// this will have one of the values from dialect::keywords, otherwise empty
426    pub keyword: Keyword,
427}
428
429impl fmt::Display for Word {
430    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
431        match self.quote_style {
432            Some(s) if s == '"' || s == '[' || s == '`' => {
433                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
434            }
435            None => f.write_str(&self.value),
436            _ => panic!("Unexpected quote_style!"),
437        }
438    }
439}
440
441impl Word {
442    fn matching_end_quote(ch: char) -> char {
443        match ch {
444            '"' => '"', // ANSI and most dialects
445            '[' => ']', // MS SQL
446            '`' => '`', // MySQL
447            _ => panic!("unexpected quoting style!"),
448        }
449    }
450}
451
452#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
453#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
454#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
455pub enum Whitespace {
456    Space,
457    Newline,
458    Tab,
459    SingleLineComment { comment: String, prefix: String },
460    MultiLineComment(String),
461}
462
463impl fmt::Display for Whitespace {
464    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
465        match self {
466            Whitespace::Space => f.write_str(" "),
467            Whitespace::Newline => f.write_str("\n"),
468            Whitespace::Tab => f.write_str("\t"),
469            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
470            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
471        }
472    }
473}
474
475/// Location in input string
476///
477/// # Create an "empty" (unknown) `Location`
478/// ```
479/// # use sqlparser::tokenizer::Location;
480/// let location = Location::empty();
481/// ```
482///
483/// # Create a `Location` from a line and column
484/// ```
485/// # use sqlparser::tokenizer::Location;
486/// let location = Location::new(1, 1);
487/// ```
488///
489/// # Create a `Location` from a pair
490/// ```
491/// # use sqlparser::tokenizer::Location;
492/// let location = Location::from((1, 1));
493/// ```
494#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
495#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
496#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
497pub struct Location {
498    /// Line number, starting from 1.
499    ///
500    /// Note: Line 0 is used for empty spans
501    pub line: u64,
502    /// Line column, starting from 1.
503    ///
504    /// Note: Column 0 is used for empty spans
505    pub column: u64,
506}
507
508impl fmt::Display for Location {
509    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
510        if self.line == 0 {
511            return Ok(());
512        }
513        write!(f, " at Line: {}, Column: {}", self.line, self.column)
514    }
515}
516
517impl fmt::Debug for Location {
518    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
519        write!(f, "Location({},{})", self.line, self.column)
520    }
521}
522
523impl Location {
524    /// Return an "empty" / unknown location
525    pub fn empty() -> Self {
526        Self { line: 0, column: 0 }
527    }
528
529    /// Create a new `Location` for a given line and column
530    pub fn new(line: u64, column: u64) -> Self {
531        Self { line, column }
532    }
533
534    /// Create a new location for a given line and column
535    ///
536    /// Alias for [`Self::new`]
537    // TODO: remove / deprecate in favor of` `new` for consistency?
538    pub fn of(line: u64, column: u64) -> Self {
539        Self::new(line, column)
540    }
541
542    /// Combine self and `end` into a new `Span`
543    pub fn span_to(self, end: Self) -> Span {
544        Span { start: self, end }
545    }
546}
547
548impl From<(u64, u64)> for Location {
549    fn from((line, column): (u64, u64)) -> Self {
550        Self { line, column }
551    }
552}
553
554/// A span represents a linear portion of the input string (start, end)
555///
556/// See [Spanned](crate::ast::Spanned) for more information.
557#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
558#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
559#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
560pub struct Span {
561    pub start: Location,
562    pub end: Location,
563}
564
565impl fmt::Debug for Span {
566    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
567        write!(f, "Span({:?}..{:?})", self.start, self.end)
568    }
569}
570
571impl Span {
572    // An empty span (0, 0) -> (0, 0)
573    // We need a const instance for pattern matching
574    const EMPTY: Span = Self::empty();
575
576    /// Create a new span from a start and end [`Location`]
577    pub fn new(start: Location, end: Location) -> Span {
578        Span { start, end }
579    }
580
581    /// Returns an empty span `(0, 0) -> (0, 0)`
582    ///
583    /// Empty spans represent no knowledge of source location
584    /// See [Spanned](crate::ast::Spanned) for more information.
585    pub const fn empty() -> Span {
586        Span {
587            start: Location { line: 0, column: 0 },
588            end: Location { line: 0, column: 0 },
589        }
590    }
591
592    /// Returns the smallest Span that contains both `self` and `other`
593    /// If either span is [Span::empty], the other span is returned
594    ///
595    /// # Examples
596    /// ```
597    /// # use sqlparser::tokenizer::{Span, Location};
598    /// // line 1, column1 -> line 2, column 5
599    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
600    /// // line 2, column 3 -> line 3, column 7
601    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
602    /// // Union of the two is the min/max of the two spans
603    /// // line 1, column 1 -> line 3, column 7
604    /// let union = span1.union(&span2);
605    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
606    /// ```
607    pub fn union(&self, other: &Span) -> Span {
608        // If either span is empty, return the other
609        // this prevents propagating (0, 0) through the tree
610        match (self, other) {
611            (&Span::EMPTY, _) => *other,
612            (_, &Span::EMPTY) => *self,
613            _ => Span {
614                start: cmp::min(self.start, other.start),
615                end: cmp::max(self.end, other.end),
616            },
617        }
618    }
619
620    /// Same as [Span::union] for `Option<Span>`
621    ///
622    /// If `other` is `None`, `self` is returned
623    pub fn union_opt(&self, other: &Option<Span>) -> Span {
624        match other {
625            Some(other) => self.union(other),
626            None => *self,
627        }
628    }
629
630    /// Return the [Span::union] of all spans in the iterator
631    ///
632    /// If the iterator is empty, an empty span is returned
633    ///
634    /// # Example
635    /// ```
636    /// # use sqlparser::tokenizer::{Span, Location};
637    /// let spans = vec![
638    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
639    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
640    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
641    /// ];
642    /// // line 1, column 1 -> line 4, column 2
643    /// assert_eq!(
644    ///   Span::union_iter(spans),
645    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
646    /// );
647    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
648        iter.into_iter()
649            .reduce(|acc, item| acc.union(&item))
650            .unwrap_or(Span::empty())
651    }
652}
653
654/// Backwards compatibility struct for [`TokenWithSpan`]
655#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656pub type TokenWithLocation = TokenWithSpan;
657
658/// A [Token] with [Span] attached to it
659///
660/// This is used to track the location of a token in the input string
661///
662/// # Examples
663/// ```
664/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
665/// // commas @ line 1, column 10
666/// let tok1 = TokenWithSpan::new(
667///   Token::Comma,
668///   Span::new(Location::new(1, 10), Location::new(1, 11)),
669/// );
670/// assert_eq!(tok1, Token::Comma); // can compare the token
671///
672/// // commas @ line 2, column 20
673/// let tok2 = TokenWithSpan::new(
674///   Token::Comma,
675///   Span::new(Location::new(2, 20), Location::new(2, 21)),
676/// );
677/// // same token but different locations are not equal
678/// assert_ne!(tok1, tok2);
679/// ```
680#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
681#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
682#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
683pub struct TokenWithSpan {
684    pub token: Token,
685    pub span: Span,
686}
687
688impl TokenWithSpan {
689    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
690    pub fn new(token: Token, span: Span) -> Self {
691        Self { token, span }
692    }
693
694    /// Wrap a token with an empty span
695    pub fn wrap(token: Token) -> Self {
696        Self::new(token, Span::empty())
697    }
698
699    /// Wrap a token with a location from `start` to `end`
700    pub fn at(token: Token, start: Location, end: Location) -> Self {
701        Self::new(token, Span::new(start, end))
702    }
703
704    /// Return an EOF token with no location
705    pub fn new_eof() -> Self {
706        Self::wrap(Token::EOF)
707    }
708}
709
710impl PartialEq<Token> for TokenWithSpan {
711    fn eq(&self, other: &Token) -> bool {
712        &self.token == other
713    }
714}
715
716impl PartialEq<TokenWithSpan> for Token {
717    fn eq(&self, other: &TokenWithSpan) -> bool {
718        self == &other.token
719    }
720}
721
722impl fmt::Display for TokenWithSpan {
723    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
724        self.token.fmt(f)
725    }
726}
727
728/// Tokenizer error
729#[derive(Debug, PartialEq, Eq)]
730pub struct TokenizerError {
731    pub message: String,
732    pub location: Location,
733}
734
735impl fmt::Display for TokenizerError {
736    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737        write!(f, "{}{}", self.message, self.location,)
738    }
739}
740
741#[cfg(feature = "std")]
742impl std::error::Error for TokenizerError {}
743
744struct State<'a> {
745    peekable: Peekable<Chars<'a>>,
746    pub line: u64,
747    pub col: u64,
748}
749
750impl State<'_> {
751    /// return the next character and advance the stream
752    pub fn next(&mut self) -> Option<char> {
753        match self.peekable.next() {
754            None => None,
755            Some(s) => {
756                if s == '\n' {
757                    self.line += 1;
758                    self.col = 1;
759                } else {
760                    self.col += 1;
761                }
762                Some(s)
763            }
764        }
765    }
766
767    /// return the next character but do not advance the stream
768    pub fn peek(&mut self) -> Option<&char> {
769        self.peekable.peek()
770    }
771
772    pub fn location(&self) -> Location {
773        Location {
774            line: self.line,
775            column: self.col,
776        }
777    }
778}
779
780/// Represents how many quote characters enclose a string literal.
781#[derive(Copy, Clone)]
782enum NumStringQuoteChars {
783    /// e.g. `"abc"`, `'abc'`, `r'abc'`
784    One,
785    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
786    Many(NonZeroU8),
787}
788
789/// Settings for tokenizing a quoted string literal.
790struct TokenizeQuotedStringSettings {
791    /// The character used to quote the string.
792    quote_style: char,
793    /// Represents how many quotes characters enclose the string literal.
794    num_quote_chars: NumStringQuoteChars,
795    /// The number of opening quotes left to consume, before parsing
796    /// the remaining string literal.
797    /// For example: given initial string `"""abc"""`. If the caller has
798    /// already parsed the first quote for some reason, then this value
799    /// is set to 1, flagging to look to consume only 2 leading quotes.
800    num_opening_quotes_to_consume: u8,
801    /// True if the string uses backslash escaping of special characters
802    /// e.g `'abc\ndef\'ghi'
803    backslash_escape: bool,
804}
805
806/// SQL Tokenizer
807pub struct Tokenizer<'a> {
808    dialect: &'a dyn Dialect,
809    query: &'a str,
810    /// If true (the default), the tokenizer will un-escape literal
811    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
812    unescape: bool,
813}
814
815impl<'a> Tokenizer<'a> {
816    /// Create a new SQL tokenizer for the specified SQL statement
817    ///
818    /// ```
819    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
820    /// # use sqlparser::dialect::GenericDialect;
821    /// # let dialect = GenericDialect{};
822    /// let query = r#"SELECT 'foo'"#;
823    ///
824    /// // Parsing the query
825    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
826    ///
827    /// assert_eq!(tokens, vec![
828    ///   Token::make_word("SELECT", None),
829    ///   Token::Whitespace(Whitespace::Space),
830    ///   Token::SingleQuotedString("foo".to_string()),
831    /// ]);
832    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
833        Self {
834            dialect,
835            query,
836            unescape: true,
837        }
838    }
839
840    /// Set unescape mode
841    ///
842    /// When true (default) the tokenizer unescapes literal values
843    /// (for example, `""` in SQL is unescaped to the literal `"`).
844    ///
845    /// When false, the tokenizer provides the raw strings as provided
846    /// in the query.  This can be helpful for programs that wish to
847    /// recover the *exact* original query text without normalizing
848    /// the escaping
849    ///
850    /// # Example
851    ///
852    /// ```
853    /// # use sqlparser::tokenizer::{Token, Tokenizer};
854    /// # use sqlparser::dialect::GenericDialect;
855    /// # let dialect = GenericDialect{};
856    /// let query = r#""Foo "" Bar""#;
857    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
858    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
859    ///
860    /// // Parsing with unescaping (default)
861    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
862    /// assert_eq!(tokens, vec![unescaped]);
863    ///
864    /// // Parsing with unescape = false
865    /// let tokens = Tokenizer::new(&dialect, &query)
866    ///    .with_unescape(false)
867    ///    .tokenize().unwrap();
868    /// assert_eq!(tokens, vec![original]);
869    /// ```
870    pub fn with_unescape(mut self, unescape: bool) -> Self {
871        self.unescape = unescape;
872        self
873    }
874
875    /// Tokenize the statement and produce a vector of tokens
876    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877        let twl = self.tokenize_with_location()?;
878        Ok(twl.into_iter().map(|t| t.token).collect())
879    }
880
881    /// Tokenize the statement and produce a vector of tokens with location information
882    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
883        let mut tokens: Vec<TokenWithSpan> = vec![];
884        self.tokenize_with_location_into_buf(&mut tokens)
885            .map(|_| tokens)
886    }
887
888    /// Tokenize the statement and append tokens with location information into the provided buffer.
889    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
890    pub fn tokenize_with_location_into_buf(
891        &mut self,
892        buf: &mut Vec<TokenWithSpan>,
893    ) -> Result<(), TokenizerError> {
894        let mut state = State {
895            peekable: self.query.chars().peekable(),
896            line: 1,
897            col: 1,
898        };
899
900        let mut location = state.location();
901        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902            let span = location.span_to(state.location());
903
904            buf.push(TokenWithSpan { token, span });
905
906            location = state.location();
907        }
908        Ok(())
909    }
910
911    // Tokenize the identifier or keywords in `ch`
912    fn tokenize_identifier_or_keyword(
913        &self,
914        ch: impl IntoIterator<Item = char>,
915        chars: &mut State,
916    ) -> Result<Option<Token>, TokenizerError> {
917        chars.next(); // consume the first char
918        let ch: String = ch.into_iter().collect();
919        let word = self.tokenize_word(ch, chars);
920
921        // TODO: implement parsing of exponent here
922        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923            let mut inner_state = State {
924                peekable: word.chars().peekable(),
925                line: 0,
926                col: 0,
927            };
928            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
930            s += s2.as_str();
931            return Ok(Some(Token::Number(s, false)));
932        }
933
934        Ok(Some(Token::make_word(&word, None)))
935    }
936
937    /// Get the next token or return None
938    fn next_token(
939        &self,
940        chars: &mut State,
941        prev_token: Option<&Token>,
942    ) -> Result<Option<Token>, TokenizerError> {
943        match chars.peek() {
944            Some(&ch) => match ch {
945                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
946                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
947                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
948                '\r' => {
949                    // Emit a single Whitespace::Newline token for \r and \r\n
950                    chars.next();
951                    if let Some('\n') = chars.peek() {
952                        chars.next();
953                    }
954                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
955                }
956                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
957                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
958                {
959                    chars.next(); // consume
960                    match chars.peek() {
961                        Some('\'') => {
962                            if self.dialect.supports_triple_quoted_string() {
963                                return self
964                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
965                                        chars,
966                                        '\'',
967                                        false,
968                                        Token::SingleQuotedByteStringLiteral,
969                                        Token::TripleSingleQuotedByteStringLiteral,
970                                    );
971                            }
972                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
973                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
974                        }
975                        Some('\"') => {
976                            if self.dialect.supports_triple_quoted_string() {
977                                return self
978                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
979                                        chars,
980                                        '"',
981                                        false,
982                                        Token::DoubleQuotedByteStringLiteral,
983                                        Token::TripleDoubleQuotedByteStringLiteral,
984                                    );
985                            }
986                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
987                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
988                        }
989                        _ => {
990                            // regular identifier starting with an "b" or "B"
991                            let s = self.tokenize_word(b, chars);
992                            Ok(Some(Token::make_word(&s, None)))
993                        }
994                    }
995                }
996                // BigQuery uses r or R for raw string literal
997                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
998                    chars.next(); // consume
999                    match chars.peek() {
1000                        Some('\'') => self
1001                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1002                                chars,
1003                                '\'',
1004                                false,
1005                                Token::SingleQuotedRawStringLiteral,
1006                                Token::TripleSingleQuotedRawStringLiteral,
1007                            ),
1008                        Some('\"') => self
1009                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1010                                chars,
1011                                '"',
1012                                false,
1013                                Token::DoubleQuotedRawStringLiteral,
1014                                Token::TripleDoubleQuotedRawStringLiteral,
1015                            ),
1016                        _ => {
1017                            // regular identifier starting with an "r" or "R"
1018                            let s = self.tokenize_word(b, chars);
1019                            Ok(Some(Token::make_word(&s, None)))
1020                        }
1021                    }
1022                }
1023                // Redshift uses lower case n for national string literal
1024                n @ 'N' | n @ 'n' => {
1025                    chars.next(); // consume, to check the next char
1026                    match chars.peek() {
1027                        Some('\'') => {
1028                            // N'...' - a <national character string literal>
1029                            let backslash_escape =
1030                                self.dialect.supports_string_literal_backslash_escape();
1031                            let s =
1032                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1033                            Ok(Some(Token::NationalStringLiteral(s)))
1034                        }
1035                        _ => {
1036                            // regular identifier starting with an "N"
1037                            let s = self.tokenize_word(n, chars);
1038                            Ok(Some(Token::make_word(&s, None)))
1039                        }
1040                    }
1041                }
1042                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1043                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1044                    let starting_loc = chars.location();
1045                    chars.next(); // consume, to check the next char
1046                    match chars.peek() {
1047                        Some('\'') => {
1048                            let s =
1049                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1050                            Ok(Some(Token::EscapedStringLiteral(s)))
1051                        }
1052                        _ => {
1053                            // regular identifier starting with an "E" or "e"
1054                            let s = self.tokenize_word(x, chars);
1055                            Ok(Some(Token::make_word(&s, None)))
1056                        }
1057                    }
1058                }
1059                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1060                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1061                    chars.next(); // consume, to check the next char
1062                    if chars.peek() == Some(&'&') {
1063                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1064                        let mut chars_clone = chars.peekable.clone();
1065                        chars_clone.next(); // consume the '&' in the clone
1066                        if chars_clone.peek() == Some(&'\'') {
1067                            chars.next(); // consume the '&' in the original iterator
1068                            let s = unescape_unicode_single_quoted_string(chars)?;
1069                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1070                        }
1071                    }
1072                    // regular identifier starting with an "U" or "u"
1073                    let s = self.tokenize_word(x, chars);
1074                    Ok(Some(Token::make_word(&s, None)))
1075                }
1076                // The spec only allows an uppercase 'X' to introduce a hex
1077                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1078                x @ 'x' | x @ 'X' => {
1079                    chars.next(); // consume, to check the next char
1080                    match chars.peek() {
1081                        Some('\'') => {
1082                            // X'...' - a <binary string literal>
1083                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1084                            Ok(Some(Token::HexStringLiteral(s)))
1085                        }
1086                        _ => {
1087                            // regular identifier starting with an "X"
1088                            let s = self.tokenize_word(x, chars);
1089                            Ok(Some(Token::make_word(&s, None)))
1090                        }
1091                    }
1092                }
1093                // single quoted string
1094                '\'' => {
1095                    if self.dialect.supports_triple_quoted_string() {
1096                        return self
1097                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1098                                chars,
1099                                '\'',
1100                                self.dialect.supports_string_literal_backslash_escape(),
1101                                Token::SingleQuotedString,
1102                                Token::TripleSingleQuotedString,
1103                            );
1104                    }
1105                    let s = self.tokenize_single_quoted_string(
1106                        chars,
1107                        '\'',
1108                        self.dialect.supports_string_literal_backslash_escape(),
1109                    )?;
1110
1111                    Ok(Some(Token::SingleQuotedString(s)))
1112                }
1113                // double quoted string
1114                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1115                    && !self.dialect.is_identifier_start(ch) =>
1116                {
1117                    if self.dialect.supports_triple_quoted_string() {
1118                        return self
1119                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120                                chars,
1121                                '"',
1122                                self.dialect.supports_string_literal_backslash_escape(),
1123                                Token::DoubleQuotedString,
1124                                Token::TripleDoubleQuotedString,
1125                            );
1126                    }
1127                    let s = self.tokenize_single_quoted_string(
1128                        chars,
1129                        '"',
1130                        self.dialect.supports_string_literal_backslash_escape(),
1131                    )?;
1132
1133                    Ok(Some(Token::DoubleQuotedString(s)))
1134                }
1135                // delimited (quoted) identifier
1136                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1137                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1138                    Ok(Some(Token::make_word(&word, Some(quote_start))))
1139                }
1140                // Potentially nested delimited (quoted) identifier
1141                quote_start
1142                    if self
1143                        .dialect
1144                        .is_nested_delimited_identifier_start(quote_start)
1145                        && self
1146                            .dialect
1147                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1148                            .is_some() =>
1149                {
1150                    let Some((quote_start, nested_quote_start)) = self
1151                        .dialect
1152                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1153                    else {
1154                        return self.tokenizer_error(
1155                            chars.location(),
1156                            format!("Expected nested delimiter '{quote_start}' before EOF."),
1157                        );
1158                    };
1159
1160                    let Some(nested_quote_start) = nested_quote_start else {
1161                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1162                        return Ok(Some(Token::make_word(&word, Some(quote_start))));
1163                    };
1164
1165                    let mut word = vec![];
1166                    let quote_end = Word::matching_end_quote(quote_start);
1167                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1168                    let error_loc = chars.location();
1169
1170                    chars.next(); // skip the first delimiter
1171                    peeking_take_while(chars, |ch| ch.is_whitespace());
1172                    if chars.peek() != Some(&nested_quote_start) {
1173                        return self.tokenizer_error(
1174                            error_loc,
1175                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1176                        );
1177                    }
1178                    word.push(nested_quote_start.into());
1179                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1180                    word.push(nested_quote_end.into());
1181                    peeking_take_while(chars, |ch| ch.is_whitespace());
1182                    if chars.peek() != Some(&quote_end) {
1183                        return self.tokenizer_error(
1184                            error_loc,
1185                            format!("Expected close delimiter '{quote_end}' before EOF."),
1186                        );
1187                    }
1188                    chars.next(); // skip close delimiter
1189
1190                    Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1191                }
1192                // numbers and period
1193                '0'..='9' | '.' => {
1194                    // special case where if ._ is encountered after a word then that word
1195                    // is a table and the _ is the start of the col name.
1196                    // if the prev token is not a word, then this is not a valid sql
1197                    // word or number.
1198                    if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1199                        if let Some(Token::Word(_)) = prev_token {
1200                            chars.next();
1201                            return Ok(Some(Token::Period));
1202                        }
1203
1204                        return self.tokenizer_error(
1205                            chars.location(),
1206                            "Unexpected character '_'".to_string(),
1207                        );
1208                    }
1209
1210                    // Some dialects support underscore as number separator
1211                    // There can only be one at a time and it must be followed by another digit
1212                    let is_number_separator = |ch: char, next_char: Option<char>| {
1213                        self.dialect.supports_numeric_literal_underscores()
1214                            && ch == '_'
1215                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1216                    };
1217
1218                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1219                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1220                    });
1221
1222                    // match binary literal that starts with 0x
1223                    if s == "0" && chars.peek() == Some(&'x') {
1224                        chars.next();
1225                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1226                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1227                        });
1228                        return Ok(Some(Token::HexStringLiteral(s2)));
1229                    }
1230
1231                    // match one period
1232                    if let Some('.') = chars.peek() {
1233                        s.push('.');
1234                        chars.next();
1235                    }
1236
1237                    // If the dialect supports identifiers that start with a numeric prefix
1238                    // and we have now consumed a dot, check if the previous token was a Word.
1239                    // If so, what follows is definitely not part of a decimal number and
1240                    // we should yield the dot as a dedicated token so compound identifiers
1241                    // starting with digits can be parsed correctly.
1242                    if s == "." && self.dialect.supports_numeric_prefix() {
1243                        if let Some(Token::Word(_)) = prev_token {
1244                            return Ok(Some(Token::Period));
1245                        }
1246                    }
1247
1248                    // Consume fractional digits.
1249                    s += &peeking_next_take_while(chars, |ch, next_ch| {
1250                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1251                    });
1252
1253                    // No fraction -> Token::Period
1254                    if s == "." {
1255                        return Ok(Some(Token::Period));
1256                    }
1257
1258                    // Parse exponent as number
1259                    let mut exponent_part = String::new();
1260                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1261                        let mut char_clone = chars.peekable.clone();
1262                        exponent_part.push(char_clone.next().unwrap());
1263
1264                        // Optional sign
1265                        match char_clone.peek() {
1266                            Some(&c) if matches!(c, '+' | '-') => {
1267                                exponent_part.push(c);
1268                                char_clone.next();
1269                            }
1270                            _ => (),
1271                        }
1272
1273                        match char_clone.peek() {
1274                            // Definitely an exponent, get original iterator up to speed and use it
1275                            Some(&c) if c.is_ascii_digit() => {
1276                                for _ in 0..exponent_part.len() {
1277                                    chars.next();
1278                                }
1279                                exponent_part +=
1280                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1281                                s += exponent_part.as_str();
1282                            }
1283                            // Not an exponent, discard the work done
1284                            _ => (),
1285                        }
1286                    }
1287
1288                    // If the dialect supports identifiers that start with a numeric prefix,
1289                    // we need to check if the value is in fact an identifier and must thus
1290                    // be tokenized as a word.
1291                    if self.dialect.supports_numeric_prefix() {
1292                        if exponent_part.is_empty() {
1293                            // If it is not a number with an exponent, it may be
1294                            // an identifier starting with digits.
1295                            let word =
1296                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1297
1298                            if !word.is_empty() {
1299                                s += word.as_str();
1300                                return Ok(Some(Token::make_word(s.as_str(), None)));
1301                            }
1302                        } else if prev_token == Some(&Token::Period) {
1303                            // If the previous token was a period, thus not belonging to a number,
1304                            // the value we have is part of an identifier.
1305                            return Ok(Some(Token::make_word(s.as_str(), None)));
1306                        }
1307                    }
1308
1309                    let long = if chars.peek() == Some(&'L') {
1310                        chars.next();
1311                        true
1312                    } else {
1313                        false
1314                    };
1315                    Ok(Some(Token::Number(s, long)))
1316                }
1317                // punctuation
1318                '(' => self.consume_and_return(chars, Token::LParen),
1319                ')' => self.consume_and_return(chars, Token::RParen),
1320                ',' => self.consume_and_return(chars, Token::Comma),
1321                // operators
1322                '-' => {
1323                    chars.next(); // consume the '-'
1324
1325                    match chars.peek() {
1326                        Some('-') => {
1327                            let mut is_comment = true;
1328                            if self.dialect.requires_single_line_comment_whitespace() {
1329                                is_comment = Some(' ') == chars.peekable.clone().nth(1);
1330                            }
1331
1332                            if is_comment {
1333                                chars.next(); // consume second '-'
1334                                let comment = self.tokenize_single_line_comment(chars);
1335                                return Ok(Some(Token::Whitespace(
1336                                    Whitespace::SingleLineComment {
1337                                        prefix: "--".to_owned(),
1338                                        comment,
1339                                    },
1340                                )));
1341                            }
1342
1343                            self.start_binop(chars, "-", Token::Minus)
1344                        }
1345                        Some('>') => {
1346                            chars.next();
1347                            match chars.peek() {
1348                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1349                                _ => self.start_binop(chars, "->", Token::Arrow),
1350                            }
1351                        }
1352                        // a regular '-' operator
1353                        _ => self.start_binop(chars, "-", Token::Minus),
1354                    }
1355                }
1356                '/' => {
1357                    chars.next(); // consume the '/'
1358                    match chars.peek() {
1359                        Some('*') => {
1360                            chars.next(); // consume the '*', starting a multi-line comment
1361                            self.tokenize_multiline_comment(chars)
1362                        }
1363                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1364                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1365                            let comment = self.tokenize_single_line_comment(chars);
1366                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1367                                prefix: "//".to_owned(),
1368                                comment,
1369                            })))
1370                        }
1371                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1372                            self.consume_and_return(chars, Token::DuckIntDiv)
1373                        }
1374                        // a regular '/' operator
1375                        _ => Ok(Some(Token::Div)),
1376                    }
1377                }
1378                '+' => self.consume_and_return(chars, Token::Plus),
1379                '*' => self.consume_and_return(chars, Token::Mul),
1380                '%' => {
1381                    chars.next(); // advance past '%'
1382                    match chars.peek() {
1383                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1384                        Some(sch) if self.dialect.is_identifier_start('%') => {
1385                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1386                        }
1387                        _ => self.start_binop(chars, "%", Token::Mod),
1388                    }
1389                }
1390                '|' => {
1391                    chars.next(); // consume the '|'
1392                    match chars.peek() {
1393                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1394                        Some('|') => {
1395                            chars.next(); // consume the second '|'
1396                            match chars.peek() {
1397                                Some('/') => {
1398                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1399                                }
1400                                _ => self.start_binop(chars, "||", Token::StringConcat),
1401                            }
1402                        }
1403                        Some('&') if self.dialect.supports_geometric_types() => {
1404                            chars.next(); // consume
1405                            match chars.peek() {
1406                                Some('>') => self.consume_for_binop(
1407                                    chars,
1408                                    "|&>",
1409                                    Token::VerticalBarAmpersandRightAngleBracket,
1410                                ),
1411                                _ => self.start_binop_opt(chars, "|&", None),
1412                            }
1413                        }
1414                        Some('>') if self.dialect.supports_geometric_types() => {
1415                            chars.next(); // consume
1416                            match chars.peek() {
1417                                Some('>') => self.consume_for_binop(
1418                                    chars,
1419                                    "|>>",
1420                                    Token::VerticalBarShiftRight,
1421                                ),
1422                                _ => self.start_binop_opt(chars, "|>", None),
1423                            }
1424                        }
1425                        Some('>') if self.dialect.supports_pipe_operator() => {
1426                            self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1427                        }
1428                        // Bitshift '|' operator
1429                        _ => self.start_binop(chars, "|", Token::Pipe),
1430                    }
1431                }
1432                '=' => {
1433                    chars.next(); // consume
1434                    match chars.peek() {
1435                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1436                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1437                        _ => Ok(Some(Token::Eq)),
1438                    }
1439                }
1440                '!' => {
1441                    chars.next(); // consume
1442                    match chars.peek() {
1443                        Some('=') => self.consume_and_return(chars, Token::Neq),
1444                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1445                        Some('~') => {
1446                            chars.next();
1447                            match chars.peek() {
1448                                Some('*') => self
1449                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1450                                Some('~') => {
1451                                    chars.next();
1452                                    match chars.peek() {
1453                                        Some('*') => self.consume_and_return(
1454                                            chars,
1455                                            Token::ExclamationMarkDoubleTildeAsterisk,
1456                                        ),
1457                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1458                                    }
1459                                }
1460                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1461                            }
1462                        }
1463                        _ => Ok(Some(Token::ExclamationMark)),
1464                    }
1465                }
1466                '<' => {
1467                    chars.next(); // consume
1468                    match chars.peek() {
1469                        Some('=') => {
1470                            chars.next();
1471                            match chars.peek() {
1472                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1473                                _ => self.start_binop(chars, "<=", Token::LtEq),
1474                            }
1475                        }
1476                        Some('|') if self.dialect.supports_geometric_types() => {
1477                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1478                        }
1479                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1480                        Some('<') if self.dialect.supports_geometric_types() => {
1481                            chars.next(); // consume
1482                            match chars.peek() {
1483                                Some('|') => self.consume_for_binop(
1484                                    chars,
1485                                    "<<|",
1486                                    Token::ShiftLeftVerticalBar,
1487                                ),
1488                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1489                            }
1490                        }
1491                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1492                        Some('-') if self.dialect.supports_geometric_types() => {
1493                            chars.next(); // consume
1494                            match chars.peek() {
1495                                Some('>') => {
1496                                    self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1497                                }
1498                                _ => self.start_binop_opt(chars, "<-", None),
1499                            }
1500                        }
1501                        Some('^') if self.dialect.supports_geometric_types() => {
1502                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1503                        }
1504                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1505                        _ => self.start_binop(chars, "<", Token::Lt),
1506                    }
1507                }
1508                '>' => {
1509                    chars.next(); // consume
1510                    match chars.peek() {
1511                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1512                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1513                        Some('^') if self.dialect.supports_geometric_types() => {
1514                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1515                        }
1516                        _ => self.start_binop(chars, ">", Token::Gt),
1517                    }
1518                }
1519                ':' => {
1520                    chars.next();
1521                    match chars.peek() {
1522                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1523                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1524                        _ => Ok(Some(Token::Colon)),
1525                    }
1526                }
1527                ';' => self.consume_and_return(chars, Token::SemiColon),
1528                '\\' => self.consume_and_return(chars, Token::Backslash),
1529                '[' => self.consume_and_return(chars, Token::LBracket),
1530                ']' => self.consume_and_return(chars, Token::RBracket),
1531                '&' => {
1532                    chars.next(); // consume the '&'
1533                    match chars.peek() {
1534                        Some('>') if self.dialect.supports_geometric_types() => {
1535                            chars.next();
1536                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1537                        }
1538                        Some('<') if self.dialect.supports_geometric_types() => {
1539                            chars.next(); // consume
1540                            match chars.peek() {
1541                                Some('|') => self.consume_and_return(
1542                                    chars,
1543                                    Token::AmpersandLeftAngleBracketVerticalBar,
1544                                ),
1545                                _ => {
1546                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1547                                }
1548                            }
1549                        }
1550                        Some('&') => {
1551                            chars.next(); // consume the second '&'
1552                            self.start_binop(chars, "&&", Token::Overlap)
1553                        }
1554                        // Bitshift '&' operator
1555                        _ => self.start_binop(chars, "&", Token::Ampersand),
1556                    }
1557                }
1558                '^' => {
1559                    chars.next(); // consume the '^'
1560                    match chars.peek() {
1561                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1562                        _ => Ok(Some(Token::Caret)),
1563                    }
1564                }
1565                '{' => self.consume_and_return(chars, Token::LBrace),
1566                '}' => self.consume_and_return(chars, Token::RBrace),
1567                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1568                {
1569                    chars.next(); // consume the '#', starting a snowflake single-line comment
1570                    let comment = self.tokenize_single_line_comment(chars);
1571                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1572                        prefix: "#".to_owned(),
1573                        comment,
1574                    })))
1575                }
1576                '~' => {
1577                    chars.next(); // consume
1578                    match chars.peek() {
1579                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1580                        Some('=') if self.dialect.supports_geometric_types() => {
1581                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
1582                        }
1583                        Some('~') => {
1584                            chars.next();
1585                            match chars.peek() {
1586                                Some('*') => {
1587                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1588                                }
1589                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1590                            }
1591                        }
1592                        _ => self.start_binop(chars, "~", Token::Tilde),
1593                    }
1594                }
1595                '#' => {
1596                    chars.next();
1597                    match chars.peek() {
1598                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1599                        Some('>') => {
1600                            chars.next();
1601                            match chars.peek() {
1602                                Some('>') => {
1603                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1604                                }
1605                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1606                            }
1607                        }
1608                        Some(' ') => Ok(Some(Token::Sharp)),
1609                        Some('#') if self.dialect.supports_geometric_types() => {
1610                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
1611                        }
1612                        Some(sch) if self.dialect.is_identifier_start('#') => {
1613                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1614                        }
1615                        _ => self.start_binop(chars, "#", Token::Sharp),
1616                    }
1617                }
1618                '@' => {
1619                    chars.next();
1620                    match chars.peek() {
1621                        Some('@') if self.dialect.supports_geometric_types() => {
1622                            self.consume_and_return(chars, Token::AtAt)
1623                        }
1624                        Some('-') if self.dialect.supports_geometric_types() => {
1625                            chars.next();
1626                            match chars.peek() {
1627                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1628                                _ => self.start_binop_opt(chars, "@-", None),
1629                            }
1630                        }
1631                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1632                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1633                        Some('@') => {
1634                            chars.next();
1635                            match chars.peek() {
1636                                Some(' ') => Ok(Some(Token::AtAt)),
1637                                Some(tch) if self.dialect.is_identifier_start('@') => {
1638                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1639                                }
1640                                _ => Ok(Some(Token::AtAt)),
1641                            }
1642                        }
1643                        Some(' ') => Ok(Some(Token::AtSign)),
1644                        // We break on quotes here, because no dialect allows identifiers starting
1645                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1646                        // quoted, which is tokenized as a quoted string, not here (e.g.
1647                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1648                        // quoted string as two separate tokens, which this allows. For example,
1649                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1650                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1651                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1652                        // for the user, the `@`, and the host.
1653                        Some('\'') => Ok(Some(Token::AtSign)),
1654                        Some('\"') => Ok(Some(Token::AtSign)),
1655                        Some('`') => Ok(Some(Token::AtSign)),
1656                        Some(sch) if self.dialect.is_identifier_start('@') => {
1657                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1658                        }
1659                        _ => Ok(Some(Token::AtSign)),
1660                    }
1661                }
1662                // Postgres uses ? for jsonb operators, not prepared statements
1663                '?' if self.dialect.supports_geometric_types() => {
1664                    chars.next(); // consume
1665                    match chars.peek() {
1666                        Some('|') => {
1667                            chars.next();
1668                            match chars.peek() {
1669                                Some('|') => self.consume_and_return(
1670                                    chars,
1671                                    Token::QuestionMarkDoubleVerticalBar,
1672                                ),
1673                                _ => Ok(Some(Token::QuestionPipe)),
1674                            }
1675                        }
1676
1677                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1678                        Some('-') => {
1679                            chars.next(); // consume
1680                            match chars.peek() {
1681                                Some('|') => self
1682                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1683                                _ => Ok(Some(Token::QuestionMarkDash)),
1684                            }
1685                        }
1686                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1687                        _ => self.consume_and_return(chars, Token::Question),
1688                    }
1689                }
1690                '?' => {
1691                    chars.next();
1692                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1693                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1694                }
1695
1696                // identifier or keyword
1697                ch if self.dialect.is_identifier_start(ch) => {
1698                    self.tokenize_identifier_or_keyword([ch], chars)
1699                }
1700                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1701
1702                // whitespace check (including unicode chars) should be last as it covers some of the chars above
1703                ch if ch.is_whitespace() => {
1704                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1705                }
1706                other => self.consume_and_return(chars, Token::Char(other)),
1707            },
1708            None => Ok(None),
1709        }
1710    }
1711
1712    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1713    fn consume_for_binop(
1714        &self,
1715        chars: &mut State,
1716        prefix: &str,
1717        default: Token,
1718    ) -> Result<Option<Token>, TokenizerError> {
1719        chars.next(); // consume the first char
1720        self.start_binop_opt(chars, prefix, Some(default))
1721    }
1722
1723    /// parse a custom binary operator
1724    fn start_binop(
1725        &self,
1726        chars: &mut State,
1727        prefix: &str,
1728        default: Token,
1729    ) -> Result<Option<Token>, TokenizerError> {
1730        self.start_binop_opt(chars, prefix, Some(default))
1731    }
1732
1733    /// parse a custom binary operator
1734    fn start_binop_opt(
1735        &self,
1736        chars: &mut State,
1737        prefix: &str,
1738        default: Option<Token>,
1739    ) -> Result<Option<Token>, TokenizerError> {
1740        let mut custom = None;
1741        while let Some(&ch) = chars.peek() {
1742            if !self.dialect.is_custom_operator_part(ch) {
1743                break;
1744            }
1745
1746            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1747            chars.next();
1748        }
1749        match (custom, default) {
1750            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1751            (None, Some(tok)) => Ok(Some(tok)),
1752            (None, None) => self.tokenizer_error(
1753                chars.location(),
1754                format!("Expected a valid binary operator after '{prefix}'"),
1755            ),
1756        }
1757    }
1758
1759    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1760    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1761        let mut s = String::new();
1762        let mut value = String::new();
1763
1764        chars.next();
1765
1766        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1767        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1768            chars.next();
1769
1770            let mut is_terminated = false;
1771            let mut prev: Option<char> = None;
1772
1773            while let Some(&ch) = chars.peek() {
1774                if prev == Some('$') {
1775                    if ch == '$' {
1776                        chars.next();
1777                        is_terminated = true;
1778                        break;
1779                    } else {
1780                        s.push('$');
1781                        s.push(ch);
1782                    }
1783                } else if ch != '$' {
1784                    s.push(ch);
1785                }
1786
1787                prev = Some(ch);
1788                chars.next();
1789            }
1790
1791            return if chars.peek().is_none() && !is_terminated {
1792                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1793            } else {
1794                Ok(Token::DollarQuotedString(DollarQuotedString {
1795                    value: s,
1796                    tag: None,
1797                }))
1798            };
1799        } else {
1800            value.push_str(&peeking_take_while(chars, |ch| {
1801                ch.is_alphanumeric()
1802                    || ch == '_'
1803                    // Allow $ as a placeholder character if the dialect supports it
1804                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1805            }));
1806
1807            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1808            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1809                chars.next();
1810
1811                let mut temp = String::new();
1812                let end_delimiter = format!("${value}$");
1813
1814                loop {
1815                    match chars.next() {
1816                        Some(ch) => {
1817                            temp.push(ch);
1818
1819                            if temp.ends_with(&end_delimiter) {
1820                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1821                                    s.push_str(temp);
1822                                }
1823                                break;
1824                            }
1825                        }
1826                        None => {
1827                            if temp.ends_with(&end_delimiter) {
1828                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1829                                    s.push_str(temp);
1830                                }
1831                                break;
1832                            }
1833
1834                            return self.tokenizer_error(
1835                                chars.location(),
1836                                "Unterminated dollar-quoted, expected $",
1837                            );
1838                        }
1839                    }
1840                }
1841            } else {
1842                return Ok(Token::Placeholder(String::from("$") + &value));
1843            }
1844        }
1845
1846        Ok(Token::DollarQuotedString(DollarQuotedString {
1847            value: s,
1848            tag: if value.is_empty() { None } else { Some(value) },
1849        }))
1850    }
1851
1852    fn tokenizer_error<R>(
1853        &self,
1854        loc: Location,
1855        message: impl Into<String>,
1856    ) -> Result<R, TokenizerError> {
1857        Err(TokenizerError {
1858            message: message.into(),
1859            location: loc,
1860        })
1861    }
1862
1863    // Consume characters until newline
1864    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1865        let mut comment = peeking_take_while(chars, |ch| match ch {
1866            '\n' => false,                                           // Always stop at \n
1867            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
1868            _ => true, // Keep consuming for other characters
1869        });
1870
1871        if let Some(ch) = chars.next() {
1872            assert!(ch == '\n' || ch == '\r');
1873            comment.push(ch);
1874        }
1875
1876        comment
1877    }
1878
1879    /// Tokenize an identifier or keyword, after the first char is already consumed.
1880    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881        let mut s = first_chars.into();
1882        s.push_str(&peeking_take_while(chars, |ch| {
1883            self.dialect.is_identifier_part(ch)
1884        }));
1885        s
1886    }
1887
1888    /// Read a quoted identifier
1889    fn tokenize_quoted_identifier(
1890        &self,
1891        quote_start: char,
1892        chars: &mut State,
1893    ) -> Result<String, TokenizerError> {
1894        let error_loc = chars.location();
1895        chars.next(); // consume the opening quote
1896        let quote_end = Word::matching_end_quote(quote_start);
1897        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1898
1899        if last_char == Some(quote_end) {
1900            Ok(s)
1901        } else {
1902            self.tokenizer_error(
1903                error_loc,
1904                format!("Expected close delimiter '{quote_end}' before EOF."),
1905            )
1906        }
1907    }
1908
1909    /// Read a single quoted string, starting with the opening quote.
1910    fn tokenize_escaped_single_quoted_string(
1911        &self,
1912        starting_loc: Location,
1913        chars: &mut State,
1914    ) -> Result<String, TokenizerError> {
1915        if let Some(s) = unescape_single_quoted_string(chars) {
1916            return Ok(s);
1917        }
1918
1919        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1920    }
1921
1922    /// Reads a string literal quoted by a single or triple quote characters.
1923    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
1924    fn tokenize_single_or_triple_quoted_string<F>(
1925        &self,
1926        chars: &mut State,
1927        quote_style: char,
1928        backslash_escape: bool,
1929        single_quote_token: F,
1930        triple_quote_token: F,
1931    ) -> Result<Option<Token>, TokenizerError>
1932    where
1933        F: Fn(String) -> Token,
1934    {
1935        let error_loc = chars.location();
1936
1937        let mut num_opening_quotes = 0u8;
1938        for _ in 0..3 {
1939            if Some(&quote_style) == chars.peek() {
1940                chars.next(); // Consume quote.
1941                num_opening_quotes += 1;
1942            } else {
1943                break;
1944            }
1945        }
1946
1947        let (token_fn, num_quote_chars) = match num_opening_quotes {
1948            1 => (single_quote_token, NumStringQuoteChars::One),
1949            2 => {
1950                // If we matched double quotes, then this is an empty string.
1951                return Ok(Some(single_quote_token("".into())));
1952            }
1953            3 => {
1954                let Some(num_quote_chars) = NonZeroU8::new(3) else {
1955                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1956                };
1957                (
1958                    triple_quote_token,
1959                    NumStringQuoteChars::Many(num_quote_chars),
1960                )
1961            }
1962            _ => {
1963                return self.tokenizer_error(error_loc, "invalid string literal opening");
1964            }
1965        };
1966
1967        let settings = TokenizeQuotedStringSettings {
1968            quote_style,
1969            num_quote_chars,
1970            num_opening_quotes_to_consume: 0,
1971            backslash_escape,
1972        };
1973
1974        self.tokenize_quoted_string(chars, settings)
1975            .map(token_fn)
1976            .map(Some)
1977    }
1978
1979    /// Reads a string literal quoted by a single quote character.
1980    fn tokenize_single_quoted_string(
1981        &self,
1982        chars: &mut State,
1983        quote_style: char,
1984        backslash_escape: bool,
1985    ) -> Result<String, TokenizerError> {
1986        self.tokenize_quoted_string(
1987            chars,
1988            TokenizeQuotedStringSettings {
1989                quote_style,
1990                num_quote_chars: NumStringQuoteChars::One,
1991                num_opening_quotes_to_consume: 1,
1992                backslash_escape,
1993            },
1994        )
1995    }
1996
1997    /// Read a quoted string.
1998    fn tokenize_quoted_string(
1999        &self,
2000        chars: &mut State,
2001        settings: TokenizeQuotedStringSettings,
2002    ) -> Result<String, TokenizerError> {
2003        let mut s = String::new();
2004        let error_loc = chars.location();
2005
2006        // Consume any opening quotes.
2007        for _ in 0..settings.num_opening_quotes_to_consume {
2008            if Some(settings.quote_style) != chars.next() {
2009                return self.tokenizer_error(error_loc, "invalid string literal opening");
2010            }
2011        }
2012
2013        let mut num_consecutive_quotes = 0;
2014        while let Some(&ch) = chars.peek() {
2015            let pending_final_quote = match settings.num_quote_chars {
2016                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2017                n @ NumStringQuoteChars::Many(count)
2018                    if num_consecutive_quotes + 1 == count.get() =>
2019                {
2020                    Some(n)
2021                }
2022                NumStringQuoteChars::Many(_) => None,
2023            };
2024
2025            match ch {
2026                char if char == settings.quote_style && pending_final_quote.is_some() => {
2027                    chars.next(); // consume
2028
2029                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2030                        // For an initial string like `"""abc"""`, at this point we have
2031                        // `abc""` in the buffer and have now matched the final `"`.
2032                        // However, the string to return is simply `abc`, so we strip off
2033                        // the trailing quotes before returning.
2034                        let mut buf = s.chars();
2035                        for _ in 1..count.get() {
2036                            buf.next_back();
2037                        }
2038                        return Ok(buf.as_str().to_string());
2039                    } else if chars
2040                        .peek()
2041                        .map(|c| *c == settings.quote_style)
2042                        .unwrap_or(false)
2043                    {
2044                        s.push(ch);
2045                        if !self.unescape {
2046                            // In no-escape mode, the given query has to be saved completely
2047                            s.push(ch);
2048                        }
2049                        chars.next();
2050                    } else {
2051                        return Ok(s);
2052                    }
2053                }
2054                '\\' if settings.backslash_escape => {
2055                    // consume backslash
2056                    chars.next();
2057
2058                    num_consecutive_quotes = 0;
2059
2060                    if let Some(next) = chars.peek() {
2061                        if !self.unescape
2062                            || (self.dialect.ignores_wildcard_escapes()
2063                                && (*next == '%' || *next == '_'))
2064                        {
2065                            // In no-escape mode, the given query has to be saved completely
2066                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2067                            // the backslash is not stripped.
2068                            s.push(ch);
2069                            s.push(*next);
2070                            chars.next(); // consume next
2071                        } else {
2072                            let n = match next {
2073                                '0' => '\0',
2074                                'a' => '\u{7}',
2075                                'b' => '\u{8}',
2076                                'f' => '\u{c}',
2077                                'n' => '\n',
2078                                'r' => '\r',
2079                                't' => '\t',
2080                                'Z' => '\u{1a}',
2081                                _ => *next,
2082                            };
2083                            s.push(n);
2084                            chars.next(); // consume next
2085                        }
2086                    }
2087                }
2088                ch => {
2089                    chars.next(); // consume ch
2090
2091                    if ch == settings.quote_style {
2092                        num_consecutive_quotes += 1;
2093                    } else {
2094                        num_consecutive_quotes = 0;
2095                    }
2096
2097                    s.push(ch);
2098                }
2099            }
2100        }
2101        self.tokenizer_error(error_loc, "Unterminated string literal")
2102    }
2103
2104    fn tokenize_multiline_comment(
2105        &self,
2106        chars: &mut State,
2107    ) -> Result<Option<Token>, TokenizerError> {
2108        let mut s = String::new();
2109        let mut nested = 1;
2110        let supports_nested_comments = self.dialect.supports_nested_comments();
2111
2112        loop {
2113            match chars.next() {
2114                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2115                    chars.next(); // consume the '*'
2116                    s.push('/');
2117                    s.push('*');
2118                    nested += 1;
2119                }
2120                Some('*') if matches!(chars.peek(), Some('/')) => {
2121                    chars.next(); // consume the '/'
2122                    nested -= 1;
2123                    if nested == 0 {
2124                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2125                    }
2126                    s.push('*');
2127                    s.push('/');
2128                }
2129                Some(ch) => {
2130                    s.push(ch);
2131                }
2132                None => {
2133                    break self.tokenizer_error(
2134                        chars.location(),
2135                        "Unexpected EOF while in a multi-line comment",
2136                    );
2137                }
2138            }
2139        }
2140    }
2141
2142    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2143        let mut last_char = None;
2144        let mut s = String::new();
2145        while let Some(ch) = chars.next() {
2146            if ch == quote_end {
2147                if chars.peek() == Some(&quote_end) {
2148                    chars.next();
2149                    s.push(ch);
2150                    if !self.unescape {
2151                        // In no-escape mode, the given query has to be saved completely
2152                        s.push(ch);
2153                    }
2154                } else {
2155                    last_char = Some(quote_end);
2156                    break;
2157                }
2158            } else {
2159                s.push(ch);
2160            }
2161        }
2162        (s, last_char)
2163    }
2164
2165    #[allow(clippy::unnecessary_wraps)]
2166    fn consume_and_return(
2167        &self,
2168        chars: &mut State,
2169        t: Token,
2170    ) -> Result<Option<Token>, TokenizerError> {
2171        chars.next();
2172        Ok(Some(t))
2173    }
2174}
2175
2176/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2177/// Return the characters read as String, and keep the first non-matching
2178/// char available as `chars.next()`.
2179fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2180    let mut s = String::new();
2181    while let Some(&ch) = chars.peek() {
2182        if predicate(ch) {
2183            chars.next(); // consume
2184            s.push(ch);
2185        } else {
2186            break;
2187        }
2188    }
2189    s
2190}
2191
2192/// Same as peeking_take_while, but also passes the next character to the predicate.
2193fn peeking_next_take_while(
2194    chars: &mut State,
2195    mut predicate: impl FnMut(char, Option<char>) -> bool,
2196) -> String {
2197    let mut s = String::new();
2198    while let Some(&ch) = chars.peek() {
2199        let next_char = chars.peekable.clone().nth(1);
2200        if predicate(ch, next_char) {
2201            chars.next(); // consume
2202            s.push(ch);
2203        } else {
2204            break;
2205        }
2206    }
2207    s
2208}
2209
2210fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2211    Unescape::new(chars).unescape()
2212}
2213
2214struct Unescape<'a: 'b, 'b> {
2215    chars: &'b mut State<'a>,
2216}
2217
2218impl<'a: 'b, 'b> Unescape<'a, 'b> {
2219    fn new(chars: &'b mut State<'a>) -> Self {
2220        Self { chars }
2221    }
2222    fn unescape(mut self) -> Option<String> {
2223        let mut unescaped = String::new();
2224
2225        self.chars.next();
2226
2227        while let Some(c) = self.chars.next() {
2228            if c == '\'' {
2229                // case: ''''
2230                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2231                    self.chars.next();
2232                    unescaped.push('\'');
2233                    continue;
2234                }
2235                return Some(unescaped);
2236            }
2237
2238            if c != '\\' {
2239                unescaped.push(c);
2240                continue;
2241            }
2242
2243            let c = match self.chars.next()? {
2244                'b' => '\u{0008}',
2245                'f' => '\u{000C}',
2246                'n' => '\n',
2247                'r' => '\r',
2248                't' => '\t',
2249                'u' => self.unescape_unicode_16()?,
2250                'U' => self.unescape_unicode_32()?,
2251                'x' => self.unescape_hex()?,
2252                c if c.is_digit(8) => self.unescape_octal(c)?,
2253                c => c,
2254            };
2255
2256            unescaped.push(Self::check_null(c)?);
2257        }
2258
2259        None
2260    }
2261
2262    #[inline]
2263    fn check_null(c: char) -> Option<char> {
2264        if c == '\0' {
2265            None
2266        } else {
2267            Some(c)
2268        }
2269    }
2270
2271    #[inline]
2272    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2273        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2274        match u32::from_str_radix(s, RADIX) {
2275            Err(_) => None,
2276            Ok(n) => {
2277                let n = n & 0xFF;
2278                if n <= 127 {
2279                    char::from_u32(n)
2280                } else {
2281                    None
2282                }
2283            }
2284        }
2285    }
2286
2287    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2288    fn unescape_hex(&mut self) -> Option<char> {
2289        let mut s = String::new();
2290
2291        for _ in 0..2 {
2292            match self.next_hex_digit() {
2293                Some(c) => s.push(c),
2294                None => break,
2295            }
2296        }
2297
2298        if s.is_empty() {
2299            return Some('x');
2300        }
2301
2302        Self::byte_to_char::<16>(&s)
2303    }
2304
2305    #[inline]
2306    fn next_hex_digit(&mut self) -> Option<char> {
2307        match self.chars.peek() {
2308            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2309            _ => None,
2310        }
2311    }
2312
2313    // Octal byte value. \o, \oo, \ooo (o = 0–7)
2314    fn unescape_octal(&mut self, c: char) -> Option<char> {
2315        let mut s = String::new();
2316
2317        s.push(c);
2318        for _ in 0..2 {
2319            match self.next_octal_digest() {
2320                Some(c) => s.push(c),
2321                None => break,
2322            }
2323        }
2324
2325        Self::byte_to_char::<8>(&s)
2326    }
2327
2328    #[inline]
2329    fn next_octal_digest(&mut self) -> Option<char> {
2330        match self.chars.peek() {
2331            Some(c) if c.is_digit(8) => self.chars.next(),
2332            _ => None,
2333        }
2334    }
2335
2336    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2337    fn unescape_unicode_16(&mut self) -> Option<char> {
2338        self.unescape_unicode::<4>()
2339    }
2340
2341    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2342    fn unescape_unicode_32(&mut self) -> Option<char> {
2343        self.unescape_unicode::<8>()
2344    }
2345
2346    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2347        let mut s = String::new();
2348        for _ in 0..NUM {
2349            s.push(self.chars.next()?);
2350        }
2351        match u32::from_str_radix(&s, 16) {
2352            Err(_) => None,
2353            Ok(n) => char::from_u32(n),
2354        }
2355    }
2356}
2357
2358fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2359    let mut unescaped = String::new();
2360    chars.next(); // consume the opening quote
2361    while let Some(c) = chars.next() {
2362        match c {
2363            '\'' => {
2364                if chars.peek() == Some(&'\'') {
2365                    chars.next();
2366                    unescaped.push('\'');
2367                } else {
2368                    return Ok(unescaped);
2369                }
2370            }
2371            '\\' => match chars.peek() {
2372                Some('\\') => {
2373                    chars.next();
2374                    unescaped.push('\\');
2375                }
2376                Some('+') => {
2377                    chars.next();
2378                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2379                }
2380                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2381            },
2382            _ => {
2383                unescaped.push(c);
2384            }
2385        }
2386    }
2387    Err(TokenizerError {
2388        message: "Unterminated unicode encoded string literal".to_string(),
2389        location: chars.location(),
2390    })
2391}
2392
2393fn take_char_from_hex_digits(
2394    chars: &mut State<'_>,
2395    max_digits: usize,
2396) -> Result<char, TokenizerError> {
2397    let mut result = 0u32;
2398    for _ in 0..max_digits {
2399        let next_char = chars.next().ok_or_else(|| TokenizerError {
2400            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2401                .to_string(),
2402            location: chars.location(),
2403        })?;
2404        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2405            message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2406            location: chars.location(),
2407        })?;
2408        result = result * 16 + digit;
2409    }
2410    char::from_u32(result).ok_or_else(|| TokenizerError {
2411        message: format!("Invalid unicode character: {result:x}"),
2412        location: chars.location(),
2413    })
2414}
2415
2416#[cfg(test)]
2417mod tests {
2418    use super::*;
2419    use crate::dialect::{
2420        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2421    };
2422    use crate::test_utils::{all_dialects_except, all_dialects_where};
2423    use core::fmt::Debug;
2424
2425    #[test]
2426    fn tokenizer_error_impl() {
2427        let err = TokenizerError {
2428            message: "test".into(),
2429            location: Location { line: 1, column: 1 },
2430        };
2431        #[cfg(feature = "std")]
2432        {
2433            use std::error::Error;
2434            assert!(err.source().is_none());
2435        }
2436        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2437    }
2438
2439    #[test]
2440    fn tokenize_select_1() {
2441        let sql = String::from("SELECT 1");
2442        let dialect = GenericDialect {};
2443        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2444
2445        let expected = vec![
2446            Token::make_keyword("SELECT"),
2447            Token::Whitespace(Whitespace::Space),
2448            Token::Number(String::from("1"), false),
2449        ];
2450
2451        compare(expected, tokens);
2452    }
2453
2454    #[test]
2455    fn tokenize_select_float() {
2456        let sql = String::from("SELECT .1");
2457        let dialect = GenericDialect {};
2458        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2459
2460        let expected = vec![
2461            Token::make_keyword("SELECT"),
2462            Token::Whitespace(Whitespace::Space),
2463            Token::Number(String::from(".1"), false),
2464        ];
2465
2466        compare(expected, tokens);
2467    }
2468
2469    #[test]
2470    fn tokenize_clickhouse_double_equal() {
2471        let sql = String::from("SELECT foo=='1'");
2472        let dialect = ClickHouseDialect {};
2473        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2474        let tokens = tokenizer.tokenize().unwrap();
2475
2476        let expected = vec![
2477            Token::make_keyword("SELECT"),
2478            Token::Whitespace(Whitespace::Space),
2479            Token::Word(Word {
2480                value: "foo".to_string(),
2481                quote_style: None,
2482                keyword: Keyword::NoKeyword,
2483            }),
2484            Token::DoubleEq,
2485            Token::SingleQuotedString("1".to_string()),
2486        ];
2487
2488        compare(expected, tokens);
2489    }
2490
2491    #[test]
2492    fn tokenize_numeric_literal_underscore() {
2493        let dialect = GenericDialect {};
2494        let sql = String::from("SELECT 10_000");
2495        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2496        let tokens = tokenizer.tokenize().unwrap();
2497        let expected = vec![
2498            Token::make_keyword("SELECT"),
2499            Token::Whitespace(Whitespace::Space),
2500            Token::Number("10".to_string(), false),
2501            Token::make_word("_000", None),
2502        ];
2503        compare(expected, tokens);
2504
2505        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2506            "SELECT 10_000, _10_000, 10_00_, 10___0",
2507            vec![
2508                Token::make_keyword("SELECT"),
2509                Token::Whitespace(Whitespace::Space),
2510                Token::Number("10_000".to_string(), false),
2511                Token::Comma,
2512                Token::Whitespace(Whitespace::Space),
2513                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2514                Token::Comma,
2515                Token::Whitespace(Whitespace::Space),
2516                Token::Number("10_00".to_string(), false),
2517                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2518                Token::Comma,
2519                Token::Whitespace(Whitespace::Space),
2520                Token::Number("10".to_string(), false),
2521                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2522            ],
2523        );
2524    }
2525
2526    #[test]
2527    fn tokenize_select_exponent() {
2528        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2529        let dialect = GenericDialect {};
2530        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2531
2532        let expected = vec![
2533            Token::make_keyword("SELECT"),
2534            Token::Whitespace(Whitespace::Space),
2535            Token::Number(String::from("1e10"), false),
2536            Token::Comma,
2537            Token::Whitespace(Whitespace::Space),
2538            Token::Number(String::from("1e-10"), false),
2539            Token::Comma,
2540            Token::Whitespace(Whitespace::Space),
2541            Token::Number(String::from("1e+10"), false),
2542            Token::Comma,
2543            Token::Whitespace(Whitespace::Space),
2544            Token::Number(String::from("1"), false),
2545            Token::make_word("ea", None),
2546            Token::Comma,
2547            Token::Whitespace(Whitespace::Space),
2548            Token::Number(String::from("1e-10"), false),
2549            Token::make_word("a", None),
2550            Token::Comma,
2551            Token::Whitespace(Whitespace::Space),
2552            Token::Number(String::from("1e-10"), false),
2553            Token::Minus,
2554            Token::Number(String::from("10"), false),
2555        ];
2556
2557        compare(expected, tokens);
2558    }
2559
2560    #[test]
2561    fn tokenize_scalar_function() {
2562        let sql = String::from("SELECT sqrt(1)");
2563        let dialect = GenericDialect {};
2564        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2565
2566        let expected = vec![
2567            Token::make_keyword("SELECT"),
2568            Token::Whitespace(Whitespace::Space),
2569            Token::make_word("sqrt", None),
2570            Token::LParen,
2571            Token::Number(String::from("1"), false),
2572            Token::RParen,
2573        ];
2574
2575        compare(expected, tokens);
2576    }
2577
2578    #[test]
2579    fn tokenize_string_string_concat() {
2580        let sql = String::from("SELECT 'a' || 'b'");
2581        let dialect = GenericDialect {};
2582        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2583
2584        let expected = vec![
2585            Token::make_keyword("SELECT"),
2586            Token::Whitespace(Whitespace::Space),
2587            Token::SingleQuotedString(String::from("a")),
2588            Token::Whitespace(Whitespace::Space),
2589            Token::StringConcat,
2590            Token::Whitespace(Whitespace::Space),
2591            Token::SingleQuotedString(String::from("b")),
2592        ];
2593
2594        compare(expected, tokens);
2595    }
2596    #[test]
2597    fn tokenize_bitwise_op() {
2598        let sql = String::from("SELECT one | two ^ three");
2599        let dialect = GenericDialect {};
2600        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2601
2602        let expected = vec![
2603            Token::make_keyword("SELECT"),
2604            Token::Whitespace(Whitespace::Space),
2605            Token::make_word("one", None),
2606            Token::Whitespace(Whitespace::Space),
2607            Token::Pipe,
2608            Token::Whitespace(Whitespace::Space),
2609            Token::make_word("two", None),
2610            Token::Whitespace(Whitespace::Space),
2611            Token::Caret,
2612            Token::Whitespace(Whitespace::Space),
2613            Token::make_word("three", None),
2614        ];
2615        compare(expected, tokens);
2616    }
2617
2618    #[test]
2619    fn tokenize_logical_xor() {
2620        let sql =
2621            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2622        let dialect = GenericDialect {};
2623        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2624
2625        let expected = vec![
2626            Token::make_keyword("SELECT"),
2627            Token::Whitespace(Whitespace::Space),
2628            Token::make_keyword("true"),
2629            Token::Whitespace(Whitespace::Space),
2630            Token::make_keyword("XOR"),
2631            Token::Whitespace(Whitespace::Space),
2632            Token::make_keyword("true"),
2633            Token::Comma,
2634            Token::Whitespace(Whitespace::Space),
2635            Token::make_keyword("false"),
2636            Token::Whitespace(Whitespace::Space),
2637            Token::make_keyword("XOR"),
2638            Token::Whitespace(Whitespace::Space),
2639            Token::make_keyword("false"),
2640            Token::Comma,
2641            Token::Whitespace(Whitespace::Space),
2642            Token::make_keyword("true"),
2643            Token::Whitespace(Whitespace::Space),
2644            Token::make_keyword("XOR"),
2645            Token::Whitespace(Whitespace::Space),
2646            Token::make_keyword("false"),
2647            Token::Comma,
2648            Token::Whitespace(Whitespace::Space),
2649            Token::make_keyword("false"),
2650            Token::Whitespace(Whitespace::Space),
2651            Token::make_keyword("XOR"),
2652            Token::Whitespace(Whitespace::Space),
2653            Token::make_keyword("true"),
2654        ];
2655        compare(expected, tokens);
2656    }
2657
2658    #[test]
2659    fn tokenize_simple_select() {
2660        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2661        let dialect = GenericDialect {};
2662        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2663
2664        let expected = vec![
2665            Token::make_keyword("SELECT"),
2666            Token::Whitespace(Whitespace::Space),
2667            Token::Mul,
2668            Token::Whitespace(Whitespace::Space),
2669            Token::make_keyword("FROM"),
2670            Token::Whitespace(Whitespace::Space),
2671            Token::make_word("customer", None),
2672            Token::Whitespace(Whitespace::Space),
2673            Token::make_keyword("WHERE"),
2674            Token::Whitespace(Whitespace::Space),
2675            Token::make_word("id", None),
2676            Token::Whitespace(Whitespace::Space),
2677            Token::Eq,
2678            Token::Whitespace(Whitespace::Space),
2679            Token::Number(String::from("1"), false),
2680            Token::Whitespace(Whitespace::Space),
2681            Token::make_keyword("LIMIT"),
2682            Token::Whitespace(Whitespace::Space),
2683            Token::Number(String::from("5"), false),
2684        ];
2685
2686        compare(expected, tokens);
2687    }
2688
2689    #[test]
2690    fn tokenize_explain_select() {
2691        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2692        let dialect = GenericDialect {};
2693        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2694
2695        let expected = vec![
2696            Token::make_keyword("EXPLAIN"),
2697            Token::Whitespace(Whitespace::Space),
2698            Token::make_keyword("SELECT"),
2699            Token::Whitespace(Whitespace::Space),
2700            Token::Mul,
2701            Token::Whitespace(Whitespace::Space),
2702            Token::make_keyword("FROM"),
2703            Token::Whitespace(Whitespace::Space),
2704            Token::make_word("customer", None),
2705            Token::Whitespace(Whitespace::Space),
2706            Token::make_keyword("WHERE"),
2707            Token::Whitespace(Whitespace::Space),
2708            Token::make_word("id", None),
2709            Token::Whitespace(Whitespace::Space),
2710            Token::Eq,
2711            Token::Whitespace(Whitespace::Space),
2712            Token::Number(String::from("1"), false),
2713        ];
2714
2715        compare(expected, tokens);
2716    }
2717
2718    #[test]
2719    fn tokenize_explain_analyze_select() {
2720        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2721        let dialect = GenericDialect {};
2722        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2723
2724        let expected = vec![
2725            Token::make_keyword("EXPLAIN"),
2726            Token::Whitespace(Whitespace::Space),
2727            Token::make_keyword("ANALYZE"),
2728            Token::Whitespace(Whitespace::Space),
2729            Token::make_keyword("SELECT"),
2730            Token::Whitespace(Whitespace::Space),
2731            Token::Mul,
2732            Token::Whitespace(Whitespace::Space),
2733            Token::make_keyword("FROM"),
2734            Token::Whitespace(Whitespace::Space),
2735            Token::make_word("customer", None),
2736            Token::Whitespace(Whitespace::Space),
2737            Token::make_keyword("WHERE"),
2738            Token::Whitespace(Whitespace::Space),
2739            Token::make_word("id", None),
2740            Token::Whitespace(Whitespace::Space),
2741            Token::Eq,
2742            Token::Whitespace(Whitespace::Space),
2743            Token::Number(String::from("1"), false),
2744        ];
2745
2746        compare(expected, tokens);
2747    }
2748
2749    #[test]
2750    fn tokenize_string_predicate() {
2751        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2752        let dialect = GenericDialect {};
2753        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2754
2755        let expected = vec![
2756            Token::make_keyword("SELECT"),
2757            Token::Whitespace(Whitespace::Space),
2758            Token::Mul,
2759            Token::Whitespace(Whitespace::Space),
2760            Token::make_keyword("FROM"),
2761            Token::Whitespace(Whitespace::Space),
2762            Token::make_word("customer", None),
2763            Token::Whitespace(Whitespace::Space),
2764            Token::make_keyword("WHERE"),
2765            Token::Whitespace(Whitespace::Space),
2766            Token::make_word("salary", None),
2767            Token::Whitespace(Whitespace::Space),
2768            Token::Neq,
2769            Token::Whitespace(Whitespace::Space),
2770            Token::SingleQuotedString(String::from("Not Provided")),
2771        ];
2772
2773        compare(expected, tokens);
2774    }
2775
2776    #[test]
2777    fn tokenize_invalid_string() {
2778        let sql = String::from("\n💝مصطفىh");
2779
2780        let dialect = GenericDialect {};
2781        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2782        // println!("tokens: {:#?}", tokens);
2783        let expected = vec![
2784            Token::Whitespace(Whitespace::Newline),
2785            Token::Char('💝'),
2786            Token::make_word("مصطفىh", None),
2787        ];
2788        compare(expected, tokens);
2789    }
2790
2791    #[test]
2792    fn tokenize_newline_in_string_literal() {
2793        let sql = String::from("'foo\r\nbar\nbaz'");
2794
2795        let dialect = GenericDialect {};
2796        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2797        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2798        compare(expected, tokens);
2799    }
2800
2801    #[test]
2802    fn tokenize_unterminated_string_literal() {
2803        let sql = String::from("select 'foo");
2804
2805        let dialect = GenericDialect {};
2806        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2807        assert_eq!(
2808            tokenizer.tokenize(),
2809            Err(TokenizerError {
2810                message: "Unterminated string literal".to_string(),
2811                location: Location { line: 1, column: 8 },
2812            })
2813        );
2814    }
2815
2816    #[test]
2817    fn tokenize_unterminated_string_literal_utf8() {
2818        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2819
2820        let dialect = GenericDialect {};
2821        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2822        assert_eq!(
2823            tokenizer.tokenize(),
2824            Err(TokenizerError {
2825                message: "Unterminated string literal".to_string(),
2826                location: Location {
2827                    line: 1,
2828                    column: 35
2829                }
2830            })
2831        );
2832    }
2833
2834    #[test]
2835    fn tokenize_invalid_string_cols() {
2836        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2837
2838        let dialect = GenericDialect {};
2839        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2840        // println!("tokens: {:#?}", tokens);
2841        let expected = vec![
2842            Token::Whitespace(Whitespace::Newline),
2843            Token::Whitespace(Whitespace::Newline),
2844            Token::make_keyword("SELECT"),
2845            Token::Whitespace(Whitespace::Space),
2846            Token::Mul,
2847            Token::Whitespace(Whitespace::Space),
2848            Token::make_keyword("FROM"),
2849            Token::Whitespace(Whitespace::Space),
2850            Token::make_keyword("table"),
2851            Token::Whitespace(Whitespace::Tab),
2852            Token::Char('💝'),
2853            Token::make_word("مصطفىh", None),
2854        ];
2855        compare(expected, tokens);
2856    }
2857
2858    #[test]
2859    fn tokenize_dollar_quoted_string_tagged() {
2860        let test_cases = vec![
2861            (
2862                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2863                vec![
2864                    Token::make_keyword("SELECT"),
2865                    Token::Whitespace(Whitespace::Space),
2866                    Token::DollarQuotedString(DollarQuotedString {
2867                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2868                        tag: Some("tag".into()),
2869                    })
2870                ]
2871            ),
2872            (
2873                String::from("SELECT $abc$x$ab$abc$"),
2874                vec![
2875                    Token::make_keyword("SELECT"),
2876                    Token::Whitespace(Whitespace::Space),
2877                    Token::DollarQuotedString(DollarQuotedString {
2878                        value: "x$ab".into(),
2879                        tag: Some("abc".into()),
2880                    })
2881                ]
2882            ),
2883            (
2884                String::from("SELECT $abc$$abc$"),
2885                vec![
2886                    Token::make_keyword("SELECT"),
2887                    Token::Whitespace(Whitespace::Space),
2888                    Token::DollarQuotedString(DollarQuotedString {
2889                        value: "".into(),
2890                        tag: Some("abc".into()),
2891                    })
2892                ]
2893            ),
2894            (
2895                String::from("0$abc$$abc$1"),
2896                vec![
2897                    Token::Number("0".into(), false),
2898                    Token::DollarQuotedString(DollarQuotedString {
2899                        value: "".into(),
2900                        tag: Some("abc".into()),
2901                    }),
2902                    Token::Number("1".into(), false),
2903                ]
2904            ),
2905            (
2906                String::from("$function$abc$q$data$q$$function$"),
2907                vec![
2908                    Token::DollarQuotedString(DollarQuotedString {
2909                        value: "abc$q$data$q$".into(),
2910                        tag: Some("function".into()),
2911                    }),
2912                ]
2913            ),
2914        ];
2915
2916        let dialect = GenericDialect {};
2917        for (sql, expected) in test_cases {
2918            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2919            compare(expected, tokens);
2920        }
2921    }
2922
2923    #[test]
2924    fn tokenize_dollar_quoted_string_tagged_unterminated() {
2925        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2926        let dialect = GenericDialect {};
2927        assert_eq!(
2928            Tokenizer::new(&dialect, &sql).tokenize(),
2929            Err(TokenizerError {
2930                message: "Unterminated dollar-quoted, expected $".into(),
2931                location: Location {
2932                    line: 1,
2933                    column: 91
2934                }
2935            })
2936        );
2937    }
2938
2939    #[test]
2940    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2941        let sql = String::from("SELECT $abc$abc$");
2942        let dialect = GenericDialect {};
2943        assert_eq!(
2944            Tokenizer::new(&dialect, &sql).tokenize(),
2945            Err(TokenizerError {
2946                message: "Unterminated dollar-quoted, expected $".into(),
2947                location: Location {
2948                    line: 1,
2949                    column: 17
2950                }
2951            })
2952        );
2953    }
2954
2955    #[test]
2956    fn tokenize_dollar_placeholder() {
2957        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
2958        let dialect = SQLiteDialect {};
2959        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2960        assert_eq!(
2961            tokens,
2962            vec![
2963                Token::make_keyword("SELECT"),
2964                Token::Whitespace(Whitespace::Space),
2965                Token::Placeholder("$$".into()),
2966                Token::Comma,
2967                Token::Whitespace(Whitespace::Space),
2968                Token::Placeholder("$$ABC$$".into()),
2969                Token::Comma,
2970                Token::Whitespace(Whitespace::Space),
2971                Token::Placeholder("$ABC$".into()),
2972                Token::Comma,
2973                Token::Whitespace(Whitespace::Space),
2974                Token::Placeholder("$ABC".into()),
2975            ]
2976        );
2977    }
2978
2979    #[test]
2980    fn tokenize_nested_dollar_quoted_strings() {
2981        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
2982        let dialect = GenericDialect {};
2983        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2984        let expected = vec![
2985            Token::make_keyword("SELECT"),
2986            Token::Whitespace(Whitespace::Space),
2987            Token::DollarQuotedString(DollarQuotedString {
2988                value: "dollar $nested$ string".into(),
2989                tag: Some("tag".into()),
2990            }),
2991        ];
2992        compare(expected, tokens);
2993    }
2994
2995    #[test]
2996    fn tokenize_dollar_quoted_string_untagged_empty() {
2997        let sql = String::from("SELECT $$$$");
2998        let dialect = GenericDialect {};
2999        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3000        let expected = vec![
3001            Token::make_keyword("SELECT"),
3002            Token::Whitespace(Whitespace::Space),
3003            Token::DollarQuotedString(DollarQuotedString {
3004                value: "".into(),
3005                tag: None,
3006            }),
3007        ];
3008        compare(expected, tokens);
3009    }
3010
3011    #[test]
3012    fn tokenize_dollar_quoted_string_untagged() {
3013        let sql =
3014            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3015        let dialect = GenericDialect {};
3016        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3017        let expected = vec![
3018            Token::make_keyword("SELECT"),
3019            Token::Whitespace(Whitespace::Space),
3020            Token::DollarQuotedString(DollarQuotedString {
3021                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3022                tag: None,
3023            }),
3024        ];
3025        compare(expected, tokens);
3026    }
3027
3028    #[test]
3029    fn tokenize_dollar_quoted_string_untagged_unterminated() {
3030        let sql = String::from(
3031            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3032        );
3033        let dialect = GenericDialect {};
3034        assert_eq!(
3035            Tokenizer::new(&dialect, &sql).tokenize(),
3036            Err(TokenizerError {
3037                message: "Unterminated dollar-quoted string".into(),
3038                location: Location {
3039                    line: 1,
3040                    column: 86
3041                }
3042            })
3043        );
3044    }
3045
3046    #[test]
3047    fn tokenize_right_arrow() {
3048        let sql = String::from("FUNCTION(key=>value)");
3049        let dialect = GenericDialect {};
3050        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3051        let expected = vec![
3052            Token::make_word("FUNCTION", None),
3053            Token::LParen,
3054            Token::make_word("key", None),
3055            Token::RArrow,
3056            Token::make_word("value", None),
3057            Token::RParen,
3058        ];
3059        compare(expected, tokens);
3060    }
3061
3062    #[test]
3063    fn tokenize_is_null() {
3064        let sql = String::from("a IS NULL");
3065        let dialect = GenericDialect {};
3066        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3067
3068        let expected = vec![
3069            Token::make_word("a", None),
3070            Token::Whitespace(Whitespace::Space),
3071            Token::make_keyword("IS"),
3072            Token::Whitespace(Whitespace::Space),
3073            Token::make_keyword("NULL"),
3074        ];
3075
3076        compare(expected, tokens);
3077    }
3078
3079    #[test]
3080    fn tokenize_comment() {
3081        let test_cases = vec![
3082            (
3083                String::from("0--this is a comment\n1"),
3084                vec![
3085                    Token::Number("0".to_string(), false),
3086                    Token::Whitespace(Whitespace::SingleLineComment {
3087                        prefix: "--".to_string(),
3088                        comment: "this is a comment\n".to_string(),
3089                    }),
3090                    Token::Number("1".to_string(), false),
3091                ],
3092            ),
3093            (
3094                String::from("0--this is a comment\r1"),
3095                vec![
3096                    Token::Number("0".to_string(), false),
3097                    Token::Whitespace(Whitespace::SingleLineComment {
3098                        prefix: "--".to_string(),
3099                        comment: "this is a comment\r1".to_string(),
3100                    }),
3101                ],
3102            ),
3103            (
3104                String::from("0--this is a comment\r\n1"),
3105                vec![
3106                    Token::Number("0".to_string(), false),
3107                    Token::Whitespace(Whitespace::SingleLineComment {
3108                        prefix: "--".to_string(),
3109                        comment: "this is a comment\r\n".to_string(),
3110                    }),
3111                    Token::Number("1".to_string(), false),
3112                ],
3113            ),
3114        ];
3115
3116        let dialect = GenericDialect {};
3117
3118        for (sql, expected) in test_cases {
3119            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3120            compare(expected, tokens);
3121        }
3122    }
3123
3124    #[test]
3125    fn tokenize_comment_postgres() {
3126        let sql = String::from("1--\r0");
3127
3128        let dialect = PostgreSqlDialect {};
3129        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3130        let expected = vec![
3131            Token::Number("1".to_string(), false),
3132            Token::Whitespace(Whitespace::SingleLineComment {
3133                prefix: "--".to_string(),
3134                comment: "\r".to_string(),
3135            }),
3136            Token::Number("0".to_string(), false),
3137        ];
3138        compare(expected, tokens);
3139    }
3140
3141    #[test]
3142    fn tokenize_comment_at_eof() {
3143        let sql = String::from("--this is a comment");
3144
3145        let dialect = GenericDialect {};
3146        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3147        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3148            prefix: "--".to_string(),
3149            comment: "this is a comment".to_string(),
3150        })];
3151        compare(expected, tokens);
3152    }
3153
3154    #[test]
3155    fn tokenize_multiline_comment() {
3156        let sql = String::from("0/*multi-line\n* /comment*/1");
3157
3158        let dialect = GenericDialect {};
3159        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3160        let expected = vec![
3161            Token::Number("0".to_string(), false),
3162            Token::Whitespace(Whitespace::MultiLineComment(
3163                "multi-line\n* /comment".to_string(),
3164            )),
3165            Token::Number("1".to_string(), false),
3166        ];
3167        compare(expected, tokens);
3168    }
3169
3170    #[test]
3171    fn tokenize_nested_multiline_comment() {
3172        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3173            "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3174            vec![
3175                Token::Number("0".to_string(), false),
3176                Token::Whitespace(Whitespace::MultiLineComment(
3177                    "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3178                )),
3179                Token::Whitespace(Whitespace::Space),
3180                Token::Div,
3181                Token::Word(Word {
3182                    value: "comment".to_string(),
3183                    quote_style: None,
3184                    keyword: Keyword::COMMENT,
3185                }),
3186                Token::Mul,
3187                Token::Div,
3188                Token::Number("1".to_string(), false),
3189            ],
3190        );
3191
3192        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3193            "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3194            vec![
3195                Token::Number("0".to_string(), false),
3196                Token::Whitespace(Whitespace::MultiLineComment(
3197                    "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3198                )),
3199                Token::Number("1".to_string(), false),
3200            ],
3201        );
3202
3203        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3204            "SELECT 1/* a /* b */ c */0",
3205            vec![
3206                Token::make_keyword("SELECT"),
3207                Token::Whitespace(Whitespace::Space),
3208                Token::Number("1".to_string(), false),
3209                Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3210                Token::Number("0".to_string(), false),
3211            ],
3212        );
3213    }
3214
3215    #[test]
3216    fn tokenize_nested_multiline_comment_empty() {
3217        all_dialects_where(|d| d.supports_nested_comments()).tokenizes_to(
3218            "select 1/*/**/*/0",
3219            vec![
3220                Token::make_keyword("select"),
3221                Token::Whitespace(Whitespace::Space),
3222                Token::Number("1".to_string(), false),
3223                Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3224                Token::Number("0".to_string(), false),
3225            ],
3226        );
3227    }
3228
3229    #[test]
3230    fn tokenize_nested_comments_if_not_supported() {
3231        all_dialects_except(|d| d.supports_nested_comments()).tokenizes_to(
3232            "SELECT 1/*/* nested comment */*/0",
3233            vec![
3234                Token::make_keyword("SELECT"),
3235                Token::Whitespace(Whitespace::Space),
3236                Token::Number("1".to_string(), false),
3237                Token::Whitespace(Whitespace::MultiLineComment(
3238                    "/* nested comment ".to_string(),
3239                )),
3240                Token::Mul,
3241                Token::Div,
3242                Token::Number("0".to_string(), false),
3243            ],
3244        );
3245    }
3246
3247    #[test]
3248    fn tokenize_multiline_comment_with_even_asterisks() {
3249        let sql = String::from("\n/** Comment **/\n");
3250
3251        let dialect = GenericDialect {};
3252        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3253        let expected = vec![
3254            Token::Whitespace(Whitespace::Newline),
3255            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3256            Token::Whitespace(Whitespace::Newline),
3257        ];
3258        compare(expected, tokens);
3259    }
3260
3261    #[test]
3262    fn tokenize_unicode_whitespace() {
3263        let sql = String::from(" \u{2003}\n");
3264
3265        let dialect = GenericDialect {};
3266        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3267        let expected = vec![
3268            Token::Whitespace(Whitespace::Space),
3269            Token::Whitespace(Whitespace::Space),
3270            Token::Whitespace(Whitespace::Newline),
3271        ];
3272        compare(expected, tokens);
3273    }
3274
3275    #[test]
3276    fn tokenize_mismatched_quotes() {
3277        let sql = String::from("\"foo");
3278
3279        let dialect = GenericDialect {};
3280        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3281        assert_eq!(
3282            tokenizer.tokenize(),
3283            Err(TokenizerError {
3284                message: "Expected close delimiter '\"' before EOF.".to_string(),
3285                location: Location { line: 1, column: 1 },
3286            })
3287        );
3288    }
3289
3290    #[test]
3291    fn tokenize_newlines() {
3292        let sql = String::from("line1\nline2\rline3\r\nline4\r");
3293
3294        let dialect = GenericDialect {};
3295        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3296        let expected = vec![
3297            Token::make_word("line1", None),
3298            Token::Whitespace(Whitespace::Newline),
3299            Token::make_word("line2", None),
3300            Token::Whitespace(Whitespace::Newline),
3301            Token::make_word("line3", None),
3302            Token::Whitespace(Whitespace::Newline),
3303            Token::make_word("line4", None),
3304            Token::Whitespace(Whitespace::Newline),
3305        ];
3306        compare(expected, tokens);
3307    }
3308
3309    #[test]
3310    fn tokenize_mssql_top() {
3311        let sql = "SELECT TOP 5 [bar] FROM foo";
3312        let dialect = MsSqlDialect {};
3313        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3314        let expected = vec![
3315            Token::make_keyword("SELECT"),
3316            Token::Whitespace(Whitespace::Space),
3317            Token::make_keyword("TOP"),
3318            Token::Whitespace(Whitespace::Space),
3319            Token::Number(String::from("5"), false),
3320            Token::Whitespace(Whitespace::Space),
3321            Token::make_word("bar", Some('[')),
3322            Token::Whitespace(Whitespace::Space),
3323            Token::make_keyword("FROM"),
3324            Token::Whitespace(Whitespace::Space),
3325            Token::make_word("foo", None),
3326        ];
3327        compare(expected, tokens);
3328    }
3329
3330    #[test]
3331    fn tokenize_pg_regex_match() {
3332        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3333        let dialect = GenericDialect {};
3334        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3335        let expected = vec![
3336            Token::make_keyword("SELECT"),
3337            Token::Whitespace(Whitespace::Space),
3338            Token::make_word("col", None),
3339            Token::Whitespace(Whitespace::Space),
3340            Token::Tilde,
3341            Token::Whitespace(Whitespace::Space),
3342            Token::SingleQuotedString("^a".into()),
3343            Token::Comma,
3344            Token::Whitespace(Whitespace::Space),
3345            Token::make_word("col", None),
3346            Token::Whitespace(Whitespace::Space),
3347            Token::TildeAsterisk,
3348            Token::Whitespace(Whitespace::Space),
3349            Token::SingleQuotedString("^a".into()),
3350            Token::Comma,
3351            Token::Whitespace(Whitespace::Space),
3352            Token::make_word("col", None),
3353            Token::Whitespace(Whitespace::Space),
3354            Token::ExclamationMarkTilde,
3355            Token::Whitespace(Whitespace::Space),
3356            Token::SingleQuotedString("^a".into()),
3357            Token::Comma,
3358            Token::Whitespace(Whitespace::Space),
3359            Token::make_word("col", None),
3360            Token::Whitespace(Whitespace::Space),
3361            Token::ExclamationMarkTildeAsterisk,
3362            Token::Whitespace(Whitespace::Space),
3363            Token::SingleQuotedString("^a".into()),
3364        ];
3365        compare(expected, tokens);
3366    }
3367
3368    #[test]
3369    fn tokenize_pg_like_match() {
3370        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3371        let dialect = GenericDialect {};
3372        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3373        let expected = vec![
3374            Token::make_keyword("SELECT"),
3375            Token::Whitespace(Whitespace::Space),
3376            Token::make_word("col", None),
3377            Token::Whitespace(Whitespace::Space),
3378            Token::DoubleTilde,
3379            Token::Whitespace(Whitespace::Space),
3380            Token::SingleQuotedString("_a%".into()),
3381            Token::Comma,
3382            Token::Whitespace(Whitespace::Space),
3383            Token::make_word("col", None),
3384            Token::Whitespace(Whitespace::Space),
3385            Token::DoubleTildeAsterisk,
3386            Token::Whitespace(Whitespace::Space),
3387            Token::SingleQuotedString("_a%".into()),
3388            Token::Comma,
3389            Token::Whitespace(Whitespace::Space),
3390            Token::make_word("col", None),
3391            Token::Whitespace(Whitespace::Space),
3392            Token::ExclamationMarkDoubleTilde,
3393            Token::Whitespace(Whitespace::Space),
3394            Token::SingleQuotedString("_a%".into()),
3395            Token::Comma,
3396            Token::Whitespace(Whitespace::Space),
3397            Token::make_word("col", None),
3398            Token::Whitespace(Whitespace::Space),
3399            Token::ExclamationMarkDoubleTildeAsterisk,
3400            Token::Whitespace(Whitespace::Space),
3401            Token::SingleQuotedString("_a%".into()),
3402        ];
3403        compare(expected, tokens);
3404    }
3405
3406    #[test]
3407    fn tokenize_quoted_identifier() {
3408        let sql = r#" "a "" b" "a """ "c """"" "#;
3409        let dialect = GenericDialect {};
3410        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3411        let expected = vec![
3412            Token::Whitespace(Whitespace::Space),
3413            Token::make_word(r#"a " b"#, Some('"')),
3414            Token::Whitespace(Whitespace::Space),
3415            Token::make_word(r#"a ""#, Some('"')),
3416            Token::Whitespace(Whitespace::Space),
3417            Token::make_word(r#"c """#, Some('"')),
3418            Token::Whitespace(Whitespace::Space),
3419        ];
3420        compare(expected, tokens);
3421    }
3422
3423    #[test]
3424    fn tokenize_snowflake_div() {
3425        let sql = r#"field/1000"#;
3426        let dialect = SnowflakeDialect {};
3427        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3428        let expected = vec![
3429            Token::make_word(r#"field"#, None),
3430            Token::Div,
3431            Token::Number("1000".to_string(), false),
3432        ];
3433        compare(expected, tokens);
3434    }
3435
3436    #[test]
3437    fn tokenize_quoted_identifier_with_no_escape() {
3438        let sql = r#" "a "" b" "a """ "c """"" "#;
3439        let dialect = GenericDialect {};
3440        let tokens = Tokenizer::new(&dialect, sql)
3441            .with_unescape(false)
3442            .tokenize()
3443            .unwrap();
3444        let expected = vec![
3445            Token::Whitespace(Whitespace::Space),
3446            Token::make_word(r#"a "" b"#, Some('"')),
3447            Token::Whitespace(Whitespace::Space),
3448            Token::make_word(r#"a """#, Some('"')),
3449            Token::Whitespace(Whitespace::Space),
3450            Token::make_word(r#"c """""#, Some('"')),
3451            Token::Whitespace(Whitespace::Space),
3452        ];
3453        compare(expected, tokens);
3454    }
3455
3456    #[test]
3457    fn tokenize_with_location() {
3458        let sql = "SELECT a,\n b";
3459        let dialect = GenericDialect {};
3460        let tokens = Tokenizer::new(&dialect, sql)
3461            .tokenize_with_location()
3462            .unwrap();
3463        let expected = vec![
3464            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3465            TokenWithSpan::at(
3466                Token::Whitespace(Whitespace::Space),
3467                (1, 7).into(),
3468                (1, 8).into(),
3469            ),
3470            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3471            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3472            TokenWithSpan::at(
3473                Token::Whitespace(Whitespace::Newline),
3474                (1, 10).into(),
3475                (2, 1).into(),
3476            ),
3477            TokenWithSpan::at(
3478                Token::Whitespace(Whitespace::Space),
3479                (2, 1).into(),
3480                (2, 2).into(),
3481            ),
3482            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3483        ];
3484        compare(expected, tokens);
3485    }
3486
3487    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3488        //println!("------------------------------");
3489        //println!("tokens   = {:?}", actual);
3490        //println!("expected = {:?}", expected);
3491        //println!("------------------------------");
3492        assert_eq!(expected, actual);
3493    }
3494
3495    fn check_unescape(s: &str, expected: Option<&str>) {
3496        let s = format!("'{s}'");
3497        let mut state = State {
3498            peekable: s.chars().peekable(),
3499            line: 0,
3500            col: 0,
3501        };
3502
3503        assert_eq!(
3504            unescape_single_quoted_string(&mut state),
3505            expected.map(|s| s.to_string())
3506        );
3507    }
3508
3509    #[test]
3510    fn test_unescape() {
3511        check_unescape(r"\b", Some("\u{0008}"));
3512        check_unescape(r"\f", Some("\u{000C}"));
3513        check_unescape(r"\t", Some("\t"));
3514        check_unescape(r"\r\n", Some("\r\n"));
3515        check_unescape(r"\/", Some("/"));
3516        check_unescape(r"/", Some("/"));
3517        check_unescape(r"\\", Some("\\"));
3518
3519        // 16 and 32-bit hexadecimal Unicode character value
3520        check_unescape(r"\u0001", Some("\u{0001}"));
3521        check_unescape(r"\u4c91", Some("\u{4c91}"));
3522        check_unescape(r"\u4c916", Some("\u{4c91}6"));
3523        check_unescape(r"\u4c", None);
3524        check_unescape(r"\u0000", None);
3525        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3526        check_unescape(r"\U00110000", None);
3527        check_unescape(r"\U00000000", None);
3528        check_unescape(r"\u", None);
3529        check_unescape(r"\U", None);
3530        check_unescape(r"\U1010FFFF", None);
3531
3532        // hexadecimal byte value
3533        check_unescape(r"\x4B", Some("\u{004b}"));
3534        check_unescape(r"\x4", Some("\u{0004}"));
3535        check_unescape(r"\x4L", Some("\u{0004}L"));
3536        check_unescape(r"\x", Some("x"));
3537        check_unescape(r"\xP", Some("xP"));
3538        check_unescape(r"\x0", None);
3539        check_unescape(r"\xCAD", None);
3540        check_unescape(r"\xA9", None);
3541
3542        // octal byte value
3543        check_unescape(r"\1", Some("\u{0001}"));
3544        check_unescape(r"\12", Some("\u{000a}"));
3545        check_unescape(r"\123", Some("\u{0053}"));
3546        check_unescape(r"\1232", Some("\u{0053}2"));
3547        check_unescape(r"\4", Some("\u{0004}"));
3548        check_unescape(r"\45", Some("\u{0025}"));
3549        check_unescape(r"\450", Some("\u{0028}"));
3550        check_unescape(r"\603", None);
3551        check_unescape(r"\0", None);
3552        check_unescape(r"\080", None);
3553
3554        // others
3555        check_unescape(r"\9", Some("9"));
3556        check_unescape(r"''", Some("'"));
3557        check_unescape(
3558            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3559            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3560        );
3561        check_unescape(r"Hello\0", None);
3562        check_unescape(r"Hello\xCADRust", None);
3563    }
3564
3565    #[test]
3566    fn tokenize_numeric_prefix_trait() {
3567        #[derive(Debug)]
3568        struct NumericPrefixDialect;
3569
3570        impl Dialect for NumericPrefixDialect {
3571            fn is_identifier_start(&self, ch: char) -> bool {
3572                ch.is_ascii_lowercase()
3573                    || ch.is_ascii_uppercase()
3574                    || ch.is_ascii_digit()
3575                    || ch == '$'
3576            }
3577
3578            fn is_identifier_part(&self, ch: char) -> bool {
3579                ch.is_ascii_lowercase()
3580                    || ch.is_ascii_uppercase()
3581                    || ch.is_ascii_digit()
3582                    || ch == '_'
3583                    || ch == '$'
3584                    || ch == '{'
3585                    || ch == '}'
3586            }
3587
3588            fn supports_numeric_prefix(&self) -> bool {
3589                true
3590            }
3591        }
3592
3593        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3594        tokenize_numeric_prefix_inner(&HiveDialect {});
3595        tokenize_numeric_prefix_inner(&MySqlDialect {});
3596    }
3597
3598    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3599        let sql = r#"SELECT * FROM 1"#;
3600        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3601        let expected = vec![
3602            Token::make_keyword("SELECT"),
3603            Token::Whitespace(Whitespace::Space),
3604            Token::Mul,
3605            Token::Whitespace(Whitespace::Space),
3606            Token::make_keyword("FROM"),
3607            Token::Whitespace(Whitespace::Space),
3608            Token::Number(String::from("1"), false),
3609        ];
3610        compare(expected, tokens);
3611    }
3612
3613    #[test]
3614    fn tokenize_quoted_string_escape() {
3615        let dialect = SnowflakeDialect {};
3616        for (sql, expected, expected_unescaped) in [
3617            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3618            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3619            (r#"'\\'"#, r#"\\"#, r#"\"#),
3620            (
3621                r#"'\0\a\b\f\n\r\t\Z'"#,
3622                r#"\0\a\b\f\n\r\t\Z"#,
3623                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3624            ),
3625            (r#"'\"'"#, r#"\""#, "\""),
3626            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3627            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3628            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3629            (r#"'\q'"#, r#"\q"#, r#"q"#),
3630            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3631            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3632        ] {
3633            let tokens = Tokenizer::new(&dialect, sql)
3634                .with_unescape(false)
3635                .tokenize()
3636                .unwrap();
3637            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3638            compare(expected, tokens);
3639
3640            let tokens = Tokenizer::new(&dialect, sql)
3641                .with_unescape(true)
3642                .tokenize()
3643                .unwrap();
3644            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3645            compare(expected, tokens);
3646        }
3647
3648        for sql in [r#"'\'"#, r#"'ab\'"#] {
3649            let mut tokenizer = Tokenizer::new(&dialect, sql);
3650            assert_eq!(
3651                "Unterminated string literal",
3652                tokenizer.tokenize().unwrap_err().message.as_str(),
3653            );
3654        }
3655
3656        // Non-escape dialect
3657        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3658            let dialect = GenericDialect {};
3659            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3660
3661            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3662
3663            compare(expected, tokens);
3664        }
3665
3666        // MySQL special case for LIKE escapes
3667        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3668            let dialect = MySqlDialect {};
3669            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3670
3671            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3672
3673            compare(expected, tokens);
3674        }
3675    }
3676
3677    #[test]
3678    fn tokenize_triple_quoted_string() {
3679        fn check<F>(
3680            q: char, // The quote character to test
3681            r: char, // An alternate quote character.
3682            quote_token: F,
3683        ) where
3684            F: Fn(String) -> Token,
3685        {
3686            let dialect = BigQueryDialect {};
3687
3688            for (sql, expected, expected_unescaped) in [
3689                // Empty string
3690                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3691                // Should not count escaped quote as end of string.
3692                (
3693                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3694                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3695                    format!(r#"ab{q}{q}{q}{q}cd"#),
3696                ),
3697                // Simple string
3698                (
3699                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3700                    "abc".into(),
3701                    "abc".into(),
3702                ),
3703                // Mix single-double quotes unescaped.
3704                (
3705                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3706                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3707                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3708                ),
3709                // Escaped quote.
3710                (
3711                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3712                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3713                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3714                ),
3715                // backslash-escaped quote characters.
3716                (
3717                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3718                    r#"a\'\'b\'c\'d"#.into(),
3719                    r#"a''b'c'd"#.into(),
3720                ),
3721                // backslash-escaped characters
3722                (
3723                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3724                    r#"abc\0\n\rdef"#.into(),
3725                    "abc\0\n\rdef".into(),
3726                ),
3727            ] {
3728                let tokens = Tokenizer::new(&dialect, sql.as_str())
3729                    .with_unescape(false)
3730                    .tokenize()
3731                    .unwrap();
3732                let expected = vec![quote_token(expected.to_string())];
3733                compare(expected, tokens);
3734
3735                let tokens = Tokenizer::new(&dialect, sql.as_str())
3736                    .with_unescape(true)
3737                    .tokenize()
3738                    .unwrap();
3739                let expected = vec![quote_token(expected_unescaped.to_string())];
3740                compare(expected, tokens);
3741            }
3742
3743            for sql in [
3744                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3745                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3746                format!(r#"{q}{q}{q}{q}"#),
3747                format!(r#"{q}{q}{q}{r}{r}"#),
3748                format!(r#"{q}{q}{q}abc{q}"#),
3749                format!(r#"{q}{q}{q}abc{q}{q}"#),
3750                format!(r#"{q}{q}{q}abc"#),
3751            ] {
3752                let dialect = BigQueryDialect {};
3753                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3754                assert_eq!(
3755                    "Unterminated string literal",
3756                    tokenizer.tokenize().unwrap_err().message.as_str(),
3757                );
3758            }
3759        }
3760
3761        check('"', '\'', Token::TripleDoubleQuotedString);
3762
3763        check('\'', '"', Token::TripleSingleQuotedString);
3764
3765        let dialect = BigQueryDialect {};
3766
3767        let sql = r#"""''"#;
3768        let tokens = Tokenizer::new(&dialect, sql)
3769            .with_unescape(true)
3770            .tokenize()
3771            .unwrap();
3772        let expected = vec![
3773            Token::DoubleQuotedString("".to_string()),
3774            Token::SingleQuotedString("".to_string()),
3775        ];
3776        compare(expected, tokens);
3777
3778        let sql = r#"''"""#;
3779        let tokens = Tokenizer::new(&dialect, sql)
3780            .with_unescape(true)
3781            .tokenize()
3782            .unwrap();
3783        let expected = vec![
3784            Token::SingleQuotedString("".to_string()),
3785            Token::DoubleQuotedString("".to_string()),
3786        ];
3787        compare(expected, tokens);
3788
3789        // Non-triple quoted string dialect
3790        let dialect = SnowflakeDialect {};
3791        let sql = r#"''''''"#;
3792        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3793        let expected = vec![Token::SingleQuotedString("''".to_string())];
3794        compare(expected, tokens);
3795    }
3796
3797    #[test]
3798    fn test_mysql_users_grantees() {
3799        let dialect = MySqlDialect {};
3800
3801        let sql = "CREATE USER `root`@`%`";
3802        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3803        let expected = vec![
3804            Token::make_keyword("CREATE"),
3805            Token::Whitespace(Whitespace::Space),
3806            Token::make_keyword("USER"),
3807            Token::Whitespace(Whitespace::Space),
3808            Token::make_word("root", Some('`')),
3809            Token::AtSign,
3810            Token::make_word("%", Some('`')),
3811        ];
3812        compare(expected, tokens);
3813    }
3814
3815    #[test]
3816    fn test_postgres_abs_without_space_and_string_literal() {
3817        let dialect = MySqlDialect {};
3818
3819        let sql = "SELECT @'1'";
3820        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3821        let expected = vec![
3822            Token::make_keyword("SELECT"),
3823            Token::Whitespace(Whitespace::Space),
3824            Token::AtSign,
3825            Token::SingleQuotedString("1".to_string()),
3826        ];
3827        compare(expected, tokens);
3828    }
3829
3830    #[test]
3831    fn test_postgres_abs_without_space_and_quoted_column() {
3832        let dialect = MySqlDialect {};
3833
3834        let sql = r#"SELECT @"bar" FROM foo"#;
3835        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3836        let expected = vec![
3837            Token::make_keyword("SELECT"),
3838            Token::Whitespace(Whitespace::Space),
3839            Token::AtSign,
3840            Token::DoubleQuotedString("bar".to_string()),
3841            Token::Whitespace(Whitespace::Space),
3842            Token::make_keyword("FROM"),
3843            Token::Whitespace(Whitespace::Space),
3844            Token::make_word("foo", None),
3845        ];
3846        compare(expected, tokens);
3847    }
3848
3849    #[test]
3850    fn test_national_strings_backslash_escape_not_supported() {
3851        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3852            .tokenizes_to(
3853                "select n'''''\\'",
3854                vec![
3855                    Token::make_keyword("select"),
3856                    Token::Whitespace(Whitespace::Space),
3857                    Token::NationalStringLiteral("''\\".to_string()),
3858                ],
3859            );
3860    }
3861
3862    #[test]
3863    fn test_national_strings_backslash_escape_supported() {
3864        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3865            .tokenizes_to(
3866                "select n'''''\\''",
3867                vec![
3868                    Token::make_keyword("select"),
3869                    Token::Whitespace(Whitespace::Space),
3870                    Token::NationalStringLiteral("'''".to_string()),
3871                ],
3872            );
3873    }
3874
3875    #[test]
3876    fn test_string_escape_constant_not_supported() {
3877        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3878            "select e'...'",
3879            vec![
3880                Token::make_keyword("select"),
3881                Token::Whitespace(Whitespace::Space),
3882                Token::make_word("e", None),
3883                Token::SingleQuotedString("...".to_string()),
3884            ],
3885        );
3886
3887        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3888            "select E'...'",
3889            vec![
3890                Token::make_keyword("select"),
3891                Token::Whitespace(Whitespace::Space),
3892                Token::make_word("E", None),
3893                Token::SingleQuotedString("...".to_string()),
3894            ],
3895        );
3896    }
3897
3898    #[test]
3899    fn test_string_escape_constant_supported() {
3900        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3901            "select e'\\''",
3902            vec![
3903                Token::make_keyword("select"),
3904                Token::Whitespace(Whitespace::Space),
3905                Token::EscapedStringLiteral("'".to_string()),
3906            ],
3907        );
3908
3909        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3910            "select E'\\''",
3911            vec![
3912                Token::make_keyword("select"),
3913                Token::Whitespace(Whitespace::Space),
3914                Token::EscapedStringLiteral("'".to_string()),
3915            ],
3916        );
3917    }
3918
3919    #[test]
3920    fn test_whitespace_required_after_single_line_comment() {
3921        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3922            .tokenizes_to(
3923                "SELECT --'abc'",
3924                vec![
3925                    Token::make_keyword("SELECT"),
3926                    Token::Whitespace(Whitespace::Space),
3927                    Token::Minus,
3928                    Token::Minus,
3929                    Token::SingleQuotedString("abc".to_string()),
3930                ],
3931            );
3932
3933        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3934            .tokenizes_to(
3935                "SELECT -- 'abc'",
3936                vec![
3937                    Token::make_keyword("SELECT"),
3938                    Token::Whitespace(Whitespace::Space),
3939                    Token::Whitespace(Whitespace::SingleLineComment {
3940                        prefix: "--".to_string(),
3941                        comment: " 'abc'".to_string(),
3942                    }),
3943                ],
3944            );
3945
3946        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3947            .tokenizes_to(
3948                "SELECT --",
3949                vec![
3950                    Token::make_keyword("SELECT"),
3951                    Token::Whitespace(Whitespace::Space),
3952                    Token::Minus,
3953                    Token::Minus,
3954                ],
3955            );
3956    }
3957
3958    #[test]
3959    fn test_whitespace_not_required_after_single_line_comment() {
3960        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3961            .tokenizes_to(
3962                "SELECT --'abc'",
3963                vec![
3964                    Token::make_keyword("SELECT"),
3965                    Token::Whitespace(Whitespace::Space),
3966                    Token::Whitespace(Whitespace::SingleLineComment {
3967                        prefix: "--".to_string(),
3968                        comment: "'abc'".to_string(),
3969                    }),
3970                ],
3971            );
3972
3973        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3974            .tokenizes_to(
3975                "SELECT -- 'abc'",
3976                vec![
3977                    Token::make_keyword("SELECT"),
3978                    Token::Whitespace(Whitespace::Space),
3979                    Token::Whitespace(Whitespace::SingleLineComment {
3980                        prefix: "--".to_string(),
3981                        comment: " 'abc'".to_string(),
3982                    }),
3983                ],
3984            );
3985
3986        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3987            .tokenizes_to(
3988                "SELECT --",
3989                vec![
3990                    Token::make_keyword("SELECT"),
3991                    Token::Whitespace(Whitespace::Space),
3992                    Token::Whitespace(Whitespace::SingleLineComment {
3993                        prefix: "--".to_string(),
3994                        comment: "".to_string(),
3995                    }),
3996                ],
3997            );
3998    }
3999
4000    #[test]
4001    fn test_tokenize_identifiers_numeric_prefix() {
4002        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4003            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4004
4005        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4006            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4007
4008        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4009            "t.12e34",
4010            vec![
4011                Token::make_word("t", None),
4012                Token::Period,
4013                Token::make_word("12e34", None),
4014            ],
4015        );
4016
4017        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4018            "t.1two3",
4019            vec![
4020                Token::make_word("t", None),
4021                Token::Period,
4022                Token::make_word("1two3", None),
4023            ],
4024        );
4025    }
4026
4027    #[test]
4028    fn tokenize_period_underscore() {
4029        let sql = String::from("SELECT table._col");
4030        // a dialect that supports underscores in numeric literals
4031        let dialect = PostgreSqlDialect {};
4032        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4033
4034        let expected = vec![
4035            Token::make_keyword("SELECT"),
4036            Token::Whitespace(Whitespace::Space),
4037            Token::Word(Word {
4038                value: "table".to_string(),
4039                quote_style: None,
4040                keyword: Keyword::TABLE,
4041            }),
4042            Token::Period,
4043            Token::Word(Word {
4044                value: "_col".to_string(),
4045                quote_style: None,
4046                keyword: Keyword::NoKeyword,
4047            }),
4048        ];
4049
4050        compare(expected, tokens);
4051
4052        let sql = String::from("SELECT ._123");
4053        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4054            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4055        }
4056
4057        let sql = String::from("SELECT ._abc");
4058        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4059            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4060        }
4061    }
4062}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs