sqlparser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use yachtsql_sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46    SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51/// SQL Token enumeration
52#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56    /// An end-of-file marker, not a real token
57    EOF,
58    /// A keyword (like SELECT) or an optionally quoted SQL identifier
59    Word(Word),
60    /// An unsigned numeric literal
61    Number(String, bool),
62    /// A character that could not be tokenized
63    Char(char),
64    /// Single quoted string: i.e: 'string'
65    SingleQuotedString(String),
66    /// Double quoted string: i.e: "string"
67    DoubleQuotedString(String),
68    /// Triple single quoted strings: Example '''abc'''
69    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
70    TripleSingleQuotedString(String),
71    /// Triple double quoted strings: Example """abc"""
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleDoubleQuotedString(String),
74    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
75    DollarQuotedString(DollarQuotedString),
76    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
77    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
78    SingleQuotedByteStringLiteral(String),
79    /// Byte string literal: i.e: b"string" or B"string"
80    DoubleQuotedByteStringLiteral(String),
81    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
82    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
83    TripleSingleQuotedByteStringLiteral(String),
84    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleDoubleQuotedByteStringLiteral(String),
87    /// Single quoted literal with raw string prefix. Example `R'abc'`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    SingleQuotedRawStringLiteral(String),
90    /// Double quoted literal with raw string prefix. Example `R"abc"`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    DoubleQuotedRawStringLiteral(String),
93    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    TripleSingleQuotedRawStringLiteral(String),
96    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleDoubleQuotedRawStringLiteral(String),
99    /// "National" string literal: i.e: N'string'
100    NationalStringLiteral(String),
101    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
102    EscapedStringLiteral(String),
103    /// Unicode string literal: i.e: U&'first \000A second'
104    UnicodeStringLiteral(String),
105    /// Hexadecimal string literal: i.e.: X'deadbeef'
106    HexStringLiteral(String),
107    /// Comma
108    Comma,
109    /// Whitespace (space, tab, etc)
110    Whitespace(Whitespace),
111    /// Double equals sign `==`
112    DoubleEq,
113    /// Equality operator `=`
114    Eq,
115    /// Not Equals operator `<>` (or `!=` in some dialects)
116    Neq,
117    /// Less Than operator `<`
118    Lt,
119    /// Greater Than operator `>`
120    Gt,
121    /// Less Than Or Equals operator `<=`
122    LtEq,
123    /// Greater Than Or Equals operator `>=`
124    GtEq,
125    /// Spaceship operator <=>
126    Spaceship,
127    /// Plus operator `+`
128    Plus,
129    /// Minus operator `-`
130    Minus,
131    /// Multiplication operator `*`
132    Mul,
133    /// Division operator `/`
134    Div,
135    /// Integer division operator `//` in DuckDB
136    DuckIntDiv,
137    /// Modulo Operator `%`
138    Mod,
139    /// String concatenation `||`
140    StringConcat,
141    /// Left parenthesis `(`
142    LParen,
143    /// Right parenthesis `)`
144    RParen,
145    /// Period (used for compound identifiers or projections into nested types)
146    Period,
147    /// Colon `:`
148    Colon,
149    /// DoubleColon `::` (used for casting in PostgreSQL)
150    DoubleColon,
151    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
152    Assignment,
153    /// SemiColon `;` used as separator for COPY and payload
154    SemiColon,
155    /// Backslash `\` used in terminating the COPY payload with `\.`
156    Backslash,
157    /// Left bracket `[`
158    LBracket,
159    /// Right bracket `]`
160    RBracket,
161    /// Ampersand `&`
162    Ampersand,
163    /// Pipe `|`
164    Pipe,
165    /// Caret `^`
166    Caret,
167    /// Left brace `{`
168    LBrace,
169    /// Right brace `}`
170    RBrace,
171    /// Right Arrow `=>`
172    RArrow,
173    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
174    Sharp,
175    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
176    DoubleSharp,
177    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
178    Tilde,
179    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
180    TildeAsterisk,
181    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
182    ExclamationMarkTilde,
183    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
184    ExclamationMarkTildeAsterisk,
185    /// `~~`, a case sensitive match pattern operator in PostgreSQL
186    DoubleTilde,
187    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
188    DoubleTildeAsterisk,
189    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
190    ExclamationMarkDoubleTilde,
191    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
192    ExclamationMarkDoubleTildeAsterisk,
193    /// `<<`, a bitwise shift left operator in PostgreSQL
194    ShiftLeft,
195    /// `>>`, a bitwise shift right operator in PostgreSQL
196    ShiftRight,
197    /// `&&`, an overlap operator in PostgreSQL
198    Overlap,
199    /// Exclamation Mark `!` used for PostgreSQL factorial operator
200    ExclamationMark,
201    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
202    DoubleExclamationMark,
203    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
204    AtSign,
205    /// `^@`, a "starts with" string operator in PostgreSQL
206    CaretAt,
207    /// `|/`, a square root math operator in PostgreSQL
208    PGSquareRoot,
209    /// `||/`, a cube root math operator in PostgreSQL
210    PGCubeRoot,
211    /// `?` or `$` , a prepared statement arg placeholder
212    Placeholder(String),
213    /// `->`, used as a operator to extract json field in PostgreSQL
214    Arrow,
215    /// `->>`, used as a operator to extract json field as text in PostgreSQL
216    LongArrow,
217    /// `#>`, extracts JSON sub-object at the specified path
218    HashArrow,
219    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
220    AtDashAt,
221    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
222    QuestionMarkDash,
223    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
224    AmpersandLeftAngleBracket,
225    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
226    AmpersandRightAngleBracket,
227    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
228    AmpersandLeftAngleBracketVerticalBar,
229    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
230    VerticalBarAmpersandRightAngleBracket,
231    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
232    TwoWayArrow,
233    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
234    LeftAngleBracketCaret,
235    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
236    RightAngleBracketCaret,
237    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
238    QuestionMarkSharp,
239    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
240    QuestionMarkDashVerticalBar,
241    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
242    QuestionMarkDoubleVerticalBar,
243    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
244    TildeEqual,
245    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
246    ShiftLeftVerticalBar,
247    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
248    VerticalBarShiftRight,
249    /// `|> BigQuery pipe operator
250    VerticalBarRightAngleBracket,
251    /// `#>>`, extracts JSON sub-object at the specified path as text
252    HashLongArrow,
253    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
254    AtArrow,
255    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
256    ArrowAt,
257    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
258    /// path, where path elements can be either field keys or array indexes.
259    HashMinus,
260    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
261    /// JSON value?
262    AtQuestion,
263    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
264    /// for the specified JSON value. Only the first item of the result is taken into
265    /// account. If the result is not Boolean, then NULL is returned.
266    AtAt,
267    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
268    /// jsonb object
269    Question,
270    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
271    /// keys within the jsonb object
272    QuestionAnd,
273    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
274    /// keys within the jsonb object
275    QuestionPipe,
276    /// Custom binary operator
277    /// This is used to represent any custom binary operator that is not part of the SQL standard.
278    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
279    CustomBinaryOperator(String),
280}
281
282impl fmt::Display for Token {
283    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
284        match self {
285            Token::EOF => f.write_str("EOF"),
286            Token::Word(ref w) => write!(f, "{w}"),
287            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
288            Token::Char(ref c) => write!(f, "{c}"),
289            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
290            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
291            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
292            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
293            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
294            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
295            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
296            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
297            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
298            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
299            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
300            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
301            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
302            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
303            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
304            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
305            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
306            Token::Comma => f.write_str(","),
307            Token::Whitespace(ws) => write!(f, "{ws}"),
308            Token::DoubleEq => f.write_str("=="),
309            Token::Spaceship => f.write_str("<=>"),
310            Token::Eq => f.write_str("="),
311            Token::Neq => f.write_str("<>"),
312            Token::Lt => f.write_str("<"),
313            Token::Gt => f.write_str(">"),
314            Token::LtEq => f.write_str("<="),
315            Token::GtEq => f.write_str(">="),
316            Token::Plus => f.write_str("+"),
317            Token::Minus => f.write_str("-"),
318            Token::Mul => f.write_str("*"),
319            Token::Div => f.write_str("/"),
320            Token::DuckIntDiv => f.write_str("//"),
321            Token::StringConcat => f.write_str("||"),
322            Token::Mod => f.write_str("%"),
323            Token::LParen => f.write_str("("),
324            Token::RParen => f.write_str(")"),
325            Token::Period => f.write_str("."),
326            Token::Colon => f.write_str(":"),
327            Token::DoubleColon => f.write_str("::"),
328            Token::Assignment => f.write_str(":="),
329            Token::SemiColon => f.write_str(";"),
330            Token::Backslash => f.write_str("\\"),
331            Token::LBracket => f.write_str("["),
332            Token::RBracket => f.write_str("]"),
333            Token::Ampersand => f.write_str("&"),
334            Token::Caret => f.write_str("^"),
335            Token::Pipe => f.write_str("|"),
336            Token::LBrace => f.write_str("{"),
337            Token::RBrace => f.write_str("}"),
338            Token::RArrow => f.write_str("=>"),
339            Token::Sharp => f.write_str("#"),
340            Token::DoubleSharp => f.write_str("##"),
341            Token::ExclamationMark => f.write_str("!"),
342            Token::DoubleExclamationMark => f.write_str("!!"),
343            Token::Tilde => f.write_str("~"),
344            Token::TildeAsterisk => f.write_str("~*"),
345            Token::ExclamationMarkTilde => f.write_str("!~"),
346            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
347            Token::DoubleTilde => f.write_str("~~"),
348            Token::DoubleTildeAsterisk => f.write_str("~~*"),
349            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
350            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
351            Token::AtSign => f.write_str("@"),
352            Token::CaretAt => f.write_str("^@"),
353            Token::ShiftLeft => f.write_str("<<"),
354            Token::ShiftRight => f.write_str(">>"),
355            Token::Overlap => f.write_str("&&"),
356            Token::PGSquareRoot => f.write_str("|/"),
357            Token::PGCubeRoot => f.write_str("||/"),
358            Token::AtDashAt => f.write_str("@-@"),
359            Token::QuestionMarkDash => f.write_str("?-"),
360            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
361            Token::AmpersandRightAngleBracket => f.write_str("&>"),
362            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
363            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
364            Token::VerticalBarRightAngleBracket => f.write_str("|>"),
365            Token::TwoWayArrow => f.write_str("<->"),
366            Token::LeftAngleBracketCaret => f.write_str("<^"),
367            Token::RightAngleBracketCaret => f.write_str(">^"),
368            Token::QuestionMarkSharp => f.write_str("?#"),
369            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
370            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
371            Token::TildeEqual => f.write_str("~="),
372            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
373            Token::VerticalBarShiftRight => f.write_str("|>>"),
374            Token::Placeholder(ref s) => write!(f, "{s}"),
375            Token::Arrow => write!(f, "->"),
376            Token::LongArrow => write!(f, "->>"),
377            Token::HashArrow => write!(f, "#>"),
378            Token::HashLongArrow => write!(f, "#>>"),
379            Token::AtArrow => write!(f, "@>"),
380            Token::ArrowAt => write!(f, "<@"),
381            Token::HashMinus => write!(f, "#-"),
382            Token::AtQuestion => write!(f, "@?"),
383            Token::AtAt => write!(f, "@@"),
384            Token::Question => write!(f, "?"),
385            Token::QuestionAnd => write!(f, "?&"),
386            Token::QuestionPipe => write!(f, "?|"),
387            Token::CustomBinaryOperator(s) => f.write_str(s),
388        }
389    }
390}
391
392impl Token {
393    pub fn make_keyword(keyword: &str) -> Self {
394        Token::make_word(keyword, None)
395    }
396
397    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
398        let word_uppercase = word.to_uppercase();
399        Token::Word(Word {
400            value: word.to_string(),
401            quote_style,
402            keyword: if quote_style.is_none() {
403                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
404                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
405            } else {
406                Keyword::NoKeyword
407            },
408        })
409    }
410}
411
412/// A keyword (like SELECT) or an optionally quoted SQL identifier
413#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
416pub struct Word {
417    /// The value of the token, without the enclosing quotes, and with the
418    /// escape sequences (if any) processed (TODO: escapes are not handled)
419    pub value: String,
420    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
421    /// The standard and most implementations allow using double quotes for this,
422    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
423    pub quote_style: Option<char>,
424    /// If the word was not quoted and it matched one of the known keywords,
425    /// this will have one of the values from dialect::keywords, otherwise empty
426    pub keyword: Keyword,
427}
428
429impl fmt::Display for Word {
430    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
431        match self.quote_style {
432            Some(s) if s == '"' || s == '[' || s == '`' => {
433                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
434            }
435            None => f.write_str(&self.value),
436            _ => panic!("Unexpected quote_style!"),
437        }
438    }
439}
440
441impl Word {
442    fn matching_end_quote(ch: char) -> char {
443        match ch {
444            '"' => '"', // ANSI and most dialects
445            '[' => ']', // MS SQL
446            '`' => '`', // MySQL
447            _ => panic!("unexpected quoting style!"),
448        }
449    }
450}
451
452#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
453#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
454#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
455pub enum Whitespace {
456    Space,
457    Newline,
458    Tab,
459    SingleLineComment { comment: String, prefix: String },
460    MultiLineComment(String),
461}
462
463impl fmt::Display for Whitespace {
464    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
465        match self {
466            Whitespace::Space => f.write_str(" "),
467            Whitespace::Newline => f.write_str("\n"),
468            Whitespace::Tab => f.write_str("\t"),
469            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
470            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
471        }
472    }
473}
474
475/// Location in input string
476///
477/// # Create an "empty" (unknown) `Location`
478/// ```
479/// # use sqlparser::tokenizer::Location;
480/// let location = Location::empty();
481/// ```
482///
483/// # Create a `Location` from a line and column
484/// ```
485/// # use sqlparser::tokenizer::Location;
486/// let location = Location::new(1, 1);
487/// ```
488///
489/// # Create a `Location` from a pair
490/// ```
491/// # use sqlparser::tokenizer::Location;
492/// let location = Location::from((1, 1));
493/// ```
494#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
495#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
496#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
497pub struct Location {
498    /// Line number, starting from 1.
499    ///
500    /// Note: Line 0 is used for empty spans
501    pub line: u64,
502    /// Line column, starting from 1.
503    ///
504    /// Note: Column 0 is used for empty spans
505    pub column: u64,
506}
507
508impl fmt::Display for Location {
509    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
510        if self.line == 0 {
511            return Ok(());
512        }
513        write!(f, " at Line: {}, Column: {}", self.line, self.column)
514    }
515}
516
517impl fmt::Debug for Location {
518    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
519        write!(f, "Location({},{})", self.line, self.column)
520    }
521}
522
523impl Location {
524    /// Return an "empty" / unknown location
525    pub fn empty() -> Self {
526        Self { line: 0, column: 0 }
527    }
528
529    /// Create a new `Location` for a given line and column
530    pub fn new(line: u64, column: u64) -> Self {
531        Self { line, column }
532    }
533
534    /// Create a new location for a given line and column
535    ///
536    /// Alias for [`Self::new`]
537    // TODO: remove / deprecate in favor of` `new` for consistency?
538    pub fn of(line: u64, column: u64) -> Self {
539        Self::new(line, column)
540    }
541
542    /// Combine self and `end` into a new `Span`
543    pub fn span_to(self, end: Self) -> Span {
544        Span { start: self, end }
545    }
546}
547
548impl From<(u64, u64)> for Location {
549    fn from((line, column): (u64, u64)) -> Self {
550        Self { line, column }
551    }
552}
553
554/// A span represents a linear portion of the input string (start, end)
555///
556/// See [Spanned](crate::ast::Spanned) for more information.
557#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
558#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
559#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
560pub struct Span {
561    pub start: Location,
562    pub end: Location,
563}
564
565impl fmt::Debug for Span {
566    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
567        write!(f, "Span({:?}..{:?})", self.start, self.end)
568    }
569}
570
571impl Span {
572    // An empty span (0, 0) -> (0, 0)
573    // We need a const instance for pattern matching
574    const EMPTY: Span = Self::empty();
575
576    /// Create a new span from a start and end [`Location`]
577    pub fn new(start: Location, end: Location) -> Span {
578        Span { start, end }
579    }
580
581    /// Returns an empty span `(0, 0) -> (0, 0)`
582    ///
583    /// Empty spans represent no knowledge of source location
584    /// See [Spanned](crate::ast::Spanned) for more information.
585    pub const fn empty() -> Span {
586        Span {
587            start: Location { line: 0, column: 0 },
588            end: Location { line: 0, column: 0 },
589        }
590    }
591
592    /// Returns the smallest Span that contains both `self` and `other`
593    /// If either span is [Span::empty], the other span is returned
594    ///
595    /// # Examples
596    /// ```
597    /// # use sqlparser::tokenizer::{Span, Location};
598    /// // line 1, column1 -> line 2, column 5
599    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
600    /// // line 2, column 3 -> line 3, column 7
601    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
602    /// // Union of the two is the min/max of the two spans
603    /// // line 1, column 1 -> line 3, column 7
604    /// let union = span1.union(&span2);
605    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
606    /// ```
607    pub fn union(&self, other: &Span) -> Span {
608        // If either span is empty, return the other
609        // this prevents propagating (0, 0) through the tree
610        match (self, other) {
611            (&Span::EMPTY, _) => *other,
612            (_, &Span::EMPTY) => *self,
613            _ => Span {
614                start: cmp::min(self.start, other.start),
615                end: cmp::max(self.end, other.end),
616            },
617        }
618    }
619
620    /// Same as [Span::union] for `Option<Span>`
621    ///
622    /// If `other` is `None`, `self` is returned
623    pub fn union_opt(&self, other: &Option<Span>) -> Span {
624        match other {
625            Some(other) => self.union(other),
626            None => *self,
627        }
628    }
629
630    /// Return the [Span::union] of all spans in the iterator
631    ///
632    /// If the iterator is empty, an empty span is returned
633    ///
634    /// # Example
635    /// ```
636    /// # use sqlparser::tokenizer::{Span, Location};
637    /// let spans = vec![
638    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
639    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
640    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
641    /// ];
642    /// // line 1, column 1 -> line 4, column 2
643    /// assert_eq!(
644    ///   Span::union_iter(spans),
645    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
646    /// );
647    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
648        iter.into_iter()
649            .reduce(|acc, item| acc.union(&item))
650            .unwrap_or(Span::empty())
651    }
652}
653
654/// Backwards compatibility struct for [`TokenWithSpan`]
655#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656pub type TokenWithLocation = TokenWithSpan;
657
658/// A [Token] with [Span] attached to it
659///
660/// This is used to track the location of a token in the input string
661///
662/// # Examples
663/// ```
664/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
665/// // commas @ line 1, column 10
666/// let tok1 = TokenWithSpan::new(
667///   Token::Comma,
668///   Span::new(Location::new(1, 10), Location::new(1, 11)),
669/// );
670/// assert_eq!(tok1, Token::Comma); // can compare the token
671///
672/// // commas @ line 2, column 20
673/// let tok2 = TokenWithSpan::new(
674///   Token::Comma,
675///   Span::new(Location::new(2, 20), Location::new(2, 21)),
676/// );
677/// // same token but different locations are not equal
678/// assert_ne!(tok1, tok2);
679/// ```
680#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
681#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
682#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
683pub struct TokenWithSpan {
684    pub token: Token,
685    pub span: Span,
686}
687
688impl TokenWithSpan {
689    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
690    pub fn new(token: Token, span: Span) -> Self {
691        Self { token, span }
692    }
693
694    /// Wrap a token with an empty span
695    pub fn wrap(token: Token) -> Self {
696        Self::new(token, Span::empty())
697    }
698
699    /// Wrap a token with a location from `start` to `end`
700    pub fn at(token: Token, start: Location, end: Location) -> Self {
701        Self::new(token, Span::new(start, end))
702    }
703
704    /// Return an EOF token with no location
705    pub fn new_eof() -> Self {
706        Self::wrap(Token::EOF)
707    }
708}
709
710impl PartialEq<Token> for TokenWithSpan {
711    fn eq(&self, other: &Token) -> bool {
712        &self.token == other
713    }
714}
715
716impl PartialEq<TokenWithSpan> for Token {
717    fn eq(&self, other: &TokenWithSpan) -> bool {
718        self == &other.token
719    }
720}
721
722impl fmt::Display for TokenWithSpan {
723    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
724        self.token.fmt(f)
725    }
726}
727
728/// Tokenizer error
729#[derive(Debug, PartialEq, Eq)]
730pub struct TokenizerError {
731    pub message: String,
732    pub location: Location,
733}
734
735impl fmt::Display for TokenizerError {
736    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737        write!(f, "{}{}", self.message, self.location,)
738    }
739}
740
741#[cfg(feature = "std")]
742impl std::error::Error for TokenizerError {}
743
744struct State<'a> {
745    peekable: Peekable<Chars<'a>>,
746    pub line: u64,
747    pub col: u64,
748}
749
750impl State<'_> {
751    /// return the next character and advance the stream
752    pub fn next(&mut self) -> Option<char> {
753        match self.peekable.next() {
754            None => None,
755            Some(s) => {
756                if s == '\n' {
757                    self.line += 1;
758                    self.col = 1;
759                } else {
760                    self.col += 1;
761                }
762                Some(s)
763            }
764        }
765    }
766
767    /// return the next character but do not advance the stream
768    pub fn peek(&mut self) -> Option<&char> {
769        self.peekable.peek()
770    }
771
772    pub fn location(&self) -> Location {
773        Location {
774            line: self.line,
775            column: self.col,
776        }
777    }
778}
779
780/// Represents how many quote characters enclose a string literal.
781#[derive(Copy, Clone)]
782enum NumStringQuoteChars {
783    /// e.g. `"abc"`, `'abc'`, `r'abc'`
784    One,
785    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
786    Many(NonZeroU8),
787}
788
789/// Settings for tokenizing a quoted string literal.
790struct TokenizeQuotedStringSettings {
791    /// The character used to quote the string.
792    quote_style: char,
793    /// Represents how many quotes characters enclose the string literal.
794    num_quote_chars: NumStringQuoteChars,
795    /// The number of opening quotes left to consume, before parsing
796    /// the remaining string literal.
797    /// For example: given initial string `"""abc"""`. If the caller has
798    /// already parsed the first quote for some reason, then this value
799    /// is set to 1, flagging to look to consume only 2 leading quotes.
800    num_opening_quotes_to_consume: u8,
801    /// True if the string uses backslash escaping of special characters
802    /// e.g `'abc\ndef\'ghi'
803    backslash_escape: bool,
804}
805
806/// SQL Tokenizer
807pub struct Tokenizer<'a> {
808    dialect: &'a dyn Dialect,
809    query: &'a str,
810    /// If true (the default), the tokenizer will un-escape literal
811    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
812    unescape: bool,
813}
814
815impl<'a> Tokenizer<'a> {
816    /// Create a new SQL tokenizer for the specified SQL statement
817    ///
818    /// ```
819    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
820    /// # use sqlparser::dialect::GenericDialect;
821    /// # let dialect = GenericDialect{};
822    /// let query = r#"SELECT 'foo'"#;
823    ///
824    /// // Parsing the query
825    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
826    ///
827    /// assert_eq!(tokens, vec![
828    ///   Token::make_word("SELECT", None),
829    ///   Token::Whitespace(Whitespace::Space),
830    ///   Token::SingleQuotedString("foo".to_string()),
831    /// ]);
832    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
833        Self {
834            dialect,
835            query,
836            unescape: true,
837        }
838    }
839
840    /// Set unescape mode
841    ///
842    /// When true (default) the tokenizer unescapes literal values
843    /// (for example, `""` in SQL is unescaped to the literal `"`).
844    ///
845    /// When false, the tokenizer provides the raw strings as provided
846    /// in the query.  This can be helpful for programs that wish to
847    /// recover the *exact* original query text without normalizing
848    /// the escaping
849    ///
850    /// # Example
851    ///
852    /// ```
853    /// # use sqlparser::tokenizer::{Token, Tokenizer};
854    /// # use sqlparser::dialect::GenericDialect;
855    /// # let dialect = GenericDialect{};
856    /// let query = r#""Foo "" Bar""#;
857    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
858    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
859    ///
860    /// // Parsing with unescaping (default)
861    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
862    /// assert_eq!(tokens, vec![unescaped]);
863    ///
864    /// // Parsing with unescape = false
865    /// let tokens = Tokenizer::new(&dialect, &query)
866    ///    .with_unescape(false)
867    ///    .tokenize().unwrap();
868    /// assert_eq!(tokens, vec![original]);
869    /// ```
870    pub fn with_unescape(mut self, unescape: bool) -> Self {
871        self.unescape = unescape;
872        self
873    }
874
875    /// Tokenize the statement and produce a vector of tokens
876    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877        let twl = self.tokenize_with_location()?;
878        Ok(twl.into_iter().map(|t| t.token).collect())
879    }
880
881    /// Tokenize the statement and produce a vector of tokens with location information
882    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
883        let mut tokens: Vec<TokenWithSpan> = vec![];
884        self.tokenize_with_location_into_buf(&mut tokens)
885            .map(|_| tokens)
886    }
887
888    /// Tokenize the statement and append tokens with location information into the provided buffer.
889    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
890    pub fn tokenize_with_location_into_buf(
891        &mut self,
892        buf: &mut Vec<TokenWithSpan>,
893    ) -> Result<(), TokenizerError> {
894        let mut state = State {
895            peekable: self.query.chars().peekable(),
896            line: 1,
897            col: 1,
898        };
899
900        let mut location = state.location();
901        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902            let span = location.span_to(state.location());
903
904            buf.push(TokenWithSpan { token, span });
905
906            location = state.location();
907        }
908        Ok(())
909    }
910
911    // Tokenize the identifier or keywords in `ch`
912    fn tokenize_identifier_or_keyword(
913        &self,
914        ch: impl IntoIterator<Item = char>,
915        chars: &mut State,
916    ) -> Result<Option<Token>, TokenizerError> {
917        chars.next(); // consume the first char
918        let ch: String = ch.into_iter().collect();
919        let word = self.tokenize_word(ch, chars);
920
921        // TODO: implement parsing of exponent here
922        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923            let mut inner_state = State {
924                peekable: word.chars().peekable(),
925                line: 0,
926                col: 0,
927            };
928            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
930            s += s2.as_str();
931            return Ok(Some(Token::Number(s, false)));
932        }
933
934        Ok(Some(Token::make_word(&word, None)))
935    }
936
937    /// Get the next token or return None
938    fn next_token(
939        &self,
940        chars: &mut State,
941        prev_token: Option<&Token>,
942    ) -> Result<Option<Token>, TokenizerError> {
943        match chars.peek() {
944            Some(&ch) => match ch {
945                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
946                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
947                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
948                '\r' => {
949                    // Emit a single Whitespace::Newline token for \r and \r\n
950                    chars.next();
951                    if let Some('\n') = chars.peek() {
952                        chars.next();
953                    }
954                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
955                }
956                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
957                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
958                {
959                    chars.next(); // consume
960                    match chars.peek() {
961                        Some('\'') => {
962                            if self.dialect.supports_triple_quoted_string() {
963                                return self
964                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
965                                        chars,
966                                        '\'',
967                                        false,
968                                        Token::SingleQuotedByteStringLiteral,
969                                        Token::TripleSingleQuotedByteStringLiteral,
970                                    );
971                            }
972                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
973                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
974                        }
975                        Some('\"') => {
976                            if self.dialect.supports_triple_quoted_string() {
977                                return self
978                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
979                                        chars,
980                                        '"',
981                                        false,
982                                        Token::DoubleQuotedByteStringLiteral,
983                                        Token::TripleDoubleQuotedByteStringLiteral,
984                                    );
985                            }
986                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
987                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
988                        }
989                        _ => {
990                            // regular identifier starting with an "b" or "B"
991                            let s = self.tokenize_word(b, chars);
992                            Ok(Some(Token::make_word(&s, None)))
993                        }
994                    }
995                }
996                // BigQuery uses r or R for raw string literal
997                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
998                    chars.next(); // consume
999                    match chars.peek() {
1000                        Some('\'') => self
1001                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1002                                chars,
1003                                '\'',
1004                                false,
1005                                Token::SingleQuotedRawStringLiteral,
1006                                Token::TripleSingleQuotedRawStringLiteral,
1007                            ),
1008                        Some('\"') => self
1009                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1010                                chars,
1011                                '"',
1012                                false,
1013                                Token::DoubleQuotedRawStringLiteral,
1014                                Token::TripleDoubleQuotedRawStringLiteral,
1015                            ),
1016                        _ => {
1017                            // regular identifier starting with an "r" or "R"
1018                            let s = self.tokenize_word(b, chars);
1019                            Ok(Some(Token::make_word(&s, None)))
1020                        }
1021                    }
1022                }
1023                // Redshift uses lower case n for national string literal
1024                n @ 'N' | n @ 'n' => {
1025                    chars.next(); // consume, to check the next char
1026                    match chars.peek() {
1027                        Some('\'') => {
1028                            // N'...' - a <national character string literal>
1029                            let backslash_escape =
1030                                self.dialect.supports_string_literal_backslash_escape();
1031                            let s =
1032                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1033                            Ok(Some(Token::NationalStringLiteral(s)))
1034                        }
1035                        _ => {
1036                            // regular identifier starting with an "N"
1037                            let s = self.tokenize_word(n, chars);
1038                            Ok(Some(Token::make_word(&s, None)))
1039                        }
1040                    }
1041                }
1042                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1043                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1044                    let starting_loc = chars.location();
1045                    chars.next(); // consume, to check the next char
1046                    match chars.peek() {
1047                        Some('\'') => {
1048                            let s =
1049                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1050                            Ok(Some(Token::EscapedStringLiteral(s)))
1051                        }
1052                        _ => {
1053                            // regular identifier starting with an "E" or "e"
1054                            let s = self.tokenize_word(x, chars);
1055                            Ok(Some(Token::make_word(&s, None)))
1056                        }
1057                    }
1058                }
1059                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1060                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1061                    chars.next(); // consume, to check the next char
1062                    if chars.peek() == Some(&'&') {
1063                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1064                        let mut chars_clone = chars.peekable.clone();
1065                        chars_clone.next(); // consume the '&' in the clone
1066                        if chars_clone.peek() == Some(&'\'') {
1067                            chars.next(); // consume the '&' in the original iterator
1068                            let s = unescape_unicode_single_quoted_string(chars)?;
1069                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1070                        }
1071                    }
1072                    // regular identifier starting with an "U" or "u"
1073                    let s = self.tokenize_word(x, chars);
1074                    Ok(Some(Token::make_word(&s, None)))
1075                }
1076                // The spec only allows an uppercase 'X' to introduce a hex
1077                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1078                x @ 'x' | x @ 'X' => {
1079                    chars.next(); // consume, to check the next char
1080                    match chars.peek() {
1081                        Some('\'') => {
1082                            // X'...' - a <binary string literal>
1083                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1084                            Ok(Some(Token::HexStringLiteral(s)))
1085                        }
1086                        _ => {
1087                            // regular identifier starting with an "X"
1088                            let s = self.tokenize_word(x, chars);
1089                            Ok(Some(Token::make_word(&s, None)))
1090                        }
1091                    }
1092                }
1093                // single quoted string
1094                '\'' => {
1095                    if self.dialect.supports_triple_quoted_string() {
1096                        return self
1097                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1098                                chars,
1099                                '\'',
1100                                self.dialect.supports_string_literal_backslash_escape(),
1101                                Token::SingleQuotedString,
1102                                Token::TripleSingleQuotedString,
1103                            );
1104                    }
1105                    let s = self.tokenize_single_quoted_string(
1106                        chars,
1107                        '\'',
1108                        self.dialect.supports_string_literal_backslash_escape(),
1109                    )?;
1110
1111                    Ok(Some(Token::SingleQuotedString(s)))
1112                }
1113                // double quoted string
1114                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1115                    && !self.dialect.is_identifier_start(ch) =>
1116                {
1117                    if self.dialect.supports_triple_quoted_string() {
1118                        return self
1119                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120                                chars,
1121                                '"',
1122                                self.dialect.supports_string_literal_backslash_escape(),
1123                                Token::DoubleQuotedString,
1124                                Token::TripleDoubleQuotedString,
1125                            );
1126                    }
1127                    let s = self.tokenize_single_quoted_string(
1128                        chars,
1129                        '"',
1130                        self.dialect.supports_string_literal_backslash_escape(),
1131                    )?;
1132
1133                    Ok(Some(Token::DoubleQuotedString(s)))
1134                }
1135                // delimited (quoted) identifier
1136                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1137                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1138                    Ok(Some(Token::make_word(&word, Some(quote_start))))
1139                }
1140                // Potentially nested delimited (quoted) identifier
1141                quote_start
1142                    if self
1143                        .dialect
1144                        .is_nested_delimited_identifier_start(quote_start)
1145                        && self
1146                            .dialect
1147                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1148                            .is_some() =>
1149                {
1150                    let Some((quote_start, nested_quote_start)) = self
1151                        .dialect
1152                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1153                    else {
1154                        return self.tokenizer_error(
1155                            chars.location(),
1156                            format!("Expected nested delimiter '{quote_start}' before EOF."),
1157                        );
1158                    };
1159
1160                    let Some(nested_quote_start) = nested_quote_start else {
1161                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1162                        return Ok(Some(Token::make_word(&word, Some(quote_start))));
1163                    };
1164
1165                    let mut word = vec![];
1166                    let quote_end = Word::matching_end_quote(quote_start);
1167                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1168                    let error_loc = chars.location();
1169
1170                    chars.next(); // skip the first delimiter
1171                    peeking_take_while(chars, |ch| ch.is_whitespace());
1172                    if chars.peek() != Some(&nested_quote_start) {
1173                        return self.tokenizer_error(
1174                            error_loc,
1175                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1176                        );
1177                    }
1178                    word.push(nested_quote_start.into());
1179                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1180                    word.push(nested_quote_end.into());
1181                    peeking_take_while(chars, |ch| ch.is_whitespace());
1182                    if chars.peek() != Some(&quote_end) {
1183                        return self.tokenizer_error(
1184                            error_loc,
1185                            format!("Expected close delimiter '{quote_end}' before EOF."),
1186                        );
1187                    }
1188                    chars.next(); // skip close delimiter
1189
1190                    Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1191                }
1192                // numbers and period
1193                '0'..='9' | '.' => {
1194                    // special case where if ._ is encountered after a word then that word
1195                    // is a table and the _ is the start of the col name.
1196                    // if the prev token is not a word, then this is not a valid sql
1197                    // word or number.
1198                    if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1199                        if let Some(Token::Word(_)) = prev_token {
1200                            chars.next();
1201                            return Ok(Some(Token::Period));
1202                        }
1203
1204                        return self.tokenizer_error(
1205                            chars.location(),
1206                            "Unexpected character '_'".to_string(),
1207                        );
1208                    }
1209
1210                    // Some dialects support underscore as number separator
1211                    // There can only be one at a time and it must be followed by another digit
1212                    let is_number_separator = |ch: char, next_char: Option<char>| {
1213                        self.dialect.supports_numeric_literal_underscores()
1214                            && ch == '_'
1215                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1216                    };
1217
1218                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1219                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1220                    });
1221
1222                    // match binary literal that starts with 0x
1223                    if s == "0" && chars.peek() == Some(&'x') {
1224                        chars.next();
1225                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1226                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1227                        });
1228                        return Ok(Some(Token::HexStringLiteral(s2)));
1229                    }
1230
1231                    // match one period
1232                    if let Some('.') = chars.peek() {
1233                        s.push('.');
1234                        chars.next();
1235                    }
1236
1237                    // If the dialect supports identifiers that start with a numeric prefix
1238                    // and we have now consumed a dot, check if the previous token was a Word.
1239                    // If so, what follows is definitely not part of a decimal number and
1240                    // we should yield the dot as a dedicated token so compound identifiers
1241                    // starting with digits can be parsed correctly.
1242                    if s == "." && self.dialect.supports_numeric_prefix() {
1243                        if let Some(Token::Word(_)) = prev_token {
1244                            return Ok(Some(Token::Period));
1245                        }
1246                    }
1247
1248                    // Consume fractional digits.
1249                    s += &peeking_next_take_while(chars, |ch, next_ch| {
1250                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1251                    });
1252
1253                    // No fraction -> Token::Period
1254                    if s == "." {
1255                        return Ok(Some(Token::Period));
1256                    }
1257
1258                    // Parse exponent as number
1259                    let mut exponent_part = String::new();
1260                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1261                        let mut char_clone = chars.peekable.clone();
1262                        exponent_part.push(char_clone.next().unwrap());
1263
1264                        // Optional sign
1265                        match char_clone.peek() {
1266                            Some(&c) if matches!(c, '+' | '-') => {
1267                                exponent_part.push(c);
1268                                char_clone.next();
1269                            }
1270                            _ => (),
1271                        }
1272
1273                        match char_clone.peek() {
1274                            // Definitely an exponent, get original iterator up to speed and use it
1275                            Some(&c) if c.is_ascii_digit() => {
1276                                for _ in 0..exponent_part.len() {
1277                                    chars.next();
1278                                }
1279                                exponent_part +=
1280                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1281                                s += exponent_part.as_str();
1282                            }
1283                            // Not an exponent, discard the work done
1284                            _ => (),
1285                        }
1286                    }
1287
1288                    // If the dialect supports identifiers that start with a numeric prefix,
1289                    // we need to check if the value is in fact an identifier and must thus
1290                    // be tokenized as a word.
1291                    if self.dialect.supports_numeric_prefix() {
1292                        if exponent_part.is_empty() {
1293                            // If it is not a number with an exponent, it may be
1294                            // an identifier starting with digits.
1295                            let word =
1296                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1297
1298                            if !word.is_empty() {
1299                                s += word.as_str();
1300                                return Ok(Some(Token::make_word(s.as_str(), None)));
1301                            }
1302                        } else if prev_token == Some(&Token::Period) {
1303                            // If the previous token was a period, thus not belonging to a number,
1304                            // the value we have is part of an identifier.
1305                            return Ok(Some(Token::make_word(s.as_str(), None)));
1306                        }
1307                    }
1308
1309                    let long = if chars.peek() == Some(&'L') {
1310                        chars.next();
1311                        true
1312                    } else {
1313                        false
1314                    };
1315                    Ok(Some(Token::Number(s, long)))
1316                }
1317                // punctuation
1318                '(' => self.consume_and_return(chars, Token::LParen),
1319                ')' => self.consume_and_return(chars, Token::RParen),
1320                ',' => self.consume_and_return(chars, Token::Comma),
1321                // operators
1322                '-' => {
1323                    chars.next(); // consume the '-'
1324
1325                    match chars.peek() {
1326                        Some('-') => {
1327                            let mut is_comment = true;
1328                            if self.dialect.requires_single_line_comment_whitespace() {
1329                                is_comment = Some(' ') == chars.peekable.clone().nth(1);
1330                            }
1331
1332                            if is_comment {
1333                                chars.next(); // consume second '-'
1334                                let comment = self.tokenize_single_line_comment(chars);
1335                                return Ok(Some(Token::Whitespace(
1336                                    Whitespace::SingleLineComment {
1337                                        prefix: "--".to_owned(),
1338                                        comment,
1339                                    },
1340                                )));
1341                            }
1342
1343                            self.start_binop(chars, "-", Token::Minus)
1344                        }
1345                        Some('>') => {
1346                            chars.next();
1347                            match chars.peek() {
1348                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1349                                _ => self.start_binop(chars, "->", Token::Arrow),
1350                            }
1351                        }
1352                        // a regular '-' operator
1353                        _ => self.start_binop(chars, "-", Token::Minus),
1354                    }
1355                }
1356                '/' => {
1357                    chars.next(); // consume the '/'
1358                    match chars.peek() {
1359                        Some('*') => {
1360                            chars.next(); // consume the '*', starting a multi-line comment
1361                            self.tokenize_multiline_comment(chars)
1362                        }
1363                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1364                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1365                            let comment = self.tokenize_single_line_comment(chars);
1366                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1367                                prefix: "//".to_owned(),
1368                                comment,
1369                            })))
1370                        }
1371                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1372                            self.consume_and_return(chars, Token::DuckIntDiv)
1373                        }
1374                        // a regular '/' operator
1375                        _ => Ok(Some(Token::Div)),
1376                    }
1377                }
1378                '+' => self.consume_and_return(chars, Token::Plus),
1379                '*' => self.consume_and_return(chars, Token::Mul),
1380                '%' => {
1381                    chars.next(); // advance past '%'
1382                    match chars.peek() {
1383                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1384                        Some(sch) if self.dialect.is_identifier_start('%') => {
1385                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1386                        }
1387                        _ => self.start_binop(chars, "%", Token::Mod),
1388                    }
1389                }
1390                '|' => {
1391                    chars.next(); // consume the '|'
1392                    match chars.peek() {
1393                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1394                        Some('|') => {
1395                            chars.next(); // consume the second '|'
1396                            match chars.peek() {
1397                                Some('/') => {
1398                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1399                                }
1400                                _ => self.start_binop(chars, "||", Token::StringConcat),
1401                            }
1402                        }
1403                        Some('&') if self.dialect.supports_geometric_types() => {
1404                            chars.next(); // consume
1405                            match chars.peek() {
1406                                Some('>') => self.consume_for_binop(
1407                                    chars,
1408                                    "|&>",
1409                                    Token::VerticalBarAmpersandRightAngleBracket,
1410                                ),
1411                                _ => self.start_binop_opt(chars, "|&", None),
1412                            }
1413                        }
1414                        Some('>') if self.dialect.supports_geometric_types() => {
1415                            chars.next(); // consume
1416                            match chars.peek() {
1417                                Some('>') => self.consume_for_binop(
1418                                    chars,
1419                                    "|>>",
1420                                    Token::VerticalBarShiftRight,
1421                                ),
1422                                _ => self.start_binop_opt(chars, "|>", None),
1423                            }
1424                        }
1425                        Some('>') if self.dialect.supports_pipe_operator() => {
1426                            self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1427                        }
1428                        // Bitshift '|' operator
1429                        _ => self.start_binop(chars, "|", Token::Pipe),
1430                    }
1431                }
1432                '=' => {
1433                    chars.next(); // consume
1434                    match chars.peek() {
1435                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1436                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1437                        _ => Ok(Some(Token::Eq)),
1438                    }
1439                }
1440                '!' => {
1441                    chars.next(); // consume
1442                    match chars.peek() {
1443                        Some('=') => self.consume_and_return(chars, Token::Neq),
1444                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1445                        Some('~') => {
1446                            chars.next();
1447                            match chars.peek() {
1448                                Some('*') => self
1449                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1450                                Some('~') => {
1451                                    chars.next();
1452                                    match chars.peek() {
1453                                        Some('*') => self.consume_and_return(
1454                                            chars,
1455                                            Token::ExclamationMarkDoubleTildeAsterisk,
1456                                        ),
1457                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1458                                    }
1459                                }
1460                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1461                            }
1462                        }
1463                        _ => Ok(Some(Token::ExclamationMark)),
1464                    }
1465                }
1466                '<' => {
1467                    chars.next(); // consume
1468                    match chars.peek() {
1469                        Some('=') => {
1470                            chars.next();
1471                            match chars.peek() {
1472                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1473                                _ => self.start_binop(chars, "<=", Token::LtEq),
1474                            }
1475                        }
1476                        Some('|') if self.dialect.supports_geometric_types() => {
1477                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1478                        }
1479                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1480                        Some('<') if self.dialect.supports_geometric_types() => {
1481                            chars.next(); // consume
1482                            match chars.peek() {
1483                                Some('|') => self.consume_for_binop(
1484                                    chars,
1485                                    "<<|",
1486                                    Token::ShiftLeftVerticalBar,
1487                                ),
1488                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1489                            }
1490                        }
1491                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1492                        Some('-') if self.dialect.supports_geometric_types() => {
1493                            chars.next(); // consume
1494                            match chars.peek() {
1495                                Some('>') => {
1496                                    self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1497                                }
1498                                _ => self.start_binop_opt(chars, "<-", None),
1499                            }
1500                        }
1501                        Some('^') if self.dialect.supports_geometric_types() => {
1502                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1503                        }
1504                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1505                        _ => self.start_binop(chars, "<", Token::Lt),
1506                    }
1507                }
1508                '>' => {
1509                    chars.next(); // consume
1510                    match chars.peek() {
1511                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1512                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1513                        Some('^') if self.dialect.supports_geometric_types() => {
1514                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1515                        }
1516                        _ => self.start_binop(chars, ">", Token::Gt),
1517                    }
1518                }
1519                ':' => {
1520                    chars.next();
1521                    match chars.peek() {
1522                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1523                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1524                        _ => Ok(Some(Token::Colon)),
1525                    }
1526                }
1527                ';' => self.consume_and_return(chars, Token::SemiColon),
1528                '\\' => self.consume_and_return(chars, Token::Backslash),
1529                '[' => self.consume_and_return(chars, Token::LBracket),
1530                ']' => self.consume_and_return(chars, Token::RBracket),
1531                '&' => {
1532                    chars.next(); // consume the '&'
1533                    match chars.peek() {
1534                        Some('>') if self.dialect.supports_geometric_types() => {
1535                            chars.next();
1536                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1537                        }
1538                        Some('<') if self.dialect.supports_geometric_types() => {
1539                            chars.next(); // consume
1540                            match chars.peek() {
1541                                Some('|') => self.consume_and_return(
1542                                    chars,
1543                                    Token::AmpersandLeftAngleBracketVerticalBar,
1544                                ),
1545                                _ => {
1546                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1547                                }
1548                            }
1549                        }
1550                        Some('&') => {
1551                            chars.next(); // consume the second '&'
1552                            self.start_binop(chars, "&&", Token::Overlap)
1553                        }
1554                        // Bitshift '&' operator
1555                        _ => self.start_binop(chars, "&", Token::Ampersand),
1556                    }
1557                }
1558                '^' => {
1559                    chars.next(); // consume the '^'
1560                    match chars.peek() {
1561                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1562                        _ => Ok(Some(Token::Caret)),
1563                    }
1564                }
1565                '{' => self.consume_and_return(chars, Token::LBrace),
1566                '}' => self.consume_and_return(chars, Token::RBrace),
1567                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1568                {
1569                    chars.next(); // consume the '#', starting a snowflake single-line comment
1570                    let comment = self.tokenize_single_line_comment(chars);
1571                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1572                        prefix: "#".to_owned(),
1573                        comment,
1574                    })))
1575                }
1576                '~' => {
1577                    chars.next(); // consume
1578                    match chars.peek() {
1579                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1580                        Some('=') if self.dialect.supports_geometric_types() => {
1581                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
1582                        }
1583                        Some('~') => {
1584                            chars.next();
1585                            match chars.peek() {
1586                                Some('*') => {
1587                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1588                                }
1589                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1590                            }
1591                        }
1592                        _ => self.start_binop(chars, "~", Token::Tilde),
1593                    }
1594                }
1595                '#' => {
1596                    chars.next();
1597                    match chars.peek() {
1598                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1599                        Some('>') => {
1600                            chars.next();
1601                            match chars.peek() {
1602                                Some('>') => {
1603                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1604                                }
1605                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1606                            }
1607                        }
1608                        Some(' ') => Ok(Some(Token::Sharp)),
1609                        Some('#') if self.dialect.supports_geometric_types() => {
1610                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
1611                        }
1612                        Some(sch) if self.dialect.is_identifier_start('#') => {
1613                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1614                        }
1615                        _ => self.start_binop(chars, "#", Token::Sharp),
1616                    }
1617                }
1618                '@' => {
1619                    chars.next();
1620                    match chars.peek() {
1621                        Some('@') if self.dialect.supports_geometric_types() => {
1622                            self.consume_and_return(chars, Token::AtAt)
1623                        }
1624                        Some('-') if self.dialect.supports_geometric_types() => {
1625                            chars.next();
1626                            match chars.peek() {
1627                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1628                                _ => self.start_binop_opt(chars, "@-", None),
1629                            }
1630                        }
1631                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1632                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1633                        Some('@') => {
1634                            chars.next();
1635                            match chars.peek() {
1636                                Some(' ') => Ok(Some(Token::AtAt)),
1637                                Some(tch) if self.dialect.is_identifier_start('@') => {
1638                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1639                                }
1640                                _ => Ok(Some(Token::AtAt)),
1641                            }
1642                        }
1643                        Some(' ') => Ok(Some(Token::AtSign)),
1644                        // We break on quotes here, because no dialect allows identifiers starting
1645                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1646                        // quoted, which is tokenized as a quoted string, not here (e.g.
1647                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1648                        // quoted string as two separate tokens, which this allows. For example,
1649                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1650                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1651                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1652                        // for the user, the `@`, and the host.
1653                        Some('\'') => Ok(Some(Token::AtSign)),
1654                        Some('\"') => Ok(Some(Token::AtSign)),
1655                        Some('`') => Ok(Some(Token::AtSign)),
1656                        Some(sch) if self.dialect.is_identifier_start('@') => {
1657                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1658                        }
1659                        _ => Ok(Some(Token::AtSign)),
1660                    }
1661                }
1662                // Postgres uses ? for jsonb operators, not prepared statements
1663                '?' if self.dialect.supports_geometric_types() => {
1664                    chars.next(); // consume
1665                    match chars.peek() {
1666                        Some('|') => {
1667                            chars.next();
1668                            match chars.peek() {
1669                                Some('|') => self.consume_and_return(
1670                                    chars,
1671                                    Token::QuestionMarkDoubleVerticalBar,
1672                                ),
1673                                _ => Ok(Some(Token::QuestionPipe)),
1674                            }
1675                        }
1676
1677                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1678                        Some('-') => {
1679                            chars.next(); // consume
1680                            match chars.peek() {
1681                                Some('|') => self
1682                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1683                                _ => Ok(Some(Token::QuestionMarkDash)),
1684                            }
1685                        }
1686                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1687                        _ => self.consume_and_return(chars, Token::Question),
1688                    }
1689                }
1690                '?' => {
1691                    chars.next();
1692                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1693                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1694                }
1695
1696                // identifier or keyword
1697                ch if self.dialect.is_identifier_start(ch) => {
1698                    self.tokenize_identifier_or_keyword([ch], chars)
1699                }
1700                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1701
1702                // whitespace check (including unicode chars) should be last as it covers some of the chars above
1703                ch if ch.is_whitespace() => {
1704                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1705                }
1706                other => self.consume_and_return(chars, Token::Char(other)),
1707            },
1708            None => Ok(None),
1709        }
1710    }
1711
1712    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1713    fn consume_for_binop(
1714        &self,
1715        chars: &mut State,
1716        prefix: &str,
1717        default: Token,
1718    ) -> Result<Option<Token>, TokenizerError> {
1719        chars.next(); // consume the first char
1720        self.start_binop_opt(chars, prefix, Some(default))
1721    }
1722
1723    /// parse a custom binary operator
1724    fn start_binop(
1725        &self,
1726        chars: &mut State,
1727        prefix: &str,
1728        default: Token,
1729    ) -> Result<Option<Token>, TokenizerError> {
1730        self.start_binop_opt(chars, prefix, Some(default))
1731    }
1732
1733    /// parse a custom binary operator
1734    fn start_binop_opt(
1735        &self,
1736        chars: &mut State,
1737        prefix: &str,
1738        default: Option<Token>,
1739    ) -> Result<Option<Token>, TokenizerError> {
1740        let mut custom = None;
1741        while let Some(&ch) = chars.peek() {
1742            if !self.dialect.is_custom_operator_part(ch) {
1743                break;
1744            }
1745
1746            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1747            chars.next();
1748        }
1749        match (custom, default) {
1750            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1751            (None, Some(tok)) => Ok(Some(tok)),
1752            (None, None) => self.tokenizer_error(
1753                chars.location(),
1754                format!("Expected a valid binary operator after '{prefix}'"),
1755            ),
1756        }
1757    }
1758
1759    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1760    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1761        let mut s = String::new();
1762        let mut value = String::new();
1763
1764        chars.next();
1765
1766        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1767        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1768            chars.next();
1769
1770            let mut is_terminated = false;
1771            let mut prev: Option<char> = None;
1772
1773            while let Some(&ch) = chars.peek() {
1774                if prev == Some('$') {
1775                    if ch == '$' {
1776                        chars.next();
1777                        is_terminated = true;
1778                        break;
1779                    } else {
1780                        s.push('$');
1781                        s.push(ch);
1782                    }
1783                } else if ch != '$' {
1784                    s.push(ch);
1785                }
1786
1787                prev = Some(ch);
1788                chars.next();
1789            }
1790
1791            return if chars.peek().is_none() && !is_terminated {
1792                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1793            } else {
1794                Ok(Token::DollarQuotedString(DollarQuotedString {
1795                    value: s,
1796                    tag: None,
1797                }))
1798            };
1799        } else {
1800            value.push_str(&peeking_take_while(chars, |ch| {
1801                ch.is_alphanumeric()
1802                    || ch == '_'
1803                    // Allow $ as a placeholder character if the dialect supports it
1804                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1805            }));
1806
1807            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1808            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1809                chars.next();
1810
1811                let mut temp = String::new();
1812                let end_delimiter = format!("${value}$");
1813
1814                loop {
1815                    match chars.next() {
1816                        Some(ch) => {
1817                            temp.push(ch);
1818
1819                            if temp.ends_with(&end_delimiter) {
1820                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1821                                    s.push_str(temp);
1822                                }
1823                                break;
1824                            }
1825                        }
1826                        None => {
1827                            if temp.ends_with(&end_delimiter) {
1828                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1829                                    s.push_str(temp);
1830                                }
1831                                break;
1832                            }
1833
1834                            return self.tokenizer_error(
1835                                chars.location(),
1836                                "Unterminated dollar-quoted, expected $",
1837                            );
1838                        }
1839                    }
1840                }
1841            } else {
1842                return Ok(Token::Placeholder(String::from("$") + &value));
1843            }
1844        }
1845
1846        Ok(Token::DollarQuotedString(DollarQuotedString {
1847            value: s,
1848            tag: if value.is_empty() { None } else { Some(value) },
1849        }))
1850    }
1851
1852    fn tokenizer_error<R>(
1853        &self,
1854        loc: Location,
1855        message: impl Into<String>,
1856    ) -> Result<R, TokenizerError> {
1857        Err(TokenizerError {
1858            message: message.into(),
1859            location: loc,
1860        })
1861    }
1862
1863    // Consume characters until newline
1864    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1865        let mut comment = peeking_take_while(chars, |ch| match ch {
1866            '\n' => false,                                           // Always stop at \n
1867            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
1868            _ => true, // Keep consuming for other characters
1869        });
1870
1871        if let Some(ch) = chars.next() {
1872            assert!(ch == '\n' || ch == '\r');
1873            comment.push(ch);
1874        }
1875
1876        comment
1877    }
1878
1879    /// Tokenize an identifier or keyword, after the first char is already consumed.
1880    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881        let mut s = first_chars.into();
1882        s.push_str(&peeking_take_while(chars, |ch| {
1883            self.dialect.is_identifier_part(ch)
1884        }));
1885        s
1886    }
1887
1888    /// Read a quoted identifier
1889    fn tokenize_quoted_identifier(
1890        &self,
1891        quote_start: char,
1892        chars: &mut State,
1893    ) -> Result<String, TokenizerError> {
1894        let error_loc = chars.location();
1895        chars.next(); // consume the opening quote
1896        let quote_end = Word::matching_end_quote(quote_start);
1897        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1898
1899        if last_char == Some(quote_end) {
1900            Ok(s)
1901        } else {
1902            self.tokenizer_error(
1903                error_loc,
1904                format!("Expected close delimiter '{quote_end}' before EOF."),
1905            )
1906        }
1907    }
1908
1909    /// Read a single quoted string, starting with the opening quote.
1910    fn tokenize_escaped_single_quoted_string(
1911        &self,
1912        starting_loc: Location,
1913        chars: &mut State,
1914    ) -> Result<String, TokenizerError> {
1915        if let Some(s) = unescape_single_quoted_string(chars) {
1916            return Ok(s);
1917        }
1918
1919        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1920    }
1921
1922    /// Reads a string literal quoted by a single or triple quote characters.
1923    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
1924    fn tokenize_single_or_triple_quoted_string<F>(
1925        &self,
1926        chars: &mut State,
1927        quote_style: char,
1928        backslash_escape: bool,
1929        single_quote_token: F,
1930        triple_quote_token: F,
1931    ) -> Result<Option<Token>, TokenizerError>
1932    where
1933        F: Fn(String) -> Token,
1934    {
1935        let error_loc = chars.location();
1936
1937        let mut num_opening_quotes = 0u8;
1938        for _ in 0..3 {
1939            if Some(&quote_style) == chars.peek() {
1940                chars.next(); // Consume quote.
1941                num_opening_quotes += 1;
1942            } else {
1943                break;
1944            }
1945        }
1946
1947        let (token_fn, num_quote_chars) = match num_opening_quotes {
1948            1 => (single_quote_token, NumStringQuoteChars::One),
1949            2 => {
1950                // If we matched double quotes, then this is an empty string.
1951                return Ok(Some(single_quote_token("".into())));
1952            }
1953            3 => {
1954                let Some(num_quote_chars) = NonZeroU8::new(3) else {
1955                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1956                };
1957                (
1958                    triple_quote_token,
1959                    NumStringQuoteChars::Many(num_quote_chars),
1960                )
1961            }
1962            _ => {
1963                return self.tokenizer_error(error_loc, "invalid string literal opening");
1964            }
1965        };
1966
1967        let settings = TokenizeQuotedStringSettings {
1968            quote_style,
1969            num_quote_chars,
1970            num_opening_quotes_to_consume: 0,
1971            backslash_escape,
1972        };
1973
1974        self.tokenize_quoted_string(chars, settings)
1975            .map(token_fn)
1976            .map(Some)
1977    }
1978
1979    /// Reads a string literal quoted by a single quote character.
1980    fn tokenize_single_quoted_string(
1981        &self,
1982        chars: &mut State,
1983        quote_style: char,
1984        backslash_escape: bool,
1985    ) -> Result<String, TokenizerError> {
1986        self.tokenize_quoted_string(
1987            chars,
1988            TokenizeQuotedStringSettings {
1989                quote_style,
1990                num_quote_chars: NumStringQuoteChars::One,
1991                num_opening_quotes_to_consume: 1,
1992                backslash_escape,
1993            },
1994        )
1995    }
1996
1997    /// Read a quoted string.
1998    fn tokenize_quoted_string(
1999        &self,
2000        chars: &mut State,
2001        settings: TokenizeQuotedStringSettings,
2002    ) -> Result<String, TokenizerError> {
2003        let mut s = String::new();
2004        let error_loc = chars.location();
2005
2006        // Consume any opening quotes.
2007        for _ in 0..settings.num_opening_quotes_to_consume {
2008            if Some(settings.quote_style) != chars.next() {
2009                return self.tokenizer_error(error_loc, "invalid string literal opening");
2010            }
2011        }
2012
2013        let mut num_consecutive_quotes = 0;
2014        while let Some(&ch) = chars.peek() {
2015            let pending_final_quote = match settings.num_quote_chars {
2016                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2017                n @ NumStringQuoteChars::Many(count)
2018                    if num_consecutive_quotes + 1 == count.get() =>
2019                {
2020                    Some(n)
2021                }
2022                NumStringQuoteChars::Many(_) => None,
2023            };
2024
2025            match ch {
2026                char if char == settings.quote_style && pending_final_quote.is_some() => {
2027                    chars.next(); // consume
2028
2029                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2030                        // For an initial string like `"""abc"""`, at this point we have
2031                        // `abc""` in the buffer and have now matched the final `"`.
2032                        // However, the string to return is simply `abc`, so we strip off
2033                        // the trailing quotes before returning.
2034                        let mut buf = s.chars();
2035                        for _ in 1..count.get() {
2036                            buf.next_back();
2037                        }
2038                        return Ok(buf.as_str().to_string());
2039                    } else if chars
2040                        .peek()
2041                        .map(|c| *c == settings.quote_style)
2042                        .unwrap_or(false)
2043                    {
2044                        s.push(ch);
2045                        if !self.unescape {
2046                            // In no-escape mode, the given query has to be saved completely
2047                            s.push(ch);
2048                        }
2049                        chars.next();
2050                    } else {
2051                        return Ok(s);
2052                    }
2053                }
2054                '\\' if settings.backslash_escape => {
2055                    // consume backslash
2056                    chars.next();
2057
2058                    num_consecutive_quotes = 0;
2059
2060                    if let Some(next) = chars.peek() {
2061                        if !self.unescape
2062                            || (self.dialect.ignores_wildcard_escapes()
2063                                && (*next == '%' || *next == '_'))
2064                        {
2065                            // In no-escape mode, the given query has to be saved completely
2066                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2067                            // the backslash is not stripped.
2068                            s.push(ch);
2069                            s.push(*next);
2070                            chars.next(); // consume next
2071                        } else if *next == 'u' {
2072                            // Unicode escape \uXXXX (4 hex digits)
2073                            chars.next(); // consume 'u'
2074                            let hex: String = (0..4)
2075                                .filter_map(|_| chars.next())
2076                                .collect();
2077                            if hex.len() == 4 {
2078                                if let Ok(code_point) = u32::from_str_radix(&hex, 16) {
2079                                    if let Some(unicode_char) = char::from_u32(code_point) {
2080                                        s.push(unicode_char);
2081                                    } else {
2082                                        s.push_str("\\u");
2083                                        s.push_str(&hex);
2084                                    }
2085                                } else {
2086                                    s.push_str("\\u");
2087                                    s.push_str(&hex);
2088                                }
2089                            } else {
2090                                s.push_str("\\u");
2091                                s.push_str(&hex);
2092                            }
2093                        } else if *next == 'U' {
2094                            // Unicode escape \UXXXXXXXX (8 hex digits)
2095                            chars.next(); // consume 'U'
2096                            let hex: String = (0..8)
2097                                .filter_map(|_| chars.next())
2098                                .collect();
2099                            if hex.len() == 8 {
2100                                if let Ok(code_point) = u32::from_str_radix(&hex, 16) {
2101                                    if let Some(unicode_char) = char::from_u32(code_point) {
2102                                        s.push(unicode_char);
2103                                    } else {
2104                                        s.push_str("\\U");
2105                                        s.push_str(&hex);
2106                                    }
2107                                } else {
2108                                    s.push_str("\\U");
2109                                    s.push_str(&hex);
2110                                }
2111                            } else {
2112                                s.push_str("\\U");
2113                                s.push_str(&hex);
2114                            }
2115                        } else {
2116                            let n = match next {
2117                                '0' => '\0',
2118                                'a' => '\u{7}',
2119                                'b' => '\u{8}',
2120                                'f' => '\u{c}',
2121                                'n' => '\n',
2122                                'r' => '\r',
2123                                't' => '\t',
2124                                'Z' => '\u{1a}',
2125                                _ => *next,
2126                            };
2127                            s.push(n);
2128                            chars.next(); // consume next
2129                        }
2130                    }
2131                }
2132                ch => {
2133                    chars.next(); // consume ch
2134
2135                    if ch == settings.quote_style {
2136                        num_consecutive_quotes += 1;
2137                    } else {
2138                        num_consecutive_quotes = 0;
2139                    }
2140
2141                    s.push(ch);
2142                }
2143            }
2144        }
2145        self.tokenizer_error(error_loc, "Unterminated string literal")
2146    }
2147
2148    fn tokenize_multiline_comment(
2149        &self,
2150        chars: &mut State,
2151    ) -> Result<Option<Token>, TokenizerError> {
2152        let mut s = String::new();
2153        let mut nested = 1;
2154        let supports_nested_comments = self.dialect.supports_nested_comments();
2155
2156        loop {
2157            match chars.next() {
2158                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2159                    chars.next(); // consume the '*'
2160                    s.push('/');
2161                    s.push('*');
2162                    nested += 1;
2163                }
2164                Some('*') if matches!(chars.peek(), Some('/')) => {
2165                    chars.next(); // consume the '/'
2166                    nested -= 1;
2167                    if nested == 0 {
2168                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2169                    }
2170                    s.push('*');
2171                    s.push('/');
2172                }
2173                Some(ch) => {
2174                    s.push(ch);
2175                }
2176                None => {
2177                    break self.tokenizer_error(
2178                        chars.location(),
2179                        "Unexpected EOF while in a multi-line comment",
2180                    );
2181                }
2182            }
2183        }
2184    }
2185
2186    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2187        let mut last_char = None;
2188        let mut s = String::new();
2189        while let Some(ch) = chars.next() {
2190            if ch == quote_end {
2191                if chars.peek() == Some(&quote_end) {
2192                    chars.next();
2193                    s.push(ch);
2194                    if !self.unescape {
2195                        // In no-escape mode, the given query has to be saved completely
2196                        s.push(ch);
2197                    }
2198                } else {
2199                    last_char = Some(quote_end);
2200                    break;
2201                }
2202            } else {
2203                s.push(ch);
2204            }
2205        }
2206        (s, last_char)
2207    }
2208
2209    #[allow(clippy::unnecessary_wraps)]
2210    fn consume_and_return(
2211        &self,
2212        chars: &mut State,
2213        t: Token,
2214    ) -> Result<Option<Token>, TokenizerError> {
2215        chars.next();
2216        Ok(Some(t))
2217    }
2218}
2219
2220/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2221/// Return the characters read as String, and keep the first non-matching
2222/// char available as `chars.next()`.
2223fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2224    let mut s = String::new();
2225    while let Some(&ch) = chars.peek() {
2226        if predicate(ch) {
2227            chars.next(); // consume
2228            s.push(ch);
2229        } else {
2230            break;
2231        }
2232    }
2233    s
2234}
2235
2236/// Same as peeking_take_while, but also passes the next character to the predicate.
2237fn peeking_next_take_while(
2238    chars: &mut State,
2239    mut predicate: impl FnMut(char, Option<char>) -> bool,
2240) -> String {
2241    let mut s = String::new();
2242    while let Some(&ch) = chars.peek() {
2243        let next_char = chars.peekable.clone().nth(1);
2244        if predicate(ch, next_char) {
2245            chars.next(); // consume
2246            s.push(ch);
2247        } else {
2248            break;
2249        }
2250    }
2251    s
2252}
2253
2254fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2255    Unescape::new(chars).unescape()
2256}
2257
2258struct Unescape<'a: 'b, 'b> {
2259    chars: &'b mut State<'a>,
2260}
2261
2262impl<'a: 'b, 'b> Unescape<'a, 'b> {
2263    fn new(chars: &'b mut State<'a>) -> Self {
2264        Self { chars }
2265    }
2266    fn unescape(mut self) -> Option<String> {
2267        let mut unescaped = String::new();
2268
2269        self.chars.next();
2270
2271        while let Some(c) = self.chars.next() {
2272            if c == '\'' {
2273                // case: ''''
2274                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2275                    self.chars.next();
2276                    unescaped.push('\'');
2277                    continue;
2278                }
2279                return Some(unescaped);
2280            }
2281
2282            if c != '\\' {
2283                unescaped.push(c);
2284                continue;
2285            }
2286
2287            let c = match self.chars.next()? {
2288                'b' => '\u{0008}',
2289                'f' => '\u{000C}',
2290                'n' => '\n',
2291                'r' => '\r',
2292                't' => '\t',
2293                'u' => self.unescape_unicode_16()?,
2294                'U' => self.unescape_unicode_32()?,
2295                'x' => self.unescape_hex()?,
2296                c if c.is_digit(8) => self.unescape_octal(c)?,
2297                c => c,
2298            };
2299
2300            unescaped.push(Self::check_null(c)?);
2301        }
2302
2303        None
2304    }
2305
2306    #[inline]
2307    fn check_null(c: char) -> Option<char> {
2308        if c == '\0' {
2309            None
2310        } else {
2311            Some(c)
2312        }
2313    }
2314
2315    #[inline]
2316    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2317        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2318        match u32::from_str_radix(s, RADIX) {
2319            Err(_) => None,
2320            Ok(n) => {
2321                let n = n & 0xFF;
2322                if n <= 127 {
2323                    char::from_u32(n)
2324                } else {
2325                    None
2326                }
2327            }
2328        }
2329    }
2330
2331    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2332    fn unescape_hex(&mut self) -> Option<char> {
2333        let mut s = String::new();
2334
2335        for _ in 0..2 {
2336            match self.next_hex_digit() {
2337                Some(c) => s.push(c),
2338                None => break,
2339            }
2340        }
2341
2342        if s.is_empty() {
2343            return Some('x');
2344        }
2345
2346        Self::byte_to_char::<16>(&s)
2347    }
2348
2349    #[inline]
2350    fn next_hex_digit(&mut self) -> Option<char> {
2351        match self.chars.peek() {
2352            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2353            _ => None,
2354        }
2355    }
2356
2357    // Octal byte value. \o, \oo, \ooo (o = 0–7)
2358    fn unescape_octal(&mut self, c: char) -> Option<char> {
2359        let mut s = String::new();
2360
2361        s.push(c);
2362        for _ in 0..2 {
2363            match self.next_octal_digest() {
2364                Some(c) => s.push(c),
2365                None => break,
2366            }
2367        }
2368
2369        Self::byte_to_char::<8>(&s)
2370    }
2371
2372    #[inline]
2373    fn next_octal_digest(&mut self) -> Option<char> {
2374        match self.chars.peek() {
2375            Some(c) if c.is_digit(8) => self.chars.next(),
2376            _ => None,
2377        }
2378    }
2379
2380    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2381    fn unescape_unicode_16(&mut self) -> Option<char> {
2382        self.unescape_unicode::<4>()
2383    }
2384
2385    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2386    fn unescape_unicode_32(&mut self) -> Option<char> {
2387        self.unescape_unicode::<8>()
2388    }
2389
2390    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2391        let mut s = String::new();
2392        for _ in 0..NUM {
2393            s.push(self.chars.next()?);
2394        }
2395        match u32::from_str_radix(&s, 16) {
2396            Err(_) => None,
2397            Ok(n) => char::from_u32(n),
2398        }
2399    }
2400}
2401
2402fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2403    let mut unescaped = String::new();
2404    chars.next(); // consume the opening quote
2405    while let Some(c) = chars.next() {
2406        match c {
2407            '\'' => {
2408                if chars.peek() == Some(&'\'') {
2409                    chars.next();
2410                    unescaped.push('\'');
2411                } else {
2412                    return Ok(unescaped);
2413                }
2414            }
2415            '\\' => match chars.peek() {
2416                Some('\\') => {
2417                    chars.next();
2418                    unescaped.push('\\');
2419                }
2420                Some('+') => {
2421                    chars.next();
2422                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2423                }
2424                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2425            },
2426            _ => {
2427                unescaped.push(c);
2428            }
2429        }
2430    }
2431    Err(TokenizerError {
2432        message: "Unterminated unicode encoded string literal".to_string(),
2433        location: chars.location(),
2434    })
2435}
2436
2437fn take_char_from_hex_digits(
2438    chars: &mut State<'_>,
2439    max_digits: usize,
2440) -> Result<char, TokenizerError> {
2441    let mut result = 0u32;
2442    for _ in 0..max_digits {
2443        let next_char = chars.next().ok_or_else(|| TokenizerError {
2444            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2445                .to_string(),
2446            location: chars.location(),
2447        })?;
2448        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2449            message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
2450            location: chars.location(),
2451        })?;
2452        result = result * 16 + digit;
2453    }
2454    char::from_u32(result).ok_or_else(|| TokenizerError {
2455        message: format!("Invalid unicode character: {result:x}"),
2456        location: chars.location(),
2457    })
2458}
2459
2460#[cfg(test)]
2461mod tests {
2462    use super::*;
2463    use crate::dialect::{
2464        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2465    };
2466    use crate::test_utils::all_dialects_where;
2467    use core::fmt::Debug;
2468
2469    #[test]
2470    fn tokenizer_error_impl() {
2471        let err = TokenizerError {
2472            message: "test".into(),
2473            location: Location { line: 1, column: 1 },
2474        };
2475        #[cfg(feature = "std")]
2476        {
2477            use std::error::Error;
2478            assert!(err.source().is_none());
2479        }
2480        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2481    }
2482
2483    #[test]
2484    fn tokenize_select_1() {
2485        let sql = String::from("SELECT 1");
2486        let dialect = GenericDialect {};
2487        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2488
2489        let expected = vec![
2490            Token::make_keyword("SELECT"),
2491            Token::Whitespace(Whitespace::Space),
2492            Token::Number(String::from("1"), false),
2493        ];
2494
2495        compare(expected, tokens);
2496    }
2497
2498    #[test]
2499    fn tokenize_select_float() {
2500        let sql = String::from("SELECT .1");
2501        let dialect = GenericDialect {};
2502        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2503
2504        let expected = vec![
2505            Token::make_keyword("SELECT"),
2506            Token::Whitespace(Whitespace::Space),
2507            Token::Number(String::from(".1"), false),
2508        ];
2509
2510        compare(expected, tokens);
2511    }
2512
2513    #[test]
2514    fn tokenize_clickhouse_double_equal() {
2515        let sql = String::from("SELECT foo=='1'");
2516        let dialect = ClickHouseDialect {};
2517        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2518        let tokens = tokenizer.tokenize().unwrap();
2519
2520        let expected = vec![
2521            Token::make_keyword("SELECT"),
2522            Token::Whitespace(Whitespace::Space),
2523            Token::Word(Word {
2524                value: "foo".to_string(),
2525                quote_style: None,
2526                keyword: Keyword::NoKeyword,
2527            }),
2528            Token::DoubleEq,
2529            Token::SingleQuotedString("1".to_string()),
2530        ];
2531
2532        compare(expected, tokens);
2533    }
2534
2535    #[test]
2536    fn tokenize_numeric_literal_underscore() {
2537        let dialect = GenericDialect {};
2538        let sql = String::from("SELECT 10_000");
2539        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2540        let tokens = tokenizer.tokenize().unwrap();
2541        let expected = vec![
2542            Token::make_keyword("SELECT"),
2543            Token::Whitespace(Whitespace::Space),
2544            Token::Number("10".to_string(), false),
2545            Token::make_word("_000", None),
2546        ];
2547        compare(expected, tokens);
2548
2549        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2550            "SELECT 10_000, _10_000, 10_00_, 10___0",
2551            vec![
2552                Token::make_keyword("SELECT"),
2553                Token::Whitespace(Whitespace::Space),
2554                Token::Number("10_000".to_string(), false),
2555                Token::Comma,
2556                Token::Whitespace(Whitespace::Space),
2557                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2558                Token::Comma,
2559                Token::Whitespace(Whitespace::Space),
2560                Token::Number("10_00".to_string(), false),
2561                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2562                Token::Comma,
2563                Token::Whitespace(Whitespace::Space),
2564                Token::Number("10".to_string(), false),
2565                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2566            ],
2567        );
2568    }
2569
2570    #[test]
2571    fn tokenize_select_exponent() {
2572        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2573        let dialect = GenericDialect {};
2574        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2575
2576        let expected = vec![
2577            Token::make_keyword("SELECT"),
2578            Token::Whitespace(Whitespace::Space),
2579            Token::Number(String::from("1e10"), false),
2580            Token::Comma,
2581            Token::Whitespace(Whitespace::Space),
2582            Token::Number(String::from("1e-10"), false),
2583            Token::Comma,
2584            Token::Whitespace(Whitespace::Space),
2585            Token::Number(String::from("1e+10"), false),
2586            Token::Comma,
2587            Token::Whitespace(Whitespace::Space),
2588            Token::Number(String::from("1"), false),
2589            Token::make_word("ea", None),
2590            Token::Comma,
2591            Token::Whitespace(Whitespace::Space),
2592            Token::Number(String::from("1e-10"), false),
2593            Token::make_word("a", None),
2594            Token::Comma,
2595            Token::Whitespace(Whitespace::Space),
2596            Token::Number(String::from("1e-10"), false),
2597            Token::Minus,
2598            Token::Number(String::from("10"), false),
2599        ];
2600
2601        compare(expected, tokens);
2602    }
2603
2604    #[test]
2605    fn tokenize_scalar_function() {
2606        let sql = String::from("SELECT sqrt(1)");
2607        let dialect = GenericDialect {};
2608        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2609
2610        let expected = vec![
2611            Token::make_keyword("SELECT"),
2612            Token::Whitespace(Whitespace::Space),
2613            Token::make_word("sqrt", None),
2614            Token::LParen,
2615            Token::Number(String::from("1"), false),
2616            Token::RParen,
2617        ];
2618
2619        compare(expected, tokens);
2620    }
2621
2622    #[test]
2623    fn tokenize_string_string_concat() {
2624        let sql = String::from("SELECT 'a' || 'b'");
2625        let dialect = GenericDialect {};
2626        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2627
2628        let expected = vec![
2629            Token::make_keyword("SELECT"),
2630            Token::Whitespace(Whitespace::Space),
2631            Token::SingleQuotedString(String::from("a")),
2632            Token::Whitespace(Whitespace::Space),
2633            Token::StringConcat,
2634            Token::Whitespace(Whitespace::Space),
2635            Token::SingleQuotedString(String::from("b")),
2636        ];
2637
2638        compare(expected, tokens);
2639    }
2640    #[test]
2641    fn tokenize_bitwise_op() {
2642        let sql = String::from("SELECT one | two ^ three");
2643        let dialect = GenericDialect {};
2644        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2645
2646        let expected = vec![
2647            Token::make_keyword("SELECT"),
2648            Token::Whitespace(Whitespace::Space),
2649            Token::make_word("one", None),
2650            Token::Whitespace(Whitespace::Space),
2651            Token::Pipe,
2652            Token::Whitespace(Whitespace::Space),
2653            Token::make_word("two", None),
2654            Token::Whitespace(Whitespace::Space),
2655            Token::Caret,
2656            Token::Whitespace(Whitespace::Space),
2657            Token::make_word("three", None),
2658        ];
2659        compare(expected, tokens);
2660    }
2661
2662    #[test]
2663    fn tokenize_logical_xor() {
2664        let sql =
2665            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2666        let dialect = GenericDialect {};
2667        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2668
2669        let expected = vec![
2670            Token::make_keyword("SELECT"),
2671            Token::Whitespace(Whitespace::Space),
2672            Token::make_keyword("true"),
2673            Token::Whitespace(Whitespace::Space),
2674            Token::make_keyword("XOR"),
2675            Token::Whitespace(Whitespace::Space),
2676            Token::make_keyword("true"),
2677            Token::Comma,
2678            Token::Whitespace(Whitespace::Space),
2679            Token::make_keyword("false"),
2680            Token::Whitespace(Whitespace::Space),
2681            Token::make_keyword("XOR"),
2682            Token::Whitespace(Whitespace::Space),
2683            Token::make_keyword("false"),
2684            Token::Comma,
2685            Token::Whitespace(Whitespace::Space),
2686            Token::make_keyword("true"),
2687            Token::Whitespace(Whitespace::Space),
2688            Token::make_keyword("XOR"),
2689            Token::Whitespace(Whitespace::Space),
2690            Token::make_keyword("false"),
2691            Token::Comma,
2692            Token::Whitespace(Whitespace::Space),
2693            Token::make_keyword("false"),
2694            Token::Whitespace(Whitespace::Space),
2695            Token::make_keyword("XOR"),
2696            Token::Whitespace(Whitespace::Space),
2697            Token::make_keyword("true"),
2698        ];
2699        compare(expected, tokens);
2700    }
2701
2702    #[test]
2703    fn tokenize_simple_select() {
2704        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2705        let dialect = GenericDialect {};
2706        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2707
2708        let expected = vec![
2709            Token::make_keyword("SELECT"),
2710            Token::Whitespace(Whitespace::Space),
2711            Token::Mul,
2712            Token::Whitespace(Whitespace::Space),
2713            Token::make_keyword("FROM"),
2714            Token::Whitespace(Whitespace::Space),
2715            Token::make_word("customer", None),
2716            Token::Whitespace(Whitespace::Space),
2717            Token::make_keyword("WHERE"),
2718            Token::Whitespace(Whitespace::Space),
2719            Token::make_word("id", None),
2720            Token::Whitespace(Whitespace::Space),
2721            Token::Eq,
2722            Token::Whitespace(Whitespace::Space),
2723            Token::Number(String::from("1"), false),
2724            Token::Whitespace(Whitespace::Space),
2725            Token::make_keyword("LIMIT"),
2726            Token::Whitespace(Whitespace::Space),
2727            Token::Number(String::from("5"), false),
2728        ];
2729
2730        compare(expected, tokens);
2731    }
2732
2733    #[test]
2734    fn tokenize_explain_select() {
2735        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2736        let dialect = GenericDialect {};
2737        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2738
2739        let expected = vec![
2740            Token::make_keyword("EXPLAIN"),
2741            Token::Whitespace(Whitespace::Space),
2742            Token::make_keyword("SELECT"),
2743            Token::Whitespace(Whitespace::Space),
2744            Token::Mul,
2745            Token::Whitespace(Whitespace::Space),
2746            Token::make_keyword("FROM"),
2747            Token::Whitespace(Whitespace::Space),
2748            Token::make_word("customer", None),
2749            Token::Whitespace(Whitespace::Space),
2750            Token::make_keyword("WHERE"),
2751            Token::Whitespace(Whitespace::Space),
2752            Token::make_word("id", None),
2753            Token::Whitespace(Whitespace::Space),
2754            Token::Eq,
2755            Token::Whitespace(Whitespace::Space),
2756            Token::Number(String::from("1"), false),
2757        ];
2758
2759        compare(expected, tokens);
2760    }
2761
2762    #[test]
2763    fn tokenize_explain_analyze_select() {
2764        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2765        let dialect = GenericDialect {};
2766        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2767
2768        let expected = vec![
2769            Token::make_keyword("EXPLAIN"),
2770            Token::Whitespace(Whitespace::Space),
2771            Token::make_keyword("ANALYZE"),
2772            Token::Whitespace(Whitespace::Space),
2773            Token::make_keyword("SELECT"),
2774            Token::Whitespace(Whitespace::Space),
2775            Token::Mul,
2776            Token::Whitespace(Whitespace::Space),
2777            Token::make_keyword("FROM"),
2778            Token::Whitespace(Whitespace::Space),
2779            Token::make_word("customer", None),
2780            Token::Whitespace(Whitespace::Space),
2781            Token::make_keyword("WHERE"),
2782            Token::Whitespace(Whitespace::Space),
2783            Token::make_word("id", None),
2784            Token::Whitespace(Whitespace::Space),
2785            Token::Eq,
2786            Token::Whitespace(Whitespace::Space),
2787            Token::Number(String::from("1"), false),
2788        ];
2789
2790        compare(expected, tokens);
2791    }
2792
2793    #[test]
2794    fn tokenize_string_predicate() {
2795        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2796        let dialect = GenericDialect {};
2797        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2798
2799        let expected = vec![
2800            Token::make_keyword("SELECT"),
2801            Token::Whitespace(Whitespace::Space),
2802            Token::Mul,
2803            Token::Whitespace(Whitespace::Space),
2804            Token::make_keyword("FROM"),
2805            Token::Whitespace(Whitespace::Space),
2806            Token::make_word("customer", None),
2807            Token::Whitespace(Whitespace::Space),
2808            Token::make_keyword("WHERE"),
2809            Token::Whitespace(Whitespace::Space),
2810            Token::make_word("salary", None),
2811            Token::Whitespace(Whitespace::Space),
2812            Token::Neq,
2813            Token::Whitespace(Whitespace::Space),
2814            Token::SingleQuotedString(String::from("Not Provided")),
2815        ];
2816
2817        compare(expected, tokens);
2818    }
2819
2820    #[test]
2821    fn tokenize_invalid_string() {
2822        let sql = String::from("\n💝مصطفىh");
2823
2824        let dialect = GenericDialect {};
2825        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2826        // println!("tokens: {:#?}", tokens);
2827        let expected = vec![
2828            Token::Whitespace(Whitespace::Newline),
2829            Token::Char('💝'),
2830            Token::make_word("مصطفىh", None),
2831        ];
2832        compare(expected, tokens);
2833    }
2834
2835    #[test]
2836    fn tokenize_newline_in_string_literal() {
2837        let sql = String::from("'foo\r\nbar\nbaz'");
2838
2839        let dialect = GenericDialect {};
2840        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2841        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2842        compare(expected, tokens);
2843    }
2844
2845    #[test]
2846    fn tokenize_unterminated_string_literal() {
2847        let sql = String::from("select 'foo");
2848
2849        let dialect = GenericDialect {};
2850        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2851        assert_eq!(
2852            tokenizer.tokenize(),
2853            Err(TokenizerError {
2854                message: "Unterminated string literal".to_string(),
2855                location: Location { line: 1, column: 8 },
2856            })
2857        );
2858    }
2859
2860    #[test]
2861    fn tokenize_unterminated_string_literal_utf8() {
2862        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2863
2864        let dialect = GenericDialect {};
2865        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2866        assert_eq!(
2867            tokenizer.tokenize(),
2868            Err(TokenizerError {
2869                message: "Unterminated string literal".to_string(),
2870                location: Location {
2871                    line: 1,
2872                    column: 35
2873                }
2874            })
2875        );
2876    }
2877
2878    #[test]
2879    fn tokenize_invalid_string_cols() {
2880        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2881
2882        let dialect = GenericDialect {};
2883        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2884        // println!("tokens: {:#?}", tokens);
2885        let expected = vec![
2886            Token::Whitespace(Whitespace::Newline),
2887            Token::Whitespace(Whitespace::Newline),
2888            Token::make_keyword("SELECT"),
2889            Token::Whitespace(Whitespace::Space),
2890            Token::Mul,
2891            Token::Whitespace(Whitespace::Space),
2892            Token::make_keyword("FROM"),
2893            Token::Whitespace(Whitespace::Space),
2894            Token::make_keyword("table"),
2895            Token::Whitespace(Whitespace::Tab),
2896            Token::Char('💝'),
2897            Token::make_word("مصطفىh", None),
2898        ];
2899        compare(expected, tokens);
2900    }
2901
2902    #[test]
2903    fn tokenize_dollar_quoted_string_tagged() {
2904        let test_cases = vec![
2905            (
2906                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2907                vec![
2908                    Token::make_keyword("SELECT"),
2909                    Token::Whitespace(Whitespace::Space),
2910                    Token::DollarQuotedString(DollarQuotedString {
2911                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2912                        tag: Some("tag".into()),
2913                    })
2914                ]
2915            ),
2916            (
2917                String::from("SELECT $abc$x$ab$abc$"),
2918                vec![
2919                    Token::make_keyword("SELECT"),
2920                    Token::Whitespace(Whitespace::Space),
2921                    Token::DollarQuotedString(DollarQuotedString {
2922                        value: "x$ab".into(),
2923                        tag: Some("abc".into()),
2924                    })
2925                ]
2926            ),
2927            (
2928                String::from("SELECT $abc$$abc$"),
2929                vec![
2930                    Token::make_keyword("SELECT"),
2931                    Token::Whitespace(Whitespace::Space),
2932                    Token::DollarQuotedString(DollarQuotedString {
2933                        value: "".into(),
2934                        tag: Some("abc".into()),
2935                    })
2936                ]
2937            ),
2938            (
2939                String::from("0$abc$$abc$1"),
2940                vec![
2941                    Token::Number("0".into(), false),
2942                    Token::DollarQuotedString(DollarQuotedString {
2943                        value: "".into(),
2944                        tag: Some("abc".into()),
2945                    }),
2946                    Token::Number("1".into(), false),
2947                ]
2948            ),
2949            (
2950                String::from("$function$abc$q$data$q$$function$"),
2951                vec![
2952                    Token::DollarQuotedString(DollarQuotedString {
2953                        value: "abc$q$data$q$".into(),
2954                        tag: Some("function".into()),
2955                    }),
2956                ]
2957            ),
2958        ];
2959
2960        let dialect = GenericDialect {};
2961        for (sql, expected) in test_cases {
2962            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2963            compare(expected, tokens);
2964        }
2965    }
2966
2967    #[test]
2968    fn tokenize_dollar_quoted_string_tagged_unterminated() {
2969        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2970        let dialect = GenericDialect {};
2971        assert_eq!(
2972            Tokenizer::new(&dialect, &sql).tokenize(),
2973            Err(TokenizerError {
2974                message: "Unterminated dollar-quoted, expected $".into(),
2975                location: Location {
2976                    line: 1,
2977                    column: 91
2978                }
2979            })
2980        );
2981    }
2982
2983    #[test]
2984    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2985        let sql = String::from("SELECT $abc$abc$");
2986        let dialect = GenericDialect {};
2987        assert_eq!(
2988            Tokenizer::new(&dialect, &sql).tokenize(),
2989            Err(TokenizerError {
2990                message: "Unterminated dollar-quoted, expected $".into(),
2991                location: Location {
2992                    line: 1,
2993                    column: 17
2994                }
2995            })
2996        );
2997    }
2998
2999    #[test]
3000    fn tokenize_dollar_placeholder() {
3001        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
3002        let dialect = SQLiteDialect {};
3003        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3004        assert_eq!(
3005            tokens,
3006            vec![
3007                Token::make_keyword("SELECT"),
3008                Token::Whitespace(Whitespace::Space),
3009                Token::Placeholder("$$".into()),
3010                Token::Comma,
3011                Token::Whitespace(Whitespace::Space),
3012                Token::Placeholder("$$ABC$$".into()),
3013                Token::Comma,
3014                Token::Whitespace(Whitespace::Space),
3015                Token::Placeholder("$ABC$".into()),
3016                Token::Comma,
3017                Token::Whitespace(Whitespace::Space),
3018                Token::Placeholder("$ABC".into()),
3019            ]
3020        );
3021    }
3022
3023    #[test]
3024    fn tokenize_nested_dollar_quoted_strings() {
3025        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
3026        let dialect = GenericDialect {};
3027        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3028        let expected = vec![
3029            Token::make_keyword("SELECT"),
3030            Token::Whitespace(Whitespace::Space),
3031            Token::DollarQuotedString(DollarQuotedString {
3032                value: "dollar $nested$ string".into(),
3033                tag: Some("tag".into()),
3034            }),
3035        ];
3036        compare(expected, tokens);
3037    }
3038
3039    #[test]
3040    fn tokenize_dollar_quoted_string_untagged_empty() {
3041        let sql = String::from("SELECT $$$$");
3042        let dialect = GenericDialect {};
3043        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3044        let expected = vec![
3045            Token::make_keyword("SELECT"),
3046            Token::Whitespace(Whitespace::Space),
3047            Token::DollarQuotedString(DollarQuotedString {
3048                value: "".into(),
3049                tag: None,
3050            }),
3051        ];
3052        compare(expected, tokens);
3053    }
3054
3055    #[test]
3056    fn tokenize_dollar_quoted_string_untagged() {
3057        let sql =
3058            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
3059        let dialect = GenericDialect {};
3060        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3061        let expected = vec![
3062            Token::make_keyword("SELECT"),
3063            Token::Whitespace(Whitespace::Space),
3064            Token::DollarQuotedString(DollarQuotedString {
3065                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3066                tag: None,
3067            }),
3068        ];
3069        compare(expected, tokens);
3070    }
3071
3072    #[test]
3073    fn tokenize_dollar_quoted_string_untagged_unterminated() {
3074        let sql = String::from(
3075            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3076        );
3077        let dialect = GenericDialect {};
3078        assert_eq!(
3079            Tokenizer::new(&dialect, &sql).tokenize(),
3080            Err(TokenizerError {
3081                message: "Unterminated dollar-quoted string".into(),
3082                location: Location {
3083                    line: 1,
3084                    column: 86
3085                }
3086            })
3087        );
3088    }
3089
3090    #[test]
3091    fn tokenize_right_arrow() {
3092        let sql = String::from("FUNCTION(key=>value)");
3093        let dialect = GenericDialect {};
3094        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3095        let expected = vec![
3096            Token::make_word("FUNCTION", None),
3097            Token::LParen,
3098            Token::make_word("key", None),
3099            Token::RArrow,
3100            Token::make_word("value", None),
3101            Token::RParen,
3102        ];
3103        compare(expected, tokens);
3104    }
3105
3106    #[test]
3107    fn tokenize_is_null() {
3108        let sql = String::from("a IS NULL");
3109        let dialect = GenericDialect {};
3110        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3111
3112        let expected = vec![
3113            Token::make_word("a", None),
3114            Token::Whitespace(Whitespace::Space),
3115            Token::make_keyword("IS"),
3116            Token::Whitespace(Whitespace::Space),
3117            Token::make_keyword("NULL"),
3118        ];
3119
3120        compare(expected, tokens);
3121    }
3122
3123    #[test]
3124    fn tokenize_comment() {
3125        let test_cases = vec![
3126            (
3127                String::from("0--this is a comment\n1"),
3128                vec![
3129                    Token::Number("0".to_string(), false),
3130                    Token::Whitespace(Whitespace::SingleLineComment {
3131                        prefix: "--".to_string(),
3132                        comment: "this is a comment\n".to_string(),
3133                    }),
3134                    Token::Number("1".to_string(), false),
3135                ],
3136            ),
3137            (
3138                String::from("0--this is a comment\r1"),
3139                vec![
3140                    Token::Number("0".to_string(), false),
3141                    Token::Whitespace(Whitespace::SingleLineComment {
3142                        prefix: "--".to_string(),
3143                        comment: "this is a comment\r1".to_string(),
3144                    }),
3145                ],
3146            ),
3147            (
3148                String::from("0--this is a comment\r\n1"),
3149                vec![
3150                    Token::Number("0".to_string(), false),
3151                    Token::Whitespace(Whitespace::SingleLineComment {
3152                        prefix: "--".to_string(),
3153                        comment: "this is a comment\r\n".to_string(),
3154                    }),
3155                    Token::Number("1".to_string(), false),
3156                ],
3157            ),
3158        ];
3159
3160        let dialect = GenericDialect {};
3161
3162        for (sql, expected) in test_cases {
3163            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3164            compare(expected, tokens);
3165        }
3166    }
3167
3168    #[test]
3169    fn tokenize_comment_postgres() {
3170        let sql = String::from("1--\r0");
3171
3172        let dialect = PostgreSqlDialect {};
3173        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3174        let expected = vec![
3175            Token::Number("1".to_string(), false),
3176            Token::Whitespace(Whitespace::SingleLineComment {
3177                prefix: "--".to_string(),
3178                comment: "\r".to_string(),
3179            }),
3180            Token::Number("0".to_string(), false),
3181        ];
3182        compare(expected, tokens);
3183    }
3184
3185    #[test]
3186    fn tokenize_comment_at_eof() {
3187        let sql = String::from("--this is a comment");
3188
3189        let dialect = GenericDialect {};
3190        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3191        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3192            prefix: "--".to_string(),
3193            comment: "this is a comment".to_string(),
3194        })];
3195        compare(expected, tokens);
3196    }
3197
3198    #[test]
3199    fn tokenize_multiline_comment() {
3200        let sql = String::from("0/*multi-line\n* /comment*/1");
3201
3202        let dialect = GenericDialect {};
3203        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3204        let expected = vec![
3205            Token::Number("0".to_string(), false),
3206            Token::Whitespace(Whitespace::MultiLineComment(
3207                "multi-line\n* /comment".to_string(),
3208            )),
3209            Token::Number("1".to_string(), false),
3210        ];
3211        compare(expected, tokens);
3212    }
3213
3214    #[test]
3215    fn tokenize_nested_multiline_comment() {
3216        let dialect = GenericDialect {};
3217        let test_cases = vec![
3218            (
3219                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3220                vec![
3221                    Token::Number("0".to_string(), false),
3222                    Token::Whitespace(Whitespace::MultiLineComment(
3223                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3224                    )),
3225                    Token::Whitespace(Whitespace::Space),
3226                    Token::Div,
3227                    Token::Word(Word {
3228                        value: "comment".to_string(),
3229                        quote_style: None,
3230                        keyword: Keyword::COMMENT,
3231                    }),
3232                    Token::Mul,
3233                    Token::Div,
3234                    Token::Number("1".to_string(), false),
3235                ],
3236            ),
3237            (
3238                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3239                vec![
3240                    Token::Number("0".to_string(), false),
3241                    Token::Whitespace(Whitespace::MultiLineComment(
3242                        "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3243                    )),
3244                    Token::Number("1".to_string(), false),
3245                ],
3246            ),
3247            (
3248                "SELECT 1/* a /* b */ c */0",
3249                vec![
3250                    Token::make_keyword("SELECT"),
3251                    Token::Whitespace(Whitespace::Space),
3252                    Token::Number("1".to_string(), false),
3253                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3254                    Token::Number("0".to_string(), false),
3255                ],
3256            ),
3257        ];
3258
3259        for (sql, expected) in test_cases {
3260            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3261            compare(expected, tokens);
3262        }
3263    }
3264
3265    #[test]
3266    fn tokenize_nested_multiline_comment_empty() {
3267        let sql = "select 1/*/**/*/0";
3268
3269        let dialect = GenericDialect {};
3270        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3271        let expected = vec![
3272            Token::make_keyword("select"),
3273            Token::Whitespace(Whitespace::Space),
3274            Token::Number("1".to_string(), false),
3275            Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3276            Token::Number("0".to_string(), false),
3277        ];
3278
3279        compare(expected, tokens);
3280    }
3281
3282    #[test]
3283    fn tokenize_nested_comments_if_not_supported() {
3284        let dialect = SQLiteDialect {};
3285        let sql = "SELECT 1/*/* nested comment */*/0";
3286        let tokens = Tokenizer::new(&dialect, sql).tokenize();
3287        let expected = vec![
3288            Token::make_keyword("SELECT"),
3289            Token::Whitespace(Whitespace::Space),
3290            Token::Number("1".to_string(), false),
3291            Token::Whitespace(Whitespace::MultiLineComment(
3292                "/* nested comment ".to_string(),
3293            )),
3294            Token::Mul,
3295            Token::Div,
3296            Token::Number("0".to_string(), false),
3297        ];
3298
3299        compare(expected, tokens.unwrap());
3300    }
3301
3302    #[test]
3303    fn tokenize_multiline_comment_with_even_asterisks() {
3304        let sql = String::from("\n/** Comment **/\n");
3305
3306        let dialect = GenericDialect {};
3307        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3308        let expected = vec![
3309            Token::Whitespace(Whitespace::Newline),
3310            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3311            Token::Whitespace(Whitespace::Newline),
3312        ];
3313        compare(expected, tokens);
3314    }
3315
3316    #[test]
3317    fn tokenize_unicode_whitespace() {
3318        let sql = String::from(" \u{2003}\n");
3319
3320        let dialect = GenericDialect {};
3321        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3322        let expected = vec![
3323            Token::Whitespace(Whitespace::Space),
3324            Token::Whitespace(Whitespace::Space),
3325            Token::Whitespace(Whitespace::Newline),
3326        ];
3327        compare(expected, tokens);
3328    }
3329
3330    #[test]
3331    fn tokenize_mismatched_quotes() {
3332        let sql = String::from("\"foo");
3333
3334        let dialect = GenericDialect {};
3335        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3336        assert_eq!(
3337            tokenizer.tokenize(),
3338            Err(TokenizerError {
3339                message: "Expected close delimiter '\"' before EOF.".to_string(),
3340                location: Location { line: 1, column: 1 },
3341            })
3342        );
3343    }
3344
3345    #[test]
3346    fn tokenize_newlines() {
3347        let sql = String::from("line1\nline2\rline3\r\nline4\r");
3348
3349        let dialect = GenericDialect {};
3350        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3351        let expected = vec![
3352            Token::make_word("line1", None),
3353            Token::Whitespace(Whitespace::Newline),
3354            Token::make_word("line2", None),
3355            Token::Whitespace(Whitespace::Newline),
3356            Token::make_word("line3", None),
3357            Token::Whitespace(Whitespace::Newline),
3358            Token::make_word("line4", None),
3359            Token::Whitespace(Whitespace::Newline),
3360        ];
3361        compare(expected, tokens);
3362    }
3363
3364    #[test]
3365    fn tokenize_mssql_top() {
3366        let sql = "SELECT TOP 5 [bar] FROM foo";
3367        let dialect = MsSqlDialect {};
3368        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3369        let expected = vec![
3370            Token::make_keyword("SELECT"),
3371            Token::Whitespace(Whitespace::Space),
3372            Token::make_keyword("TOP"),
3373            Token::Whitespace(Whitespace::Space),
3374            Token::Number(String::from("5"), false),
3375            Token::Whitespace(Whitespace::Space),
3376            Token::make_word("bar", Some('[')),
3377            Token::Whitespace(Whitespace::Space),
3378            Token::make_keyword("FROM"),
3379            Token::Whitespace(Whitespace::Space),
3380            Token::make_word("foo", None),
3381        ];
3382        compare(expected, tokens);
3383    }
3384
3385    #[test]
3386    fn tokenize_pg_regex_match() {
3387        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3388        let dialect = GenericDialect {};
3389        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3390        let expected = vec![
3391            Token::make_keyword("SELECT"),
3392            Token::Whitespace(Whitespace::Space),
3393            Token::make_word("col", None),
3394            Token::Whitespace(Whitespace::Space),
3395            Token::Tilde,
3396            Token::Whitespace(Whitespace::Space),
3397            Token::SingleQuotedString("^a".into()),
3398            Token::Comma,
3399            Token::Whitespace(Whitespace::Space),
3400            Token::make_word("col", None),
3401            Token::Whitespace(Whitespace::Space),
3402            Token::TildeAsterisk,
3403            Token::Whitespace(Whitespace::Space),
3404            Token::SingleQuotedString("^a".into()),
3405            Token::Comma,
3406            Token::Whitespace(Whitespace::Space),
3407            Token::make_word("col", None),
3408            Token::Whitespace(Whitespace::Space),
3409            Token::ExclamationMarkTilde,
3410            Token::Whitespace(Whitespace::Space),
3411            Token::SingleQuotedString("^a".into()),
3412            Token::Comma,
3413            Token::Whitespace(Whitespace::Space),
3414            Token::make_word("col", None),
3415            Token::Whitespace(Whitespace::Space),
3416            Token::ExclamationMarkTildeAsterisk,
3417            Token::Whitespace(Whitespace::Space),
3418            Token::SingleQuotedString("^a".into()),
3419        ];
3420        compare(expected, tokens);
3421    }
3422
3423    #[test]
3424    fn tokenize_pg_like_match() {
3425        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3426        let dialect = GenericDialect {};
3427        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3428        let expected = vec![
3429            Token::make_keyword("SELECT"),
3430            Token::Whitespace(Whitespace::Space),
3431            Token::make_word("col", None),
3432            Token::Whitespace(Whitespace::Space),
3433            Token::DoubleTilde,
3434            Token::Whitespace(Whitespace::Space),
3435            Token::SingleQuotedString("_a%".into()),
3436            Token::Comma,
3437            Token::Whitespace(Whitespace::Space),
3438            Token::make_word("col", None),
3439            Token::Whitespace(Whitespace::Space),
3440            Token::DoubleTildeAsterisk,
3441            Token::Whitespace(Whitespace::Space),
3442            Token::SingleQuotedString("_a%".into()),
3443            Token::Comma,
3444            Token::Whitespace(Whitespace::Space),
3445            Token::make_word("col", None),
3446            Token::Whitespace(Whitespace::Space),
3447            Token::ExclamationMarkDoubleTilde,
3448            Token::Whitespace(Whitespace::Space),
3449            Token::SingleQuotedString("_a%".into()),
3450            Token::Comma,
3451            Token::Whitespace(Whitespace::Space),
3452            Token::make_word("col", None),
3453            Token::Whitespace(Whitespace::Space),
3454            Token::ExclamationMarkDoubleTildeAsterisk,
3455            Token::Whitespace(Whitespace::Space),
3456            Token::SingleQuotedString("_a%".into()),
3457        ];
3458        compare(expected, tokens);
3459    }
3460
3461    #[test]
3462    fn tokenize_quoted_identifier() {
3463        let sql = r#" "a "" b" "a """ "c """"" "#;
3464        let dialect = GenericDialect {};
3465        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3466        let expected = vec![
3467            Token::Whitespace(Whitespace::Space),
3468            Token::make_word(r#"a " b"#, Some('"')),
3469            Token::Whitespace(Whitespace::Space),
3470            Token::make_word(r#"a ""#, Some('"')),
3471            Token::Whitespace(Whitespace::Space),
3472            Token::make_word(r#"c """#, Some('"')),
3473            Token::Whitespace(Whitespace::Space),
3474        ];
3475        compare(expected, tokens);
3476    }
3477
3478    #[test]
3479    fn tokenize_snowflake_div() {
3480        let sql = r#"field/1000"#;
3481        let dialect = SnowflakeDialect {};
3482        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3483        let expected = vec![
3484            Token::make_word(r#"field"#, None),
3485            Token::Div,
3486            Token::Number("1000".to_string(), false),
3487        ];
3488        compare(expected, tokens);
3489    }
3490
3491    #[test]
3492    fn tokenize_quoted_identifier_with_no_escape() {
3493        let sql = r#" "a "" b" "a """ "c """"" "#;
3494        let dialect = GenericDialect {};
3495        let tokens = Tokenizer::new(&dialect, sql)
3496            .with_unescape(false)
3497            .tokenize()
3498            .unwrap();
3499        let expected = vec![
3500            Token::Whitespace(Whitespace::Space),
3501            Token::make_word(r#"a "" b"#, Some('"')),
3502            Token::Whitespace(Whitespace::Space),
3503            Token::make_word(r#"a """#, Some('"')),
3504            Token::Whitespace(Whitespace::Space),
3505            Token::make_word(r#"c """""#, Some('"')),
3506            Token::Whitespace(Whitespace::Space),
3507        ];
3508        compare(expected, tokens);
3509    }
3510
3511    #[test]
3512    fn tokenize_with_location() {
3513        let sql = "SELECT a,\n b";
3514        let dialect = GenericDialect {};
3515        let tokens = Tokenizer::new(&dialect, sql)
3516            .tokenize_with_location()
3517            .unwrap();
3518        let expected = vec![
3519            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3520            TokenWithSpan::at(
3521                Token::Whitespace(Whitespace::Space),
3522                (1, 7).into(),
3523                (1, 8).into(),
3524            ),
3525            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3526            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3527            TokenWithSpan::at(
3528                Token::Whitespace(Whitespace::Newline),
3529                (1, 10).into(),
3530                (2, 1).into(),
3531            ),
3532            TokenWithSpan::at(
3533                Token::Whitespace(Whitespace::Space),
3534                (2, 1).into(),
3535                (2, 2).into(),
3536            ),
3537            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3538        ];
3539        compare(expected, tokens);
3540    }
3541
3542    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3543        //println!("------------------------------");
3544        //println!("tokens   = {:?}", actual);
3545        //println!("expected = {:?}", expected);
3546        //println!("------------------------------");
3547        assert_eq!(expected, actual);
3548    }
3549
3550    fn check_unescape(s: &str, expected: Option<&str>) {
3551        let s = format!("'{s}'");
3552        let mut state = State {
3553            peekable: s.chars().peekable(),
3554            line: 0,
3555            col: 0,
3556        };
3557
3558        assert_eq!(
3559            unescape_single_quoted_string(&mut state),
3560            expected.map(|s| s.to_string())
3561        );
3562    }
3563
3564    #[test]
3565    fn test_unescape() {
3566        check_unescape(r"\b", Some("\u{0008}"));
3567        check_unescape(r"\f", Some("\u{000C}"));
3568        check_unescape(r"\t", Some("\t"));
3569        check_unescape(r"\r\n", Some("\r\n"));
3570        check_unescape(r"\/", Some("/"));
3571        check_unescape(r"/", Some("/"));
3572        check_unescape(r"\\", Some("\\"));
3573
3574        // 16 and 32-bit hexadecimal Unicode character value
3575        check_unescape(r"\u0001", Some("\u{0001}"));
3576        check_unescape(r"\u4c91", Some("\u{4c91}"));
3577        check_unescape(r"\u4c916", Some("\u{4c91}6"));
3578        check_unescape(r"\u4c", None);
3579        check_unescape(r"\u0000", None);
3580        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3581        check_unescape(r"\U00110000", None);
3582        check_unescape(r"\U00000000", None);
3583        check_unescape(r"\u", None);
3584        check_unescape(r"\U", None);
3585        check_unescape(r"\U1010FFFF", None);
3586
3587        // hexadecimal byte value
3588        check_unescape(r"\x4B", Some("\u{004b}"));
3589        check_unescape(r"\x4", Some("\u{0004}"));
3590        check_unescape(r"\x4L", Some("\u{0004}L"));
3591        check_unescape(r"\x", Some("x"));
3592        check_unescape(r"\xP", Some("xP"));
3593        check_unescape(r"\x0", None);
3594        check_unescape(r"\xCAD", None);
3595        check_unescape(r"\xA9", None);
3596
3597        // octal byte value
3598        check_unescape(r"\1", Some("\u{0001}"));
3599        check_unescape(r"\12", Some("\u{000a}"));
3600        check_unescape(r"\123", Some("\u{0053}"));
3601        check_unescape(r"\1232", Some("\u{0053}2"));
3602        check_unescape(r"\4", Some("\u{0004}"));
3603        check_unescape(r"\45", Some("\u{0025}"));
3604        check_unescape(r"\450", Some("\u{0028}"));
3605        check_unescape(r"\603", None);
3606        check_unescape(r"\0", None);
3607        check_unescape(r"\080", None);
3608
3609        // others
3610        check_unescape(r"\9", Some("9"));
3611        check_unescape(r"''", Some("'"));
3612        check_unescape(
3613            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3614            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3615        );
3616        check_unescape(r"Hello\0", None);
3617        check_unescape(r"Hello\xCADRust", None);
3618    }
3619
3620    #[test]
3621    fn tokenize_numeric_prefix_trait() {
3622        #[derive(Debug)]
3623        struct NumericPrefixDialect;
3624
3625        impl Dialect for NumericPrefixDialect {
3626            fn is_identifier_start(&self, ch: char) -> bool {
3627                ch.is_ascii_lowercase()
3628                    || ch.is_ascii_uppercase()
3629                    || ch.is_ascii_digit()
3630                    || ch == '$'
3631            }
3632
3633            fn is_identifier_part(&self, ch: char) -> bool {
3634                ch.is_ascii_lowercase()
3635                    || ch.is_ascii_uppercase()
3636                    || ch.is_ascii_digit()
3637                    || ch == '_'
3638                    || ch == '$'
3639                    || ch == '{'
3640                    || ch == '}'
3641            }
3642
3643            fn supports_numeric_prefix(&self) -> bool {
3644                true
3645            }
3646        }
3647
3648        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3649        tokenize_numeric_prefix_inner(&HiveDialect {});
3650        tokenize_numeric_prefix_inner(&MySqlDialect {});
3651    }
3652
3653    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3654        let sql = r#"SELECT * FROM 1"#;
3655        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3656        let expected = vec![
3657            Token::make_keyword("SELECT"),
3658            Token::Whitespace(Whitespace::Space),
3659            Token::Mul,
3660            Token::Whitespace(Whitespace::Space),
3661            Token::make_keyword("FROM"),
3662            Token::Whitespace(Whitespace::Space),
3663            Token::Number(String::from("1"), false),
3664        ];
3665        compare(expected, tokens);
3666    }
3667
3668    #[test]
3669    fn tokenize_quoted_string_escape() {
3670        let dialect = SnowflakeDialect {};
3671        for (sql, expected, expected_unescaped) in [
3672            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3673            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3674            (r#"'\\'"#, r#"\\"#, r#"\"#),
3675            (
3676                r#"'\0\a\b\f\n\r\t\Z'"#,
3677                r#"\0\a\b\f\n\r\t\Z"#,
3678                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3679            ),
3680            (r#"'\"'"#, r#"\""#, "\""),
3681            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3682            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3683            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3684            (r#"'\q'"#, r#"\q"#, r#"q"#),
3685            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3686            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3687        ] {
3688            let tokens = Tokenizer::new(&dialect, sql)
3689                .with_unescape(false)
3690                .tokenize()
3691                .unwrap();
3692            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3693            compare(expected, tokens);
3694
3695            let tokens = Tokenizer::new(&dialect, sql)
3696                .with_unescape(true)
3697                .tokenize()
3698                .unwrap();
3699            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3700            compare(expected, tokens);
3701        }
3702
3703        for sql in [r#"'\'"#, r#"'ab\'"#] {
3704            let mut tokenizer = Tokenizer::new(&dialect, sql);
3705            assert_eq!(
3706                "Unterminated string literal",
3707                tokenizer.tokenize().unwrap_err().message.as_str(),
3708            );
3709        }
3710
3711        // Non-escape dialect
3712        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3713            let dialect = GenericDialect {};
3714            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3715
3716            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3717
3718            compare(expected, tokens);
3719        }
3720
3721        // MySQL special case for LIKE escapes
3722        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3723            let dialect = MySqlDialect {};
3724            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3725
3726            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3727
3728            compare(expected, tokens);
3729        }
3730    }
3731
3732    #[test]
3733    fn tokenize_triple_quoted_string() {
3734        fn check<F>(
3735            q: char, // The quote character to test
3736            r: char, // An alternate quote character.
3737            quote_token: F,
3738        ) where
3739            F: Fn(String) -> Token,
3740        {
3741            let dialect = BigQueryDialect {};
3742
3743            for (sql, expected, expected_unescaped) in [
3744                // Empty string
3745                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3746                // Should not count escaped quote as end of string.
3747                (
3748                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3749                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3750                    format!(r#"ab{q}{q}{q}{q}cd"#),
3751                ),
3752                // Simple string
3753                (
3754                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3755                    "abc".into(),
3756                    "abc".into(),
3757                ),
3758                // Mix single-double quotes unescaped.
3759                (
3760                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3761                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3762                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3763                ),
3764                // Escaped quote.
3765                (
3766                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3767                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3768                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3769                ),
3770                // backslash-escaped quote characters.
3771                (
3772                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3773                    r#"a\'\'b\'c\'d"#.into(),
3774                    r#"a''b'c'd"#.into(),
3775                ),
3776                // backslash-escaped characters
3777                (
3778                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3779                    r#"abc\0\n\rdef"#.into(),
3780                    "abc\0\n\rdef".into(),
3781                ),
3782            ] {
3783                let tokens = Tokenizer::new(&dialect, sql.as_str())
3784                    .with_unescape(false)
3785                    .tokenize()
3786                    .unwrap();
3787                let expected = vec![quote_token(expected.to_string())];
3788                compare(expected, tokens);
3789
3790                let tokens = Tokenizer::new(&dialect, sql.as_str())
3791                    .with_unescape(true)
3792                    .tokenize()
3793                    .unwrap();
3794                let expected = vec![quote_token(expected_unescaped.to_string())];
3795                compare(expected, tokens);
3796            }
3797
3798            for sql in [
3799                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3800                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3801                format!(r#"{q}{q}{q}{q}"#),
3802                format!(r#"{q}{q}{q}{r}{r}"#),
3803                format!(r#"{q}{q}{q}abc{q}"#),
3804                format!(r#"{q}{q}{q}abc{q}{q}"#),
3805                format!(r#"{q}{q}{q}abc"#),
3806            ] {
3807                let dialect = BigQueryDialect {};
3808                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3809                assert_eq!(
3810                    "Unterminated string literal",
3811                    tokenizer.tokenize().unwrap_err().message.as_str(),
3812                );
3813            }
3814        }
3815
3816        check('"', '\'', Token::TripleDoubleQuotedString);
3817
3818        check('\'', '"', Token::TripleSingleQuotedString);
3819
3820        let dialect = BigQueryDialect {};
3821
3822        let sql = r#"""''"#;
3823        let tokens = Tokenizer::new(&dialect, sql)
3824            .with_unescape(true)
3825            .tokenize()
3826            .unwrap();
3827        let expected = vec![
3828            Token::DoubleQuotedString("".to_string()),
3829            Token::SingleQuotedString("".to_string()),
3830        ];
3831        compare(expected, tokens);
3832
3833        let sql = r#"''"""#;
3834        let tokens = Tokenizer::new(&dialect, sql)
3835            .with_unescape(true)
3836            .tokenize()
3837            .unwrap();
3838        let expected = vec![
3839            Token::SingleQuotedString("".to_string()),
3840            Token::DoubleQuotedString("".to_string()),
3841        ];
3842        compare(expected, tokens);
3843
3844        // Non-triple quoted string dialect
3845        let dialect = SnowflakeDialect {};
3846        let sql = r#"''''''"#;
3847        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3848        let expected = vec![Token::SingleQuotedString("''".to_string())];
3849        compare(expected, tokens);
3850    }
3851
3852    #[test]
3853    fn test_mysql_users_grantees() {
3854        let dialect = MySqlDialect {};
3855
3856        let sql = "CREATE USER `root`@`%`";
3857        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3858        let expected = vec![
3859            Token::make_keyword("CREATE"),
3860            Token::Whitespace(Whitespace::Space),
3861            Token::make_keyword("USER"),
3862            Token::Whitespace(Whitespace::Space),
3863            Token::make_word("root", Some('`')),
3864            Token::AtSign,
3865            Token::make_word("%", Some('`')),
3866        ];
3867        compare(expected, tokens);
3868    }
3869
3870    #[test]
3871    fn test_postgres_abs_without_space_and_string_literal() {
3872        let dialect = MySqlDialect {};
3873
3874        let sql = "SELECT @'1'";
3875        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3876        let expected = vec![
3877            Token::make_keyword("SELECT"),
3878            Token::Whitespace(Whitespace::Space),
3879            Token::AtSign,
3880            Token::SingleQuotedString("1".to_string()),
3881        ];
3882        compare(expected, tokens);
3883    }
3884
3885    #[test]
3886    fn test_postgres_abs_without_space_and_quoted_column() {
3887        let dialect = MySqlDialect {};
3888
3889        let sql = r#"SELECT @"bar" FROM foo"#;
3890        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3891        let expected = vec![
3892            Token::make_keyword("SELECT"),
3893            Token::Whitespace(Whitespace::Space),
3894            Token::AtSign,
3895            Token::DoubleQuotedString("bar".to_string()),
3896            Token::Whitespace(Whitespace::Space),
3897            Token::make_keyword("FROM"),
3898            Token::Whitespace(Whitespace::Space),
3899            Token::make_word("foo", None),
3900        ];
3901        compare(expected, tokens);
3902    }
3903
3904    #[test]
3905    fn test_national_strings_backslash_escape_not_supported() {
3906        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3907            .tokenizes_to(
3908                "select n'''''\\'",
3909                vec![
3910                    Token::make_keyword("select"),
3911                    Token::Whitespace(Whitespace::Space),
3912                    Token::NationalStringLiteral("''\\".to_string()),
3913                ],
3914            );
3915    }
3916
3917    #[test]
3918    fn test_national_strings_backslash_escape_supported() {
3919        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3920            .tokenizes_to(
3921                "select n'''''\\''",
3922                vec![
3923                    Token::make_keyword("select"),
3924                    Token::Whitespace(Whitespace::Space),
3925                    Token::NationalStringLiteral("'''".to_string()),
3926                ],
3927            );
3928    }
3929
3930    #[test]
3931    fn test_string_escape_constant_not_supported() {
3932        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3933            "select e'...'",
3934            vec![
3935                Token::make_keyword("select"),
3936                Token::Whitespace(Whitespace::Space),
3937                Token::make_word("e", None),
3938                Token::SingleQuotedString("...".to_string()),
3939            ],
3940        );
3941
3942        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3943            "select E'...'",
3944            vec![
3945                Token::make_keyword("select"),
3946                Token::Whitespace(Whitespace::Space),
3947                Token::make_word("E", None),
3948                Token::SingleQuotedString("...".to_string()),
3949            ],
3950        );
3951    }
3952
3953    #[test]
3954    fn test_string_escape_constant_supported() {
3955        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3956            "select e'\\''",
3957            vec![
3958                Token::make_keyword("select"),
3959                Token::Whitespace(Whitespace::Space),
3960                Token::EscapedStringLiteral("'".to_string()),
3961            ],
3962        );
3963
3964        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3965            "select E'\\''",
3966            vec![
3967                Token::make_keyword("select"),
3968                Token::Whitespace(Whitespace::Space),
3969                Token::EscapedStringLiteral("'".to_string()),
3970            ],
3971        );
3972    }
3973
3974    #[test]
3975    fn test_whitespace_required_after_single_line_comment() {
3976        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3977            .tokenizes_to(
3978                "SELECT --'abc'",
3979                vec![
3980                    Token::make_keyword("SELECT"),
3981                    Token::Whitespace(Whitespace::Space),
3982                    Token::Minus,
3983                    Token::Minus,
3984                    Token::SingleQuotedString("abc".to_string()),
3985                ],
3986            );
3987
3988        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3989            .tokenizes_to(
3990                "SELECT -- 'abc'",
3991                vec![
3992                    Token::make_keyword("SELECT"),
3993                    Token::Whitespace(Whitespace::Space),
3994                    Token::Whitespace(Whitespace::SingleLineComment {
3995                        prefix: "--".to_string(),
3996                        comment: " 'abc'".to_string(),
3997                    }),
3998                ],
3999            );
4000
4001        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
4002            .tokenizes_to(
4003                "SELECT --",
4004                vec![
4005                    Token::make_keyword("SELECT"),
4006                    Token::Whitespace(Whitespace::Space),
4007                    Token::Minus,
4008                    Token::Minus,
4009                ],
4010            );
4011    }
4012
4013    #[test]
4014    fn test_whitespace_not_required_after_single_line_comment() {
4015        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4016            .tokenizes_to(
4017                "SELECT --'abc'",
4018                vec![
4019                    Token::make_keyword("SELECT"),
4020                    Token::Whitespace(Whitespace::Space),
4021                    Token::Whitespace(Whitespace::SingleLineComment {
4022                        prefix: "--".to_string(),
4023                        comment: "'abc'".to_string(),
4024                    }),
4025                ],
4026            );
4027
4028        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4029            .tokenizes_to(
4030                "SELECT -- 'abc'",
4031                vec![
4032                    Token::make_keyword("SELECT"),
4033                    Token::Whitespace(Whitespace::Space),
4034                    Token::Whitespace(Whitespace::SingleLineComment {
4035                        prefix: "--".to_string(),
4036                        comment: " 'abc'".to_string(),
4037                    }),
4038                ],
4039            );
4040
4041        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
4042            .tokenizes_to(
4043                "SELECT --",
4044                vec![
4045                    Token::make_keyword("SELECT"),
4046                    Token::Whitespace(Whitespace::Space),
4047                    Token::Whitespace(Whitespace::SingleLineComment {
4048                        prefix: "--".to_string(),
4049                        comment: "".to_string(),
4050                    }),
4051                ],
4052            );
4053    }
4054
4055    #[test]
4056    fn test_tokenize_identifiers_numeric_prefix() {
4057        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4058            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
4059
4060        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4061            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4062
4063        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4064            "t.12e34",
4065            vec![
4066                Token::make_word("t", None),
4067                Token::Period,
4068                Token::make_word("12e34", None),
4069            ],
4070        );
4071
4072        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4073            "t.1two3",
4074            vec![
4075                Token::make_word("t", None),
4076                Token::Period,
4077                Token::make_word("1two3", None),
4078            ],
4079        );
4080    }
4081
4082    #[test]
4083    fn tokenize_period_underscore() {
4084        let sql = String::from("SELECT table._col");
4085        // a dialect that supports underscores in numeric literals
4086        let dialect = PostgreSqlDialect {};
4087        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
4088
4089        let expected = vec![
4090            Token::make_keyword("SELECT"),
4091            Token::Whitespace(Whitespace::Space),
4092            Token::Word(Word {
4093                value: "table".to_string(),
4094                quote_style: None,
4095                keyword: Keyword::TABLE,
4096            }),
4097            Token::Period,
4098            Token::Word(Word {
4099                value: "_col".to_string(),
4100                quote_style: None,
4101                keyword: Keyword::NoKeyword,
4102            }),
4103        ];
4104
4105        compare(expected, tokens);
4106
4107        let sql = String::from("SELECT ._123");
4108        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4109            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4110        }
4111
4112        let sql = String::from("SELECT ._abc");
4113        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
4114            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
4115        }
4116    }
4117}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs