sqltk_parser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqltk_parser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46    SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51/// SQL Token enumeration
52#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56    /// An end-of-file marker, not a real token
57    EOF,
58    /// A keyword (like SELECT) or an optionally quoted SQL identifier
59    Word(Word),
60    /// An unsigned numeric literal
61    Number(String, bool),
62    /// A character that could not be tokenized
63    Char(char),
64    /// Single quoted string: i.e: 'string'
65    SingleQuotedString(String),
66    /// Double quoted string: i.e: "string"
67    DoubleQuotedString(String),
68    /// Triple single quoted strings: Example '''abc'''
69    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
70    TripleSingleQuotedString(String),
71    /// Triple double quoted strings: Example """abc"""
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleDoubleQuotedString(String),
74    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
75    DollarQuotedString(DollarQuotedString),
76    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
77    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
78    SingleQuotedByteStringLiteral(String),
79    /// Byte string literal: i.e: b"string" or B"string"
80    DoubleQuotedByteStringLiteral(String),
81    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
82    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
83    TripleSingleQuotedByteStringLiteral(String),
84    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleDoubleQuotedByteStringLiteral(String),
87    /// Single quoted literal with raw string prefix. Example `R'abc'`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    SingleQuotedRawStringLiteral(String),
90    /// Double quoted literal with raw string prefix. Example `R"abc"`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    DoubleQuotedRawStringLiteral(String),
93    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    TripleSingleQuotedRawStringLiteral(String),
96    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleDoubleQuotedRawStringLiteral(String),
99    /// "National" string literal: i.e: N'string'
100    NationalStringLiteral(String),
101    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
102    EscapedStringLiteral(String),
103    /// Unicode string literal: i.e: U&'first \000A second'
104    UnicodeStringLiteral(String),
105    /// Hexadecimal string literal: i.e.: X'deadbeef'
106    HexStringLiteral(String),
107    /// Comma
108    Comma,
109    /// Whitespace (space, tab, etc)
110    Whitespace(Whitespace),
111    /// Double equals sign `==`
112    DoubleEq,
113    /// Equality operator `=`
114    Eq,
115    /// Not Equals operator `<>` (or `!=` in some dialects)
116    Neq,
117    /// Less Than operator `<`
118    Lt,
119    /// Greater Than operator `>`
120    Gt,
121    /// Less Than Or Equals operator `<=`
122    LtEq,
123    /// Greater Than Or Equals operator `>=`
124    GtEq,
125    /// Spaceship operator <=>
126    Spaceship,
127    /// Plus operator `+`
128    Plus,
129    /// Minus operator `-`
130    Minus,
131    /// Multiplication operator `*`
132    Mul,
133    /// Division operator `/`
134    Div,
135    /// Integer division operator `//` in DuckDB
136    DuckIntDiv,
137    /// Modulo Operator `%`
138    Mod,
139    /// String concatenation `||`
140    StringConcat,
141    /// Left parenthesis `(`
142    LParen,
143    /// Right parenthesis `)`
144    RParen,
145    /// Period (used for compound identifiers or projections into nested types)
146    Period,
147    /// Colon `:`
148    Colon,
149    /// DoubleColon `::` (used for casting in PostgreSQL)
150    DoubleColon,
151    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
152    Assignment,
153    /// SemiColon `;` used as separator for COPY and payload
154    SemiColon,
155    /// Backslash `\` used in terminating the COPY payload with `\.`
156    Backslash,
157    /// Left bracket `[`
158    LBracket,
159    /// Right bracket `]`
160    RBracket,
161    /// Ampersand `&`
162    Ampersand,
163    /// Pipe `|`
164    Pipe,
165    /// Caret `^`
166    Caret,
167    /// Left brace `{`
168    LBrace,
169    /// Right brace `}`
170    RBrace,
171    /// Right Arrow `=>`
172    RArrow,
173    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
174    Sharp,
175    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
176    DoubleSharp,
177    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
178    Tilde,
179    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
180    TildeAsterisk,
181    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
182    ExclamationMarkTilde,
183    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
184    ExclamationMarkTildeAsterisk,
185    /// `~~`, a case sensitive match pattern operator in PostgreSQL
186    DoubleTilde,
187    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
188    DoubleTildeAsterisk,
189    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
190    ExclamationMarkDoubleTilde,
191    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
192    ExclamationMarkDoubleTildeAsterisk,
193    /// `<<`, a bitwise shift left operator in PostgreSQL
194    ShiftLeft,
195    /// `>>`, a bitwise shift right operator in PostgreSQL
196    ShiftRight,
197    /// `&&`, an overlap operator in PostgreSQL
198    Overlap,
199    /// Exclamation Mark `!` used for PostgreSQL factorial operator
200    ExclamationMark,
201    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
202    DoubleExclamationMark,
203    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
204    AtSign,
205    /// `^@`, a "starts with" string operator in PostgreSQL
206    CaretAt,
207    /// `|/`, a square root math operator in PostgreSQL
208    PGSquareRoot,
209    /// `||/`, a cube root math operator in PostgreSQL
210    PGCubeRoot,
211    /// `?` or `$` , a prepared statement arg placeholder
212    Placeholder(String),
213    /// `->`, used as a operator to extract json field in PostgreSQL
214    Arrow,
215    /// `->>`, used as a operator to extract json field as text in PostgreSQL
216    LongArrow,
217    /// `#>`, extracts JSON sub-object at the specified path
218    HashArrow,
219    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
220    AtDashAt,
221    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
222    QuestionMarkDash,
223    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
224    AmpersandLeftAngleBracket,
225    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
226    AmpersandRightAngleBracket,
227    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
228    AmpersandLeftAngleBracketVerticalBar,
229    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
230    VerticalBarAmpersandRightAngleBracket,
231    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
232    TwoWayArrow,
233    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
234    LeftAngleBracketCaret,
235    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
236    RightAngleBracketCaret,
237    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
238    QuestionMarkSharp,
239    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
240    QuestionMarkDashVerticalBar,
241    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
242    QuestionMarkDoubleVerticalBar,
243    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
244    TildeEqual,
245    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
246    ShiftLeftVerticalBar,
247    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
248    VerticalBarShiftRight,
249    /// `|> BigQuery pipe operator
250    VerticalBarRightAngleBracket,
251    /// `#>>`, extracts JSON sub-object at the specified path as text
252    HashLongArrow,
253    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
254    AtArrow,
255    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
256    ArrowAt,
257    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
258    /// path, where path elements can be either field keys or array indexes.
259    HashMinus,
260    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
261    /// JSON value?
262    AtQuestion,
263    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
264    /// for the specified JSON value. Only the first item of the result is taken into
265    /// account. If the result is not Boolean, then NULL is returned.
266    AtAt,
267    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
268    /// jsonb object
269    Question,
270    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
271    /// keys within the jsonb object
272    QuestionAnd,
273    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
274    /// keys within the jsonb object
275    QuestionPipe,
276    /// Custom binary operator
277    /// This is used to represent any custom binary operator that is not part of the SQL standard.
278    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
279    CustomBinaryOperator(String),
280}
281
282impl fmt::Display for Token {
283    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
284        match self {
285            Token::EOF => f.write_str("EOF"),
286            Token::Word(ref w) => write!(f, "{w}"),
287            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
288            Token::Char(ref c) => write!(f, "{c}"),
289            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
290            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
291            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
292            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
293            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
294            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
295            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
296            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
297            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
298            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
299            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
300            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
301            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
302            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
303            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
304            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
305            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
306            Token::Comma => f.write_str(","),
307            Token::Whitespace(ws) => write!(f, "{ws}"),
308            Token::DoubleEq => f.write_str("=="),
309            Token::Spaceship => f.write_str("<=>"),
310            Token::Eq => f.write_str("="),
311            Token::Neq => f.write_str("<>"),
312            Token::Lt => f.write_str("<"),
313            Token::Gt => f.write_str(">"),
314            Token::LtEq => f.write_str("<="),
315            Token::GtEq => f.write_str(">="),
316            Token::Plus => f.write_str("+"),
317            Token::Minus => f.write_str("-"),
318            Token::Mul => f.write_str("*"),
319            Token::Div => f.write_str("/"),
320            Token::DuckIntDiv => f.write_str("//"),
321            Token::StringConcat => f.write_str("||"),
322            Token::Mod => f.write_str("%"),
323            Token::LParen => f.write_str("("),
324            Token::RParen => f.write_str(")"),
325            Token::Period => f.write_str("."),
326            Token::Colon => f.write_str(":"),
327            Token::DoubleColon => f.write_str("::"),
328            Token::Assignment => f.write_str(":="),
329            Token::SemiColon => f.write_str(";"),
330            Token::Backslash => f.write_str("\\"),
331            Token::LBracket => f.write_str("["),
332            Token::RBracket => f.write_str("]"),
333            Token::Ampersand => f.write_str("&"),
334            Token::Caret => f.write_str("^"),
335            Token::Pipe => f.write_str("|"),
336            Token::LBrace => f.write_str("{"),
337            Token::RBrace => f.write_str("}"),
338            Token::RArrow => f.write_str("=>"),
339            Token::Sharp => f.write_str("#"),
340            Token::DoubleSharp => f.write_str("##"),
341            Token::ExclamationMark => f.write_str("!"),
342            Token::DoubleExclamationMark => f.write_str("!!"),
343            Token::Tilde => f.write_str("~"),
344            Token::TildeAsterisk => f.write_str("~*"),
345            Token::ExclamationMarkTilde => f.write_str("!~"),
346            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
347            Token::DoubleTilde => f.write_str("~~"),
348            Token::DoubleTildeAsterisk => f.write_str("~~*"),
349            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
350            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
351            Token::AtSign => f.write_str("@"),
352            Token::CaretAt => f.write_str("^@"),
353            Token::ShiftLeft => f.write_str("<<"),
354            Token::ShiftRight => f.write_str(">>"),
355            Token::Overlap => f.write_str("&&"),
356            Token::PGSquareRoot => f.write_str("|/"),
357            Token::PGCubeRoot => f.write_str("||/"),
358            Token::AtDashAt => f.write_str("@-@"),
359            Token::QuestionMarkDash => f.write_str("?-"),
360            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
361            Token::AmpersandRightAngleBracket => f.write_str("&>"),
362            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
363            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
364            Token::VerticalBarRightAngleBracket => f.write_str("|>"),
365            Token::TwoWayArrow => f.write_str("<->"),
366            Token::LeftAngleBracketCaret => f.write_str("<^"),
367            Token::RightAngleBracketCaret => f.write_str(">^"),
368            Token::QuestionMarkSharp => f.write_str("?#"),
369            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
370            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
371            Token::TildeEqual => f.write_str("~="),
372            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
373            Token::VerticalBarShiftRight => f.write_str("|>>"),
374            Token::Placeholder(ref s) => write!(f, "{s}"),
375            Token::Arrow => write!(f, "->"),
376            Token::LongArrow => write!(f, "->>"),
377            Token::HashArrow => write!(f, "#>"),
378            Token::HashLongArrow => write!(f, "#>>"),
379            Token::AtArrow => write!(f, "@>"),
380            Token::ArrowAt => write!(f, "<@"),
381            Token::HashMinus => write!(f, "#-"),
382            Token::AtQuestion => write!(f, "@?"),
383            Token::AtAt => write!(f, "@@"),
384            Token::Question => write!(f, "?"),
385            Token::QuestionAnd => write!(f, "?&"),
386            Token::QuestionPipe => write!(f, "?|"),
387            Token::CustomBinaryOperator(s) => f.write_str(s),
388        }
389    }
390}
391
392impl Token {
393    pub fn make_keyword(keyword: &str) -> Self {
394        Token::make_word(keyword, None)
395    }
396
397    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
398        let word_uppercase = word.to_uppercase();
399        Token::Word(Word {
400            value: word.to_string(),
401            quote_style,
402            keyword: if quote_style.is_none() {
403                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
404                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
405            } else {
406                Keyword::NoKeyword
407            },
408        })
409    }
410}
411
412/// A keyword (like SELECT) or an optionally quoted SQL identifier
413#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
414#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
415#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
416pub struct Word {
417    /// The value of the token, without the enclosing quotes, and with the
418    /// escape sequences (if any) processed (TODO: escapes are not handled)
419    pub value: String,
420    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
421    /// The standard and most implementations allow using double quotes for this,
422    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
423    pub quote_style: Option<char>,
424    /// If the word was not quoted and it matched one of the known keywords,
425    /// this will have one of the values from dialect::keywords, otherwise empty
426    pub keyword: Keyword,
427}
428
429impl fmt::Display for Word {
430    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
431        match self.quote_style {
432            Some(s) if s == '"' || s == '[' || s == '`' => {
433                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
434            }
435            None => f.write_str(&self.value),
436            _ => panic!("Unexpected quote_style!"),
437        }
438    }
439}
440
441impl Word {
442    fn matching_end_quote(ch: char) -> char {
443        match ch {
444            '"' => '"', // ANSI and most dialects
445            '[' => ']', // MS SQL
446            '`' => '`', // MySQL
447            _ => panic!("unexpected quoting style!"),
448        }
449    }
450}
451
452#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
453#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
454#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
455pub enum Whitespace {
456    Space,
457    Newline,
458    Tab,
459    SingleLineComment { comment: String, prefix: String },
460    MultiLineComment(String),
461}
462
463impl fmt::Display for Whitespace {
464    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
465        match self {
466            Whitespace::Space => f.write_str(" "),
467            Whitespace::Newline => f.write_str("\n"),
468            Whitespace::Tab => f.write_str("\t"),
469            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
470            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
471        }
472    }
473}
474
475/// Location in input string
476///
477/// # Create an "empty" (unknown) `Location`
478/// ```
479/// # use sqltk_parser::tokenizer::Location;
480/// let location = Location::empty();
481/// ```
482///
483/// # Create a `Location` from a line and column
484/// ```
485/// # use sqltk_parser::tokenizer::Location;
486/// let location = Location::new(1, 1);
487/// ```
488///
489/// # Create a `Location` from a pair
490/// ```
491/// # use sqltk_parser::tokenizer::Location;
492/// let location = Location::from((1, 1));
493/// ```
494#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
495#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
496#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
497pub struct Location {
498    /// Line number, starting from 1.
499    ///
500    /// Note: Line 0 is used for empty spans
501    pub line: u64,
502    /// Line column, starting from 1.
503    ///
504    /// Note: Column 0 is used for empty spans
505    pub column: u64,
506}
507
508impl fmt::Display for Location {
509    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
510        if self.line == 0 {
511            return Ok(());
512        }
513        write!(f, " at Line: {}, Column: {}", self.line, self.column)
514    }
515}
516
517impl fmt::Debug for Location {
518    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
519        write!(f, "Location({},{})", self.line, self.column)
520    }
521}
522
523impl Location {
524    /// Return an "empty" / unknown location
525    pub fn empty() -> Self {
526        Self { line: 0, column: 0 }
527    }
528
529    /// Create a new `Location` for a given line and column
530    pub fn new(line: u64, column: u64) -> Self {
531        Self { line, column }
532    }
533
534    /// Create a new location for a given line and column
535    ///
536    /// Alias for [`Self::new`]
537    // TODO: remove / deprecate in favor of` `new` for consistency?
538    pub fn of(line: u64, column: u64) -> Self {
539        Self::new(line, column)
540    }
541
542    /// Combine self and `end` into a new `Span`
543    pub fn span_to(self, end: Self) -> Span {
544        Span { start: self, end }
545    }
546}
547
548impl From<(u64, u64)> for Location {
549    fn from((line, column): (u64, u64)) -> Self {
550        Self { line, column }
551    }
552}
553
554/// A span represents a linear portion of the input string (start, end)
555///
556/// See [Spanned](crate::ast::Spanned) for more information.
557#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
558#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
559#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
560pub struct Span {
561    pub start: Location,
562    pub end: Location,
563}
564
565impl fmt::Debug for Span {
566    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
567        write!(f, "Span({:?}..{:?})", self.start, self.end)
568    }
569}
570
571impl Span {
572    // An empty span (0, 0) -> (0, 0)
573    // We need a const instance for pattern matching
574    const EMPTY: Span = Self::empty();
575
576    /// Create a new span from a start and end [`Location`]
577    pub fn new(start: Location, end: Location) -> Span {
578        Span { start, end }
579    }
580
581    /// Returns an empty span `(0, 0) -> (0, 0)`
582    ///
583    /// Empty spans represent no knowledge of source location
584    /// See [Spanned](crate::ast::Spanned) for more information.
585    pub const fn empty() -> Span {
586        Span {
587            start: Location { line: 0, column: 0 },
588            end: Location { line: 0, column: 0 },
589        }
590    }
591
592    /// Returns the smallest Span that contains both `self` and `other`
593    /// If either span is [Span::empty], the other span is returned
594    ///
595    /// # Examples
596    /// ```
597    /// # use sqltk_parser::tokenizer::{Span, Location};
598    /// // line 1, column1 -> line 2, column 5
599    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
600    /// // line 2, column 3 -> line 3, column 7
601    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
602    /// // Union of the two is the min/max of the two spans
603    /// // line 1, column 1 -> line 3, column 7
604    /// let union = span1.union(&span2);
605    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
606    /// ```
607    pub fn union(&self, other: &Span) -> Span {
608        // If either span is empty, return the other
609        // this prevents propagating (0, 0) through the tree
610        match (self, other) {
611            (&Span::EMPTY, _) => *other,
612            (_, &Span::EMPTY) => *self,
613            _ => Span {
614                start: cmp::min(self.start, other.start),
615                end: cmp::max(self.end, other.end),
616            },
617        }
618    }
619
620    /// Same as [Span::union] for `Option<Span>`
621    ///
622    /// If `other` is `None`, `self` is returned
623    pub fn union_opt(&self, other: &Option<Span>) -> Span {
624        match other {
625            Some(other) => self.union(other),
626            None => *self,
627        }
628    }
629
630    /// Return the [Span::union] of all spans in the iterator
631    ///
632    /// If the iterator is empty, an empty span is returned
633    ///
634    /// # Example
635    /// ```
636    /// # use sqltk_parser::tokenizer::{Span, Location};
637    /// let spans = vec![
638    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
639    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
640    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
641    /// ];
642    /// // line 1, column 1 -> line 4, column 2
643    /// assert_eq!(
644    ///   Span::union_iter(spans),
645    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
646    /// );
647    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
648        iter.into_iter()
649            .reduce(|acc, item| acc.union(&item))
650            .unwrap_or(Span::empty())
651    }
652}
653
654/// Backwards compatibility struct for [`TokenWithSpan`]
655#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
656pub type TokenWithLocation = TokenWithSpan;
657
658/// A [Token] with [Span] attached to it
659///
660/// This is used to track the location of a token in the input string
661///
662/// # Examples
663/// ```
664/// # use sqltk_parser::tokenizer::{Location, Span, Token, TokenWithSpan};
665/// // commas @ line 1, column 10
666/// let tok1 = TokenWithSpan::new(
667///   Token::Comma,
668///   Span::new(Location::new(1, 10), Location::new(1, 11)),
669/// );
670/// assert_eq!(tok1, Token::Comma); // can compare the token
671///
672/// // commas @ line 2, column 20
673/// let tok2 = TokenWithSpan::new(
674///   Token::Comma,
675///   Span::new(Location::new(2, 20), Location::new(2, 21)),
676/// );
677/// // same token but different locations are not equal
678/// assert_ne!(tok1, tok2);
679/// ```
680#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
681#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
682#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
683pub struct TokenWithSpan {
684    pub token: Token,
685    pub span: Span,
686}
687
688impl TokenWithSpan {
689    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
690    pub fn new(token: Token, span: Span) -> Self {
691        Self { token, span }
692    }
693
694    /// Wrap a token with an empty span
695    pub fn wrap(token: Token) -> Self {
696        Self::new(token, Span::empty())
697    }
698
699    /// Wrap a token with a location from `start` to `end`
700    pub fn at(token: Token, start: Location, end: Location) -> Self {
701        Self::new(token, Span::new(start, end))
702    }
703
704    /// Return an EOF token with no location
705    pub fn new_eof() -> Self {
706        Self::wrap(Token::EOF)
707    }
708}
709
710impl PartialEq<Token> for TokenWithSpan {
711    fn eq(&self, other: &Token) -> bool {
712        &self.token == other
713    }
714}
715
716impl PartialEq<TokenWithSpan> for Token {
717    fn eq(&self, other: &TokenWithSpan) -> bool {
718        self == &other.token
719    }
720}
721
722impl fmt::Display for TokenWithSpan {
723    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
724        self.token.fmt(f)
725    }
726}
727
728/// Tokenizer error
729#[derive(Debug, PartialEq, Eq)]
730pub struct TokenizerError {
731    pub message: String,
732    pub location: Location,
733}
734
735impl fmt::Display for TokenizerError {
736    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
737        write!(f, "{}{}", self.message, self.location,)
738    }
739}
740
741#[cfg(feature = "std")]
742impl std::error::Error for TokenizerError {}
743
744struct State<'a> {
745    peekable: Peekable<Chars<'a>>,
746    pub line: u64,
747    pub col: u64,
748}
749
750impl State<'_> {
751    /// return the next character and advance the stream
752    pub fn next(&mut self) -> Option<char> {
753        match self.peekable.next() {
754            None => None,
755            Some(s) => {
756                if s == '\n' {
757                    self.line += 1;
758                    self.col = 1;
759                } else {
760                    self.col += 1;
761                }
762                Some(s)
763            }
764        }
765    }
766
767    /// return the next character but do not advance the stream
768    pub fn peek(&mut self) -> Option<&char> {
769        self.peekable.peek()
770    }
771
772    pub fn location(&self) -> Location {
773        Location {
774            line: self.line,
775            column: self.col,
776        }
777    }
778}
779
780/// Represents how many quote characters enclose a string literal.
781#[derive(Copy, Clone)]
782enum NumStringQuoteChars {
783    /// e.g. `"abc"`, `'abc'`, `r'abc'`
784    One,
785    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
786    Many(NonZeroU8),
787}
788
789/// Settings for tokenizing a quoted string literal.
790struct TokenizeQuotedStringSettings {
791    /// The character used to quote the string.
792    quote_style: char,
793    /// Represents how many quotes characters enclose the string literal.
794    num_quote_chars: NumStringQuoteChars,
795    /// The number of opening quotes left to consume, before parsing
796    /// the remaining string literal.
797    /// For example: given initial string `"""abc"""`. If the caller has
798    /// already parsed the first quote for some reason, then this value
799    /// is set to 1, flagging to look to consume only 2 leading quotes.
800    num_opening_quotes_to_consume: u8,
801    /// True if the string uses backslash escaping of special characters
802    /// e.g `'abc\ndef\'ghi'
803    backslash_escape: bool,
804}
805
806/// SQL Tokenizer
807pub struct Tokenizer<'a> {
808    dialect: &'a dyn Dialect,
809    query: &'a str,
810    /// If true (the default), the tokenizer will un-escape literal
811    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
812    unescape: bool,
813}
814
815impl<'a> Tokenizer<'a> {
816    /// Create a new SQL tokenizer for the specified SQL statement
817    ///
818    /// ```
819    /// # use sqltk_parser::tokenizer::{Token, Whitespace, Tokenizer};
820    /// # use sqltk_parser::dialect::GenericDialect;
821    /// # let dialect = GenericDialect{};
822    /// let query = r#"SELECT 'foo'"#;
823    ///
824    /// // Parsing the query
825    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
826    ///
827    /// assert_eq!(tokens, vec![
828    ///   Token::make_word("SELECT", None),
829    ///   Token::Whitespace(Whitespace::Space),
830    ///   Token::SingleQuotedString("foo".to_string()),
831    /// ]);
832    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
833        Self {
834            dialect,
835            query,
836            unescape: true,
837        }
838    }
839
840    /// Set unescape mode
841    ///
842    /// When true (default) the tokenizer unescapes literal values
843    /// (for example, `""` in SQL is unescaped to the literal `"`).
844    ///
845    /// When false, the tokenizer provides the raw strings as provided
846    /// in the query.  This can be helpful for programs that wish to
847    /// recover the *exact* original query text without normalizing
848    /// the escaping
849    ///
850    /// # Example
851    ///
852    /// ```
853    /// # use sqltk_parser::tokenizer::{Token, Tokenizer};
854    /// # use sqltk_parser::dialect::GenericDialect;
855    /// # let dialect = GenericDialect{};
856    /// let query = r#""Foo "" Bar""#;
857    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
858    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
859    ///
860    /// // Parsing with unescaping (default)
861    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
862    /// assert_eq!(tokens, vec![unescaped]);
863    ///
864    /// // Parsing with unescape = false
865    /// let tokens = Tokenizer::new(&dialect, &query)
866    ///    .with_unescape(false)
867    ///    .tokenize().unwrap();
868    /// assert_eq!(tokens, vec![original]);
869    /// ```
870    pub fn with_unescape(mut self, unescape: bool) -> Self {
871        self.unescape = unescape;
872        self
873    }
874
875    /// Tokenize the statement and produce a vector of tokens
876    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
877        let twl = self.tokenize_with_location()?;
878        Ok(twl.into_iter().map(|t| t.token).collect())
879    }
880
881    /// Tokenize the statement and produce a vector of tokens with location information
882    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
883        let mut tokens: Vec<TokenWithSpan> = vec![];
884        self.tokenize_with_location_into_buf(&mut tokens)
885            .map(|_| tokens)
886    }
887
888    /// Tokenize the statement and append tokens with location information into the provided buffer.
889    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
890    pub fn tokenize_with_location_into_buf(
891        &mut self,
892        buf: &mut Vec<TokenWithSpan>,
893    ) -> Result<(), TokenizerError> {
894        let mut state = State {
895            peekable: self.query.chars().peekable(),
896            line: 1,
897            col: 1,
898        };
899
900        let mut location = state.location();
901        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
902            let span = location.span_to(state.location());
903
904            buf.push(TokenWithSpan { token, span });
905
906            location = state.location();
907        }
908        Ok(())
909    }
910
911    // Tokenize the identifier or keywords in `ch`
912    fn tokenize_identifier_or_keyword(
913        &self,
914        ch: impl IntoIterator<Item = char>,
915        chars: &mut State,
916    ) -> Result<Option<Token>, TokenizerError> {
917        chars.next(); // consume the first char
918        let ch: String = ch.into_iter().collect();
919        let word = self.tokenize_word(ch, chars);
920
921        // TODO: implement parsing of exponent here
922        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923            let mut inner_state = State {
924                peekable: word.chars().peekable(),
925                line: 0,
926                col: 0,
927            };
928            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
930            s += s2.as_str();
931            return Ok(Some(Token::Number(s, false)));
932        }
933
934        Ok(Some(Token::make_word(&word, None)))
935    }
936
937    /// Get the next token or return None
938    fn next_token(
939        &self,
940        chars: &mut State,
941        prev_token: Option<&Token>,
942    ) -> Result<Option<Token>, TokenizerError> {
943        match chars.peek() {
944            Some(&ch) => match ch {
945                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
946                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
947                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
948                '\r' => {
949                    // Emit a single Whitespace::Newline token for \r and \r\n
950                    chars.next();
951                    if let Some('\n') = chars.peek() {
952                        chars.next();
953                    }
954                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
955                }
956                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
957                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
958                {
959                    chars.next(); // consume
960                    match chars.peek() {
961                        Some('\'') => {
962                            if self.dialect.supports_triple_quoted_string() {
963                                return self
964                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
965                                        chars,
966                                        '\'',
967                                        false,
968                                        Token::SingleQuotedByteStringLiteral,
969                                        Token::TripleSingleQuotedByteStringLiteral,
970                                    );
971                            }
972                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
973                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
974                        }
975                        Some('\"') => {
976                            if self.dialect.supports_triple_quoted_string() {
977                                return self
978                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
979                                        chars,
980                                        '"',
981                                        false,
982                                        Token::DoubleQuotedByteStringLiteral,
983                                        Token::TripleDoubleQuotedByteStringLiteral,
984                                    );
985                            }
986                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
987                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
988                        }
989                        _ => {
990                            // regular identifier starting with an "b" or "B"
991                            let s = self.tokenize_word(b, chars);
992                            Ok(Some(Token::make_word(&s, None)))
993                        }
994                    }
995                }
996                // BigQuery uses r or R for raw string literal
997                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
998                    chars.next(); // consume
999                    match chars.peek() {
1000                        Some('\'') => self
1001                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1002                                chars,
1003                                '\'',
1004                                false,
1005                                Token::SingleQuotedRawStringLiteral,
1006                                Token::TripleSingleQuotedRawStringLiteral,
1007                            ),
1008                        Some('\"') => self
1009                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1010                                chars,
1011                                '"',
1012                                false,
1013                                Token::DoubleQuotedRawStringLiteral,
1014                                Token::TripleDoubleQuotedRawStringLiteral,
1015                            ),
1016                        _ => {
1017                            // regular identifier starting with an "r" or "R"
1018                            let s = self.tokenize_word(b, chars);
1019                            Ok(Some(Token::make_word(&s, None)))
1020                        }
1021                    }
1022                }
1023                // Redshift uses lower case n for national string literal
1024                n @ 'N' | n @ 'n' => {
1025                    chars.next(); // consume, to check the next char
1026                    match chars.peek() {
1027                        Some('\'') => {
1028                            // N'...' - a <national character string literal>
1029                            let backslash_escape =
1030                                self.dialect.supports_string_literal_backslash_escape();
1031                            let s =
1032                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1033                            Ok(Some(Token::NationalStringLiteral(s)))
1034                        }
1035                        _ => {
1036                            // regular identifier starting with an "N"
1037                            let s = self.tokenize_word(n, chars);
1038                            Ok(Some(Token::make_word(&s, None)))
1039                        }
1040                    }
1041                }
1042                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1043                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1044                    let starting_loc = chars.location();
1045                    chars.next(); // consume, to check the next char
1046                    match chars.peek() {
1047                        Some('\'') => {
1048                            let s =
1049                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1050                            Ok(Some(Token::EscapedStringLiteral(s)))
1051                        }
1052                        _ => {
1053                            // regular identifier starting with an "E" or "e"
1054                            let s = self.tokenize_word(x, chars);
1055                            Ok(Some(Token::make_word(&s, None)))
1056                        }
1057                    }
1058                }
1059                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1060                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1061                    chars.next(); // consume, to check the next char
1062                    if chars.peek() == Some(&'&') {
1063                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1064                        let mut chars_clone = chars.peekable.clone();
1065                        chars_clone.next(); // consume the '&' in the clone
1066                        if chars_clone.peek() == Some(&'\'') {
1067                            chars.next(); // consume the '&' in the original iterator
1068                            let s = unescape_unicode_single_quoted_string(chars)?;
1069                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1070                        }
1071                    }
1072                    // regular identifier starting with an "U" or "u"
1073                    let s = self.tokenize_word(x, chars);
1074                    Ok(Some(Token::make_word(&s, None)))
1075                }
1076                // The spec only allows an uppercase 'X' to introduce a hex
1077                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1078                x @ 'x' | x @ 'X' => {
1079                    chars.next(); // consume, to check the next char
1080                    match chars.peek() {
1081                        Some('\'') => {
1082                            // X'...' - a <binary string literal>
1083                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1084                            Ok(Some(Token::HexStringLiteral(s)))
1085                        }
1086                        _ => {
1087                            // regular identifier starting with an "X"
1088                            let s = self.tokenize_word(x, chars);
1089                            Ok(Some(Token::make_word(&s, None)))
1090                        }
1091                    }
1092                }
1093                // single quoted string
1094                '\'' => {
1095                    if self.dialect.supports_triple_quoted_string() {
1096                        return self
1097                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1098                                chars,
1099                                '\'',
1100                                self.dialect.supports_string_literal_backslash_escape(),
1101                                Token::SingleQuotedString,
1102                                Token::TripleSingleQuotedString,
1103                            );
1104                    }
1105                    let s = self.tokenize_single_quoted_string(
1106                        chars,
1107                        '\'',
1108                        self.dialect.supports_string_literal_backslash_escape(),
1109                    )?;
1110
1111                    Ok(Some(Token::SingleQuotedString(s)))
1112                }
1113                // double quoted string
1114                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1115                    && !self.dialect.is_identifier_start(ch) =>
1116                {
1117                    if self.dialect.supports_triple_quoted_string() {
1118                        return self
1119                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1120                                chars,
1121                                '"',
1122                                self.dialect.supports_string_literal_backslash_escape(),
1123                                Token::DoubleQuotedString,
1124                                Token::TripleDoubleQuotedString,
1125                            );
1126                    }
1127                    let s = self.tokenize_single_quoted_string(
1128                        chars,
1129                        '"',
1130                        self.dialect.supports_string_literal_backslash_escape(),
1131                    )?;
1132
1133                    Ok(Some(Token::DoubleQuotedString(s)))
1134                }
1135                // delimited (quoted) identifier
1136                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1137                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1138                    Ok(Some(Token::make_word(&word, Some(quote_start))))
1139                }
1140                // Potentially nested delimited (quoted) identifier
1141                quote_start
1142                    if self
1143                        .dialect
1144                        .is_nested_delimited_identifier_start(quote_start)
1145                        && self
1146                            .dialect
1147                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1148                            .is_some() =>
1149                {
1150                    let Some((quote_start, nested_quote_start)) = self
1151                        .dialect
1152                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1153                    else {
1154                        return self.tokenizer_error(
1155                            chars.location(),
1156                            format!("Expected nested delimiter '{quote_start}' before EOF."),
1157                        );
1158                    };
1159
1160                    let Some(nested_quote_start) = nested_quote_start else {
1161                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1162                        return Ok(Some(Token::make_word(&word, Some(quote_start))));
1163                    };
1164
1165                    let mut word = vec![];
1166                    let quote_end = Word::matching_end_quote(quote_start);
1167                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1168                    let error_loc = chars.location();
1169
1170                    chars.next(); // skip the first delimiter
1171                    peeking_take_while(chars, |ch| ch.is_whitespace());
1172                    if chars.peek() != Some(&nested_quote_start) {
1173                        return self.tokenizer_error(
1174                            error_loc,
1175                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1176                        );
1177                    }
1178                    word.push(nested_quote_start.into());
1179                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1180                    word.push(nested_quote_end.into());
1181                    peeking_take_while(chars, |ch| ch.is_whitespace());
1182                    if chars.peek() != Some(&quote_end) {
1183                        return self.tokenizer_error(
1184                            error_loc,
1185                            format!("Expected close delimiter '{quote_end}' before EOF."),
1186                        );
1187                    }
1188                    chars.next(); // skip close delimiter
1189
1190                    Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1191                }
1192                // numbers and period
1193                '0'..='9' | '.' => {
1194                    // Some dialects support underscore as number separator
1195                    // There can only be one at a time and it must be followed by another digit
1196                    let is_number_separator = |ch: char, next_char: Option<char>| {
1197                        self.dialect.supports_numeric_literal_underscores()
1198                            && ch == '_'
1199                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1200                    };
1201
1202                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1203                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1204                    });
1205
1206                    // match binary literal that starts with 0x
1207                    if s == "0" && chars.peek() == Some(&'x') {
1208                        chars.next();
1209                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1210                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1211                        });
1212                        return Ok(Some(Token::HexStringLiteral(s2)));
1213                    }
1214
1215                    // match one period
1216                    if let Some('.') = chars.peek() {
1217                        s.push('.');
1218                        chars.next();
1219                    }
1220
1221                    // If the dialect supports identifiers that start with a numeric prefix
1222                    // and we have now consumed a dot, check if the previous token was a Word.
1223                    // If so, what follows is definitely not part of a decimal number and
1224                    // we should yield the dot as a dedicated token so compound identifiers
1225                    // starting with digits can be parsed correctly.
1226                    if s == "." && self.dialect.supports_numeric_prefix() {
1227                        if let Some(Token::Word(_)) = prev_token {
1228                            return Ok(Some(Token::Period));
1229                        }
1230                    }
1231
1232                    // Consume fractional digits.
1233                    s += &peeking_next_take_while(chars, |ch, next_ch| {
1234                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1235                    });
1236
1237                    // No fraction -> Token::Period
1238                    if s == "." {
1239                        return Ok(Some(Token::Period));
1240                    }
1241
1242                    // Parse exponent as number
1243                    let mut exponent_part = String::new();
1244                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1245                        let mut char_clone = chars.peekable.clone();
1246                        exponent_part.push(char_clone.next().unwrap());
1247
1248                        // Optional sign
1249                        match char_clone.peek() {
1250                            Some(&c) if matches!(c, '+' | '-') => {
1251                                exponent_part.push(c);
1252                                char_clone.next();
1253                            }
1254                            _ => (),
1255                        }
1256
1257                        match char_clone.peek() {
1258                            // Definitely an exponent, get original iterator up to speed and use it
1259                            Some(&c) if c.is_ascii_digit() => {
1260                                for _ in 0..exponent_part.len() {
1261                                    chars.next();
1262                                }
1263                                exponent_part +=
1264                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1265                                s += exponent_part.as_str();
1266                            }
1267                            // Not an exponent, discard the work done
1268                            _ => (),
1269                        }
1270                    }
1271
1272                    // If the dialect supports identifiers that start with a numeric prefix,
1273                    // we need to check if the value is in fact an identifier and must thus
1274                    // be tokenized as a word.
1275                    if self.dialect.supports_numeric_prefix() {
1276                        if exponent_part.is_empty() {
1277                            // If it is not a number with an exponent, it may be
1278                            // an identifier starting with digits.
1279                            let word =
1280                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1281
1282                            if !word.is_empty() {
1283                                s += word.as_str();
1284                                return Ok(Some(Token::make_word(s.as_str(), None)));
1285                            }
1286                        } else if prev_token == Some(&Token::Period) {
1287                            // If the previous token was a period, thus not belonging to a number,
1288                            // the value we have is part of an identifier.
1289                            return Ok(Some(Token::make_word(s.as_str(), None)));
1290                        }
1291                    }
1292
1293                    let long = if chars.peek() == Some(&'L') {
1294                        chars.next();
1295                        true
1296                    } else {
1297                        false
1298                    };
1299                    Ok(Some(Token::Number(s, long)))
1300                }
1301                // punctuation
1302                '(' => self.consume_and_return(chars, Token::LParen),
1303                ')' => self.consume_and_return(chars, Token::RParen),
1304                ',' => self.consume_and_return(chars, Token::Comma),
1305                // operators
1306                '-' => {
1307                    chars.next(); // consume the '-'
1308
1309                    match chars.peek() {
1310                        Some('-') => {
1311                            let mut is_comment = true;
1312                            if self.dialect.requires_single_line_comment_whitespace() {
1313                                is_comment = Some(' ') == chars.peekable.clone().nth(1);
1314                            }
1315
1316                            if is_comment {
1317                                chars.next(); // consume second '-'
1318                                let comment = self.tokenize_single_line_comment(chars);
1319                                return Ok(Some(Token::Whitespace(
1320                                    Whitespace::SingleLineComment {
1321                                        prefix: "--".to_owned(),
1322                                        comment,
1323                                    },
1324                                )));
1325                            }
1326
1327                            self.start_binop(chars, "-", Token::Minus)
1328                        }
1329                        Some('>') => {
1330                            chars.next();
1331                            match chars.peek() {
1332                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1333                                _ => self.start_binop(chars, "->", Token::Arrow),
1334                            }
1335                        }
1336                        // a regular '-' operator
1337                        _ => self.start_binop(chars, "-", Token::Minus),
1338                    }
1339                }
1340                '/' => {
1341                    chars.next(); // consume the '/'
1342                    match chars.peek() {
1343                        Some('*') => {
1344                            chars.next(); // consume the '*', starting a multi-line comment
1345                            self.tokenize_multiline_comment(chars)
1346                        }
1347                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1348                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1349                            let comment = self.tokenize_single_line_comment(chars);
1350                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1351                                prefix: "//".to_owned(),
1352                                comment,
1353                            })))
1354                        }
1355                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1356                            self.consume_and_return(chars, Token::DuckIntDiv)
1357                        }
1358                        // a regular '/' operator
1359                        _ => Ok(Some(Token::Div)),
1360                    }
1361                }
1362                '+' => self.consume_and_return(chars, Token::Plus),
1363                '*' => self.consume_and_return(chars, Token::Mul),
1364                '%' => {
1365                    chars.next(); // advance past '%'
1366                    match chars.peek() {
1367                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1368                        Some(sch) if self.dialect.is_identifier_start('%') => {
1369                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1370                        }
1371                        _ => self.start_binop(chars, "%", Token::Mod),
1372                    }
1373                }
1374                '|' => {
1375                    chars.next(); // consume the '|'
1376                    match chars.peek() {
1377                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1378                        Some('|') => {
1379                            chars.next(); // consume the second '|'
1380                            match chars.peek() {
1381                                Some('/') => {
1382                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1383                                }
1384                                _ => self.start_binop(chars, "||", Token::StringConcat),
1385                            }
1386                        }
1387                        Some('&') if self.dialect.supports_geometric_types() => {
1388                            chars.next(); // consume
1389                            match chars.peek() {
1390                                Some('>') => self.consume_for_binop(
1391                                    chars,
1392                                    "|&>",
1393                                    Token::VerticalBarAmpersandRightAngleBracket,
1394                                ),
1395                                _ => self.start_binop_opt(chars, "|&", None),
1396                            }
1397                        }
1398                        Some('>') if self.dialect.supports_geometric_types() => {
1399                            chars.next(); // consume
1400                            match chars.peek() {
1401                                Some('>') => self.consume_for_binop(
1402                                    chars,
1403                                    "|>>",
1404                                    Token::VerticalBarShiftRight,
1405                                ),
1406                                _ => self.start_binop_opt(chars, "|>", None),
1407                            }
1408                        }
1409                        Some('>') if self.dialect.supports_pipe_operator() => {
1410                            self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
1411                        }
1412                        // Bitshift '|' operator
1413                        _ => self.start_binop(chars, "|", Token::Pipe),
1414                    }
1415                }
1416                '=' => {
1417                    chars.next(); // consume
1418                    match chars.peek() {
1419                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1420                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1421                        _ => Ok(Some(Token::Eq)),
1422                    }
1423                }
1424                '!' => {
1425                    chars.next(); // consume
1426                    match chars.peek() {
1427                        Some('=') => self.consume_and_return(chars, Token::Neq),
1428                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1429                        Some('~') => {
1430                            chars.next();
1431                            match chars.peek() {
1432                                Some('*') => self
1433                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1434                                Some('~') => {
1435                                    chars.next();
1436                                    match chars.peek() {
1437                                        Some('*') => self.consume_and_return(
1438                                            chars,
1439                                            Token::ExclamationMarkDoubleTildeAsterisk,
1440                                        ),
1441                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1442                                    }
1443                                }
1444                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1445                            }
1446                        }
1447                        _ => Ok(Some(Token::ExclamationMark)),
1448                    }
1449                }
1450                '<' => {
1451                    chars.next(); // consume
1452                    match chars.peek() {
1453                        Some('=') => {
1454                            chars.next();
1455                            match chars.peek() {
1456                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1457                                _ => self.start_binop(chars, "<=", Token::LtEq),
1458                            }
1459                        }
1460                        Some('|') if self.dialect.supports_geometric_types() => {
1461                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1462                        }
1463                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1464                        Some('<') if self.dialect.supports_geometric_types() => {
1465                            chars.next(); // consume
1466                            match chars.peek() {
1467                                Some('|') => self.consume_for_binop(
1468                                    chars,
1469                                    "<<|",
1470                                    Token::ShiftLeftVerticalBar,
1471                                ),
1472                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1473                            }
1474                        }
1475                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1476                        Some('-') if self.dialect.supports_geometric_types() => {
1477                            chars.next(); // consume
1478                            match chars.peek() {
1479                                Some('>') => {
1480                                    self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1481                                }
1482                                _ => self.start_binop_opt(chars, "<-", None),
1483                            }
1484                        }
1485                        Some('^') if self.dialect.supports_geometric_types() => {
1486                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1487                        }
1488                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1489                        _ => self.start_binop(chars, "<", Token::Lt),
1490                    }
1491                }
1492                '>' => {
1493                    chars.next(); // consume
1494                    match chars.peek() {
1495                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1496                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1497                        Some('^') if self.dialect.supports_geometric_types() => {
1498                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1499                        }
1500                        _ => self.start_binop(chars, ">", Token::Gt),
1501                    }
1502                }
1503                ':' => {
1504                    chars.next();
1505                    match chars.peek() {
1506                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1507                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1508                        _ => Ok(Some(Token::Colon)),
1509                    }
1510                }
1511                ';' => self.consume_and_return(chars, Token::SemiColon),
1512                '\\' => self.consume_and_return(chars, Token::Backslash),
1513                '[' => self.consume_and_return(chars, Token::LBracket),
1514                ']' => self.consume_and_return(chars, Token::RBracket),
1515                '&' => {
1516                    chars.next(); // consume the '&'
1517                    match chars.peek() {
1518                        Some('>') if self.dialect.supports_geometric_types() => {
1519                            chars.next();
1520                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1521                        }
1522                        Some('<') if self.dialect.supports_geometric_types() => {
1523                            chars.next(); // consume
1524                            match chars.peek() {
1525                                Some('|') => self.consume_and_return(
1526                                    chars,
1527                                    Token::AmpersandLeftAngleBracketVerticalBar,
1528                                ),
1529                                _ => {
1530                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1531                                }
1532                            }
1533                        }
1534                        Some('&') => {
1535                            chars.next(); // consume the second '&'
1536                            self.start_binop(chars, "&&", Token::Overlap)
1537                        }
1538                        // Bitshift '&' operator
1539                        _ => self.start_binop(chars, "&", Token::Ampersand),
1540                    }
1541                }
1542                '^' => {
1543                    chars.next(); // consume the '^'
1544                    match chars.peek() {
1545                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1546                        _ => Ok(Some(Token::Caret)),
1547                    }
1548                }
1549                '{' => self.consume_and_return(chars, Token::LBrace),
1550                '}' => self.consume_and_return(chars, Token::RBrace),
1551                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1552                {
1553                    chars.next(); // consume the '#', starting a snowflake single-line comment
1554                    let comment = self.tokenize_single_line_comment(chars);
1555                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1556                        prefix: "#".to_owned(),
1557                        comment,
1558                    })))
1559                }
1560                '~' => {
1561                    chars.next(); // consume
1562                    match chars.peek() {
1563                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1564                        Some('=') if self.dialect.supports_geometric_types() => {
1565                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
1566                        }
1567                        Some('~') => {
1568                            chars.next();
1569                            match chars.peek() {
1570                                Some('*') => {
1571                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1572                                }
1573                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1574                            }
1575                        }
1576                        _ => self.start_binop(chars, "~", Token::Tilde),
1577                    }
1578                }
1579                '#' => {
1580                    chars.next();
1581                    match chars.peek() {
1582                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1583                        Some('>') => {
1584                            chars.next();
1585                            match chars.peek() {
1586                                Some('>') => {
1587                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1588                                }
1589                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1590                            }
1591                        }
1592                        Some(' ') => Ok(Some(Token::Sharp)),
1593                        Some('#') if self.dialect.supports_geometric_types() => {
1594                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
1595                        }
1596                        Some(sch) if self.dialect.is_identifier_start('#') => {
1597                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1598                        }
1599                        _ => self.start_binop(chars, "#", Token::Sharp),
1600                    }
1601                }
1602                '@' => {
1603                    chars.next();
1604                    match chars.peek() {
1605                        Some('@') if self.dialect.supports_geometric_types() => {
1606                            self.consume_and_return(chars, Token::AtAt)
1607                        }
1608                        Some('-') if self.dialect.supports_geometric_types() => {
1609                            chars.next();
1610                            match chars.peek() {
1611                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1612                                _ => self.start_binop_opt(chars, "@-", None),
1613                            }
1614                        }
1615                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1616                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1617                        Some('@') => {
1618                            chars.next();
1619                            match chars.peek() {
1620                                Some(' ') => Ok(Some(Token::AtAt)),
1621                                Some(tch) if self.dialect.is_identifier_start('@') => {
1622                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1623                                }
1624                                _ => Ok(Some(Token::AtAt)),
1625                            }
1626                        }
1627                        Some(' ') => Ok(Some(Token::AtSign)),
1628                        // We break on quotes here, because no dialect allows identifiers starting
1629                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1630                        // quoted, which is tokenized as a quoted string, not here (e.g.
1631                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1632                        // quoted string as two separate tokens, which this allows. For example,
1633                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1634                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1635                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1636                        // for the user, the `@`, and the host.
1637                        Some('\'') => Ok(Some(Token::AtSign)),
1638                        Some('\"') => Ok(Some(Token::AtSign)),
1639                        Some('`') => Ok(Some(Token::AtSign)),
1640                        Some(sch) if self.dialect.is_identifier_start('@') => {
1641                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1642                        }
1643                        _ => Ok(Some(Token::AtSign)),
1644                    }
1645                }
1646                // Postgres uses ? for jsonb operators, not prepared statements
1647                '?' if self.dialect.supports_geometric_types() => {
1648                    chars.next(); // consume
1649                    match chars.peek() {
1650                        Some('|') => {
1651                            chars.next();
1652                            match chars.peek() {
1653                                Some('|') => self.consume_and_return(
1654                                    chars,
1655                                    Token::QuestionMarkDoubleVerticalBar,
1656                                ),
1657                                _ => Ok(Some(Token::QuestionPipe)),
1658                            }
1659                        }
1660
1661                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1662                        Some('-') => {
1663                            chars.next(); // consume
1664                            match chars.peek() {
1665                                Some('|') => self
1666                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1667                                _ => Ok(Some(Token::QuestionMarkDash)),
1668                            }
1669                        }
1670                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1671                        _ => self.consume_and_return(chars, Token::Question),
1672                    }
1673                }
1674                '?' => {
1675                    chars.next();
1676                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1677                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1678                }
1679
1680                // identifier or keyword
1681                ch if self.dialect.is_identifier_start(ch) => {
1682                    self.tokenize_identifier_or_keyword([ch], chars)
1683                }
1684                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1685
1686                // whitespace check (including unicode chars) should be last as it covers some of the chars above
1687                ch if ch.is_whitespace() => {
1688                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1689                }
1690                other => self.consume_and_return(chars, Token::Char(other)),
1691            },
1692            None => Ok(None),
1693        }
1694    }
1695
1696    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1697    fn consume_for_binop(
1698        &self,
1699        chars: &mut State,
1700        prefix: &str,
1701        default: Token,
1702    ) -> Result<Option<Token>, TokenizerError> {
1703        chars.next(); // consume the first char
1704        self.start_binop_opt(chars, prefix, Some(default))
1705    }
1706
1707    /// parse a custom binary operator
1708    fn start_binop(
1709        &self,
1710        chars: &mut State,
1711        prefix: &str,
1712        default: Token,
1713    ) -> Result<Option<Token>, TokenizerError> {
1714        self.start_binop_opt(chars, prefix, Some(default))
1715    }
1716
1717    /// parse a custom binary operator
1718    fn start_binop_opt(
1719        &self,
1720        chars: &mut State,
1721        prefix: &str,
1722        default: Option<Token>,
1723    ) -> Result<Option<Token>, TokenizerError> {
1724        let mut custom = None;
1725        while let Some(&ch) = chars.peek() {
1726            if !self.dialect.is_custom_operator_part(ch) {
1727                break;
1728            }
1729
1730            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1731            chars.next();
1732        }
1733        match (custom, default) {
1734            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1735            (None, Some(tok)) => Ok(Some(tok)),
1736            (None, None) => self.tokenizer_error(
1737                chars.location(),
1738                format!("Expected a valid binary operator after '{}'", prefix),
1739            ),
1740        }
1741    }
1742
1743    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1744    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1745        let mut s = String::new();
1746        let mut value = String::new();
1747
1748        chars.next();
1749
1750        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1751        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1752            chars.next();
1753
1754            let mut is_terminated = false;
1755            let mut prev: Option<char> = None;
1756
1757            while let Some(&ch) = chars.peek() {
1758                if prev == Some('$') {
1759                    if ch == '$' {
1760                        chars.next();
1761                        is_terminated = true;
1762                        break;
1763                    } else {
1764                        s.push('$');
1765                        s.push(ch);
1766                    }
1767                } else if ch != '$' {
1768                    s.push(ch);
1769                }
1770
1771                prev = Some(ch);
1772                chars.next();
1773            }
1774
1775            return if chars.peek().is_none() && !is_terminated {
1776                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1777            } else {
1778                Ok(Token::DollarQuotedString(DollarQuotedString {
1779                    value: s,
1780                    tag: None,
1781                }))
1782            };
1783        } else {
1784            value.push_str(&peeking_take_while(chars, |ch| {
1785                ch.is_alphanumeric()
1786                    || ch == '_'
1787                    // Allow $ as a placeholder character if the dialect supports it
1788                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1789            }));
1790
1791            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1792            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1793                chars.next();
1794
1795                let mut temp = String::new();
1796                let end_delimiter = format!("${}$", value);
1797
1798                loop {
1799                    match chars.next() {
1800                        Some(ch) => {
1801                            temp.push(ch);
1802
1803                            if temp.ends_with(&end_delimiter) {
1804                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1805                                    s.push_str(temp);
1806                                }
1807                                break;
1808                            }
1809                        }
1810                        None => {
1811                            if temp.ends_with(&end_delimiter) {
1812                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1813                                    s.push_str(temp);
1814                                }
1815                                break;
1816                            }
1817
1818                            return self.tokenizer_error(
1819                                chars.location(),
1820                                "Unterminated dollar-quoted, expected $",
1821                            );
1822                        }
1823                    }
1824                }
1825            } else {
1826                return Ok(Token::Placeholder(String::from("$") + &value));
1827            }
1828        }
1829
1830        Ok(Token::DollarQuotedString(DollarQuotedString {
1831            value: s,
1832            tag: if value.is_empty() { None } else { Some(value) },
1833        }))
1834    }
1835
1836    fn tokenizer_error<R>(
1837        &self,
1838        loc: Location,
1839        message: impl Into<String>,
1840    ) -> Result<R, TokenizerError> {
1841        Err(TokenizerError {
1842            message: message.into(),
1843            location: loc,
1844        })
1845    }
1846
1847    // Consume characters until newline
1848    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1849        let mut comment = peeking_take_while(chars, |ch| match ch {
1850            '\n' => false,                                           // Always stop at \n
1851            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
1852            _ => true, // Keep consuming for other characters
1853        });
1854
1855        if let Some(ch) = chars.next() {
1856            assert!(ch == '\n' || ch == '\r');
1857            comment.push(ch);
1858        }
1859
1860        comment
1861    }
1862
1863    /// Tokenize an identifier or keyword, after the first char is already consumed.
1864    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1865        let mut s = first_chars.into();
1866        s.push_str(&peeking_take_while(chars, |ch| {
1867            self.dialect.is_identifier_part(ch)
1868        }));
1869        s
1870    }
1871
1872    /// Read a quoted identifier
1873    fn tokenize_quoted_identifier(
1874        &self,
1875        quote_start: char,
1876        chars: &mut State,
1877    ) -> Result<String, TokenizerError> {
1878        let error_loc = chars.location();
1879        chars.next(); // consume the opening quote
1880        let quote_end = Word::matching_end_quote(quote_start);
1881        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1882
1883        if last_char == Some(quote_end) {
1884            Ok(s)
1885        } else {
1886            self.tokenizer_error(
1887                error_loc,
1888                format!("Expected close delimiter '{quote_end}' before EOF."),
1889            )
1890        }
1891    }
1892
1893    /// Read a single quoted string, starting with the opening quote.
1894    fn tokenize_escaped_single_quoted_string(
1895        &self,
1896        starting_loc: Location,
1897        chars: &mut State,
1898    ) -> Result<String, TokenizerError> {
1899        if let Some(s) = unescape_single_quoted_string(chars) {
1900            return Ok(s);
1901        }
1902
1903        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1904    }
1905
1906    /// Reads a string literal quoted by a single or triple quote characters.
1907    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
1908    fn tokenize_single_or_triple_quoted_string<F>(
1909        &self,
1910        chars: &mut State,
1911        quote_style: char,
1912        backslash_escape: bool,
1913        single_quote_token: F,
1914        triple_quote_token: F,
1915    ) -> Result<Option<Token>, TokenizerError>
1916    where
1917        F: Fn(String) -> Token,
1918    {
1919        let error_loc = chars.location();
1920
1921        let mut num_opening_quotes = 0u8;
1922        for _ in 0..3 {
1923            if Some(&quote_style) == chars.peek() {
1924                chars.next(); // Consume quote.
1925                num_opening_quotes += 1;
1926            } else {
1927                break;
1928            }
1929        }
1930
1931        let (token_fn, num_quote_chars) = match num_opening_quotes {
1932            1 => (single_quote_token, NumStringQuoteChars::One),
1933            2 => {
1934                // If we matched double quotes, then this is an empty string.
1935                return Ok(Some(single_quote_token("".into())));
1936            }
1937            3 => {
1938                let Some(num_quote_chars) = NonZeroU8::new(3) else {
1939                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1940                };
1941                (
1942                    triple_quote_token,
1943                    NumStringQuoteChars::Many(num_quote_chars),
1944                )
1945            }
1946            _ => {
1947                return self.tokenizer_error(error_loc, "invalid string literal opening");
1948            }
1949        };
1950
1951        let settings = TokenizeQuotedStringSettings {
1952            quote_style,
1953            num_quote_chars,
1954            num_opening_quotes_to_consume: 0,
1955            backslash_escape,
1956        };
1957
1958        self.tokenize_quoted_string(chars, settings)
1959            .map(token_fn)
1960            .map(Some)
1961    }
1962
1963    /// Reads a string literal quoted by a single quote character.
1964    fn tokenize_single_quoted_string(
1965        &self,
1966        chars: &mut State,
1967        quote_style: char,
1968        backslash_escape: bool,
1969    ) -> Result<String, TokenizerError> {
1970        self.tokenize_quoted_string(
1971            chars,
1972            TokenizeQuotedStringSettings {
1973                quote_style,
1974                num_quote_chars: NumStringQuoteChars::One,
1975                num_opening_quotes_to_consume: 1,
1976                backslash_escape,
1977            },
1978        )
1979    }
1980
1981    /// Read a quoted string.
1982    fn tokenize_quoted_string(
1983        &self,
1984        chars: &mut State,
1985        settings: TokenizeQuotedStringSettings,
1986    ) -> Result<String, TokenizerError> {
1987        let mut s = String::new();
1988        let error_loc = chars.location();
1989
1990        // Consume any opening quotes.
1991        for _ in 0..settings.num_opening_quotes_to_consume {
1992            if Some(settings.quote_style) != chars.next() {
1993                return self.tokenizer_error(error_loc, "invalid string literal opening");
1994            }
1995        }
1996
1997        let mut num_consecutive_quotes = 0;
1998        while let Some(&ch) = chars.peek() {
1999            let pending_final_quote = match settings.num_quote_chars {
2000                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
2001                n @ NumStringQuoteChars::Many(count)
2002                    if num_consecutive_quotes + 1 == count.get() =>
2003                {
2004                    Some(n)
2005                }
2006                NumStringQuoteChars::Many(_) => None,
2007            };
2008
2009            match ch {
2010                char if char == settings.quote_style && pending_final_quote.is_some() => {
2011                    chars.next(); // consume
2012
2013                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2014                        // For an initial string like `"""abc"""`, at this point we have
2015                        // `abc""` in the buffer and have now matched the final `"`.
2016                        // However, the string to return is simply `abc`, so we strip off
2017                        // the trailing quotes before returning.
2018                        let mut buf = s.chars();
2019                        for _ in 1..count.get() {
2020                            buf.next_back();
2021                        }
2022                        return Ok(buf.as_str().to_string());
2023                    } else if chars
2024                        .peek()
2025                        .map(|c| *c == settings.quote_style)
2026                        .unwrap_or(false)
2027                    {
2028                        s.push(ch);
2029                        if !self.unescape {
2030                            // In no-escape mode, the given query has to be saved completely
2031                            s.push(ch);
2032                        }
2033                        chars.next();
2034                    } else {
2035                        return Ok(s);
2036                    }
2037                }
2038                '\\' if settings.backslash_escape => {
2039                    // consume backslash
2040                    chars.next();
2041
2042                    num_consecutive_quotes = 0;
2043
2044                    if let Some(next) = chars.peek() {
2045                        if !self.unescape
2046                            || (self.dialect.ignores_wildcard_escapes()
2047                                && (*next == '%' || *next == '_'))
2048                        {
2049                            // In no-escape mode, the given query has to be saved completely
2050                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2051                            // the backslash is not stripped.
2052                            s.push(ch);
2053                            s.push(*next);
2054                            chars.next(); // consume next
2055                        } else {
2056                            let n = match next {
2057                                '0' => '\0',
2058                                'a' => '\u{7}',
2059                                'b' => '\u{8}',
2060                                'f' => '\u{c}',
2061                                'n' => '\n',
2062                                'r' => '\r',
2063                                't' => '\t',
2064                                'Z' => '\u{1a}',
2065                                _ => *next,
2066                            };
2067                            s.push(n);
2068                            chars.next(); // consume next
2069                        }
2070                    }
2071                }
2072                ch => {
2073                    chars.next(); // consume ch
2074
2075                    if ch == settings.quote_style {
2076                        num_consecutive_quotes += 1;
2077                    } else {
2078                        num_consecutive_quotes = 0;
2079                    }
2080
2081                    s.push(ch);
2082                }
2083            }
2084        }
2085        self.tokenizer_error(error_loc, "Unterminated string literal")
2086    }
2087
2088    fn tokenize_multiline_comment(
2089        &self,
2090        chars: &mut State,
2091    ) -> Result<Option<Token>, TokenizerError> {
2092        let mut s = String::new();
2093        let mut nested = 1;
2094        let supports_nested_comments = self.dialect.supports_nested_comments();
2095
2096        loop {
2097            match chars.next() {
2098                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2099                    chars.next(); // consume the '*'
2100                    s.push('/');
2101                    s.push('*');
2102                    nested += 1;
2103                }
2104                Some('*') if matches!(chars.peek(), Some('/')) => {
2105                    chars.next(); // consume the '/'
2106                    nested -= 1;
2107                    if nested == 0 {
2108                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2109                    }
2110                    s.push('*');
2111                    s.push('/');
2112                }
2113                Some(ch) => {
2114                    s.push(ch);
2115                }
2116                None => {
2117                    break self.tokenizer_error(
2118                        chars.location(),
2119                        "Unexpected EOF while in a multi-line comment",
2120                    );
2121                }
2122            }
2123        }
2124    }
2125
2126    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2127        let mut last_char = None;
2128        let mut s = String::new();
2129        while let Some(ch) = chars.next() {
2130            if ch == quote_end {
2131                if chars.peek() == Some(&quote_end) {
2132                    chars.next();
2133                    s.push(ch);
2134                    if !self.unescape {
2135                        // In no-escape mode, the given query has to be saved completely
2136                        s.push(ch);
2137                    }
2138                } else {
2139                    last_char = Some(quote_end);
2140                    break;
2141                }
2142            } else {
2143                s.push(ch);
2144            }
2145        }
2146        (s, last_char)
2147    }
2148
2149    #[allow(clippy::unnecessary_wraps)]
2150    fn consume_and_return(
2151        &self,
2152        chars: &mut State,
2153        t: Token,
2154    ) -> Result<Option<Token>, TokenizerError> {
2155        chars.next();
2156        Ok(Some(t))
2157    }
2158}
2159
2160/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2161/// Return the characters read as String, and keep the first non-matching
2162/// char available as `chars.next()`.
2163fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2164    let mut s = String::new();
2165    while let Some(&ch) = chars.peek() {
2166        if predicate(ch) {
2167            chars.next(); // consume
2168            s.push(ch);
2169        } else {
2170            break;
2171        }
2172    }
2173    s
2174}
2175
2176/// Same as peeking_take_while, but also passes the next character to the predicate.
2177fn peeking_next_take_while(
2178    chars: &mut State,
2179    mut predicate: impl FnMut(char, Option<char>) -> bool,
2180) -> String {
2181    let mut s = String::new();
2182    while let Some(&ch) = chars.peek() {
2183        let next_char = chars.peekable.clone().nth(1);
2184        if predicate(ch, next_char) {
2185            chars.next(); // consume
2186            s.push(ch);
2187        } else {
2188            break;
2189        }
2190    }
2191    s
2192}
2193
2194fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2195    Unescape::new(chars).unescape()
2196}
2197
2198struct Unescape<'a: 'b, 'b> {
2199    chars: &'b mut State<'a>,
2200}
2201
2202impl<'a: 'b, 'b> Unescape<'a, 'b> {
2203    fn new(chars: &'b mut State<'a>) -> Self {
2204        Self { chars }
2205    }
2206    fn unescape(mut self) -> Option<String> {
2207        let mut unescaped = String::new();
2208
2209        self.chars.next();
2210
2211        while let Some(c) = self.chars.next() {
2212            if c == '\'' {
2213                // case: ''''
2214                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2215                    self.chars.next();
2216                    unescaped.push('\'');
2217                    continue;
2218                }
2219                return Some(unescaped);
2220            }
2221
2222            if c != '\\' {
2223                unescaped.push(c);
2224                continue;
2225            }
2226
2227            let c = match self.chars.next()? {
2228                'b' => '\u{0008}',
2229                'f' => '\u{000C}',
2230                'n' => '\n',
2231                'r' => '\r',
2232                't' => '\t',
2233                'u' => self.unescape_unicode_16()?,
2234                'U' => self.unescape_unicode_32()?,
2235                'x' => self.unescape_hex()?,
2236                c if c.is_digit(8) => self.unescape_octal(c)?,
2237                c => c,
2238            };
2239
2240            unescaped.push(Self::check_null(c)?);
2241        }
2242
2243        None
2244    }
2245
2246    #[inline]
2247    fn check_null(c: char) -> Option<char> {
2248        if c == '\0' {
2249            None
2250        } else {
2251            Some(c)
2252        }
2253    }
2254
2255    #[inline]
2256    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2257        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2258        match u32::from_str_radix(s, RADIX) {
2259            Err(_) => None,
2260            Ok(n) => {
2261                let n = n & 0xFF;
2262                if n <= 127 {
2263                    char::from_u32(n)
2264                } else {
2265                    None
2266                }
2267            }
2268        }
2269    }
2270
2271    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2272    fn unescape_hex(&mut self) -> Option<char> {
2273        let mut s = String::new();
2274
2275        for _ in 0..2 {
2276            match self.next_hex_digit() {
2277                Some(c) => s.push(c),
2278                None => break,
2279            }
2280        }
2281
2282        if s.is_empty() {
2283            return Some('x');
2284        }
2285
2286        Self::byte_to_char::<16>(&s)
2287    }
2288
2289    #[inline]
2290    fn next_hex_digit(&mut self) -> Option<char> {
2291        match self.chars.peek() {
2292            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2293            _ => None,
2294        }
2295    }
2296
2297    // Octal byte value. \o, \oo, \ooo (o = 0–7)
2298    fn unescape_octal(&mut self, c: char) -> Option<char> {
2299        let mut s = String::new();
2300
2301        s.push(c);
2302        for _ in 0..2 {
2303            match self.next_octal_digest() {
2304                Some(c) => s.push(c),
2305                None => break,
2306            }
2307        }
2308
2309        Self::byte_to_char::<8>(&s)
2310    }
2311
2312    #[inline]
2313    fn next_octal_digest(&mut self) -> Option<char> {
2314        match self.chars.peek() {
2315            Some(c) if c.is_digit(8) => self.chars.next(),
2316            _ => None,
2317        }
2318    }
2319
2320    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2321    fn unescape_unicode_16(&mut self) -> Option<char> {
2322        self.unescape_unicode::<4>()
2323    }
2324
2325    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2326    fn unescape_unicode_32(&mut self) -> Option<char> {
2327        self.unescape_unicode::<8>()
2328    }
2329
2330    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2331        let mut s = String::new();
2332        for _ in 0..NUM {
2333            s.push(self.chars.next()?);
2334        }
2335        match u32::from_str_radix(&s, 16) {
2336            Err(_) => None,
2337            Ok(n) => char::from_u32(n),
2338        }
2339    }
2340}
2341
2342fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2343    let mut unescaped = String::new();
2344    chars.next(); // consume the opening quote
2345    while let Some(c) = chars.next() {
2346        match c {
2347            '\'' => {
2348                if chars.peek() == Some(&'\'') {
2349                    chars.next();
2350                    unescaped.push('\'');
2351                } else {
2352                    return Ok(unescaped);
2353                }
2354            }
2355            '\\' => match chars.peek() {
2356                Some('\\') => {
2357                    chars.next();
2358                    unescaped.push('\\');
2359                }
2360                Some('+') => {
2361                    chars.next();
2362                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2363                }
2364                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2365            },
2366            _ => {
2367                unescaped.push(c);
2368            }
2369        }
2370    }
2371    Err(TokenizerError {
2372        message: "Unterminated unicode encoded string literal".to_string(),
2373        location: chars.location(),
2374    })
2375}
2376
2377fn take_char_from_hex_digits(
2378    chars: &mut State<'_>,
2379    max_digits: usize,
2380) -> Result<char, TokenizerError> {
2381    let mut result = 0u32;
2382    for _ in 0..max_digits {
2383        let next_char = chars.next().ok_or_else(|| TokenizerError {
2384            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2385                .to_string(),
2386            location: chars.location(),
2387        })?;
2388        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2389            message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
2390            location: chars.location(),
2391        })?;
2392        result = result * 16 + digit;
2393    }
2394    char::from_u32(result).ok_or_else(|| TokenizerError {
2395        message: format!("Invalid unicode character: {:x}", result),
2396        location: chars.location(),
2397    })
2398}
2399
2400#[cfg(test)]
2401mod tests {
2402    use super::*;
2403    use crate::dialect::{
2404        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2405    };
2406    use crate::test_utils::all_dialects_where;
2407    use core::fmt::Debug;
2408
2409    #[test]
2410    fn tokenizer_error_impl() {
2411        let err = TokenizerError {
2412            message: "test".into(),
2413            location: Location { line: 1, column: 1 },
2414        };
2415        #[cfg(feature = "std")]
2416        {
2417            use std::error::Error;
2418            assert!(err.source().is_none());
2419        }
2420        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2421    }
2422
2423    #[test]
2424    fn tokenize_select_1() {
2425        let sql = String::from("SELECT 1");
2426        let dialect = GenericDialect {};
2427        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2428
2429        let expected = vec![
2430            Token::make_keyword("SELECT"),
2431            Token::Whitespace(Whitespace::Space),
2432            Token::Number(String::from("1"), false),
2433        ];
2434
2435        compare(expected, tokens);
2436    }
2437
2438    #[test]
2439    fn tokenize_select_float() {
2440        let sql = String::from("SELECT .1");
2441        let dialect = GenericDialect {};
2442        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2443
2444        let expected = vec![
2445            Token::make_keyword("SELECT"),
2446            Token::Whitespace(Whitespace::Space),
2447            Token::Number(String::from(".1"), false),
2448        ];
2449
2450        compare(expected, tokens);
2451    }
2452
2453    #[test]
2454    fn tokenize_clickhouse_double_equal() {
2455        let sql = String::from("SELECT foo=='1'");
2456        let dialect = ClickHouseDialect {};
2457        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2458        let tokens = tokenizer.tokenize().unwrap();
2459
2460        let expected = vec![
2461            Token::make_keyword("SELECT"),
2462            Token::Whitespace(Whitespace::Space),
2463            Token::Word(Word {
2464                value: "foo".to_string(),
2465                quote_style: None,
2466                keyword: Keyword::NoKeyword,
2467            }),
2468            Token::DoubleEq,
2469            Token::SingleQuotedString("1".to_string()),
2470        ];
2471
2472        compare(expected, tokens);
2473    }
2474
2475    #[test]
2476    fn tokenize_numeric_literal_underscore() {
2477        let dialect = GenericDialect {};
2478        let sql = String::from("SELECT 10_000");
2479        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2480        let tokens = tokenizer.tokenize().unwrap();
2481        let expected = vec![
2482            Token::make_keyword("SELECT"),
2483            Token::Whitespace(Whitespace::Space),
2484            Token::Number("10".to_string(), false),
2485            Token::make_word("_000", None),
2486        ];
2487        compare(expected, tokens);
2488
2489        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2490            "SELECT 10_000, _10_000, 10_00_, 10___0",
2491            vec![
2492                Token::make_keyword("SELECT"),
2493                Token::Whitespace(Whitespace::Space),
2494                Token::Number("10_000".to_string(), false),
2495                Token::Comma,
2496                Token::Whitespace(Whitespace::Space),
2497                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2498                Token::Comma,
2499                Token::Whitespace(Whitespace::Space),
2500                Token::Number("10_00".to_string(), false),
2501                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2502                Token::Comma,
2503                Token::Whitespace(Whitespace::Space),
2504                Token::Number("10".to_string(), false),
2505                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2506            ],
2507        );
2508    }
2509
2510    #[test]
2511    fn tokenize_select_exponent() {
2512        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2513        let dialect = GenericDialect {};
2514        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2515
2516        let expected = vec![
2517            Token::make_keyword("SELECT"),
2518            Token::Whitespace(Whitespace::Space),
2519            Token::Number(String::from("1e10"), false),
2520            Token::Comma,
2521            Token::Whitespace(Whitespace::Space),
2522            Token::Number(String::from("1e-10"), false),
2523            Token::Comma,
2524            Token::Whitespace(Whitespace::Space),
2525            Token::Number(String::from("1e+10"), false),
2526            Token::Comma,
2527            Token::Whitespace(Whitespace::Space),
2528            Token::Number(String::from("1"), false),
2529            Token::make_word("ea", None),
2530            Token::Comma,
2531            Token::Whitespace(Whitespace::Space),
2532            Token::Number(String::from("1e-10"), false),
2533            Token::make_word("a", None),
2534            Token::Comma,
2535            Token::Whitespace(Whitespace::Space),
2536            Token::Number(String::from("1e-10"), false),
2537            Token::Minus,
2538            Token::Number(String::from("10"), false),
2539        ];
2540
2541        compare(expected, tokens);
2542    }
2543
2544    #[test]
2545    fn tokenize_scalar_function() {
2546        let sql = String::from("SELECT sqrt(1)");
2547        let dialect = GenericDialect {};
2548        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2549
2550        let expected = vec![
2551            Token::make_keyword("SELECT"),
2552            Token::Whitespace(Whitespace::Space),
2553            Token::make_word("sqrt", None),
2554            Token::LParen,
2555            Token::Number(String::from("1"), false),
2556            Token::RParen,
2557        ];
2558
2559        compare(expected, tokens);
2560    }
2561
2562    #[test]
2563    fn tokenize_string_string_concat() {
2564        let sql = String::from("SELECT 'a' || 'b'");
2565        let dialect = GenericDialect {};
2566        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2567
2568        let expected = vec![
2569            Token::make_keyword("SELECT"),
2570            Token::Whitespace(Whitespace::Space),
2571            Token::SingleQuotedString(String::from("a")),
2572            Token::Whitespace(Whitespace::Space),
2573            Token::StringConcat,
2574            Token::Whitespace(Whitespace::Space),
2575            Token::SingleQuotedString(String::from("b")),
2576        ];
2577
2578        compare(expected, tokens);
2579    }
2580    #[test]
2581    fn tokenize_bitwise_op() {
2582        let sql = String::from("SELECT one | two ^ three");
2583        let dialect = GenericDialect {};
2584        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2585
2586        let expected = vec![
2587            Token::make_keyword("SELECT"),
2588            Token::Whitespace(Whitespace::Space),
2589            Token::make_word("one", None),
2590            Token::Whitespace(Whitespace::Space),
2591            Token::Pipe,
2592            Token::Whitespace(Whitespace::Space),
2593            Token::make_word("two", None),
2594            Token::Whitespace(Whitespace::Space),
2595            Token::Caret,
2596            Token::Whitespace(Whitespace::Space),
2597            Token::make_word("three", None),
2598        ];
2599        compare(expected, tokens);
2600    }
2601
2602    #[test]
2603    fn tokenize_logical_xor() {
2604        let sql =
2605            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2606        let dialect = GenericDialect {};
2607        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2608
2609        let expected = vec![
2610            Token::make_keyword("SELECT"),
2611            Token::Whitespace(Whitespace::Space),
2612            Token::make_keyword("true"),
2613            Token::Whitespace(Whitespace::Space),
2614            Token::make_keyword("XOR"),
2615            Token::Whitespace(Whitespace::Space),
2616            Token::make_keyword("true"),
2617            Token::Comma,
2618            Token::Whitespace(Whitespace::Space),
2619            Token::make_keyword("false"),
2620            Token::Whitespace(Whitespace::Space),
2621            Token::make_keyword("XOR"),
2622            Token::Whitespace(Whitespace::Space),
2623            Token::make_keyword("false"),
2624            Token::Comma,
2625            Token::Whitespace(Whitespace::Space),
2626            Token::make_keyword("true"),
2627            Token::Whitespace(Whitespace::Space),
2628            Token::make_keyword("XOR"),
2629            Token::Whitespace(Whitespace::Space),
2630            Token::make_keyword("false"),
2631            Token::Comma,
2632            Token::Whitespace(Whitespace::Space),
2633            Token::make_keyword("false"),
2634            Token::Whitespace(Whitespace::Space),
2635            Token::make_keyword("XOR"),
2636            Token::Whitespace(Whitespace::Space),
2637            Token::make_keyword("true"),
2638        ];
2639        compare(expected, tokens);
2640    }
2641
2642    #[test]
2643    fn tokenize_simple_select() {
2644        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2645        let dialect = GenericDialect {};
2646        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2647
2648        let expected = vec![
2649            Token::make_keyword("SELECT"),
2650            Token::Whitespace(Whitespace::Space),
2651            Token::Mul,
2652            Token::Whitespace(Whitespace::Space),
2653            Token::make_keyword("FROM"),
2654            Token::Whitespace(Whitespace::Space),
2655            Token::make_word("customer", None),
2656            Token::Whitespace(Whitespace::Space),
2657            Token::make_keyword("WHERE"),
2658            Token::Whitespace(Whitespace::Space),
2659            Token::make_word("id", None),
2660            Token::Whitespace(Whitespace::Space),
2661            Token::Eq,
2662            Token::Whitespace(Whitespace::Space),
2663            Token::Number(String::from("1"), false),
2664            Token::Whitespace(Whitespace::Space),
2665            Token::make_keyword("LIMIT"),
2666            Token::Whitespace(Whitespace::Space),
2667            Token::Number(String::from("5"), false),
2668        ];
2669
2670        compare(expected, tokens);
2671    }
2672
2673    #[test]
2674    fn tokenize_explain_select() {
2675        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2676        let dialect = GenericDialect {};
2677        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2678
2679        let expected = vec![
2680            Token::make_keyword("EXPLAIN"),
2681            Token::Whitespace(Whitespace::Space),
2682            Token::make_keyword("SELECT"),
2683            Token::Whitespace(Whitespace::Space),
2684            Token::Mul,
2685            Token::Whitespace(Whitespace::Space),
2686            Token::make_keyword("FROM"),
2687            Token::Whitespace(Whitespace::Space),
2688            Token::make_word("customer", None),
2689            Token::Whitespace(Whitespace::Space),
2690            Token::make_keyword("WHERE"),
2691            Token::Whitespace(Whitespace::Space),
2692            Token::make_word("id", None),
2693            Token::Whitespace(Whitespace::Space),
2694            Token::Eq,
2695            Token::Whitespace(Whitespace::Space),
2696            Token::Number(String::from("1"), false),
2697        ];
2698
2699        compare(expected, tokens);
2700    }
2701
2702    #[test]
2703    fn tokenize_explain_analyze_select() {
2704        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2705        let dialect = GenericDialect {};
2706        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2707
2708        let expected = vec![
2709            Token::make_keyword("EXPLAIN"),
2710            Token::Whitespace(Whitespace::Space),
2711            Token::make_keyword("ANALYZE"),
2712            Token::Whitespace(Whitespace::Space),
2713            Token::make_keyword("SELECT"),
2714            Token::Whitespace(Whitespace::Space),
2715            Token::Mul,
2716            Token::Whitespace(Whitespace::Space),
2717            Token::make_keyword("FROM"),
2718            Token::Whitespace(Whitespace::Space),
2719            Token::make_word("customer", None),
2720            Token::Whitespace(Whitespace::Space),
2721            Token::make_keyword("WHERE"),
2722            Token::Whitespace(Whitespace::Space),
2723            Token::make_word("id", None),
2724            Token::Whitespace(Whitespace::Space),
2725            Token::Eq,
2726            Token::Whitespace(Whitespace::Space),
2727            Token::Number(String::from("1"), false),
2728        ];
2729
2730        compare(expected, tokens);
2731    }
2732
2733    #[test]
2734    fn tokenize_string_predicate() {
2735        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2736        let dialect = GenericDialect {};
2737        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2738
2739        let expected = vec![
2740            Token::make_keyword("SELECT"),
2741            Token::Whitespace(Whitespace::Space),
2742            Token::Mul,
2743            Token::Whitespace(Whitespace::Space),
2744            Token::make_keyword("FROM"),
2745            Token::Whitespace(Whitespace::Space),
2746            Token::make_word("customer", None),
2747            Token::Whitespace(Whitespace::Space),
2748            Token::make_keyword("WHERE"),
2749            Token::Whitespace(Whitespace::Space),
2750            Token::make_word("salary", None),
2751            Token::Whitespace(Whitespace::Space),
2752            Token::Neq,
2753            Token::Whitespace(Whitespace::Space),
2754            Token::SingleQuotedString(String::from("Not Provided")),
2755        ];
2756
2757        compare(expected, tokens);
2758    }
2759
2760    #[test]
2761    fn tokenize_invalid_string() {
2762        let sql = String::from("\n💝مصطفىh");
2763
2764        let dialect = GenericDialect {};
2765        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2766        // println!("tokens: {:#?}", tokens);
2767        let expected = vec![
2768            Token::Whitespace(Whitespace::Newline),
2769            Token::Char('💝'),
2770            Token::make_word("مصطفىh", None),
2771        ];
2772        compare(expected, tokens);
2773    }
2774
2775    #[test]
2776    fn tokenize_newline_in_string_literal() {
2777        let sql = String::from("'foo\r\nbar\nbaz'");
2778
2779        let dialect = GenericDialect {};
2780        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2781        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2782        compare(expected, tokens);
2783    }
2784
2785    #[test]
2786    fn tokenize_unterminated_string_literal() {
2787        let sql = String::from("select 'foo");
2788
2789        let dialect = GenericDialect {};
2790        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2791        assert_eq!(
2792            tokenizer.tokenize(),
2793            Err(TokenizerError {
2794                message: "Unterminated string literal".to_string(),
2795                location: Location { line: 1, column: 8 },
2796            })
2797        );
2798    }
2799
2800    #[test]
2801    fn tokenize_unterminated_string_literal_utf8() {
2802        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2803
2804        let dialect = GenericDialect {};
2805        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2806        assert_eq!(
2807            tokenizer.tokenize(),
2808            Err(TokenizerError {
2809                message: "Unterminated string literal".to_string(),
2810                location: Location {
2811                    line: 1,
2812                    column: 35
2813                }
2814            })
2815        );
2816    }
2817
2818    #[test]
2819    fn tokenize_invalid_string_cols() {
2820        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2821
2822        let dialect = GenericDialect {};
2823        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2824        // println!("tokens: {:#?}", tokens);
2825        let expected = vec![
2826            Token::Whitespace(Whitespace::Newline),
2827            Token::Whitespace(Whitespace::Newline),
2828            Token::make_keyword("SELECT"),
2829            Token::Whitespace(Whitespace::Space),
2830            Token::Mul,
2831            Token::Whitespace(Whitespace::Space),
2832            Token::make_keyword("FROM"),
2833            Token::Whitespace(Whitespace::Space),
2834            Token::make_keyword("table"),
2835            Token::Whitespace(Whitespace::Tab),
2836            Token::Char('💝'),
2837            Token::make_word("مصطفىh", None),
2838        ];
2839        compare(expected, tokens);
2840    }
2841
2842    #[test]
2843    fn tokenize_dollar_quoted_string_tagged() {
2844        let test_cases = vec![
2845            (
2846                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2847                vec![
2848                    Token::make_keyword("SELECT"),
2849                    Token::Whitespace(Whitespace::Space),
2850                    Token::DollarQuotedString(DollarQuotedString {
2851                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2852                        tag: Some("tag".into()),
2853                    })
2854                ]
2855            ),
2856            (
2857                String::from("SELECT $abc$x$ab$abc$"),
2858                vec![
2859                    Token::make_keyword("SELECT"),
2860                    Token::Whitespace(Whitespace::Space),
2861                    Token::DollarQuotedString(DollarQuotedString {
2862                        value: "x$ab".into(),
2863                        tag: Some("abc".into()),
2864                    })
2865                ]
2866            ),
2867            (
2868                String::from("SELECT $abc$$abc$"),
2869                vec![
2870                    Token::make_keyword("SELECT"),
2871                    Token::Whitespace(Whitespace::Space),
2872                    Token::DollarQuotedString(DollarQuotedString {
2873                        value: "".into(),
2874                        tag: Some("abc".into()),
2875                    })
2876                ]
2877            ),
2878            (
2879                String::from("0$abc$$abc$1"),
2880                vec![
2881                    Token::Number("0".into(), false),
2882                    Token::DollarQuotedString(DollarQuotedString {
2883                        value: "".into(),
2884                        tag: Some("abc".into()),
2885                    }),
2886                    Token::Number("1".into(), false),
2887                ]
2888            ),
2889            (
2890                String::from("$function$abc$q$data$q$$function$"),
2891                vec![
2892                    Token::DollarQuotedString(DollarQuotedString {
2893                        value: "abc$q$data$q$".into(),
2894                        tag: Some("function".into()),
2895                    }),
2896                ]
2897            ),
2898        ];
2899
2900        let dialect = GenericDialect {};
2901        for (sql, expected) in test_cases {
2902            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2903            compare(expected, tokens);
2904        }
2905    }
2906
2907    #[test]
2908    fn tokenize_dollar_quoted_string_tagged_unterminated() {
2909        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2910        let dialect = GenericDialect {};
2911        assert_eq!(
2912            Tokenizer::new(&dialect, &sql).tokenize(),
2913            Err(TokenizerError {
2914                message: "Unterminated dollar-quoted, expected $".into(),
2915                location: Location {
2916                    line: 1,
2917                    column: 91
2918                }
2919            })
2920        );
2921    }
2922
2923    #[test]
2924    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2925        let sql = String::from("SELECT $abc$abc$");
2926        let dialect = GenericDialect {};
2927        assert_eq!(
2928            Tokenizer::new(&dialect, &sql).tokenize(),
2929            Err(TokenizerError {
2930                message: "Unterminated dollar-quoted, expected $".into(),
2931                location: Location {
2932                    line: 1,
2933                    column: 17
2934                }
2935            })
2936        );
2937    }
2938
2939    #[test]
2940    fn tokenize_dollar_placeholder() {
2941        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
2942        let dialect = SQLiteDialect {};
2943        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2944        assert_eq!(
2945            tokens,
2946            vec![
2947                Token::make_keyword("SELECT"),
2948                Token::Whitespace(Whitespace::Space),
2949                Token::Placeholder("$$".into()),
2950                Token::Comma,
2951                Token::Whitespace(Whitespace::Space),
2952                Token::Placeholder("$$ABC$$".into()),
2953                Token::Comma,
2954                Token::Whitespace(Whitespace::Space),
2955                Token::Placeholder("$ABC$".into()),
2956                Token::Comma,
2957                Token::Whitespace(Whitespace::Space),
2958                Token::Placeholder("$ABC".into()),
2959            ]
2960        );
2961    }
2962
2963    #[test]
2964    fn tokenize_nested_dollar_quoted_strings() {
2965        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
2966        let dialect = GenericDialect {};
2967        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2968        let expected = vec![
2969            Token::make_keyword("SELECT"),
2970            Token::Whitespace(Whitespace::Space),
2971            Token::DollarQuotedString(DollarQuotedString {
2972                value: "dollar $nested$ string".into(),
2973                tag: Some("tag".into()),
2974            }),
2975        ];
2976        compare(expected, tokens);
2977    }
2978
2979    #[test]
2980    fn tokenize_dollar_quoted_string_untagged_empty() {
2981        let sql = String::from("SELECT $$$$");
2982        let dialect = GenericDialect {};
2983        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2984        let expected = vec![
2985            Token::make_keyword("SELECT"),
2986            Token::Whitespace(Whitespace::Space),
2987            Token::DollarQuotedString(DollarQuotedString {
2988                value: "".into(),
2989                tag: None,
2990            }),
2991        ];
2992        compare(expected, tokens);
2993    }
2994
2995    #[test]
2996    fn tokenize_dollar_quoted_string_untagged() {
2997        let sql =
2998            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
2999        let dialect = GenericDialect {};
3000        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3001        let expected = vec![
3002            Token::make_keyword("SELECT"),
3003            Token::Whitespace(Whitespace::Space),
3004            Token::DollarQuotedString(DollarQuotedString {
3005                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3006                tag: None,
3007            }),
3008        ];
3009        compare(expected, tokens);
3010    }
3011
3012    #[test]
3013    fn tokenize_dollar_quoted_string_untagged_unterminated() {
3014        let sql = String::from(
3015            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3016        );
3017        let dialect = GenericDialect {};
3018        assert_eq!(
3019            Tokenizer::new(&dialect, &sql).tokenize(),
3020            Err(TokenizerError {
3021                message: "Unterminated dollar-quoted string".into(),
3022                location: Location {
3023                    line: 1,
3024                    column: 86
3025                }
3026            })
3027        );
3028    }
3029
3030    #[test]
3031    fn tokenize_right_arrow() {
3032        let sql = String::from("FUNCTION(key=>value)");
3033        let dialect = GenericDialect {};
3034        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3035        let expected = vec![
3036            Token::make_word("FUNCTION", None),
3037            Token::LParen,
3038            Token::make_word("key", None),
3039            Token::RArrow,
3040            Token::make_word("value", None),
3041            Token::RParen,
3042        ];
3043        compare(expected, tokens);
3044    }
3045
3046    #[test]
3047    fn tokenize_is_null() {
3048        let sql = String::from("a IS NULL");
3049        let dialect = GenericDialect {};
3050        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3051
3052        let expected = vec![
3053            Token::make_word("a", None),
3054            Token::Whitespace(Whitespace::Space),
3055            Token::make_keyword("IS"),
3056            Token::Whitespace(Whitespace::Space),
3057            Token::make_keyword("NULL"),
3058        ];
3059
3060        compare(expected, tokens);
3061    }
3062
3063    #[test]
3064    fn tokenize_comment() {
3065        let test_cases = vec![
3066            (
3067                String::from("0--this is a comment\n1"),
3068                vec![
3069                    Token::Number("0".to_string(), false),
3070                    Token::Whitespace(Whitespace::SingleLineComment {
3071                        prefix: "--".to_string(),
3072                        comment: "this is a comment\n".to_string(),
3073                    }),
3074                    Token::Number("1".to_string(), false),
3075                ],
3076            ),
3077            (
3078                String::from("0--this is a comment\r1"),
3079                vec![
3080                    Token::Number("0".to_string(), false),
3081                    Token::Whitespace(Whitespace::SingleLineComment {
3082                        prefix: "--".to_string(),
3083                        comment: "this is a comment\r1".to_string(),
3084                    }),
3085                ],
3086            ),
3087            (
3088                String::from("0--this is a comment\r\n1"),
3089                vec![
3090                    Token::Number("0".to_string(), false),
3091                    Token::Whitespace(Whitespace::SingleLineComment {
3092                        prefix: "--".to_string(),
3093                        comment: "this is a comment\r\n".to_string(),
3094                    }),
3095                    Token::Number("1".to_string(), false),
3096                ],
3097            ),
3098        ];
3099
3100        let dialect = GenericDialect {};
3101
3102        for (sql, expected) in test_cases {
3103            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3104            compare(expected, tokens);
3105        }
3106    }
3107
3108    #[test]
3109    fn tokenize_comment_postgres() {
3110        let sql = String::from("1--\r0");
3111
3112        let dialect = PostgreSqlDialect {};
3113        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3114        let expected = vec![
3115            Token::Number("1".to_string(), false),
3116            Token::Whitespace(Whitespace::SingleLineComment {
3117                prefix: "--".to_string(),
3118                comment: "\r".to_string(),
3119            }),
3120            Token::Number("0".to_string(), false),
3121        ];
3122        compare(expected, tokens);
3123    }
3124
3125    #[test]
3126    fn tokenize_comment_at_eof() {
3127        let sql = String::from("--this is a comment");
3128
3129        let dialect = GenericDialect {};
3130        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3131        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3132            prefix: "--".to_string(),
3133            comment: "this is a comment".to_string(),
3134        })];
3135        compare(expected, tokens);
3136    }
3137
3138    #[test]
3139    fn tokenize_multiline_comment() {
3140        let sql = String::from("0/*multi-line\n* /comment*/1");
3141
3142        let dialect = GenericDialect {};
3143        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3144        let expected = vec![
3145            Token::Number("0".to_string(), false),
3146            Token::Whitespace(Whitespace::MultiLineComment(
3147                "multi-line\n* /comment".to_string(),
3148            )),
3149            Token::Number("1".to_string(), false),
3150        ];
3151        compare(expected, tokens);
3152    }
3153
3154    #[test]
3155    fn tokenize_nested_multiline_comment() {
3156        let dialect = GenericDialect {};
3157        let test_cases = vec![
3158            (
3159                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3160                vec![
3161                    Token::Number("0".to_string(), false),
3162                    Token::Whitespace(Whitespace::MultiLineComment(
3163                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3164                    )),
3165                    Token::Whitespace(Whitespace::Space),
3166                    Token::Div,
3167                    Token::Word(Word {
3168                        value: "comment".to_string(),
3169                        quote_style: None,
3170                        keyword: Keyword::COMMENT,
3171                    }),
3172                    Token::Mul,
3173                    Token::Div,
3174                    Token::Number("1".to_string(), false),
3175                ],
3176            ),
3177            (
3178                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3179                vec![
3180                    Token::Number("0".to_string(), false),
3181                    Token::Whitespace(Whitespace::MultiLineComment(
3182                        "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3183                    )),
3184                    Token::Number("1".to_string(), false),
3185                ],
3186            ),
3187            (
3188                "SELECT 1/* a /* b */ c */0",
3189                vec![
3190                    Token::make_keyword("SELECT"),
3191                    Token::Whitespace(Whitespace::Space),
3192                    Token::Number("1".to_string(), false),
3193                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3194                    Token::Number("0".to_string(), false),
3195                ],
3196            ),
3197        ];
3198
3199        for (sql, expected) in test_cases {
3200            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3201            compare(expected, tokens);
3202        }
3203    }
3204
3205    #[test]
3206    fn tokenize_nested_multiline_comment_empty() {
3207        let sql = "select 1/*/**/*/0";
3208
3209        let dialect = GenericDialect {};
3210        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3211        let expected = vec![
3212            Token::make_keyword("select"),
3213            Token::Whitespace(Whitespace::Space),
3214            Token::Number("1".to_string(), false),
3215            Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3216            Token::Number("0".to_string(), false),
3217        ];
3218
3219        compare(expected, tokens);
3220    }
3221
3222    #[test]
3223    fn tokenize_nested_comments_if_not_supported() {
3224        let dialect = SQLiteDialect {};
3225        let sql = "SELECT 1/*/* nested comment */*/0";
3226        let tokens = Tokenizer::new(&dialect, sql).tokenize();
3227        let expected = vec![
3228            Token::make_keyword("SELECT"),
3229            Token::Whitespace(Whitespace::Space),
3230            Token::Number("1".to_string(), false),
3231            Token::Whitespace(Whitespace::MultiLineComment(
3232                "/* nested comment ".to_string(),
3233            )),
3234            Token::Mul,
3235            Token::Div,
3236            Token::Number("0".to_string(), false),
3237        ];
3238
3239        compare(expected, tokens.unwrap());
3240    }
3241
3242    #[test]
3243    fn tokenize_multiline_comment_with_even_asterisks() {
3244        let sql = String::from("\n/** Comment **/\n");
3245
3246        let dialect = GenericDialect {};
3247        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3248        let expected = vec![
3249            Token::Whitespace(Whitespace::Newline),
3250            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3251            Token::Whitespace(Whitespace::Newline),
3252        ];
3253        compare(expected, tokens);
3254    }
3255
3256    #[test]
3257    fn tokenize_unicode_whitespace() {
3258        let sql = String::from(" \u{2003}\n");
3259
3260        let dialect = GenericDialect {};
3261        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3262        let expected = vec![
3263            Token::Whitespace(Whitespace::Space),
3264            Token::Whitespace(Whitespace::Space),
3265            Token::Whitespace(Whitespace::Newline),
3266        ];
3267        compare(expected, tokens);
3268    }
3269
3270    #[test]
3271    fn tokenize_mismatched_quotes() {
3272        let sql = String::from("\"foo");
3273
3274        let dialect = GenericDialect {};
3275        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3276        assert_eq!(
3277            tokenizer.tokenize(),
3278            Err(TokenizerError {
3279                message: "Expected close delimiter '\"' before EOF.".to_string(),
3280                location: Location { line: 1, column: 1 },
3281            })
3282        );
3283    }
3284
3285    #[test]
3286    fn tokenize_newlines() {
3287        let sql = String::from("line1\nline2\rline3\r\nline4\r");
3288
3289        let dialect = GenericDialect {};
3290        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3291        let expected = vec![
3292            Token::make_word("line1", None),
3293            Token::Whitespace(Whitespace::Newline),
3294            Token::make_word("line2", None),
3295            Token::Whitespace(Whitespace::Newline),
3296            Token::make_word("line3", None),
3297            Token::Whitespace(Whitespace::Newline),
3298            Token::make_word("line4", None),
3299            Token::Whitespace(Whitespace::Newline),
3300        ];
3301        compare(expected, tokens);
3302    }
3303
3304    #[test]
3305    fn tokenize_mssql_top() {
3306        let sql = "SELECT TOP 5 [bar] FROM foo";
3307        let dialect = MsSqlDialect {};
3308        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3309        let expected = vec![
3310            Token::make_keyword("SELECT"),
3311            Token::Whitespace(Whitespace::Space),
3312            Token::make_keyword("TOP"),
3313            Token::Whitespace(Whitespace::Space),
3314            Token::Number(String::from("5"), false),
3315            Token::Whitespace(Whitespace::Space),
3316            Token::make_word("bar", Some('[')),
3317            Token::Whitespace(Whitespace::Space),
3318            Token::make_keyword("FROM"),
3319            Token::Whitespace(Whitespace::Space),
3320            Token::make_word("foo", None),
3321        ];
3322        compare(expected, tokens);
3323    }
3324
3325    #[test]
3326    fn tokenize_pg_regex_match() {
3327        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3328        let dialect = GenericDialect {};
3329        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3330        let expected = vec![
3331            Token::make_keyword("SELECT"),
3332            Token::Whitespace(Whitespace::Space),
3333            Token::make_word("col", None),
3334            Token::Whitespace(Whitespace::Space),
3335            Token::Tilde,
3336            Token::Whitespace(Whitespace::Space),
3337            Token::SingleQuotedString("^a".into()),
3338            Token::Comma,
3339            Token::Whitespace(Whitespace::Space),
3340            Token::make_word("col", None),
3341            Token::Whitespace(Whitespace::Space),
3342            Token::TildeAsterisk,
3343            Token::Whitespace(Whitespace::Space),
3344            Token::SingleQuotedString("^a".into()),
3345            Token::Comma,
3346            Token::Whitespace(Whitespace::Space),
3347            Token::make_word("col", None),
3348            Token::Whitespace(Whitespace::Space),
3349            Token::ExclamationMarkTilde,
3350            Token::Whitespace(Whitespace::Space),
3351            Token::SingleQuotedString("^a".into()),
3352            Token::Comma,
3353            Token::Whitespace(Whitespace::Space),
3354            Token::make_word("col", None),
3355            Token::Whitespace(Whitespace::Space),
3356            Token::ExclamationMarkTildeAsterisk,
3357            Token::Whitespace(Whitespace::Space),
3358            Token::SingleQuotedString("^a".into()),
3359        ];
3360        compare(expected, tokens);
3361    }
3362
3363    #[test]
3364    fn tokenize_pg_like_match() {
3365        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3366        let dialect = GenericDialect {};
3367        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3368        let expected = vec![
3369            Token::make_keyword("SELECT"),
3370            Token::Whitespace(Whitespace::Space),
3371            Token::make_word("col", None),
3372            Token::Whitespace(Whitespace::Space),
3373            Token::DoubleTilde,
3374            Token::Whitespace(Whitespace::Space),
3375            Token::SingleQuotedString("_a%".into()),
3376            Token::Comma,
3377            Token::Whitespace(Whitespace::Space),
3378            Token::make_word("col", None),
3379            Token::Whitespace(Whitespace::Space),
3380            Token::DoubleTildeAsterisk,
3381            Token::Whitespace(Whitespace::Space),
3382            Token::SingleQuotedString("_a%".into()),
3383            Token::Comma,
3384            Token::Whitespace(Whitespace::Space),
3385            Token::make_word("col", None),
3386            Token::Whitespace(Whitespace::Space),
3387            Token::ExclamationMarkDoubleTilde,
3388            Token::Whitespace(Whitespace::Space),
3389            Token::SingleQuotedString("_a%".into()),
3390            Token::Comma,
3391            Token::Whitespace(Whitespace::Space),
3392            Token::make_word("col", None),
3393            Token::Whitespace(Whitespace::Space),
3394            Token::ExclamationMarkDoubleTildeAsterisk,
3395            Token::Whitespace(Whitespace::Space),
3396            Token::SingleQuotedString("_a%".into()),
3397        ];
3398        compare(expected, tokens);
3399    }
3400
3401    #[test]
3402    fn tokenize_quoted_identifier() {
3403        let sql = r#" "a "" b" "a """ "c """"" "#;
3404        let dialect = GenericDialect {};
3405        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3406        let expected = vec![
3407            Token::Whitespace(Whitespace::Space),
3408            Token::make_word(r#"a " b"#, Some('"')),
3409            Token::Whitespace(Whitespace::Space),
3410            Token::make_word(r#"a ""#, Some('"')),
3411            Token::Whitespace(Whitespace::Space),
3412            Token::make_word(r#"c """#, Some('"')),
3413            Token::Whitespace(Whitespace::Space),
3414        ];
3415        compare(expected, tokens);
3416    }
3417
3418    #[test]
3419    fn tokenize_snowflake_div() {
3420        let sql = r#"field/1000"#;
3421        let dialect = SnowflakeDialect {};
3422        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3423        let expected = vec![
3424            Token::make_word(r#"field"#, None),
3425            Token::Div,
3426            Token::Number("1000".to_string(), false),
3427        ];
3428        compare(expected, tokens);
3429    }
3430
3431    #[test]
3432    fn tokenize_quoted_identifier_with_no_escape() {
3433        let sql = r#" "a "" b" "a """ "c """"" "#;
3434        let dialect = GenericDialect {};
3435        let tokens = Tokenizer::new(&dialect, sql)
3436            .with_unescape(false)
3437            .tokenize()
3438            .unwrap();
3439        let expected = vec![
3440            Token::Whitespace(Whitespace::Space),
3441            Token::make_word(r#"a "" b"#, Some('"')),
3442            Token::Whitespace(Whitespace::Space),
3443            Token::make_word(r#"a """#, Some('"')),
3444            Token::Whitespace(Whitespace::Space),
3445            Token::make_word(r#"c """""#, Some('"')),
3446            Token::Whitespace(Whitespace::Space),
3447        ];
3448        compare(expected, tokens);
3449    }
3450
3451    #[test]
3452    fn tokenize_with_location() {
3453        let sql = "SELECT a,\n b";
3454        let dialect = GenericDialect {};
3455        let tokens = Tokenizer::new(&dialect, sql)
3456            .tokenize_with_location()
3457            .unwrap();
3458        let expected = vec![
3459            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3460            TokenWithSpan::at(
3461                Token::Whitespace(Whitespace::Space),
3462                (1, 7).into(),
3463                (1, 8).into(),
3464            ),
3465            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3466            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3467            TokenWithSpan::at(
3468                Token::Whitespace(Whitespace::Newline),
3469                (1, 10).into(),
3470                (2, 1).into(),
3471            ),
3472            TokenWithSpan::at(
3473                Token::Whitespace(Whitespace::Space),
3474                (2, 1).into(),
3475                (2, 2).into(),
3476            ),
3477            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3478        ];
3479        compare(expected, tokens);
3480    }
3481
3482    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3483        //println!("------------------------------");
3484        //println!("tokens   = {:?}", actual);
3485        //println!("expected = {:?}", expected);
3486        //println!("------------------------------");
3487        assert_eq!(expected, actual);
3488    }
3489
3490    fn check_unescape(s: &str, expected: Option<&str>) {
3491        let s = format!("'{}'", s);
3492        let mut state = State {
3493            peekable: s.chars().peekable(),
3494            line: 0,
3495            col: 0,
3496        };
3497
3498        assert_eq!(
3499            unescape_single_quoted_string(&mut state),
3500            expected.map(|s| s.to_string())
3501        );
3502    }
3503
3504    #[test]
3505    fn test_unescape() {
3506        check_unescape(r"\b", Some("\u{0008}"));
3507        check_unescape(r"\f", Some("\u{000C}"));
3508        check_unescape(r"\t", Some("\t"));
3509        check_unescape(r"\r\n", Some("\r\n"));
3510        check_unescape(r"\/", Some("/"));
3511        check_unescape(r"/", Some("/"));
3512        check_unescape(r"\\", Some("\\"));
3513
3514        // 16 and 32-bit hexadecimal Unicode character value
3515        check_unescape(r"\u0001", Some("\u{0001}"));
3516        check_unescape(r"\u4c91", Some("\u{4c91}"));
3517        check_unescape(r"\u4c916", Some("\u{4c91}6"));
3518        check_unescape(r"\u4c", None);
3519        check_unescape(r"\u0000", None);
3520        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3521        check_unescape(r"\U00110000", None);
3522        check_unescape(r"\U00000000", None);
3523        check_unescape(r"\u", None);
3524        check_unescape(r"\U", None);
3525        check_unescape(r"\U1010FFFF", None);
3526
3527        // hexadecimal byte value
3528        check_unescape(r"\x4B", Some("\u{004b}"));
3529        check_unescape(r"\x4", Some("\u{0004}"));
3530        check_unescape(r"\x4L", Some("\u{0004}L"));
3531        check_unescape(r"\x", Some("x"));
3532        check_unescape(r"\xP", Some("xP"));
3533        check_unescape(r"\x0", None);
3534        check_unescape(r"\xCAD", None);
3535        check_unescape(r"\xA9", None);
3536
3537        // octal byte value
3538        check_unescape(r"\1", Some("\u{0001}"));
3539        check_unescape(r"\12", Some("\u{000a}"));
3540        check_unescape(r"\123", Some("\u{0053}"));
3541        check_unescape(r"\1232", Some("\u{0053}2"));
3542        check_unescape(r"\4", Some("\u{0004}"));
3543        check_unescape(r"\45", Some("\u{0025}"));
3544        check_unescape(r"\450", Some("\u{0028}"));
3545        check_unescape(r"\603", None);
3546        check_unescape(r"\0", None);
3547        check_unescape(r"\080", None);
3548
3549        // others
3550        check_unescape(r"\9", Some("9"));
3551        check_unescape(r"''", Some("'"));
3552        check_unescape(
3553            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3554            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3555        );
3556        check_unescape(r"Hello\0", None);
3557        check_unescape(r"Hello\xCADRust", None);
3558    }
3559
3560    #[test]
3561    fn tokenize_numeric_prefix_trait() {
3562        #[derive(Debug)]
3563        struct NumericPrefixDialect;
3564
3565        impl Dialect for NumericPrefixDialect {
3566            fn is_identifier_start(&self, ch: char) -> bool {
3567                ch.is_ascii_lowercase()
3568                    || ch.is_ascii_uppercase()
3569                    || ch.is_ascii_digit()
3570                    || ch == '$'
3571            }
3572
3573            fn is_identifier_part(&self, ch: char) -> bool {
3574                ch.is_ascii_lowercase()
3575                    || ch.is_ascii_uppercase()
3576                    || ch.is_ascii_digit()
3577                    || ch == '_'
3578                    || ch == '$'
3579                    || ch == '{'
3580                    || ch == '}'
3581            }
3582
3583            fn supports_numeric_prefix(&self) -> bool {
3584                true
3585            }
3586        }
3587
3588        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3589        tokenize_numeric_prefix_inner(&HiveDialect {});
3590        tokenize_numeric_prefix_inner(&MySqlDialect {});
3591    }
3592
3593    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3594        let sql = r#"SELECT * FROM 1"#;
3595        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3596        let expected = vec![
3597            Token::make_keyword("SELECT"),
3598            Token::Whitespace(Whitespace::Space),
3599            Token::Mul,
3600            Token::Whitespace(Whitespace::Space),
3601            Token::make_keyword("FROM"),
3602            Token::Whitespace(Whitespace::Space),
3603            Token::Number(String::from("1"), false),
3604        ];
3605        compare(expected, tokens);
3606    }
3607
3608    #[test]
3609    fn tokenize_quoted_string_escape() {
3610        let dialect = SnowflakeDialect {};
3611        for (sql, expected, expected_unescaped) in [
3612            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3613            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3614            (r#"'\\'"#, r#"\\"#, r#"\"#),
3615            (
3616                r#"'\0\a\b\f\n\r\t\Z'"#,
3617                r#"\0\a\b\f\n\r\t\Z"#,
3618                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3619            ),
3620            (r#"'\"'"#, r#"\""#, "\""),
3621            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3622            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3623            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3624            (r#"'\q'"#, r#"\q"#, r#"q"#),
3625            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3626            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3627        ] {
3628            let tokens = Tokenizer::new(&dialect, sql)
3629                .with_unescape(false)
3630                .tokenize()
3631                .unwrap();
3632            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3633            compare(expected, tokens);
3634
3635            let tokens = Tokenizer::new(&dialect, sql)
3636                .with_unescape(true)
3637                .tokenize()
3638                .unwrap();
3639            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3640            compare(expected, tokens);
3641        }
3642
3643        for sql in [r#"'\'"#, r#"'ab\'"#] {
3644            let mut tokenizer = Tokenizer::new(&dialect, sql);
3645            assert_eq!(
3646                "Unterminated string literal",
3647                tokenizer.tokenize().unwrap_err().message.as_str(),
3648            );
3649        }
3650
3651        // Non-escape dialect
3652        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3653            let dialect = GenericDialect {};
3654            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3655
3656            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3657
3658            compare(expected, tokens);
3659        }
3660
3661        // MySQL special case for LIKE escapes
3662        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3663            let dialect = MySqlDialect {};
3664            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3665
3666            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3667
3668            compare(expected, tokens);
3669        }
3670    }
3671
3672    #[test]
3673    fn tokenize_triple_quoted_string() {
3674        fn check<F>(
3675            q: char, // The quote character to test
3676            r: char, // An alternate quote character.
3677            quote_token: F,
3678        ) where
3679            F: Fn(String) -> Token,
3680        {
3681            let dialect = BigQueryDialect {};
3682
3683            for (sql, expected, expected_unescaped) in [
3684                // Empty string
3685                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3686                // Should not count escaped quote as end of string.
3687                (
3688                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3689                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3690                    format!(r#"ab{q}{q}{q}{q}cd"#),
3691                ),
3692                // Simple string
3693                (
3694                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3695                    "abc".into(),
3696                    "abc".into(),
3697                ),
3698                // Mix single-double quotes unescaped.
3699                (
3700                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3701                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3702                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3703                ),
3704                // Escaped quote.
3705                (
3706                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3707                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3708                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3709                ),
3710                // backslash-escaped quote characters.
3711                (
3712                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3713                    r#"a\'\'b\'c\'d"#.into(),
3714                    r#"a''b'c'd"#.into(),
3715                ),
3716                // backslash-escaped characters
3717                (
3718                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3719                    r#"abc\0\n\rdef"#.into(),
3720                    "abc\0\n\rdef".into(),
3721                ),
3722            ] {
3723                let tokens = Tokenizer::new(&dialect, sql.as_str())
3724                    .with_unescape(false)
3725                    .tokenize()
3726                    .unwrap();
3727                let expected = vec![quote_token(expected.to_string())];
3728                compare(expected, tokens);
3729
3730                let tokens = Tokenizer::new(&dialect, sql.as_str())
3731                    .with_unescape(true)
3732                    .tokenize()
3733                    .unwrap();
3734                let expected = vec![quote_token(expected_unescaped.to_string())];
3735                compare(expected, tokens);
3736            }
3737
3738            for sql in [
3739                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3740                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3741                format!(r#"{q}{q}{q}{q}"#),
3742                format!(r#"{q}{q}{q}{r}{r}"#),
3743                format!(r#"{q}{q}{q}abc{q}"#),
3744                format!(r#"{q}{q}{q}abc{q}{q}"#),
3745                format!(r#"{q}{q}{q}abc"#),
3746            ] {
3747                let dialect = BigQueryDialect {};
3748                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3749                assert_eq!(
3750                    "Unterminated string literal",
3751                    tokenizer.tokenize().unwrap_err().message.as_str(),
3752                );
3753            }
3754        }
3755
3756        check('"', '\'', Token::TripleDoubleQuotedString);
3757
3758        check('\'', '"', Token::TripleSingleQuotedString);
3759
3760        let dialect = BigQueryDialect {};
3761
3762        let sql = r#"""''"#;
3763        let tokens = Tokenizer::new(&dialect, sql)
3764            .with_unescape(true)
3765            .tokenize()
3766            .unwrap();
3767        let expected = vec![
3768            Token::DoubleQuotedString("".to_string()),
3769            Token::SingleQuotedString("".to_string()),
3770        ];
3771        compare(expected, tokens);
3772
3773        let sql = r#"''"""#;
3774        let tokens = Tokenizer::new(&dialect, sql)
3775            .with_unescape(true)
3776            .tokenize()
3777            .unwrap();
3778        let expected = vec![
3779            Token::SingleQuotedString("".to_string()),
3780            Token::DoubleQuotedString("".to_string()),
3781        ];
3782        compare(expected, tokens);
3783
3784        // Non-triple quoted string dialect
3785        let dialect = SnowflakeDialect {};
3786        let sql = r#"''''''"#;
3787        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3788        let expected = vec![Token::SingleQuotedString("''".to_string())];
3789        compare(expected, tokens);
3790    }
3791
3792    #[test]
3793    fn test_mysql_users_grantees() {
3794        let dialect = MySqlDialect {};
3795
3796        let sql = "CREATE USER `root`@`%`";
3797        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3798        let expected = vec![
3799            Token::make_keyword("CREATE"),
3800            Token::Whitespace(Whitespace::Space),
3801            Token::make_keyword("USER"),
3802            Token::Whitespace(Whitespace::Space),
3803            Token::make_word("root", Some('`')),
3804            Token::AtSign,
3805            Token::make_word("%", Some('`')),
3806        ];
3807        compare(expected, tokens);
3808    }
3809
3810    #[test]
3811    fn test_postgres_abs_without_space_and_string_literal() {
3812        let dialect = MySqlDialect {};
3813
3814        let sql = "SELECT @'1'";
3815        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3816        let expected = vec![
3817            Token::make_keyword("SELECT"),
3818            Token::Whitespace(Whitespace::Space),
3819            Token::AtSign,
3820            Token::SingleQuotedString("1".to_string()),
3821        ];
3822        compare(expected, tokens);
3823    }
3824
3825    #[test]
3826    fn test_postgres_abs_without_space_and_quoted_column() {
3827        let dialect = MySqlDialect {};
3828
3829        let sql = r#"SELECT @"bar" FROM foo"#;
3830        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3831        let expected = vec![
3832            Token::make_keyword("SELECT"),
3833            Token::Whitespace(Whitespace::Space),
3834            Token::AtSign,
3835            Token::DoubleQuotedString("bar".to_string()),
3836            Token::Whitespace(Whitespace::Space),
3837            Token::make_keyword("FROM"),
3838            Token::Whitespace(Whitespace::Space),
3839            Token::make_word("foo", None),
3840        ];
3841        compare(expected, tokens);
3842    }
3843
3844    #[test]
3845    fn test_national_strings_backslash_escape_not_supported() {
3846        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3847            .tokenizes_to(
3848                "select n'''''\\'",
3849                vec![
3850                    Token::make_keyword("select"),
3851                    Token::Whitespace(Whitespace::Space),
3852                    Token::NationalStringLiteral("''\\".to_string()),
3853                ],
3854            );
3855    }
3856
3857    #[test]
3858    fn test_national_strings_backslash_escape_supported() {
3859        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3860            .tokenizes_to(
3861                "select n'''''\\''",
3862                vec![
3863                    Token::make_keyword("select"),
3864                    Token::Whitespace(Whitespace::Space),
3865                    Token::NationalStringLiteral("'''".to_string()),
3866                ],
3867            );
3868    }
3869
3870    #[test]
3871    fn test_string_escape_constant_not_supported() {
3872        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3873            "select e'...'",
3874            vec![
3875                Token::make_keyword("select"),
3876                Token::Whitespace(Whitespace::Space),
3877                Token::make_word("e", None),
3878                Token::SingleQuotedString("...".to_string()),
3879            ],
3880        );
3881
3882        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3883            "select E'...'",
3884            vec![
3885                Token::make_keyword("select"),
3886                Token::Whitespace(Whitespace::Space),
3887                Token::make_word("E", None),
3888                Token::SingleQuotedString("...".to_string()),
3889            ],
3890        );
3891    }
3892
3893    #[test]
3894    fn test_string_escape_constant_supported() {
3895        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3896            "select e'\\''",
3897            vec![
3898                Token::make_keyword("select"),
3899                Token::Whitespace(Whitespace::Space),
3900                Token::EscapedStringLiteral("'".to_string()),
3901            ],
3902        );
3903
3904        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3905            "select E'\\''",
3906            vec![
3907                Token::make_keyword("select"),
3908                Token::Whitespace(Whitespace::Space),
3909                Token::EscapedStringLiteral("'".to_string()),
3910            ],
3911        );
3912    }
3913
3914    #[test]
3915    fn test_whitespace_required_after_single_line_comment() {
3916        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3917            .tokenizes_to(
3918                "SELECT --'abc'",
3919                vec![
3920                    Token::make_keyword("SELECT"),
3921                    Token::Whitespace(Whitespace::Space),
3922                    Token::Minus,
3923                    Token::Minus,
3924                    Token::SingleQuotedString("abc".to_string()),
3925                ],
3926            );
3927
3928        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3929            .tokenizes_to(
3930                "SELECT -- 'abc'",
3931                vec![
3932                    Token::make_keyword("SELECT"),
3933                    Token::Whitespace(Whitespace::Space),
3934                    Token::Whitespace(Whitespace::SingleLineComment {
3935                        prefix: "--".to_string(),
3936                        comment: " 'abc'".to_string(),
3937                    }),
3938                ],
3939            );
3940
3941        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3942            .tokenizes_to(
3943                "SELECT --",
3944                vec![
3945                    Token::make_keyword("SELECT"),
3946                    Token::Whitespace(Whitespace::Space),
3947                    Token::Minus,
3948                    Token::Minus,
3949                ],
3950            );
3951    }
3952
3953    #[test]
3954    fn test_whitespace_not_required_after_single_line_comment() {
3955        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3956            .tokenizes_to(
3957                "SELECT --'abc'",
3958                vec![
3959                    Token::make_keyword("SELECT"),
3960                    Token::Whitespace(Whitespace::Space),
3961                    Token::Whitespace(Whitespace::SingleLineComment {
3962                        prefix: "--".to_string(),
3963                        comment: "'abc'".to_string(),
3964                    }),
3965                ],
3966            );
3967
3968        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3969            .tokenizes_to(
3970                "SELECT -- 'abc'",
3971                vec![
3972                    Token::make_keyword("SELECT"),
3973                    Token::Whitespace(Whitespace::Space),
3974                    Token::Whitespace(Whitespace::SingleLineComment {
3975                        prefix: "--".to_string(),
3976                        comment: " 'abc'".to_string(),
3977                    }),
3978                ],
3979            );
3980
3981        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3982            .tokenizes_to(
3983                "SELECT --",
3984                vec![
3985                    Token::make_keyword("SELECT"),
3986                    Token::Whitespace(Whitespace::Space),
3987                    Token::Whitespace(Whitespace::SingleLineComment {
3988                        prefix: "--".to_string(),
3989                        comment: "".to_string(),
3990                    }),
3991                ],
3992            );
3993    }
3994
3995    #[test]
3996    fn test_tokenize_identifiers_numeric_prefix() {
3997        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3998            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
3999
4000        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
4001            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
4002
4003        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4004            "t.12e34",
4005            vec![
4006                Token::make_word("t", None),
4007                Token::Period,
4008                Token::make_word("12e34", None),
4009            ],
4010        );
4011
4012        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4013            "t.1two3",
4014            vec![
4015                Token::make_word("t", None),
4016                Token::Period,
4017                Token::make_word("1two3", None),
4018            ],
4019        );
4020    }
4021}
sqltk_parser/tokenizer.rs

sqltk_parser/
tokenizer.rs