sqltk_parser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqltk_parser_derive::{Visit, VisitMut};
42
43use crate::ast::DollarQuotedString;
44use crate::dialect::Dialect;
45use crate::dialect::{
46    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
47    SnowflakeDialect,
48};
49use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
50
51/// SQL Token enumeration
52#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56    /// An end-of-file marker, not a real token
57    EOF,
58    /// A keyword (like SELECT) or an optionally quoted SQL identifier
59    Word(Word),
60    /// An unsigned numeric literal
61    Number(String, bool),
62    /// A character that could not be tokenized
63    Char(char),
64    /// Single quoted string: i.e: 'string'
65    SingleQuotedString(String),
66    /// Double quoted string: i.e: "string"
67    DoubleQuotedString(String),
68    /// Triple single quoted strings: Example '''abc'''
69    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
70    TripleSingleQuotedString(String),
71    /// Triple double quoted strings: Example """abc"""
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleDoubleQuotedString(String),
74    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
75    DollarQuotedString(DollarQuotedString),
76    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
77    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
78    SingleQuotedByteStringLiteral(String),
79    /// Byte string literal: i.e: b"string" or B"string"
80    DoubleQuotedByteStringLiteral(String),
81    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
82    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
83    TripleSingleQuotedByteStringLiteral(String),
84    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleDoubleQuotedByteStringLiteral(String),
87    /// Single quoted literal with raw string prefix. Example `R'abc'`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    SingleQuotedRawStringLiteral(String),
90    /// Double quoted literal with raw string prefix. Example `R"abc"`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    DoubleQuotedRawStringLiteral(String),
93    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    TripleSingleQuotedRawStringLiteral(String),
96    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleDoubleQuotedRawStringLiteral(String),
99    /// "National" string literal: i.e: N'string'
100    NationalStringLiteral(String),
101    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
102    EscapedStringLiteral(String),
103    /// Unicode string literal: i.e: U&'first \000A second'
104    UnicodeStringLiteral(String),
105    /// Hexadecimal string literal: i.e.: X'deadbeef'
106    HexStringLiteral(String),
107    /// Comma
108    Comma,
109    /// Whitespace (space, tab, etc)
110    Whitespace(Whitespace),
111    /// Double equals sign `==`
112    DoubleEq,
113    /// Equality operator `=`
114    Eq,
115    /// Not Equals operator `<>` (or `!=` in some dialects)
116    Neq,
117    /// Less Than operator `<`
118    Lt,
119    /// Greater Than operator `>`
120    Gt,
121    /// Less Than Or Equals operator `<=`
122    LtEq,
123    /// Greater Than Or Equals operator `>=`
124    GtEq,
125    /// Spaceship operator <=>
126    Spaceship,
127    /// Plus operator `+`
128    Plus,
129    /// Minus operator `-`
130    Minus,
131    /// Multiplication operator `*`
132    Mul,
133    /// Division operator `/`
134    Div,
135    /// Integer division operator `//` in DuckDB
136    DuckIntDiv,
137    /// Modulo Operator `%`
138    Mod,
139    /// String concatenation `||`
140    StringConcat,
141    /// Left parenthesis `(`
142    LParen,
143    /// Right parenthesis `)`
144    RParen,
145    /// Period (used for compound identifiers or projections into nested types)
146    Period,
147    /// Colon `:`
148    Colon,
149    /// DoubleColon `::` (used for casting in PostgreSQL)
150    DoubleColon,
151    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
152    Assignment,
153    /// SemiColon `;` used as separator for COPY and payload
154    SemiColon,
155    /// Backslash `\` used in terminating the COPY payload with `\.`
156    Backslash,
157    /// Left bracket `[`
158    LBracket,
159    /// Right bracket `]`
160    RBracket,
161    /// Ampersand `&`
162    Ampersand,
163    /// Pipe `|`
164    Pipe,
165    /// Caret `^`
166    Caret,
167    /// Left brace `{`
168    LBrace,
169    /// Right brace `}`
170    RBrace,
171    /// Right Arrow `=>`
172    RArrow,
173    /// Sharp `#` used for PostgreSQL Bitwise XOR operator
174    Sharp,
175    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
176    Tilde,
177    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
178    TildeAsterisk,
179    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
180    ExclamationMarkTilde,
181    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
182    ExclamationMarkTildeAsterisk,
183    /// `~~`, a case sensitive match pattern operator in PostgreSQL
184    DoubleTilde,
185    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
186    DoubleTildeAsterisk,
187    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
188    ExclamationMarkDoubleTilde,
189    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
190    ExclamationMarkDoubleTildeAsterisk,
191    /// `<<`, a bitwise shift left operator in PostgreSQL
192    ShiftLeft,
193    /// `>>`, a bitwise shift right operator in PostgreSQL
194    ShiftRight,
195    /// `&&`, an overlap operator in PostgreSQL
196    Overlap,
197    /// Exclamation Mark `!` used for PostgreSQL factorial operator
198    ExclamationMark,
199    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
200    DoubleExclamationMark,
201    /// AtSign `@` used for PostgreSQL abs operator
202    AtSign,
203    /// `^@`, a "starts with" string operator in PostgreSQL
204    CaretAt,
205    /// `|/`, a square root math operator in PostgreSQL
206    PGSquareRoot,
207    /// `||/`, a cube root math operator in PostgreSQL
208    PGCubeRoot,
209    /// `?` or `$` , a prepared statement arg placeholder
210    Placeholder(String),
211    /// `->`, used as a operator to extract json field in PostgreSQL
212    Arrow,
213    /// `->>`, used as a operator to extract json field as text in PostgreSQL
214    LongArrow,
215    /// `#>`, extracts JSON sub-object at the specified path
216    HashArrow,
217    /// `#>>`, extracts JSON sub-object at the specified path as text
218    HashLongArrow,
219    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
220    AtArrow,
221    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
222    ArrowAt,
223    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
224    /// path, where path elements can be either field keys or array indexes.
225    HashMinus,
226    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
227    /// JSON value?
228    AtQuestion,
229    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
230    /// for the specified JSON value. Only the first item of the result is taken into
231    /// account. If the result is not Boolean, then NULL is returned.
232    AtAt,
233    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
234    /// jsonb object
235    Question,
236    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
237    /// keys within the jsonb object
238    QuestionAnd,
239    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
240    /// keys within the jsonb object
241    QuestionPipe,
242    /// Custom binary operator
243    /// This is used to represent any custom binary operator that is not part of the SQL standard.
244    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
245    CustomBinaryOperator(String),
246}
247
248impl fmt::Display for Token {
249    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
250        match self {
251            Token::EOF => f.write_str("EOF"),
252            Token::Word(ref w) => write!(f, "{w}"),
253            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
254            Token::Char(ref c) => write!(f, "{c}"),
255            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
256            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
257            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
258            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
259            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
260            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
261            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
262            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
263            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
264            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
265            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
266            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
267            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
268            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
269            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
270            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
271            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
272            Token::Comma => f.write_str(","),
273            Token::Whitespace(ws) => write!(f, "{ws}"),
274            Token::DoubleEq => f.write_str("=="),
275            Token::Spaceship => f.write_str("<=>"),
276            Token::Eq => f.write_str("="),
277            Token::Neq => f.write_str("<>"),
278            Token::Lt => f.write_str("<"),
279            Token::Gt => f.write_str(">"),
280            Token::LtEq => f.write_str("<="),
281            Token::GtEq => f.write_str(">="),
282            Token::Plus => f.write_str("+"),
283            Token::Minus => f.write_str("-"),
284            Token::Mul => f.write_str("*"),
285            Token::Div => f.write_str("/"),
286            Token::DuckIntDiv => f.write_str("//"),
287            Token::StringConcat => f.write_str("||"),
288            Token::Mod => f.write_str("%"),
289            Token::LParen => f.write_str("("),
290            Token::RParen => f.write_str(")"),
291            Token::Period => f.write_str("."),
292            Token::Colon => f.write_str(":"),
293            Token::DoubleColon => f.write_str("::"),
294            Token::Assignment => f.write_str(":="),
295            Token::SemiColon => f.write_str(";"),
296            Token::Backslash => f.write_str("\\"),
297            Token::LBracket => f.write_str("["),
298            Token::RBracket => f.write_str("]"),
299            Token::Ampersand => f.write_str("&"),
300            Token::Caret => f.write_str("^"),
301            Token::Pipe => f.write_str("|"),
302            Token::LBrace => f.write_str("{"),
303            Token::RBrace => f.write_str("}"),
304            Token::RArrow => f.write_str("=>"),
305            Token::Sharp => f.write_str("#"),
306            Token::ExclamationMark => f.write_str("!"),
307            Token::DoubleExclamationMark => f.write_str("!!"),
308            Token::Tilde => f.write_str("~"),
309            Token::TildeAsterisk => f.write_str("~*"),
310            Token::ExclamationMarkTilde => f.write_str("!~"),
311            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
312            Token::DoubleTilde => f.write_str("~~"),
313            Token::DoubleTildeAsterisk => f.write_str("~~*"),
314            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
315            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
316            Token::AtSign => f.write_str("@"),
317            Token::CaretAt => f.write_str("^@"),
318            Token::ShiftLeft => f.write_str("<<"),
319            Token::ShiftRight => f.write_str(">>"),
320            Token::Overlap => f.write_str("&&"),
321            Token::PGSquareRoot => f.write_str("|/"),
322            Token::PGCubeRoot => f.write_str("||/"),
323            Token::Placeholder(ref s) => write!(f, "{s}"),
324            Token::Arrow => write!(f, "->"),
325            Token::LongArrow => write!(f, "->>"),
326            Token::HashArrow => write!(f, "#>"),
327            Token::HashLongArrow => write!(f, "#>>"),
328            Token::AtArrow => write!(f, "@>"),
329            Token::ArrowAt => write!(f, "<@"),
330            Token::HashMinus => write!(f, "#-"),
331            Token::AtQuestion => write!(f, "@?"),
332            Token::AtAt => write!(f, "@@"),
333            Token::Question => write!(f, "?"),
334            Token::QuestionAnd => write!(f, "?&"),
335            Token::QuestionPipe => write!(f, "?|"),
336            Token::CustomBinaryOperator(s) => f.write_str(s),
337        }
338    }
339}
340
341impl Token {
342    pub fn make_keyword(keyword: &str) -> Self {
343        Token::make_word(keyword, None)
344    }
345
346    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
347        let word_uppercase = word.to_uppercase();
348        Token::Word(Word {
349            value: word.to_string(),
350            quote_style,
351            keyword: if quote_style.is_none() {
352                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
353                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
354            } else {
355                Keyword::NoKeyword
356            },
357        })
358    }
359}
360
361/// A keyword (like SELECT) or an optionally quoted SQL identifier
362#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
363#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
364#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
365pub struct Word {
366    /// The value of the token, without the enclosing quotes, and with the
367    /// escape sequences (if any) processed (TODO: escapes are not handled)
368    pub value: String,
369    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
370    /// The standard and most implementations allow using double quotes for this,
371    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
372    pub quote_style: Option<char>,
373    /// If the word was not quoted and it matched one of the known keywords,
374    /// this will have one of the values from dialect::keywords, otherwise empty
375    pub keyword: Keyword,
376}
377
378impl fmt::Display for Word {
379    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
380        match self.quote_style {
381            Some(s) if s == '"' || s == '[' || s == '`' => {
382                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
383            }
384            None => f.write_str(&self.value),
385            _ => panic!("Unexpected quote_style!"),
386        }
387    }
388}
389
390impl Word {
391    fn matching_end_quote(ch: char) -> char {
392        match ch {
393            '"' => '"', // ANSI and most dialects
394            '[' => ']', // MS SQL
395            '`' => '`', // MySQL
396            _ => panic!("unexpected quoting style!"),
397        }
398    }
399}
400
401#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
402#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
403#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
404pub enum Whitespace {
405    Space,
406    Newline,
407    Tab,
408    SingleLineComment { comment: String, prefix: String },
409    MultiLineComment(String),
410}
411
412impl fmt::Display for Whitespace {
413    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
414        match self {
415            Whitespace::Space => f.write_str(" "),
416            Whitespace::Newline => f.write_str("\n"),
417            Whitespace::Tab => f.write_str("\t"),
418            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
419            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
420        }
421    }
422}
423
424/// Location in input string
425///
426/// # Create an "empty" (unknown) `Location`
427/// ```
428/// # use sqltk_parser::tokenizer::Location;
429/// let location = Location::empty();
430/// ```
431///
432/// # Create a `Location` from a line and column
433/// ```
434/// # use sqltk_parser::tokenizer::Location;
435/// let location = Location::new(1, 1);
436/// ```
437///
438/// # Create a `Location` from a pair
439/// ```
440/// # use sqltk_parser::tokenizer::Location;
441/// let location = Location::from((1, 1));
442/// ```
443#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
444#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
445#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
446pub struct Location {
447    /// Line number, starting from 1.
448    ///
449    /// Note: Line 0 is used for empty spans
450    pub line: u64,
451    /// Line column, starting from 1.
452    ///
453    /// Note: Column 0 is used for empty spans
454    pub column: u64,
455}
456
457impl fmt::Display for Location {
458    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
459        if self.line == 0 {
460            return Ok(());
461        }
462        write!(f, " at Line: {}, Column: {}", self.line, self.column)
463    }
464}
465
466impl fmt::Debug for Location {
467    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
468        write!(f, "Location({},{})", self.line, self.column)
469    }
470}
471
472impl Location {
473    /// Return an "empty" / unknown location
474    pub fn empty() -> Self {
475        Self { line: 0, column: 0 }
476    }
477
478    /// Create a new `Location` for a given line and column
479    pub fn new(line: u64, column: u64) -> Self {
480        Self { line, column }
481    }
482
483    /// Create a new location for a given line and column
484    ///
485    /// Alias for [`Self::new`]
486    // TODO: remove / deprecate in favor of` `new` for consistency?
487    pub fn of(line: u64, column: u64) -> Self {
488        Self::new(line, column)
489    }
490
491    /// Combine self and `end` into a new `Span`
492    pub fn span_to(self, end: Self) -> Span {
493        Span { start: self, end }
494    }
495}
496
497impl From<(u64, u64)> for Location {
498    fn from((line, column): (u64, u64)) -> Self {
499        Self { line, column }
500    }
501}
502
503/// A span represents a linear portion of the input string (start, end)
504///
505/// See [Spanned](crate::ast::Spanned) for more information.
506#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
507#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
508#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
509pub struct Span {
510    pub start: Location,
511    pub end: Location,
512}
513
514impl fmt::Debug for Span {
515    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
516        write!(f, "Span({:?}..{:?})", self.start, self.end)
517    }
518}
519
520impl Span {
521    // An empty span (0, 0) -> (0, 0)
522    // We need a const instance for pattern matching
523    const EMPTY: Span = Self::empty();
524
525    /// Create a new span from a start and end [`Location`]
526    pub fn new(start: Location, end: Location) -> Span {
527        Span { start, end }
528    }
529
530    /// Returns an empty span `(0, 0) -> (0, 0)`
531    ///
532    /// Empty spans represent no knowledge of source location
533    /// See [Spanned](crate::ast::Spanned) for more information.
534    pub const fn empty() -> Span {
535        Span {
536            start: Location { line: 0, column: 0 },
537            end: Location { line: 0, column: 0 },
538        }
539    }
540
541    /// Returns the smallest Span that contains both `self` and `other`
542    /// If either span is [Span::empty], the other span is returned
543    ///
544    /// # Examples
545    /// ```
546    /// # use sqltk_parser::tokenizer::{Span, Location};
547    /// // line 1, column1 -> line 2, column 5
548    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
549    /// // line 2, column 3 -> line 3, column 7
550    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
551    /// // Union of the two is the min/max of the two spans
552    /// // line 1, column 1 -> line 3, column 7
553    /// let union = span1.union(&span2);
554    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
555    /// ```
556    pub fn union(&self, other: &Span) -> Span {
557        // If either span is empty, return the other
558        // this prevents propagating (0, 0) through the tree
559        match (self, other) {
560            (&Span::EMPTY, _) => *other,
561            (_, &Span::EMPTY) => *self,
562            _ => Span {
563                start: cmp::min(self.start, other.start),
564                end: cmp::max(self.end, other.end),
565            },
566        }
567    }
568
569    /// Same as [Span::union] for `Option<Span>`
570    ///
571    /// If `other` is `None`, `self` is returned
572    pub fn union_opt(&self, other: &Option<Span>) -> Span {
573        match other {
574            Some(other) => self.union(other),
575            None => *self,
576        }
577    }
578
579    /// Return the [Span::union] of all spans in the iterator
580    ///
581    /// If the iterator is empty, an empty span is returned
582    ///
583    /// # Example
584    /// ```
585    /// # use sqltk_parser::tokenizer::{Span, Location};
586    /// let spans = vec![
587    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
588    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
589    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
590    /// ];
591    /// // line 1, column 1 -> line 4, column 2
592    /// assert_eq!(
593    ///   Span::union_iter(spans),
594    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
595    /// );
596    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
597        iter.into_iter()
598            .reduce(|acc, item| acc.union(&item))
599            .unwrap_or(Span::empty())
600    }
601}
602
603/// Backwards compatibility struct for [`TokenWithSpan`]
604#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
605pub type TokenWithLocation = TokenWithSpan;
606
607/// A [Token] with [Span] attached to it
608///
609/// This is used to track the location of a token in the input string
610///
611/// # Examples
612/// ```
613/// # use sqltk_parser::tokenizer::{Location, Span, Token, TokenWithSpan};
614/// // commas @ line 1, column 10
615/// let tok1 = TokenWithSpan::new(
616///   Token::Comma,
617///   Span::new(Location::new(1, 10), Location::new(1, 11)),
618/// );
619/// assert_eq!(tok1, Token::Comma); // can compare the token
620///
621/// // commas @ line 2, column 20
622/// let tok2 = TokenWithSpan::new(
623///   Token::Comma,
624///   Span::new(Location::new(2, 20), Location::new(2, 21)),
625/// );
626/// // same token but different locations are not equal
627/// assert_ne!(tok1, tok2);
628/// ```
629#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
630#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
631#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
632pub struct TokenWithSpan {
633    pub token: Token,
634    pub span: Span,
635}
636
637impl TokenWithSpan {
638    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
639    pub fn new(token: Token, span: Span) -> Self {
640        Self { token, span }
641    }
642
643    /// Wrap a token with an empty span
644    pub fn wrap(token: Token) -> Self {
645        Self::new(token, Span::empty())
646    }
647
648    /// Wrap a token with a location from `start` to `end`
649    pub fn at(token: Token, start: Location, end: Location) -> Self {
650        Self::new(token, Span::new(start, end))
651    }
652
653    /// Return an EOF token with no location
654    pub fn new_eof() -> Self {
655        Self::wrap(Token::EOF)
656    }
657}
658
659impl PartialEq<Token> for TokenWithSpan {
660    fn eq(&self, other: &Token) -> bool {
661        &self.token == other
662    }
663}
664
665impl PartialEq<TokenWithSpan> for Token {
666    fn eq(&self, other: &TokenWithSpan) -> bool {
667        self == &other.token
668    }
669}
670
671impl fmt::Display for TokenWithSpan {
672    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
673        self.token.fmt(f)
674    }
675}
676
677/// Tokenizer error
678#[derive(Debug, PartialEq, Eq)]
679pub struct TokenizerError {
680    pub message: String,
681    pub location: Location,
682}
683
684impl fmt::Display for TokenizerError {
685    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
686        write!(f, "{}{}", self.message, self.location,)
687    }
688}
689
690#[cfg(feature = "std")]
691impl std::error::Error for TokenizerError {}
692
693struct State<'a> {
694    peekable: Peekable<Chars<'a>>,
695    pub line: u64,
696    pub col: u64,
697}
698
699impl State<'_> {
700    /// return the next character and advance the stream
701    pub fn next(&mut self) -> Option<char> {
702        match self.peekable.next() {
703            None => None,
704            Some(s) => {
705                if s == '\n' {
706                    self.line += 1;
707                    self.col = 1;
708                } else {
709                    self.col += 1;
710                }
711                Some(s)
712            }
713        }
714    }
715
716    /// return the next character but do not advance the stream
717    pub fn peek(&mut self) -> Option<&char> {
718        self.peekable.peek()
719    }
720
721    pub fn location(&self) -> Location {
722        Location {
723            line: self.line,
724            column: self.col,
725        }
726    }
727}
728
729/// Represents how many quote characters enclose a string literal.
730#[derive(Copy, Clone)]
731enum NumStringQuoteChars {
732    /// e.g. `"abc"`, `'abc'`, `r'abc'`
733    One,
734    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
735    Many(NonZeroU8),
736}
737
738/// Settings for tokenizing a quoted string literal.
739struct TokenizeQuotedStringSettings {
740    /// The character used to quote the string.
741    quote_style: char,
742    /// Represents how many quotes characters enclose the string literal.
743    num_quote_chars: NumStringQuoteChars,
744    /// The number of opening quotes left to consume, before parsing
745    /// the remaining string literal.
746    /// For example: given initial string `"""abc"""`. If the caller has
747    /// already parsed the first quote for some reason, then this value
748    /// is set to 1, flagging to look to consume only 2 leading quotes.
749    num_opening_quotes_to_consume: u8,
750    /// True if the string uses backslash escaping of special characters
751    /// e.g `'abc\ndef\'ghi'
752    backslash_escape: bool,
753}
754
755/// SQL Tokenizer
756pub struct Tokenizer<'a> {
757    dialect: &'a dyn Dialect,
758    query: &'a str,
759    /// If true (the default), the tokenizer will un-escape literal
760    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
761    unescape: bool,
762}
763
764impl<'a> Tokenizer<'a> {
765    /// Create a new SQL tokenizer for the specified SQL statement
766    ///
767    /// ```
768    /// # use sqltk_parser::tokenizer::{Token, Whitespace, Tokenizer};
769    /// # use sqltk_parser::dialect::GenericDialect;
770    /// # let dialect = GenericDialect{};
771    /// let query = r#"SELECT 'foo'"#;
772    ///
773    /// // Parsing the query
774    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
775    ///
776    /// assert_eq!(tokens, vec![
777    ///   Token::make_word("SELECT", None),
778    ///   Token::Whitespace(Whitespace::Space),
779    ///   Token::SingleQuotedString("foo".to_string()),
780    /// ]);
781    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
782        Self {
783            dialect,
784            query,
785            unescape: true,
786        }
787    }
788
789    /// Set unescape mode
790    ///
791    /// When true (default) the tokenizer unescapes literal values
792    /// (for example, `""` in SQL is unescaped to the literal `"`).
793    ///
794    /// When false, the tokenizer provides the raw strings as provided
795    /// in the query.  This can be helpful for programs that wish to
796    /// recover the *exact* original query text without normalizing
797    /// the escaping
798    ///
799    /// # Example
800    ///
801    /// ```
802    /// # use sqltk_parser::tokenizer::{Token, Tokenizer};
803    /// # use sqltk_parser::dialect::GenericDialect;
804    /// # let dialect = GenericDialect{};
805    /// let query = r#""Foo "" Bar""#;
806    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
807    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
808    ///
809    /// // Parsing with unescaping (default)
810    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
811    /// assert_eq!(tokens, vec![unescaped]);
812    ///
813    /// // Parsing with unescape = false
814    /// let tokens = Tokenizer::new(&dialect, &query)
815    ///    .with_unescape(false)
816    ///    .tokenize().unwrap();
817    /// assert_eq!(tokens, vec![original]);
818    /// ```
819    pub fn with_unescape(mut self, unescape: bool) -> Self {
820        self.unescape = unescape;
821        self
822    }
823
824    /// Tokenize the statement and produce a vector of tokens
825    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
826        let twl = self.tokenize_with_location()?;
827        Ok(twl.into_iter().map(|t| t.token).collect())
828    }
829
830    /// Tokenize the statement and produce a vector of tokens with location information
831    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
832        let mut tokens: Vec<TokenWithSpan> = vec![];
833        self.tokenize_with_location_into_buf(&mut tokens)
834            .map(|_| tokens)
835    }
836
837    /// Tokenize the statement and append tokens with location information into the provided buffer.
838    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
839    pub fn tokenize_with_location_into_buf(
840        &mut self,
841        buf: &mut Vec<TokenWithSpan>,
842    ) -> Result<(), TokenizerError> {
843        let mut state = State {
844            peekable: self.query.chars().peekable(),
845            line: 1,
846            col: 1,
847        };
848
849        let mut location = state.location();
850        while let Some(token) = self.next_token(&mut state)? {
851            let span = location.span_to(state.location());
852
853            buf.push(TokenWithSpan { token, span });
854
855            location = state.location();
856        }
857        Ok(())
858    }
859
860    // Tokenize the identifier or keywords in `ch`
861    fn tokenize_identifier_or_keyword(
862        &self,
863        ch: impl IntoIterator<Item = char>,
864        chars: &mut State,
865    ) -> Result<Option<Token>, TokenizerError> {
866        chars.next(); // consume the first char
867        let ch: String = ch.into_iter().collect();
868        let word = self.tokenize_word(ch, chars);
869
870        // TODO: implement parsing of exponent here
871        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
872            let mut inner_state = State {
873                peekable: word.chars().peekable(),
874                line: 0,
875                col: 0,
876            };
877            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
878            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
879            s += s2.as_str();
880            return Ok(Some(Token::Number(s, false)));
881        }
882
883        Ok(Some(Token::make_word(&word, None)))
884    }
885
886    /// Get the next token or return None
887    fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
888        match chars.peek() {
889            Some(&ch) => match ch {
890                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
891                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
892                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
893                '\r' => {
894                    // Emit a single Whitespace::Newline token for \r and \r\n
895                    chars.next();
896                    if let Some('\n') = chars.peek() {
897                        chars.next();
898                    }
899                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
900                }
901                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
902                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
903                {
904                    chars.next(); // consume
905                    match chars.peek() {
906                        Some('\'') => {
907                            if self.dialect.supports_triple_quoted_string() {
908                                return self
909                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
910                                        chars,
911                                        '\'',
912                                        false,
913                                        Token::SingleQuotedByteStringLiteral,
914                                        Token::TripleSingleQuotedByteStringLiteral,
915                                    );
916                            }
917                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
918                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
919                        }
920                        Some('\"') => {
921                            if self.dialect.supports_triple_quoted_string() {
922                                return self
923                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
924                                        chars,
925                                        '"',
926                                        false,
927                                        Token::DoubleQuotedByteStringLiteral,
928                                        Token::TripleDoubleQuotedByteStringLiteral,
929                                    );
930                            }
931                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
932                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
933                        }
934                        _ => {
935                            // regular identifier starting with an "b" or "B"
936                            let s = self.tokenize_word(b, chars);
937                            Ok(Some(Token::make_word(&s, None)))
938                        }
939                    }
940                }
941                // BigQuery uses r or R for raw string literal
942                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
943                    chars.next(); // consume
944                    match chars.peek() {
945                        Some('\'') => self
946                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
947                                chars,
948                                '\'',
949                                false,
950                                Token::SingleQuotedRawStringLiteral,
951                                Token::TripleSingleQuotedRawStringLiteral,
952                            ),
953                        Some('\"') => self
954                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
955                                chars,
956                                '"',
957                                false,
958                                Token::DoubleQuotedRawStringLiteral,
959                                Token::TripleDoubleQuotedRawStringLiteral,
960                            ),
961                        _ => {
962                            // regular identifier starting with an "r" or "R"
963                            let s = self.tokenize_word(b, chars);
964                            Ok(Some(Token::make_word(&s, None)))
965                        }
966                    }
967                }
968                // Redshift uses lower case n for national string literal
969                n @ 'N' | n @ 'n' => {
970                    chars.next(); // consume, to check the next char
971                    match chars.peek() {
972                        Some('\'') => {
973                            // N'...' - a <national character string literal>
974                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
975                            Ok(Some(Token::NationalStringLiteral(s)))
976                        }
977                        _ => {
978                            // regular identifier starting with an "N"
979                            let s = self.tokenize_word(n, chars);
980                            Ok(Some(Token::make_word(&s, None)))
981                        }
982                    }
983                }
984                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
985                x @ 'e' | x @ 'E' => {
986                    let starting_loc = chars.location();
987                    chars.next(); // consume, to check the next char
988                    match chars.peek() {
989                        Some('\'') => {
990                            let s =
991                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
992                            Ok(Some(Token::EscapedStringLiteral(s)))
993                        }
994                        _ => {
995                            // regular identifier starting with an "E" or "e"
996                            let s = self.tokenize_word(x, chars);
997                            Ok(Some(Token::make_word(&s, None)))
998                        }
999                    }
1000                }
1001                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1002                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1003                    chars.next(); // consume, to check the next char
1004                    if chars.peek() == Some(&'&') {
1005                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1006                        let mut chars_clone = chars.peekable.clone();
1007                        chars_clone.next(); // consume the '&' in the clone
1008                        if chars_clone.peek() == Some(&'\'') {
1009                            chars.next(); // consume the '&' in the original iterator
1010                            let s = unescape_unicode_single_quoted_string(chars)?;
1011                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1012                        }
1013                    }
1014                    // regular identifier starting with an "U" or "u"
1015                    let s = self.tokenize_word(x, chars);
1016                    Ok(Some(Token::make_word(&s, None)))
1017                }
1018                // The spec only allows an uppercase 'X' to introduce a hex
1019                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1020                x @ 'x' | x @ 'X' => {
1021                    chars.next(); // consume, to check the next char
1022                    match chars.peek() {
1023                        Some('\'') => {
1024                            // X'...' - a <binary string literal>
1025                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1026                            Ok(Some(Token::HexStringLiteral(s)))
1027                        }
1028                        _ => {
1029                            // regular identifier starting with an "X"
1030                            let s = self.tokenize_word(x, chars);
1031                            Ok(Some(Token::make_word(&s, None)))
1032                        }
1033                    }
1034                }
1035                // single quoted string
1036                '\'' => {
1037                    if self.dialect.supports_triple_quoted_string() {
1038                        return self
1039                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1040                                chars,
1041                                '\'',
1042                                self.dialect.supports_string_literal_backslash_escape(),
1043                                Token::SingleQuotedString,
1044                                Token::TripleSingleQuotedString,
1045                            );
1046                    }
1047                    let s = self.tokenize_single_quoted_string(
1048                        chars,
1049                        '\'',
1050                        self.dialect.supports_string_literal_backslash_escape(),
1051                    )?;
1052
1053                    Ok(Some(Token::SingleQuotedString(s)))
1054                }
1055                // double quoted string
1056                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1057                    && !self.dialect.is_identifier_start(ch) =>
1058                {
1059                    if self.dialect.supports_triple_quoted_string() {
1060                        return self
1061                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1062                                chars,
1063                                '"',
1064                                self.dialect.supports_string_literal_backslash_escape(),
1065                                Token::DoubleQuotedString,
1066                                Token::TripleDoubleQuotedString,
1067                            );
1068                    }
1069                    let s = self.tokenize_single_quoted_string(
1070                        chars,
1071                        '"',
1072                        self.dialect.supports_string_literal_backslash_escape(),
1073                    )?;
1074
1075                    Ok(Some(Token::DoubleQuotedString(s)))
1076                }
1077                // delimited (quoted) identifier
1078                quote_start
1079                    if self.dialect.is_delimited_identifier_start(ch)
1080                        && self
1081                            .dialect
1082                            .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
1083                {
1084                    let error_loc = chars.location();
1085                    chars.next(); // consume the opening quote
1086                    let quote_end = Word::matching_end_quote(quote_start);
1087                    let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1088
1089                    if last_char == Some(quote_end) {
1090                        Ok(Some(Token::make_word(&s, Some(quote_start))))
1091                    } else {
1092                        self.tokenizer_error(
1093                            error_loc,
1094                            format!("Expected close delimiter '{quote_end}' before EOF."),
1095                        )
1096                    }
1097                }
1098                // numbers and period
1099                '0'..='9' | '.' => {
1100                    let mut s = peeking_take_while(chars, |ch| ch.is_ascii_digit());
1101
1102                    // match binary literal that starts with 0x
1103                    if s == "0" && chars.peek() == Some(&'x') {
1104                        chars.next();
1105                        let s2 = peeking_take_while(chars, |ch| ch.is_ascii_hexdigit());
1106                        return Ok(Some(Token::HexStringLiteral(s2)));
1107                    }
1108
1109                    // match one period
1110                    if let Some('.') = chars.peek() {
1111                        s.push('.');
1112                        chars.next();
1113                    }
1114                    s += &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1115
1116                    // No number -> Token::Period
1117                    if s == "." {
1118                        return Ok(Some(Token::Period));
1119                    }
1120
1121                    let mut exponent_part = String::new();
1122                    // Parse exponent as number
1123                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1124                        let mut char_clone = chars.peekable.clone();
1125                        exponent_part.push(char_clone.next().unwrap());
1126
1127                        // Optional sign
1128                        match char_clone.peek() {
1129                            Some(&c) if matches!(c, '+' | '-') => {
1130                                exponent_part.push(c);
1131                                char_clone.next();
1132                            }
1133                            _ => (),
1134                        }
1135
1136                        match char_clone.peek() {
1137                            // Definitely an exponent, get original iterator up to speed and use it
1138                            Some(&c) if c.is_ascii_digit() => {
1139                                for _ in 0..exponent_part.len() {
1140                                    chars.next();
1141                                }
1142                                exponent_part +=
1143                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1144                                s += exponent_part.as_str();
1145                            }
1146                            // Not an exponent, discard the work done
1147                            _ => (),
1148                        }
1149                    }
1150
1151                    // mysql dialect supports identifiers that start with a numeric prefix,
1152                    // as long as they aren't an exponent number.
1153                    if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
1154                        let word =
1155                            peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1156
1157                        if !word.is_empty() {
1158                            s += word.as_str();
1159                            return Ok(Some(Token::make_word(s.as_str(), None)));
1160                        }
1161                    }
1162
1163                    let long = if chars.peek() == Some(&'L') {
1164                        chars.next();
1165                        true
1166                    } else {
1167                        false
1168                    };
1169                    Ok(Some(Token::Number(s, long)))
1170                }
1171                // punctuation
1172                '(' => self.consume_and_return(chars, Token::LParen),
1173                ')' => self.consume_and_return(chars, Token::RParen),
1174                ',' => self.consume_and_return(chars, Token::Comma),
1175                // operators
1176                '-' => {
1177                    chars.next(); // consume the '-'
1178                    match chars.peek() {
1179                        Some('-') => {
1180                            chars.next(); // consume the second '-', starting a single-line comment
1181                            let comment = self.tokenize_single_line_comment(chars);
1182                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1183                                prefix: "--".to_owned(),
1184                                comment,
1185                            })))
1186                        }
1187                        Some('>') => {
1188                            chars.next();
1189                            match chars.peek() {
1190                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1191                                _ => self.start_binop(chars, "->", Token::Arrow),
1192                            }
1193                        }
1194                        // a regular '-' operator
1195                        _ => self.start_binop(chars, "-", Token::Minus),
1196                    }
1197                }
1198                '/' => {
1199                    chars.next(); // consume the '/'
1200                    match chars.peek() {
1201                        Some('*') => {
1202                            chars.next(); // consume the '*', starting a multi-line comment
1203                            self.tokenize_multiline_comment(chars)
1204                        }
1205                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1206                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1207                            let comment = self.tokenize_single_line_comment(chars);
1208                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1209                                prefix: "//".to_owned(),
1210                                comment,
1211                            })))
1212                        }
1213                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1214                            self.consume_and_return(chars, Token::DuckIntDiv)
1215                        }
1216                        // a regular '/' operator
1217                        _ => Ok(Some(Token::Div)),
1218                    }
1219                }
1220                '+' => self.consume_and_return(chars, Token::Plus),
1221                '*' => self.consume_and_return(chars, Token::Mul),
1222                '%' => {
1223                    chars.next(); // advance past '%'
1224                    match chars.peek() {
1225                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1226                        Some(sch) if self.dialect.is_identifier_start('%') => {
1227                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1228                        }
1229                        _ => self.start_binop(chars, "%", Token::Mod),
1230                    }
1231                }
1232                '|' => {
1233                    chars.next(); // consume the '|'
1234                    match chars.peek() {
1235                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1236                        Some('|') => {
1237                            chars.next(); // consume the second '|'
1238                            match chars.peek() {
1239                                Some('/') => {
1240                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1241                                }
1242                                _ => self.start_binop(chars, "||", Token::StringConcat),
1243                            }
1244                        }
1245                        // Bitshift '|' operator
1246                        _ => self.start_binop(chars, "|", Token::Pipe),
1247                    }
1248                }
1249                '=' => {
1250                    chars.next(); // consume
1251                    match chars.peek() {
1252                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1253                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1254                        _ => Ok(Some(Token::Eq)),
1255                    }
1256                }
1257                '!' => {
1258                    chars.next(); // consume
1259                    match chars.peek() {
1260                        Some('=') => self.consume_and_return(chars, Token::Neq),
1261                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1262                        Some('~') => {
1263                            chars.next();
1264                            match chars.peek() {
1265                                Some('*') => self
1266                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1267                                Some('~') => {
1268                                    chars.next();
1269                                    match chars.peek() {
1270                                        Some('*') => self.consume_and_return(
1271                                            chars,
1272                                            Token::ExclamationMarkDoubleTildeAsterisk,
1273                                        ),
1274                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1275                                    }
1276                                }
1277                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1278                            }
1279                        }
1280                        _ => Ok(Some(Token::ExclamationMark)),
1281                    }
1282                }
1283                '<' => {
1284                    chars.next(); // consume
1285                    match chars.peek() {
1286                        Some('=') => {
1287                            chars.next();
1288                            match chars.peek() {
1289                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1290                                _ => self.start_binop(chars, "<=", Token::LtEq),
1291                            }
1292                        }
1293                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1294                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1295                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1296                        _ => self.start_binop(chars, "<", Token::Lt),
1297                    }
1298                }
1299                '>' => {
1300                    chars.next(); // consume
1301                    match chars.peek() {
1302                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1303                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1304                        _ => self.start_binop(chars, ">", Token::Gt),
1305                    }
1306                }
1307                ':' => {
1308                    chars.next();
1309                    match chars.peek() {
1310                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1311                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1312                        _ => Ok(Some(Token::Colon)),
1313                    }
1314                }
1315                ';' => self.consume_and_return(chars, Token::SemiColon),
1316                '\\' => self.consume_and_return(chars, Token::Backslash),
1317                '[' => self.consume_and_return(chars, Token::LBracket),
1318                ']' => self.consume_and_return(chars, Token::RBracket),
1319                '&' => {
1320                    chars.next(); // consume the '&'
1321                    match chars.peek() {
1322                        Some('&') => {
1323                            chars.next(); // consume the second '&'
1324                            self.start_binop(chars, "&&", Token::Overlap)
1325                        }
1326                        // Bitshift '&' operator
1327                        _ => self.start_binop(chars, "&", Token::Ampersand),
1328                    }
1329                }
1330                '^' => {
1331                    chars.next(); // consume the '^'
1332                    match chars.peek() {
1333                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1334                        _ => Ok(Some(Token::Caret)),
1335                    }
1336                }
1337                '{' => self.consume_and_return(chars, Token::LBrace),
1338                '}' => self.consume_and_return(chars, Token::RBrace),
1339                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect) => {
1340                    chars.next(); // consume the '#', starting a snowflake single-line comment
1341                    let comment = self.tokenize_single_line_comment(chars);
1342                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1343                        prefix: "#".to_owned(),
1344                        comment,
1345                    })))
1346                }
1347                '~' => {
1348                    chars.next(); // consume
1349                    match chars.peek() {
1350                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1351                        Some('~') => {
1352                            chars.next();
1353                            match chars.peek() {
1354                                Some('*') => {
1355                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1356                                }
1357                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1358                            }
1359                        }
1360                        _ => self.start_binop(chars, "~", Token::Tilde),
1361                    }
1362                }
1363                '#' => {
1364                    chars.next();
1365                    match chars.peek() {
1366                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1367                        Some('>') => {
1368                            chars.next();
1369                            match chars.peek() {
1370                                Some('>') => {
1371                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1372                                }
1373                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1374                            }
1375                        }
1376                        Some(' ') => Ok(Some(Token::Sharp)),
1377                        Some(sch) if self.dialect.is_identifier_start('#') => {
1378                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1379                        }
1380                        _ => self.start_binop(chars, "#", Token::Sharp),
1381                    }
1382                }
1383                '@' => {
1384                    chars.next();
1385                    match chars.peek() {
1386                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1387                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1388                        Some('@') => {
1389                            chars.next();
1390                            match chars.peek() {
1391                                Some(' ') => Ok(Some(Token::AtAt)),
1392                                Some(tch) if self.dialect.is_identifier_start('@') => {
1393                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1394                                }
1395                                _ => Ok(Some(Token::AtAt)),
1396                            }
1397                        }
1398                        Some(' ') => Ok(Some(Token::AtSign)),
1399                        Some(sch) if self.dialect.is_identifier_start('@') => {
1400                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1401                        }
1402                        _ => Ok(Some(Token::AtSign)),
1403                    }
1404                }
1405                // Postgres uses ? for jsonb operators, not prepared statements
1406                '?' if dialect_of!(self is PostgreSqlDialect) => {
1407                    chars.next();
1408                    match chars.peek() {
1409                        Some('|') => self.consume_and_return(chars, Token::QuestionPipe),
1410                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1411                        _ => self.consume_and_return(chars, Token::Question),
1412                    }
1413                }
1414                '?' => {
1415                    chars.next();
1416                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1417                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1418                }
1419
1420                // identifier or keyword
1421                ch if self.dialect.is_identifier_start(ch) => {
1422                    self.tokenize_identifier_or_keyword([ch], chars)
1423                }
1424                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1425
1426                //whitespace check (including unicode chars) should be last as it covers some of the chars above
1427                ch if ch.is_whitespace() => {
1428                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1429                }
1430                other => self.consume_and_return(chars, Token::Char(other)),
1431            },
1432            None => Ok(None),
1433        }
1434    }
1435
1436    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1437    fn consume_for_binop(
1438        &self,
1439        chars: &mut State,
1440        prefix: &str,
1441        default: Token,
1442    ) -> Result<Option<Token>, TokenizerError> {
1443        chars.next(); // consume the first char
1444        self.start_binop(chars, prefix, default)
1445    }
1446
1447    /// parse a custom binary operator
1448    fn start_binop(
1449        &self,
1450        chars: &mut State,
1451        prefix: &str,
1452        default: Token,
1453    ) -> Result<Option<Token>, TokenizerError> {
1454        let mut custom = None;
1455        while let Some(&ch) = chars.peek() {
1456            if !self.dialect.is_custom_operator_part(ch) {
1457                break;
1458            }
1459
1460            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1461            chars.next();
1462        }
1463
1464        Ok(Some(
1465            custom.map(Token::CustomBinaryOperator).unwrap_or(default),
1466        ))
1467    }
1468
1469    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1470    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1471        let mut s = String::new();
1472        let mut value = String::new();
1473
1474        chars.next();
1475
1476        if let Some('$') = chars.peek() {
1477            chars.next();
1478
1479            let mut is_terminated = false;
1480            let mut prev: Option<char> = None;
1481
1482            while let Some(&ch) = chars.peek() {
1483                if prev == Some('$') {
1484                    if ch == '$' {
1485                        chars.next();
1486                        is_terminated = true;
1487                        break;
1488                    } else {
1489                        s.push('$');
1490                        s.push(ch);
1491                    }
1492                } else if ch != '$' {
1493                    s.push(ch);
1494                }
1495
1496                prev = Some(ch);
1497                chars.next();
1498            }
1499
1500            return if chars.peek().is_none() && !is_terminated {
1501                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1502            } else {
1503                Ok(Token::DollarQuotedString(DollarQuotedString {
1504                    value: s,
1505                    tag: None,
1506                }))
1507            };
1508        } else {
1509            value.push_str(&peeking_take_while(chars, |ch| {
1510                ch.is_alphanumeric() || ch == '_'
1511            }));
1512
1513            if let Some('$') = chars.peek() {
1514                chars.next();
1515
1516                'searching_for_end: loop {
1517                    s.push_str(&peeking_take_while(chars, |ch| ch != '$'));
1518                    match chars.peek() {
1519                        Some('$') => {
1520                            chars.next();
1521                            let mut maybe_s = String::from("$");
1522                            for c in value.chars() {
1523                                if let Some(next_char) = chars.next() {
1524                                    maybe_s.push(next_char);
1525                                    if next_char != c {
1526                                        // This doesn't match the dollar quote delimiter so this
1527                                        // is not the end of the string.
1528                                        s.push_str(&maybe_s);
1529                                        continue 'searching_for_end;
1530                                    }
1531                                } else {
1532                                    return self.tokenizer_error(
1533                                        chars.location(),
1534                                        "Unterminated dollar-quoted, expected $",
1535                                    );
1536                                }
1537                            }
1538                            if chars.peek() == Some(&'$') {
1539                                chars.next();
1540                                maybe_s.push('$');
1541                                // maybe_s matches the end delimiter
1542                                break 'searching_for_end;
1543                            } else {
1544                                // This also doesn't match the dollar quote delimiter as there are
1545                                // more characters before the second dollar so this is not the end
1546                                // of the string.
1547                                s.push_str(&maybe_s);
1548                                continue 'searching_for_end;
1549                            }
1550                        }
1551                        _ => {
1552                            return self.tokenizer_error(
1553                                chars.location(),
1554                                "Unterminated dollar-quoted, expected $",
1555                            )
1556                        }
1557                    }
1558                }
1559            } else {
1560                return Ok(Token::Placeholder(String::from("$") + &value));
1561            }
1562        }
1563
1564        Ok(Token::DollarQuotedString(DollarQuotedString {
1565            value: s,
1566            tag: if value.is_empty() { None } else { Some(value) },
1567        }))
1568    }
1569
1570    fn tokenizer_error<R>(
1571        &self,
1572        loc: Location,
1573        message: impl Into<String>,
1574    ) -> Result<R, TokenizerError> {
1575        Err(TokenizerError {
1576            message: message.into(),
1577            location: loc,
1578        })
1579    }
1580
1581    // Consume characters until newline
1582    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1583        let mut comment = peeking_take_while(chars, |ch| ch != '\n');
1584        if let Some(ch) = chars.next() {
1585            assert_eq!(ch, '\n');
1586            comment.push(ch);
1587        }
1588        comment
1589    }
1590
1591    /// Tokenize an identifier or keyword, after the first char is already consumed.
1592    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1593        let mut s = first_chars.into();
1594        s.push_str(&peeking_take_while(chars, |ch| {
1595            self.dialect.is_identifier_part(ch)
1596        }));
1597        s
1598    }
1599
1600    /// Read a single quoted string, starting with the opening quote.
1601    fn tokenize_escaped_single_quoted_string(
1602        &self,
1603        starting_loc: Location,
1604        chars: &mut State,
1605    ) -> Result<String, TokenizerError> {
1606        if let Some(s) = unescape_single_quoted_string(chars) {
1607            return Ok(s);
1608        }
1609
1610        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1611    }
1612
1613    /// Reads a string literal quoted by a single or triple quote characters.
1614    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
1615    fn tokenize_single_or_triple_quoted_string<F>(
1616        &self,
1617        chars: &mut State,
1618        quote_style: char,
1619        backslash_escape: bool,
1620        single_quote_token: F,
1621        triple_quote_token: F,
1622    ) -> Result<Option<Token>, TokenizerError>
1623    where
1624        F: Fn(String) -> Token,
1625    {
1626        let error_loc = chars.location();
1627
1628        let mut num_opening_quotes = 0u8;
1629        for _ in 0..3 {
1630            if Some(&quote_style) == chars.peek() {
1631                chars.next(); // Consume quote.
1632                num_opening_quotes += 1;
1633            } else {
1634                break;
1635            }
1636        }
1637
1638        let (token_fn, num_quote_chars) = match num_opening_quotes {
1639            1 => (single_quote_token, NumStringQuoteChars::One),
1640            2 => {
1641                // If we matched double quotes, then this is an empty string.
1642                return Ok(Some(single_quote_token("".into())));
1643            }
1644            3 => {
1645                let Some(num_quote_chars) = NonZeroU8::new(3) else {
1646                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1647                };
1648                (
1649                    triple_quote_token,
1650                    NumStringQuoteChars::Many(num_quote_chars),
1651                )
1652            }
1653            _ => {
1654                return self.tokenizer_error(error_loc, "invalid string literal opening");
1655            }
1656        };
1657
1658        let settings = TokenizeQuotedStringSettings {
1659            quote_style,
1660            num_quote_chars,
1661            num_opening_quotes_to_consume: 0,
1662            backslash_escape,
1663        };
1664
1665        self.tokenize_quoted_string(chars, settings)
1666            .map(token_fn)
1667            .map(Some)
1668    }
1669
1670    /// Reads a string literal quoted by a single quote character.
1671    fn tokenize_single_quoted_string(
1672        &self,
1673        chars: &mut State,
1674        quote_style: char,
1675        backslash_escape: bool,
1676    ) -> Result<String, TokenizerError> {
1677        self.tokenize_quoted_string(
1678            chars,
1679            TokenizeQuotedStringSettings {
1680                quote_style,
1681                num_quote_chars: NumStringQuoteChars::One,
1682                num_opening_quotes_to_consume: 1,
1683                backslash_escape,
1684            },
1685        )
1686    }
1687
1688    /// Read a quoted string.
1689    fn tokenize_quoted_string(
1690        &self,
1691        chars: &mut State,
1692        settings: TokenizeQuotedStringSettings,
1693    ) -> Result<String, TokenizerError> {
1694        let mut s = String::new();
1695        let error_loc = chars.location();
1696
1697        // Consume any opening quotes.
1698        for _ in 0..settings.num_opening_quotes_to_consume {
1699            if Some(settings.quote_style) != chars.next() {
1700                return self.tokenizer_error(error_loc, "invalid string literal opening");
1701            }
1702        }
1703
1704        let mut num_consecutive_quotes = 0;
1705        while let Some(&ch) = chars.peek() {
1706            let pending_final_quote = match settings.num_quote_chars {
1707                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
1708                n @ NumStringQuoteChars::Many(count)
1709                    if num_consecutive_quotes + 1 == count.get() =>
1710                {
1711                    Some(n)
1712                }
1713                NumStringQuoteChars::Many(_) => None,
1714            };
1715
1716            match ch {
1717                char if char == settings.quote_style && pending_final_quote.is_some() => {
1718                    chars.next(); // consume
1719
1720                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
1721                        // For an initial string like `"""abc"""`, at this point we have
1722                        // `abc""` in the buffer and have now matched the final `"`.
1723                        // However, the string to return is simply `abc`, so we strip off
1724                        // the trailing quotes before returning.
1725                        let mut buf = s.chars();
1726                        for _ in 1..count.get() {
1727                            buf.next_back();
1728                        }
1729                        return Ok(buf.as_str().to_string());
1730                    } else if chars
1731                        .peek()
1732                        .map(|c| *c == settings.quote_style)
1733                        .unwrap_or(false)
1734                    {
1735                        s.push(ch);
1736                        if !self.unescape {
1737                            // In no-escape mode, the given query has to be saved completely
1738                            s.push(ch);
1739                        }
1740                        chars.next();
1741                    } else {
1742                        return Ok(s);
1743                    }
1744                }
1745                '\\' if settings.backslash_escape => {
1746                    // consume backslash
1747                    chars.next();
1748
1749                    num_consecutive_quotes = 0;
1750
1751                    if let Some(next) = chars.peek() {
1752                        if !self.unescape {
1753                            // In no-escape mode, the given query has to be saved completely including backslashes.
1754                            s.push(ch);
1755                            s.push(*next);
1756                            chars.next(); // consume next
1757                        } else {
1758                            let n = match next {
1759                                '0' => '\0',
1760                                'a' => '\u{7}',
1761                                'b' => '\u{8}',
1762                                'f' => '\u{c}',
1763                                'n' => '\n',
1764                                'r' => '\r',
1765                                't' => '\t',
1766                                'Z' => '\u{1a}',
1767                                _ => *next,
1768                            };
1769                            s.push(n);
1770                            chars.next(); // consume next
1771                        }
1772                    }
1773                }
1774                ch => {
1775                    chars.next(); // consume ch
1776
1777                    if ch == settings.quote_style {
1778                        num_consecutive_quotes += 1;
1779                    } else {
1780                        num_consecutive_quotes = 0;
1781                    }
1782
1783                    s.push(ch);
1784                }
1785            }
1786        }
1787        self.tokenizer_error(error_loc, "Unterminated string literal")
1788    }
1789
1790    fn tokenize_multiline_comment(
1791        &self,
1792        chars: &mut State,
1793    ) -> Result<Option<Token>, TokenizerError> {
1794        let mut s = String::new();
1795        let mut nested = 1;
1796        let mut last_ch = ' ';
1797
1798        loop {
1799            match chars.next() {
1800                Some(ch) => {
1801                    if last_ch == '/' && ch == '*' {
1802                        nested += 1;
1803                    } else if last_ch == '*' && ch == '/' {
1804                        nested -= 1;
1805                        if nested == 0 {
1806                            s.pop();
1807                            break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
1808                        }
1809                    }
1810                    s.push(ch);
1811                    last_ch = ch;
1812                }
1813                None => {
1814                    break self.tokenizer_error(
1815                        chars.location(),
1816                        "Unexpected EOF while in a multi-line comment",
1817                    )
1818                }
1819            }
1820        }
1821    }
1822
1823    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
1824        let mut last_char = None;
1825        let mut s = String::new();
1826        while let Some(ch) = chars.next() {
1827            if ch == quote_end {
1828                if chars.peek() == Some(&quote_end) {
1829                    chars.next();
1830                    s.push(ch);
1831                    if !self.unescape {
1832                        // In no-escape mode, the given query has to be saved completely
1833                        s.push(ch);
1834                    }
1835                } else {
1836                    last_char = Some(quote_end);
1837                    break;
1838                }
1839            } else {
1840                s.push(ch);
1841            }
1842        }
1843        (s, last_char)
1844    }
1845
1846    #[allow(clippy::unnecessary_wraps)]
1847    fn consume_and_return(
1848        &self,
1849        chars: &mut State,
1850        t: Token,
1851    ) -> Result<Option<Token>, TokenizerError> {
1852        chars.next();
1853        Ok(Some(t))
1854    }
1855}
1856
1857/// Read from `chars` until `predicate` returns `false` or EOF is hit.
1858/// Return the characters read as String, and keep the first non-matching
1859/// char available as `chars.next()`.
1860fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
1861    let mut s = String::new();
1862    while let Some(&ch) = chars.peek() {
1863        if predicate(ch) {
1864            chars.next(); // consume
1865            s.push(ch);
1866        } else {
1867            break;
1868        }
1869    }
1870    s
1871}
1872
1873fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
1874    Unescape::new(chars).unescape()
1875}
1876
1877struct Unescape<'a: 'b, 'b> {
1878    chars: &'b mut State<'a>,
1879}
1880
1881impl<'a: 'b, 'b> Unescape<'a, 'b> {
1882    fn new(chars: &'b mut State<'a>) -> Self {
1883        Self { chars }
1884    }
1885    fn unescape(mut self) -> Option<String> {
1886        let mut unescaped = String::new();
1887
1888        self.chars.next();
1889
1890        while let Some(c) = self.chars.next() {
1891            if c == '\'' {
1892                // case: ''''
1893                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
1894                    self.chars.next();
1895                    unescaped.push('\'');
1896                    continue;
1897                }
1898                return Some(unescaped);
1899            }
1900
1901            if c != '\\' {
1902                unescaped.push(c);
1903                continue;
1904            }
1905
1906            let c = match self.chars.next()? {
1907                'b' => '\u{0008}',
1908                'f' => '\u{000C}',
1909                'n' => '\n',
1910                'r' => '\r',
1911                't' => '\t',
1912                'u' => self.unescape_unicode_16()?,
1913                'U' => self.unescape_unicode_32()?,
1914                'x' => self.unescape_hex()?,
1915                c if c.is_digit(8) => self.unescape_octal(c)?,
1916                c => c,
1917            };
1918
1919            unescaped.push(Self::check_null(c)?);
1920        }
1921
1922        None
1923    }
1924
1925    #[inline]
1926    fn check_null(c: char) -> Option<char> {
1927        if c == '\0' {
1928            None
1929        } else {
1930            Some(c)
1931        }
1932    }
1933
1934    #[inline]
1935    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
1936        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
1937        match u32::from_str_radix(s, RADIX) {
1938            Err(_) => None,
1939            Ok(n) => {
1940                let n = n & 0xFF;
1941                if n <= 127 {
1942                    char::from_u32(n)
1943                } else {
1944                    None
1945                }
1946            }
1947        }
1948    }
1949
1950    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
1951    fn unescape_hex(&mut self) -> Option<char> {
1952        let mut s = String::new();
1953
1954        for _ in 0..2 {
1955            match self.next_hex_digit() {
1956                Some(c) => s.push(c),
1957                None => break,
1958            }
1959        }
1960
1961        if s.is_empty() {
1962            return Some('x');
1963        }
1964
1965        Self::byte_to_char::<16>(&s)
1966    }
1967
1968    #[inline]
1969    fn next_hex_digit(&mut self) -> Option<char> {
1970        match self.chars.peek() {
1971            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
1972            _ => None,
1973        }
1974    }
1975
1976    // Octal byte value. \o, \oo, \ooo (o = 0–7)
1977    fn unescape_octal(&mut self, c: char) -> Option<char> {
1978        let mut s = String::new();
1979
1980        s.push(c);
1981        for _ in 0..2 {
1982            match self.next_octal_digest() {
1983                Some(c) => s.push(c),
1984                None => break,
1985            }
1986        }
1987
1988        Self::byte_to_char::<8>(&s)
1989    }
1990
1991    #[inline]
1992    fn next_octal_digest(&mut self) -> Option<char> {
1993        match self.chars.peek() {
1994            Some(c) if c.is_digit(8) => self.chars.next(),
1995            _ => None,
1996        }
1997    }
1998
1999    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2000    fn unescape_unicode_16(&mut self) -> Option<char> {
2001        self.unescape_unicode::<4>()
2002    }
2003
2004    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2005    fn unescape_unicode_32(&mut self) -> Option<char> {
2006        self.unescape_unicode::<8>()
2007    }
2008
2009    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2010        let mut s = String::new();
2011        for _ in 0..NUM {
2012            s.push(self.chars.next()?);
2013        }
2014        match u32::from_str_radix(&s, 16) {
2015            Err(_) => None,
2016            Ok(n) => char::from_u32(n),
2017        }
2018    }
2019}
2020
2021fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2022    let mut unescaped = String::new();
2023    chars.next(); // consume the opening quote
2024    while let Some(c) = chars.next() {
2025        match c {
2026            '\'' => {
2027                if chars.peek() == Some(&'\'') {
2028                    chars.next();
2029                    unescaped.push('\'');
2030                } else {
2031                    return Ok(unescaped);
2032                }
2033            }
2034            '\\' => match chars.peek() {
2035                Some('\\') => {
2036                    chars.next();
2037                    unescaped.push('\\');
2038                }
2039                Some('+') => {
2040                    chars.next();
2041                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2042                }
2043                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2044            },
2045            _ => {
2046                unescaped.push(c);
2047            }
2048        }
2049    }
2050    Err(TokenizerError {
2051        message: "Unterminated unicode encoded string literal".to_string(),
2052        location: chars.location(),
2053    })
2054}
2055
2056fn take_char_from_hex_digits(
2057    chars: &mut State<'_>,
2058    max_digits: usize,
2059) -> Result<char, TokenizerError> {
2060    let mut result = 0u32;
2061    for _ in 0..max_digits {
2062        let next_char = chars.next().ok_or_else(|| TokenizerError {
2063            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2064                .to_string(),
2065            location: chars.location(),
2066        })?;
2067        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2068            message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
2069            location: chars.location(),
2070        })?;
2071        result = result * 16 + digit;
2072    }
2073    char::from_u32(result).ok_or_else(|| TokenizerError {
2074        message: format!("Invalid unicode character: {:x}", result),
2075        location: chars.location(),
2076    })
2077}
2078
2079#[cfg(test)]
2080mod tests {
2081    use super::*;
2082    use crate::dialect::{
2083        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect,
2084    };
2085    use core::fmt::Debug;
2086
2087    #[test]
2088    fn tokenizer_error_impl() {
2089        let err = TokenizerError {
2090            message: "test".into(),
2091            location: Location { line: 1, column: 1 },
2092        };
2093        #[cfg(feature = "std")]
2094        {
2095            use std::error::Error;
2096            assert!(err.source().is_none());
2097        }
2098        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2099    }
2100
2101    #[test]
2102    fn tokenize_select_1() {
2103        let sql = String::from("SELECT 1");
2104        let dialect = GenericDialect {};
2105        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2106
2107        let expected = vec![
2108            Token::make_keyword("SELECT"),
2109            Token::Whitespace(Whitespace::Space),
2110            Token::Number(String::from("1"), false),
2111        ];
2112
2113        compare(expected, tokens);
2114    }
2115
2116    #[test]
2117    fn tokenize_select_float() {
2118        let sql = String::from("SELECT .1");
2119        let dialect = GenericDialect {};
2120        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2121
2122        let expected = vec![
2123            Token::make_keyword("SELECT"),
2124            Token::Whitespace(Whitespace::Space),
2125            Token::Number(String::from(".1"), false),
2126        ];
2127
2128        compare(expected, tokens);
2129    }
2130
2131    #[test]
2132    fn tokenize_clickhouse_double_equal() {
2133        let sql = String::from("SELECT foo=='1'");
2134        let dialect = ClickHouseDialect {};
2135        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2136        let tokens = tokenizer.tokenize().unwrap();
2137
2138        let expected = vec![
2139            Token::make_keyword("SELECT"),
2140            Token::Whitespace(Whitespace::Space),
2141            Token::Word(Word {
2142                value: "foo".to_string(),
2143                quote_style: None,
2144                keyword: Keyword::NoKeyword,
2145            }),
2146            Token::DoubleEq,
2147            Token::SingleQuotedString("1".to_string()),
2148        ];
2149
2150        compare(expected, tokens);
2151    }
2152
2153    #[test]
2154    fn tokenize_select_exponent() {
2155        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2156        let dialect = GenericDialect {};
2157        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2158
2159        let expected = vec![
2160            Token::make_keyword("SELECT"),
2161            Token::Whitespace(Whitespace::Space),
2162            Token::Number(String::from("1e10"), false),
2163            Token::Comma,
2164            Token::Whitespace(Whitespace::Space),
2165            Token::Number(String::from("1e-10"), false),
2166            Token::Comma,
2167            Token::Whitespace(Whitespace::Space),
2168            Token::Number(String::from("1e+10"), false),
2169            Token::Comma,
2170            Token::Whitespace(Whitespace::Space),
2171            Token::Number(String::from("1"), false),
2172            Token::make_word("ea", None),
2173            Token::Comma,
2174            Token::Whitespace(Whitespace::Space),
2175            Token::Number(String::from("1e-10"), false),
2176            Token::make_word("a", None),
2177            Token::Comma,
2178            Token::Whitespace(Whitespace::Space),
2179            Token::Number(String::from("1e-10"), false),
2180            Token::Minus,
2181            Token::Number(String::from("10"), false),
2182        ];
2183
2184        compare(expected, tokens);
2185    }
2186
2187    #[test]
2188    fn tokenize_scalar_function() {
2189        let sql = String::from("SELECT sqrt(1)");
2190        let dialect = GenericDialect {};
2191        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2192
2193        let expected = vec![
2194            Token::make_keyword("SELECT"),
2195            Token::Whitespace(Whitespace::Space),
2196            Token::make_word("sqrt", None),
2197            Token::LParen,
2198            Token::Number(String::from("1"), false),
2199            Token::RParen,
2200        ];
2201
2202        compare(expected, tokens);
2203    }
2204
2205    #[test]
2206    fn tokenize_string_string_concat() {
2207        let sql = String::from("SELECT 'a' || 'b'");
2208        let dialect = GenericDialect {};
2209        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2210
2211        let expected = vec![
2212            Token::make_keyword("SELECT"),
2213            Token::Whitespace(Whitespace::Space),
2214            Token::SingleQuotedString(String::from("a")),
2215            Token::Whitespace(Whitespace::Space),
2216            Token::StringConcat,
2217            Token::Whitespace(Whitespace::Space),
2218            Token::SingleQuotedString(String::from("b")),
2219        ];
2220
2221        compare(expected, tokens);
2222    }
2223    #[test]
2224    fn tokenize_bitwise_op() {
2225        let sql = String::from("SELECT one | two ^ three");
2226        let dialect = GenericDialect {};
2227        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2228
2229        let expected = vec![
2230            Token::make_keyword("SELECT"),
2231            Token::Whitespace(Whitespace::Space),
2232            Token::make_word("one", None),
2233            Token::Whitespace(Whitespace::Space),
2234            Token::Pipe,
2235            Token::Whitespace(Whitespace::Space),
2236            Token::make_word("two", None),
2237            Token::Whitespace(Whitespace::Space),
2238            Token::Caret,
2239            Token::Whitespace(Whitespace::Space),
2240            Token::make_word("three", None),
2241        ];
2242        compare(expected, tokens);
2243    }
2244
2245    #[test]
2246    fn tokenize_logical_xor() {
2247        let sql =
2248            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2249        let dialect = GenericDialect {};
2250        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2251
2252        let expected = vec![
2253            Token::make_keyword("SELECT"),
2254            Token::Whitespace(Whitespace::Space),
2255            Token::make_keyword("true"),
2256            Token::Whitespace(Whitespace::Space),
2257            Token::make_keyword("XOR"),
2258            Token::Whitespace(Whitespace::Space),
2259            Token::make_keyword("true"),
2260            Token::Comma,
2261            Token::Whitespace(Whitespace::Space),
2262            Token::make_keyword("false"),
2263            Token::Whitespace(Whitespace::Space),
2264            Token::make_keyword("XOR"),
2265            Token::Whitespace(Whitespace::Space),
2266            Token::make_keyword("false"),
2267            Token::Comma,
2268            Token::Whitespace(Whitespace::Space),
2269            Token::make_keyword("true"),
2270            Token::Whitespace(Whitespace::Space),
2271            Token::make_keyword("XOR"),
2272            Token::Whitespace(Whitespace::Space),
2273            Token::make_keyword("false"),
2274            Token::Comma,
2275            Token::Whitespace(Whitespace::Space),
2276            Token::make_keyword("false"),
2277            Token::Whitespace(Whitespace::Space),
2278            Token::make_keyword("XOR"),
2279            Token::Whitespace(Whitespace::Space),
2280            Token::make_keyword("true"),
2281        ];
2282        compare(expected, tokens);
2283    }
2284
2285    #[test]
2286    fn tokenize_simple_select() {
2287        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2288        let dialect = GenericDialect {};
2289        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2290
2291        let expected = vec![
2292            Token::make_keyword("SELECT"),
2293            Token::Whitespace(Whitespace::Space),
2294            Token::Mul,
2295            Token::Whitespace(Whitespace::Space),
2296            Token::make_keyword("FROM"),
2297            Token::Whitespace(Whitespace::Space),
2298            Token::make_word("customer", None),
2299            Token::Whitespace(Whitespace::Space),
2300            Token::make_keyword("WHERE"),
2301            Token::Whitespace(Whitespace::Space),
2302            Token::make_word("id", None),
2303            Token::Whitespace(Whitespace::Space),
2304            Token::Eq,
2305            Token::Whitespace(Whitespace::Space),
2306            Token::Number(String::from("1"), false),
2307            Token::Whitespace(Whitespace::Space),
2308            Token::make_keyword("LIMIT"),
2309            Token::Whitespace(Whitespace::Space),
2310            Token::Number(String::from("5"), false),
2311        ];
2312
2313        compare(expected, tokens);
2314    }
2315
2316    #[test]
2317    fn tokenize_explain_select() {
2318        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2319        let dialect = GenericDialect {};
2320        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2321
2322        let expected = vec![
2323            Token::make_keyword("EXPLAIN"),
2324            Token::Whitespace(Whitespace::Space),
2325            Token::make_keyword("SELECT"),
2326            Token::Whitespace(Whitespace::Space),
2327            Token::Mul,
2328            Token::Whitespace(Whitespace::Space),
2329            Token::make_keyword("FROM"),
2330            Token::Whitespace(Whitespace::Space),
2331            Token::make_word("customer", None),
2332            Token::Whitespace(Whitespace::Space),
2333            Token::make_keyword("WHERE"),
2334            Token::Whitespace(Whitespace::Space),
2335            Token::make_word("id", None),
2336            Token::Whitespace(Whitespace::Space),
2337            Token::Eq,
2338            Token::Whitespace(Whitespace::Space),
2339            Token::Number(String::from("1"), false),
2340        ];
2341
2342        compare(expected, tokens);
2343    }
2344
2345    #[test]
2346    fn tokenize_explain_analyze_select() {
2347        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2348        let dialect = GenericDialect {};
2349        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2350
2351        let expected = vec![
2352            Token::make_keyword("EXPLAIN"),
2353            Token::Whitespace(Whitespace::Space),
2354            Token::make_keyword("ANALYZE"),
2355            Token::Whitespace(Whitespace::Space),
2356            Token::make_keyword("SELECT"),
2357            Token::Whitespace(Whitespace::Space),
2358            Token::Mul,
2359            Token::Whitespace(Whitespace::Space),
2360            Token::make_keyword("FROM"),
2361            Token::Whitespace(Whitespace::Space),
2362            Token::make_word("customer", None),
2363            Token::Whitespace(Whitespace::Space),
2364            Token::make_keyword("WHERE"),
2365            Token::Whitespace(Whitespace::Space),
2366            Token::make_word("id", None),
2367            Token::Whitespace(Whitespace::Space),
2368            Token::Eq,
2369            Token::Whitespace(Whitespace::Space),
2370            Token::Number(String::from("1"), false),
2371        ];
2372
2373        compare(expected, tokens);
2374    }
2375
2376    #[test]
2377    fn tokenize_string_predicate() {
2378        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2379        let dialect = GenericDialect {};
2380        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2381
2382        let expected = vec![
2383            Token::make_keyword("SELECT"),
2384            Token::Whitespace(Whitespace::Space),
2385            Token::Mul,
2386            Token::Whitespace(Whitespace::Space),
2387            Token::make_keyword("FROM"),
2388            Token::Whitespace(Whitespace::Space),
2389            Token::make_word("customer", None),
2390            Token::Whitespace(Whitespace::Space),
2391            Token::make_keyword("WHERE"),
2392            Token::Whitespace(Whitespace::Space),
2393            Token::make_word("salary", None),
2394            Token::Whitespace(Whitespace::Space),
2395            Token::Neq,
2396            Token::Whitespace(Whitespace::Space),
2397            Token::SingleQuotedString(String::from("Not Provided")),
2398        ];
2399
2400        compare(expected, tokens);
2401    }
2402
2403    #[test]
2404    fn tokenize_invalid_string() {
2405        let sql = String::from("\n💝مصطفىh");
2406
2407        let dialect = GenericDialect {};
2408        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2409        // println!("tokens: {:#?}", tokens);
2410        let expected = vec![
2411            Token::Whitespace(Whitespace::Newline),
2412            Token::Char('💝'),
2413            Token::make_word("مصطفىh", None),
2414        ];
2415        compare(expected, tokens);
2416    }
2417
2418    #[test]
2419    fn tokenize_newline_in_string_literal() {
2420        let sql = String::from("'foo\r\nbar\nbaz'");
2421
2422        let dialect = GenericDialect {};
2423        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2424        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2425        compare(expected, tokens);
2426    }
2427
2428    #[test]
2429    fn tokenize_unterminated_string_literal() {
2430        let sql = String::from("select 'foo");
2431
2432        let dialect = GenericDialect {};
2433        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2434        assert_eq!(
2435            tokenizer.tokenize(),
2436            Err(TokenizerError {
2437                message: "Unterminated string literal".to_string(),
2438                location: Location { line: 1, column: 8 },
2439            })
2440        );
2441    }
2442
2443    #[test]
2444    fn tokenize_unterminated_string_literal_utf8() {
2445        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2446
2447        let dialect = GenericDialect {};
2448        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2449        assert_eq!(
2450            tokenizer.tokenize(),
2451            Err(TokenizerError {
2452                message: "Unterminated string literal".to_string(),
2453                location: Location {
2454                    line: 1,
2455                    column: 35
2456                }
2457            })
2458        );
2459    }
2460
2461    #[test]
2462    fn tokenize_invalid_string_cols() {
2463        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2464
2465        let dialect = GenericDialect {};
2466        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2467        // println!("tokens: {:#?}", tokens);
2468        let expected = vec![
2469            Token::Whitespace(Whitespace::Newline),
2470            Token::Whitespace(Whitespace::Newline),
2471            Token::make_keyword("SELECT"),
2472            Token::Whitespace(Whitespace::Space),
2473            Token::Mul,
2474            Token::Whitespace(Whitespace::Space),
2475            Token::make_keyword("FROM"),
2476            Token::Whitespace(Whitespace::Space),
2477            Token::make_keyword("table"),
2478            Token::Whitespace(Whitespace::Tab),
2479            Token::Char('💝'),
2480            Token::make_word("مصطفىh", None),
2481        ];
2482        compare(expected, tokens);
2483    }
2484
2485    #[test]
2486    fn tokenize_dollar_quoted_string_tagged() {
2487        let sql = String::from(
2488            "SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$",
2489        );
2490        let dialect = GenericDialect {};
2491        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2492        let expected = vec![
2493            Token::make_keyword("SELECT"),
2494            Token::Whitespace(Whitespace::Space),
2495            Token::DollarQuotedString(DollarQuotedString {
2496                value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2497                tag: Some("tag".into()),
2498            }),
2499        ];
2500        compare(expected, tokens);
2501    }
2502
2503    #[test]
2504    fn tokenize_dollar_quoted_string_tagged_unterminated() {
2505        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2506        let dialect = GenericDialect {};
2507        assert_eq!(
2508            Tokenizer::new(&dialect, &sql).tokenize(),
2509            Err(TokenizerError {
2510                message: "Unterminated dollar-quoted, expected $".into(),
2511                location: Location {
2512                    line: 1,
2513                    column: 91
2514                }
2515            })
2516        );
2517    }
2518
2519    #[test]
2520    fn tokenize_dollar_quoted_string_untagged() {
2521        let sql =
2522            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
2523        let dialect = GenericDialect {};
2524        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2525        let expected = vec![
2526            Token::make_keyword("SELECT"),
2527            Token::Whitespace(Whitespace::Space),
2528            Token::DollarQuotedString(DollarQuotedString {
2529                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
2530                tag: None,
2531            }),
2532        ];
2533        compare(expected, tokens);
2534    }
2535
2536    #[test]
2537    fn tokenize_dollar_quoted_string_untagged_unterminated() {
2538        let sql = String::from(
2539            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
2540        );
2541        let dialect = GenericDialect {};
2542        assert_eq!(
2543            Tokenizer::new(&dialect, &sql).tokenize(),
2544            Err(TokenizerError {
2545                message: "Unterminated dollar-quoted string".into(),
2546                location: Location {
2547                    line: 1,
2548                    column: 86
2549                }
2550            })
2551        );
2552    }
2553
2554    #[test]
2555    fn tokenize_right_arrow() {
2556        let sql = String::from("FUNCTION(key=>value)");
2557        let dialect = GenericDialect {};
2558        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2559        let expected = vec![
2560            Token::make_word("FUNCTION", None),
2561            Token::LParen,
2562            Token::make_word("key", None),
2563            Token::RArrow,
2564            Token::make_word("value", None),
2565            Token::RParen,
2566        ];
2567        compare(expected, tokens);
2568    }
2569
2570    #[test]
2571    fn tokenize_is_null() {
2572        let sql = String::from("a IS NULL");
2573        let dialect = GenericDialect {};
2574        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2575
2576        let expected = vec![
2577            Token::make_word("a", None),
2578            Token::Whitespace(Whitespace::Space),
2579            Token::make_keyword("IS"),
2580            Token::Whitespace(Whitespace::Space),
2581            Token::make_keyword("NULL"),
2582        ];
2583
2584        compare(expected, tokens);
2585    }
2586
2587    #[test]
2588    fn tokenize_comment() {
2589        let sql = String::from("0--this is a comment\n1");
2590
2591        let dialect = GenericDialect {};
2592        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2593        let expected = vec![
2594            Token::Number("0".to_string(), false),
2595            Token::Whitespace(Whitespace::SingleLineComment {
2596                prefix: "--".to_string(),
2597                comment: "this is a comment\n".to_string(),
2598            }),
2599            Token::Number("1".to_string(), false),
2600        ];
2601        compare(expected, tokens);
2602    }
2603
2604    #[test]
2605    fn tokenize_comment_at_eof() {
2606        let sql = String::from("--this is a comment");
2607
2608        let dialect = GenericDialect {};
2609        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2610        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
2611            prefix: "--".to_string(),
2612            comment: "this is a comment".to_string(),
2613        })];
2614        compare(expected, tokens);
2615    }
2616
2617    #[test]
2618    fn tokenize_multiline_comment() {
2619        let sql = String::from("0/*multi-line\n* /comment*/1");
2620
2621        let dialect = GenericDialect {};
2622        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2623        let expected = vec![
2624            Token::Number("0".to_string(), false),
2625            Token::Whitespace(Whitespace::MultiLineComment(
2626                "multi-line\n* /comment".to_string(),
2627            )),
2628            Token::Number("1".to_string(), false),
2629        ];
2630        compare(expected, tokens);
2631    }
2632
2633    #[test]
2634    fn tokenize_nested_multiline_comment() {
2635        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
2636
2637        let dialect = GenericDialect {};
2638        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2639        let expected = vec![
2640            Token::Number("0".to_string(), false),
2641            Token::Whitespace(Whitespace::MultiLineComment(
2642                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
2643            )),
2644            Token::Number("1".to_string(), false),
2645        ];
2646        compare(expected, tokens);
2647    }
2648
2649    #[test]
2650    fn tokenize_multiline_comment_with_even_asterisks() {
2651        let sql = String::from("\n/** Comment **/\n");
2652
2653        let dialect = GenericDialect {};
2654        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2655        let expected = vec![
2656            Token::Whitespace(Whitespace::Newline),
2657            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
2658            Token::Whitespace(Whitespace::Newline),
2659        ];
2660        compare(expected, tokens);
2661    }
2662
2663    #[test]
2664    fn tokenize_unicode_whitespace() {
2665        let sql = String::from(" \u{2003}\n");
2666
2667        let dialect = GenericDialect {};
2668        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2669        let expected = vec![
2670            Token::Whitespace(Whitespace::Space),
2671            Token::Whitespace(Whitespace::Space),
2672            Token::Whitespace(Whitespace::Newline),
2673        ];
2674        compare(expected, tokens);
2675    }
2676
2677    #[test]
2678    fn tokenize_mismatched_quotes() {
2679        let sql = String::from("\"foo");
2680
2681        let dialect = GenericDialect {};
2682        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2683        assert_eq!(
2684            tokenizer.tokenize(),
2685            Err(TokenizerError {
2686                message: "Expected close delimiter '\"' before EOF.".to_string(),
2687                location: Location { line: 1, column: 1 },
2688            })
2689        );
2690    }
2691
2692    #[test]
2693    fn tokenize_newlines() {
2694        let sql = String::from("line1\nline2\rline3\r\nline4\r");
2695
2696        let dialect = GenericDialect {};
2697        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2698        let expected = vec![
2699            Token::make_word("line1", None),
2700            Token::Whitespace(Whitespace::Newline),
2701            Token::make_word("line2", None),
2702            Token::Whitespace(Whitespace::Newline),
2703            Token::make_word("line3", None),
2704            Token::Whitespace(Whitespace::Newline),
2705            Token::make_word("line4", None),
2706            Token::Whitespace(Whitespace::Newline),
2707        ];
2708        compare(expected, tokens);
2709    }
2710
2711    #[test]
2712    fn tokenize_mssql_top() {
2713        let sql = "SELECT TOP 5 [bar] FROM foo";
2714        let dialect = MsSqlDialect {};
2715        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2716        let expected = vec![
2717            Token::make_keyword("SELECT"),
2718            Token::Whitespace(Whitespace::Space),
2719            Token::make_keyword("TOP"),
2720            Token::Whitespace(Whitespace::Space),
2721            Token::Number(String::from("5"), false),
2722            Token::Whitespace(Whitespace::Space),
2723            Token::make_word("bar", Some('[')),
2724            Token::Whitespace(Whitespace::Space),
2725            Token::make_keyword("FROM"),
2726            Token::Whitespace(Whitespace::Space),
2727            Token::make_word("foo", None),
2728        ];
2729        compare(expected, tokens);
2730    }
2731
2732    #[test]
2733    fn tokenize_pg_regex_match() {
2734        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
2735        let dialect = GenericDialect {};
2736        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2737        let expected = vec![
2738            Token::make_keyword("SELECT"),
2739            Token::Whitespace(Whitespace::Space),
2740            Token::make_word("col", None),
2741            Token::Whitespace(Whitespace::Space),
2742            Token::Tilde,
2743            Token::Whitespace(Whitespace::Space),
2744            Token::SingleQuotedString("^a".into()),
2745            Token::Comma,
2746            Token::Whitespace(Whitespace::Space),
2747            Token::make_word("col", None),
2748            Token::Whitespace(Whitespace::Space),
2749            Token::TildeAsterisk,
2750            Token::Whitespace(Whitespace::Space),
2751            Token::SingleQuotedString("^a".into()),
2752            Token::Comma,
2753            Token::Whitespace(Whitespace::Space),
2754            Token::make_word("col", None),
2755            Token::Whitespace(Whitespace::Space),
2756            Token::ExclamationMarkTilde,
2757            Token::Whitespace(Whitespace::Space),
2758            Token::SingleQuotedString("^a".into()),
2759            Token::Comma,
2760            Token::Whitespace(Whitespace::Space),
2761            Token::make_word("col", None),
2762            Token::Whitespace(Whitespace::Space),
2763            Token::ExclamationMarkTildeAsterisk,
2764            Token::Whitespace(Whitespace::Space),
2765            Token::SingleQuotedString("^a".into()),
2766        ];
2767        compare(expected, tokens);
2768    }
2769
2770    #[test]
2771    fn tokenize_pg_like_match() {
2772        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
2773        let dialect = GenericDialect {};
2774        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2775        let expected = vec![
2776            Token::make_keyword("SELECT"),
2777            Token::Whitespace(Whitespace::Space),
2778            Token::make_word("col", None),
2779            Token::Whitespace(Whitespace::Space),
2780            Token::DoubleTilde,
2781            Token::Whitespace(Whitespace::Space),
2782            Token::SingleQuotedString("_a%".into()),
2783            Token::Comma,
2784            Token::Whitespace(Whitespace::Space),
2785            Token::make_word("col", None),
2786            Token::Whitespace(Whitespace::Space),
2787            Token::DoubleTildeAsterisk,
2788            Token::Whitespace(Whitespace::Space),
2789            Token::SingleQuotedString("_a%".into()),
2790            Token::Comma,
2791            Token::Whitespace(Whitespace::Space),
2792            Token::make_word("col", None),
2793            Token::Whitespace(Whitespace::Space),
2794            Token::ExclamationMarkDoubleTilde,
2795            Token::Whitespace(Whitespace::Space),
2796            Token::SingleQuotedString("_a%".into()),
2797            Token::Comma,
2798            Token::Whitespace(Whitespace::Space),
2799            Token::make_word("col", None),
2800            Token::Whitespace(Whitespace::Space),
2801            Token::ExclamationMarkDoubleTildeAsterisk,
2802            Token::Whitespace(Whitespace::Space),
2803            Token::SingleQuotedString("_a%".into()),
2804        ];
2805        compare(expected, tokens);
2806    }
2807
2808    #[test]
2809    fn tokenize_quoted_identifier() {
2810        let sql = r#" "a "" b" "a """ "c """"" "#;
2811        let dialect = GenericDialect {};
2812        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2813        let expected = vec![
2814            Token::Whitespace(Whitespace::Space),
2815            Token::make_word(r#"a " b"#, Some('"')),
2816            Token::Whitespace(Whitespace::Space),
2817            Token::make_word(r#"a ""#, Some('"')),
2818            Token::Whitespace(Whitespace::Space),
2819            Token::make_word(r#"c """#, Some('"')),
2820            Token::Whitespace(Whitespace::Space),
2821        ];
2822        compare(expected, tokens);
2823    }
2824
2825    #[test]
2826    fn tokenize_snowflake_div() {
2827        let sql = r#"field/1000"#;
2828        let dialect = SnowflakeDialect {};
2829        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
2830        let expected = vec![
2831            Token::make_word(r#"field"#, None),
2832            Token::Div,
2833            Token::Number("1000".to_string(), false),
2834        ];
2835        compare(expected, tokens);
2836    }
2837
2838    #[test]
2839    fn tokenize_quoted_identifier_with_no_escape() {
2840        let sql = r#" "a "" b" "a """ "c """"" "#;
2841        let dialect = GenericDialect {};
2842        let tokens = Tokenizer::new(&dialect, sql)
2843            .with_unescape(false)
2844            .tokenize()
2845            .unwrap();
2846        let expected = vec![
2847            Token::Whitespace(Whitespace::Space),
2848            Token::make_word(r#"a "" b"#, Some('"')),
2849            Token::Whitespace(Whitespace::Space),
2850            Token::make_word(r#"a """#, Some('"')),
2851            Token::Whitespace(Whitespace::Space),
2852            Token::make_word(r#"c """""#, Some('"')),
2853            Token::Whitespace(Whitespace::Space),
2854        ];
2855        compare(expected, tokens);
2856    }
2857
2858    #[test]
2859    fn tokenize_with_location() {
2860        let sql = "SELECT a,\n b";
2861        let dialect = GenericDialect {};
2862        let tokens = Tokenizer::new(&dialect, sql)
2863            .tokenize_with_location()
2864            .unwrap();
2865        let expected = vec![
2866            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
2867            TokenWithSpan::at(
2868                Token::Whitespace(Whitespace::Space),
2869                (1, 7).into(),
2870                (1, 8).into(),
2871            ),
2872            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
2873            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
2874            TokenWithSpan::at(
2875                Token::Whitespace(Whitespace::Newline),
2876                (1, 10).into(),
2877                (2, 1).into(),
2878            ),
2879            TokenWithSpan::at(
2880                Token::Whitespace(Whitespace::Space),
2881                (2, 1).into(),
2882                (2, 2).into(),
2883            ),
2884            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
2885        ];
2886        compare(expected, tokens);
2887    }
2888
2889    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
2890        //println!("------------------------------");
2891        //println!("tokens   = {:?}", actual);
2892        //println!("expected = {:?}", expected);
2893        //println!("------------------------------");
2894        assert_eq!(expected, actual);
2895    }
2896
2897    fn check_unescape(s: &str, expected: Option<&str>) {
2898        let s = format!("'{}'", s);
2899        let mut state = State {
2900            peekable: s.chars().peekable(),
2901            line: 0,
2902            col: 0,
2903        };
2904
2905        assert_eq!(
2906            unescape_single_quoted_string(&mut state),
2907            expected.map(|s| s.to_string())
2908        );
2909    }
2910
2911    #[test]
2912    fn test_unescape() {
2913        check_unescape(r"\b", Some("\u{0008}"));
2914        check_unescape(r"\f", Some("\u{000C}"));
2915        check_unescape(r"\t", Some("\t"));
2916        check_unescape(r"\r\n", Some("\r\n"));
2917        check_unescape(r"\/", Some("/"));
2918        check_unescape(r"/", Some("/"));
2919        check_unescape(r"\\", Some("\\"));
2920
2921        // 16 and 32-bit hexadecimal Unicode character value
2922        check_unescape(r"\u0001", Some("\u{0001}"));
2923        check_unescape(r"\u4c91", Some("\u{4c91}"));
2924        check_unescape(r"\u4c916", Some("\u{4c91}6"));
2925        check_unescape(r"\u4c", None);
2926        check_unescape(r"\u0000", None);
2927        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
2928        check_unescape(r"\U00110000", None);
2929        check_unescape(r"\U00000000", None);
2930        check_unescape(r"\u", None);
2931        check_unescape(r"\U", None);
2932        check_unescape(r"\U1010FFFF", None);
2933
2934        // hexadecimal byte value
2935        check_unescape(r"\x4B", Some("\u{004b}"));
2936        check_unescape(r"\x4", Some("\u{0004}"));
2937        check_unescape(r"\x4L", Some("\u{0004}L"));
2938        check_unescape(r"\x", Some("x"));
2939        check_unescape(r"\xP", Some("xP"));
2940        check_unescape(r"\x0", None);
2941        check_unescape(r"\xCAD", None);
2942        check_unescape(r"\xA9", None);
2943
2944        // octal byte value
2945        check_unescape(r"\1", Some("\u{0001}"));
2946        check_unescape(r"\12", Some("\u{000a}"));
2947        check_unescape(r"\123", Some("\u{0053}"));
2948        check_unescape(r"\1232", Some("\u{0053}2"));
2949        check_unescape(r"\4", Some("\u{0004}"));
2950        check_unescape(r"\45", Some("\u{0025}"));
2951        check_unescape(r"\450", Some("\u{0028}"));
2952        check_unescape(r"\603", None);
2953        check_unescape(r"\0", None);
2954        check_unescape(r"\080", None);
2955
2956        // others
2957        check_unescape(r"\9", Some("9"));
2958        check_unescape(r"''", Some("'"));
2959        check_unescape(
2960            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
2961            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
2962        );
2963        check_unescape(r"Hello\0", None);
2964        check_unescape(r"Hello\xCADRust", None);
2965    }
2966
2967    #[test]
2968    fn tokenize_numeric_prefix_trait() {
2969        #[derive(Debug)]
2970        struct NumericPrefixDialect;
2971
2972        impl Dialect for NumericPrefixDialect {
2973            fn is_identifier_start(&self, ch: char) -> bool {
2974                ch.is_ascii_lowercase()
2975                    || ch.is_ascii_uppercase()
2976                    || ch.is_ascii_digit()
2977                    || ch == '$'
2978            }
2979
2980            fn is_identifier_part(&self, ch: char) -> bool {
2981                ch.is_ascii_lowercase()
2982                    || ch.is_ascii_uppercase()
2983                    || ch.is_ascii_digit()
2984                    || ch == '_'
2985                    || ch == '$'
2986                    || ch == '{'
2987                    || ch == '}'
2988            }
2989
2990            fn supports_numeric_prefix(&self) -> bool {
2991                true
2992            }
2993        }
2994
2995        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
2996        tokenize_numeric_prefix_inner(&HiveDialect {});
2997        tokenize_numeric_prefix_inner(&MySqlDialect {});
2998    }
2999
3000    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3001        let sql = r#"SELECT * FROM 1"#;
3002        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3003        let expected = vec![
3004            Token::make_keyword("SELECT"),
3005            Token::Whitespace(Whitespace::Space),
3006            Token::Mul,
3007            Token::Whitespace(Whitespace::Space),
3008            Token::make_keyword("FROM"),
3009            Token::Whitespace(Whitespace::Space),
3010            Token::Number(String::from("1"), false),
3011        ];
3012        compare(expected, tokens);
3013    }
3014
3015    #[test]
3016    fn tokenize_quoted_string_escape() {
3017        let dialect = SnowflakeDialect {};
3018        for (sql, expected, expected_unescaped) in [
3019            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3020            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3021            (r#"'\\'"#, r#"\\"#, r#"\"#),
3022            (
3023                r#"'\0\a\b\f\n\r\t\Z'"#,
3024                r#"\0\a\b\f\n\r\t\Z"#,
3025                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3026            ),
3027            (r#"'\"'"#, r#"\""#, "\""),
3028            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3029            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3030            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3031        ] {
3032            let tokens = Tokenizer::new(&dialect, sql)
3033                .with_unescape(false)
3034                .tokenize()
3035                .unwrap();
3036            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3037            compare(expected, tokens);
3038
3039            let tokens = Tokenizer::new(&dialect, sql)
3040                .with_unescape(true)
3041                .tokenize()
3042                .unwrap();
3043            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3044            compare(expected, tokens);
3045        }
3046
3047        for sql in [r#"'\'"#, r#"'ab\'"#] {
3048            let mut tokenizer = Tokenizer::new(&dialect, sql);
3049            assert_eq!(
3050                "Unterminated string literal",
3051                tokenizer.tokenize().unwrap_err().message.as_str(),
3052            );
3053        }
3054
3055        // Non-escape dialect
3056        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3057            let dialect = GenericDialect {};
3058            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3059
3060            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3061
3062            compare(expected, tokens);
3063        }
3064    }
3065
3066    #[test]
3067    fn tokenize_triple_quoted_string() {
3068        fn check<F>(
3069            q: char, // The quote character to test
3070            r: char, // An alternate quote character.
3071            quote_token: F,
3072        ) where
3073            F: Fn(String) -> Token,
3074        {
3075            let dialect = BigQueryDialect {};
3076
3077            for (sql, expected, expected_unescaped) in [
3078                // Empty string
3079                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3080                // Should not count escaped quote as end of string.
3081                (
3082                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3083                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3084                    format!(r#"ab{q}{q}{q}{q}cd"#),
3085                ),
3086                // Simple string
3087                (
3088                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3089                    "abc".into(),
3090                    "abc".into(),
3091                ),
3092                // Mix single-double quotes unescaped.
3093                (
3094                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3095                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3096                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3097                ),
3098                // Escaped quote.
3099                (
3100                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3101                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3102                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3103                ),
3104                // backslash-escaped quote characters.
3105                (
3106                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3107                    r#"a\'\'b\'c\'d"#.into(),
3108                    r#"a''b'c'd"#.into(),
3109                ),
3110                // backslash-escaped characters
3111                (
3112                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3113                    r#"abc\0\n\rdef"#.into(),
3114                    "abc\0\n\rdef".into(),
3115                ),
3116            ] {
3117                let tokens = Tokenizer::new(&dialect, sql.as_str())
3118                    .with_unescape(false)
3119                    .tokenize()
3120                    .unwrap();
3121                let expected = vec![quote_token(expected.to_string())];
3122                compare(expected, tokens);
3123
3124                let tokens = Tokenizer::new(&dialect, sql.as_str())
3125                    .with_unescape(true)
3126                    .tokenize()
3127                    .unwrap();
3128                let expected = vec![quote_token(expected_unescaped.to_string())];
3129                compare(expected, tokens);
3130            }
3131
3132            for sql in [
3133                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3134                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3135                format!(r#"{q}{q}{q}{q}"#),
3136                format!(r#"{q}{q}{q}{r}{r}"#),
3137                format!(r#"{q}{q}{q}abc{q}"#),
3138                format!(r#"{q}{q}{q}abc{q}{q}"#),
3139                format!(r#"{q}{q}{q}abc"#),
3140            ] {
3141                let dialect = BigQueryDialect {};
3142                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3143                assert_eq!(
3144                    "Unterminated string literal",
3145                    tokenizer.tokenize().unwrap_err().message.as_str(),
3146                );
3147            }
3148        }
3149
3150        check('"', '\'', Token::TripleDoubleQuotedString);
3151
3152        check('\'', '"', Token::TripleSingleQuotedString);
3153
3154        let dialect = BigQueryDialect {};
3155
3156        let sql = r#"""''"#;
3157        let tokens = Tokenizer::new(&dialect, sql)
3158            .with_unescape(true)
3159            .tokenize()
3160            .unwrap();
3161        let expected = vec![
3162            Token::DoubleQuotedString("".to_string()),
3163            Token::SingleQuotedString("".to_string()),
3164        ];
3165        compare(expected, tokens);
3166
3167        let sql = r#"''"""#;
3168        let tokens = Tokenizer::new(&dialect, sql)
3169            .with_unescape(true)
3170            .tokenize()
3171            .unwrap();
3172        let expected = vec![
3173            Token::SingleQuotedString("".to_string()),
3174            Token::DoubleQuotedString("".to_string()),
3175        ];
3176        compare(expected, tokens);
3177
3178        // Non-triple quoted string dialect
3179        let dialect = SnowflakeDialect {};
3180        let sql = r#"''''''"#;
3181        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3182        let expected = vec![Token::SingleQuotedString("''".to_string())];
3183        compare(expected, tokens);
3184    }
3185}
sqltk_parser/tokenizer.rs

sqltk_parser/
tokenizer.rs