Skip to main content

sqlglot_rust/tokens/
tokenizer.rs

1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4/// Identifier-start predicate. Accepts ASCII `_` plus any Unicode letter,
5/// matching SQL:2003 §5.2 (PostgreSQL/MySQL/SQLite/Oracle/ClickHouse all
6/// accept Unicode letters in regular identifiers).
7#[inline]
8fn is_identifier_start(c: char) -> bool {
9    c == '_' || c.is_alphabetic()
10}
11
12/// Identifier-continue predicate. Accepts Unicode alphanumerics, `_`, `$`,
13/// and additionally any non-ASCII printable character that is not a quote,
14/// bracket, or operator delimiter. This permits identifiers like `n°`, `±x`,
15/// or `tag€` that appear in some real-world corpora (auto-generated column
16/// names, scientific tables) — every major engine accepts these inside
17/// quoted identifiers and most accept them unquoted in tail position.
18#[inline]
19fn is_identifier_continue(c: char) -> bool {
20    if c == '_' || c == '$' || c.is_alphanumeric() {
21        return true;
22    }
23    if c.is_ascii() || c.is_whitespace() || c.is_control() {
24        return false;
25    }
26    // Non-ASCII printable: reject only characters that play a structural
27    // role in SQL syntax. Everything else (degree/euro/math symbols,
28    // sub/superscripts, fraction slash) folds into the identifier tail.
29    !matches!(
30        c,
31        '\u{00AB}' | '\u{00BB}' // « »
32        | '\u{2018}' | '\u{2019}' // ‘ ’
33        | '\u{201C}' | '\u{201D}' // “ ”
34    )
35}
36
37/// SQL tokenizer that converts a SQL string into a stream of tokens.
38///
39/// Tracks line and column numbers for error reporting. Supports:
40/// - Single-line comments (`--`)
41/// - Block comments (`/* ... */`)
42/// - Quoted identifiers (`"..."` and backtick)
43/// - String literals with escape handling
44/// - Multi-character operators (`<=`, `>=`, `<>`, `!=`, `||`, `::`, `->`, `->>`)
45pub struct Tokenizer {
46    input: Vec<char>,
47    pos: usize,
48    line: usize,
49    col: usize,
50    /// Whether to preserve comments as tokens.
51    pub preserve_comments: bool,
52    /// Last non-whitespace / non-comment token type emitted. Used by the
53    /// `[` handler to disambiguate bracket-quoted identifiers from array
54    /// subscripts.
55    prev_token_type: Option<TokenType>,
56}
57
58impl Tokenizer {
59    /// Create a new tokenizer for the given SQL input.
60    #[must_use]
61    pub fn new(input: &str) -> Self {
62        Self {
63            input: input.chars().collect(),
64            pos: 0,
65            line: 1,
66            col: 1,
67            preserve_comments: false,
68            prev_token_type: None,
69        }
70    }
71
72    /// Create a tokenizer that preserves comment tokens.
73    #[must_use]
74    pub fn with_comments(input: &str) -> Self {
75        Self {
76            input: input.chars().collect(),
77            pos: 0,
78            line: 1,
79            col: 1,
80            preserve_comments: true,
81            prev_token_type: None,
82        }
83    }
84
85    /// Tokenize the entire input and return a vector of tokens.
86    ///
87    /// Whitespace tokens are skipped. Comments are optionally preserved.
88    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
89        let mut tokens = Vec::new();
90        loop {
91            let token = self.next_token()?;
92            match token.token_type {
93                TokenType::Eof => {
94                    tokens.push(token);
95                    break;
96                }
97                TokenType::Whitespace => continue,
98                TokenType::LineComment | TokenType::BlockComment => {
99                    if self.preserve_comments {
100                        tokens.push(token);
101                    }
102                }
103                _ => {
104                    self.prev_token_type = Some(token.token_type.clone());
105                    tokens.push(token);
106                }
107            }
108        }
109        Ok(tokens)
110    }
111
112    fn peek(&self) -> Option<char> {
113        self.input.get(self.pos).copied()
114    }
115
116    fn peek_at(&self, offset: usize) -> Option<char> {
117        self.input.get(self.pos + offset).copied()
118    }
119
120    fn advance(&mut self) -> Option<char> {
121        let ch = self.input.get(self.pos).copied();
122        if let Some(c) = ch {
123            self.pos += 1;
124            if c == '\n' {
125                self.line += 1;
126                self.col = 1;
127            } else {
128                self.col += 1;
129            }
130        }
131        ch
132    }
133
134    fn make_token(
135        &self,
136        token_type: TokenType,
137        value: impl Into<String>,
138        start: usize,
139        start_line: usize,
140        start_col: usize,
141    ) -> Token {
142        Token::with_location(token_type, value, start, start_line, start_col)
143    }
144
145    fn next_token(&mut self) -> Result<Token> {
146        // Skip whitespace
147        while self.peek().is_some_and(|c| c.is_whitespace()) {
148            self.advance();
149        }
150
151        let start = self.pos;
152        let start_line = self.line;
153        let start_col = self.col;
154
155        let Some(ch) = self.advance() else {
156            return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
157        };
158
159        match ch {
160            // ── Punctuation ─────────────────────────────────────────
161            '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
162            ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
163            '[' => {
164                // Decide between two readings of `[`:
165                //   1. Bracket-quoted identifier (T-SQL / SQLite style): `[name]`,
166                //      `[#]`, `[1]`, `[User Link]`. Inner content may be anything
167                //      except `]` or newline.
168                //   2. Array subscript / element selector: `arr[1]`, `arr[1:5]`.
169                //
170                // Disambiguate on the previously emitted token: array subscript
171                // requires a subscriptable value on its left (closing paren /
172                // closing bracket / identifier / string / number). After
173                // statement-start, `AS`, `(`, `,`, operators, `BY`, etc. the
174                // bracket can only be a quoted identifier.
175                let prev_is_subscriptable = matches!(
176                    self.prev_token_type,
177                    Some(
178                        TokenType::Identifier
179                            | TokenType::RParen
180                            | TokenType::RBracket
181                            | TokenType::String
182                            | TokenType::Number
183                            // Type keywords commonly preceding array modifier `TYPE[N]`
184                            | TokenType::Int
185                            | TokenType::Integer
186                            | TokenType::BigInt
187                            | TokenType::SmallInt
188                            | TokenType::TinyInt
189                            | TokenType::Float
190                            | TokenType::Double
191                            | TokenType::Decimal
192                            | TokenType::Numeric
193                            | TokenType::Real
194                            | TokenType::Varchar
195                            | TokenType::Char
196                            | TokenType::Text
197                            | TokenType::Boolean
198                            | TokenType::Bool
199                            | TokenType::Date
200                            | TokenType::Timestamp
201                            | TokenType::TimestampTz
202                            | TokenType::Time
203                            | TokenType::Interval
204                            | TokenType::Blob
205                            | TokenType::Bytea
206                            | TokenType::Json
207                            | TokenType::Jsonb
208                            | TokenType::Uuid
209                            | TokenType::Array
210                            | TokenType::Map
211                            | TokenType::Struct
212                    )
213                );
214
215                let mut looks_like_ident = false;
216                // Always try bracketed-ident interpretation when there is a
217                // space inside before `]` (e.g. `id [User Link]` — implicit
218                // alias). Real array subscripts never contain a literal space.
219                let mut has_space_inside = false;
220                let mut has_operator_inside = false;
221                if prev_is_subscriptable {
222                    let mut scan = self.pos;
223                    while scan < self.input.len() {
224                        let c = self.input[scan];
225                        if c == ']' {
226                            break;
227                        }
228                        if c == '\n' || c == '[' || c == ',' {
229                            break;
230                        }
231                        if c == ' ' || c == '\t' {
232                            has_space_inside = true;
233                        }
234                        if matches!(c, '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '!' | '&' | '|' | '^') {
235                            has_operator_inside = true;
236                        }
237                        scan += 1;
238                    }
239                }
240                if !prev_is_subscriptable || (has_space_inside && !has_operator_inside) {
241                    let mut scan = self.pos;
242                    let mut saw_quote = false;
243                    while scan < self.input.len() {
244                        let c = self.input[scan];
245                        if c == ']' {
246                            // For ARRAY/typed subscripts, a `'` inside means
247                            // it's a string literal cast (`array['lit'::T]`),
248                            // not a bracket identifier. For non-subscriptable
249                            // contexts (TSQL `[user's name]`), accept quotes.
250                            looks_like_ident = scan > self.pos
251                                && (!prev_is_subscriptable || !saw_quote);
252                            break;
253                        }
254                        // `,` rules out `ARRAY[1,2,3]` style literals.
255                        if c == '\n' || c == '[' || c == ',' {
256                            break;
257                        }
258                        if c == '\'' {
259                            saw_quote = true;
260                        }
261                        scan += 1;
262                    }
263                }
264                if looks_like_ident {
265                    self.read_quoted_identifier(start, start_line, start_col, '[')
266                } else {
267                    Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
268                }
269            }
270            ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
271            '{' => {
272                // ClickHouse parameter / typed placeholder `{name:Type}`.
273                // The name is identifier-like; the type may itself contain
274                // parens (e.g. `{ids:Array(UInt64)}`). Scan until the
275                // matching `}` and emit a single Parameter token; fall back
276                // to a plain `LBrace` otherwise.
277                if self
278                    .peek()
279                    .is_some_and(is_identifier_start)
280                {
281                    let mut i = 1usize;
282                    while self
283                        .peek_at(i)
284                        .is_some_and(|c| is_identifier_continue(c))
285                    {
286                        i += 1;
287                    }
288                    if self.peek_at(i) == Some(':') {
289                        let mut value = String::from('{');
290                        let mut depth = 0usize;
291                        loop {
292                            match self.peek() {
293                                None => break,
294                                Some('{') => {
295                                    depth += 1;
296                                    value.push('{');
297                                    self.advance();
298                                }
299                                Some('}') => {
300                                    if depth == 0 {
301                                        value.push('}');
302                                        self.advance();
303                                        return Ok(self.make_token(
304                                            TokenType::Parameter,
305                                            value,
306                                            start,
307                                            start_line,
308                                            start_col,
309                                        ));
310                                    }
311                                    depth -= 1;
312                                    value.push('}');
313                                    self.advance();
314                                }
315                                Some(c) => {
316                                    value.push(c);
317                                    self.advance();
318                                }
319                            }
320                        }
321                        return Err(SqlglotError::TokenizerError {
322                            message: "Unterminated parameter placeholder".into(),
323                            position: start,
324                        });
325                    }
326                }
327                Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col))
328            }
329            '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
330            ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
331            ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
332            '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
333            '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
334            '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
335            '@' => {
336                if self.peek() == Some('>') {
337                    self.advance();
338                    Ok(self.make_token(TokenType::AtArrow, "@>", start, start_line, start_col))
339                } else {
340                    Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col))
341                }
342            }
343            '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
344            '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
345            '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
346            '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
347
348            // ── Colon ───────────────────────────────────────────────
349            ':' => {
350                if self.peek() == Some(':') {
351                    self.advance();
352                    Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
353                } else {
354                    Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
355                }
356            }
357
358            // ── Minus / line comment / arrow ────────────────────────
359            '-' => {
360                if self.peek() == Some('-') {
361                    self.advance();
362                    let mut value = String::from("--");
363                    while self.peek().is_some_and(|c| c != '\n') {
364                        value.push(self.advance().unwrap());
365                    }
366                    Ok(
367                        self.make_token(
368                            TokenType::LineComment,
369                            value,
370                            start,
371                            start_line,
372                            start_col,
373                        ),
374                    )
375                } else if self.peek() == Some('>') {
376                    self.advance();
377                    if self.peek() == Some('>') {
378                        self.advance();
379                        Ok(self.make_token(
380                            TokenType::DoubleArrow,
381                            "->>",
382                            start,
383                            start_line,
384                            start_col,
385                        ))
386                    } else {
387                        Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
388                    }
389                } else {
390                    Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
391                }
392            }
393
394            // ── Slash / block comment ───────────────────────────────
395            '/' => {
396                if self.peek() == Some('*') {
397                    self.advance();
398                    let mut value = String::from("/*");
399                    let mut depth = 1;
400                    while depth > 0 {
401                        match self.advance() {
402                            Some('*') if self.peek() == Some('/') => {
403                                self.advance();
404                                depth -= 1;
405                                value.push_str("*/");
406                            }
407                            Some('/') if self.peek() == Some('*') => {
408                                self.advance();
409                                depth += 1;
410                                value.push_str("/*");
411                            }
412                            Some(c) => value.push(c),
413                            None => {
414                                return Err(SqlglotError::TokenizerError {
415                                    message: "Unterminated block comment".into(),
416                                    position: start,
417                                });
418                            }
419                        }
420                    }
421                    Ok(self.make_token(
422                        TokenType::BlockComment,
423                        value,
424                        start,
425                        start_line,
426                        start_col,
427                    ))
428                } else {
429                    Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
430                }
431            }
432
433            // ── Less-than variants ──────────────────────────────────
434            '<' => {
435                if self.peek() == Some('=') {
436                    self.advance();
437                    Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
438                } else if self.peek() == Some('>') {
439                    self.advance();
440                    Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
441                } else if self.peek() == Some('<') {
442                    self.advance();
443                    Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col))                } else if self.peek() == Some('@') {
444                    self.advance();
445                    Ok(self.make_token(TokenType::ArrowAt, "<@", start, start_line, start_col))                } else {
446                    Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
447                }
448            }
449
450            // ── Greater-than variants ───────────────────────────────
451            '>' => {
452                if self.peek() == Some('=') {
453                    self.advance();
454                    Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
455                } else if self.peek() == Some('>') {
456                    self.advance();
457                    Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
458                } else {
459                    Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
460                }
461            }
462
463            // ── Bang ────────────────────────────────────────────────
464            '!' => {
465                if self.peek() == Some('=') {
466                    self.advance();
467                    Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
468                } else {
469                    Err(SqlglotError::TokenizerError {
470                        message: format!("Unexpected character: {ch}"),
471                        position: start,
472                    })
473                }
474            }
475
476            // ── Pipe / BitwiseOr / Concat ───────────────────────────
477            '|' => {
478                if self.peek() == Some('|') {
479                    self.advance();
480                    Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
481                } else {
482                    Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
483                }
484            }
485
486            // ── Ampersand ───────────────────────────────────────────
487            '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
488
489            // ── Hash ────────────────────────────────────────────────
490            '#' => {
491                if self.peek() == Some('>') {
492                    self.advance();
493                    if self.peek() == Some('>') {
494                        self.advance();
495                        Ok(self.make_token(
496                            TokenType::HashDoubleArrow,
497                            "#>>",
498                            start,
499                            start_line,
500                            start_col,
501                        ))
502                    } else {
503                        Ok(self.make_token(
504                            TokenType::HashArrow,
505                            "#>",
506                            start,
507                            start_line,
508                            start_col,
509                        ))
510                    }
511                } else if self.peek() == Some('#') {
512                    // `##name##` — StackExchange Data Explorer style template
513                    // placeholder. Surface as a regular identifier so the
514                    // surrounding query parses. If we can't find a matching
515                    // closing `##` on the same line, fall through to the
516                    // line-comment behavior below.
517                    let save_pos = self.pos;
518                    let save_line = self.line;
519                    let save_col = self.col;
520                    self.advance(); // consume second `#`
521                    let inner_start = self.pos;
522                    let mut found_close = false;
523                    while let Some(c) = self.peek() {
524                        if c == '\n' {
525                            break;
526                        }
527                        if c == '#' && self.peek_at(1) == Some('#') {
528                            found_close = true;
529                            break;
530                        }
531                        self.advance();
532                    }
533                    if found_close {
534                        let value: String = self.input[inner_start..self.pos].iter().collect();
535                        self.advance(); // first closing `#`
536                        self.advance(); // second closing `#`
537                        return Ok(Token::with_quote(
538                            TokenType::Identifier,
539                            value,
540                            start,
541                            start_line,
542                            start_col,
543                            '#',
544                        ));
545                    }
546                    // Rewind and fall through to line-comment handling.
547                    self.pos = save_pos;
548                    self.line = save_line;
549                    self.col = save_col;
550                    let mut value = String::from("#");
551                    while self.peek().is_some_and(|c| c != '\n') {
552                        value.push(self.advance().unwrap());
553                    }
554                    Ok(
555                        self.make_token(
556                            TokenType::LineComment,
557                            value,
558                            start,
559                            start_line,
560                            start_col,
561                        ),
562                    )
563                } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
564                    // DuckDB `#N` positional column reference. Emit as a
565                    // Parameter so it parses inside expressions / ORDER BY.
566                    let mut value = String::from("#");
567                    while self.peek().is_some_and(|c| c.is_ascii_digit()) {
568                        value.push(self.advance().unwrap());
569                    }
570                    Ok(self.make_token(
571                        TokenType::Parameter,
572                        value,
573                        start,
574                        start_line,
575                        start_col,
576                    ))
577                } else {
578                    let mut value = String::from("#");
579                    while self.peek().is_some_and(|c| c != '\n') {
580                        value.push(self.advance().unwrap());
581                    }
582                    Ok(
583                        self.make_token(
584                            TokenType::LineComment,
585                            value,
586                            start,
587                            start_line,
588                            start_col,
589                        ),
590                    )
591                }
592            }
593
594            // ── String literals ─────────────────────────────────────
595            '\'' => self.read_string(start, start_line, start_col),
596
597            // ── Numbers ─────────────────────────────────────────────
598            c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
599
600            // ── Identifiers and keywords ────────────────────────────
601            c if is_identifier_start(c) => {
602                self.read_identifier(start, start_line, start_col, c)
603            }
604
605            // ── Quoted identifiers (double quote) ───────────────────
606            '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
607
608            // ── Backtick identifiers (MySQL, BigQuery) ──────────────
609            '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
610
611            // ── Parameter markers ───────────────────────────────────
612            '$' => {
613                // PostgreSQL dollar-quoted string literal: `$$body$$` or
614                // `$tag$body$tag$`. The tag is an optional identifier. We
615                // detect the opening sequence and scan to the matching
616                // closing sequence; the body may contain any characters.
617                if self.peek() == Some('$') {
618                    self.advance(); // closing $ of opening $$
619                    let mut value = String::new();
620                    while let Some(c) = self.peek() {
621                        if c == '$' && self.peek_at(1) == Some('$') {
622                            self.advance();
623                            self.advance();
624                            return Ok(self.make_token(
625                                TokenType::String,
626                                value,
627                                start,
628                                start_line,
629                                start_col,
630                            ));
631                        }
632                        value.push(self.advance().unwrap());
633                    }
634                    // Unterminated — fall back to the captured body as String.
635                    return Ok(self.make_token(
636                        TokenType::String,
637                        value,
638                        start,
639                        start_line,
640                        start_col,
641                    ));
642                }
643                // Speculative `$tag$ … $tag$` form. Only treat as a
644                // dollar-quote if the tokens after the tag actually form
645                // a valid closing sequence; otherwise fall through to
646                // the identifier / parameter handling below.
647                if self.peek().is_some_and(is_identifier_start) {
648                    let save_pos = self.pos;
649                    let save_line = self.line;
650                    let save_col = self.col;
651                    let mut tag = String::new();
652                    while self.peek().is_some_and(is_identifier_continue) {
653                        tag.push(self.advance().unwrap());
654                    }
655                    if self.peek() == Some('$') {
656                        self.advance();
657                        // Look ahead for matching `$tag$` close.
658                        let mut value = String::new();
659                        let mut closed = false;
660                        while let Some(c) = self.peek() {
661                            if c == '$' {
662                                // Test for the closing tag.
663                                let mut matched = true;
664                                for (i, ch) in tag.chars().enumerate() {
665                                    if self.peek_at(i + 1) != Some(ch) {
666                                        matched = false;
667                                        break;
668                                    }
669                                }
670                                if matched && self.peek_at(tag.len() + 1) == Some('$') {
671                                    // Consume `$tag$`.
672                                    for _ in 0..(tag.len() + 2) {
673                                        self.advance();
674                                    }
675                                    closed = true;
676                                    break;
677                                }
678                            }
679                            value.push(self.advance().unwrap());
680                        }
681                        if closed {
682                            return Ok(self.make_token(
683                                TokenType::String,
684                                value,
685                                start,
686                                start_line,
687                                start_col,
688                            ));
689                        }
690                    }
691                    // Not a dollar-quote; rewind and fall through to the
692                    // identifier path.
693                    self.pos = save_pos;
694                    self.line = save_line;
695                    self.col = save_col;
696                }
697                if self.peek() == Some('{') {
698                    // `${name}` template variable (DuckDB / shell-style). Consume
699                    // through the closing `}` and emit as a single Parameter token.
700                    let mut value = String::from("$");
701                    value.push(self.advance().unwrap()); // '{'
702                    while let Some(c) = self.peek() {
703                        value.push(self.advance().unwrap());
704                        if c == '}' {
705                            break;
706                        }
707                    }
708                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
709                } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
710                    let mut value = String::from("$");
711                    while self.peek().is_some_and(|c| c.is_ascii_digit()) {
712                        value.push(self.advance().unwrap());
713                    }
714                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
715                } else if self.peek().is_some_and(is_identifier_start) {
716                    // `$alias` / `$_`: identifier with a leading `$`. Appears
717                    // in auto-generated column names (e.g. `purse__$__`) and as
718                    // SELECT aliases (`AS $__`). PostgreSQL prepared-statement
719                    // parameters (`$1`, `$2`) keep the digits-only fast path
720                    // above; the `$<digit>` form cannot start an identifier so
721                    // there is no ambiguity.
722                    let mut value = String::from("$");
723                    while self.peek().is_some_and(is_identifier_continue) {
724                        value.push(self.advance().unwrap());
725                    }
726                    Ok(self.make_token(TokenType::Identifier, value, start, start_line, start_col))
727                } else {
728                    Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
729                }
730            }
731
732            '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
733
734            _ => Err(SqlglotError::TokenizerError {
735                message: format!("Unexpected character: {ch}"),
736                position: start,
737            }),
738        }
739    }
740
741    fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
742        let mut value = String::new();
743        loop {
744            match self.advance() {
745                Some('\'') => {
746                    if self.peek() == Some('\'') {
747                        self.advance();
748                        value.push('\'');
749                    } else {
750                        return Ok(self.make_token(
751                            TokenType::String,
752                            value,
753                            start,
754                            start_line,
755                            start_col,
756                        ));
757                    }
758                }
759                Some('\\') => match self.peek() {
760                    Some('\\') => {
761                        self.advance();
762                        value.push('\\');
763                    }
764                    Some('n') => {
765                        self.advance();
766                        value.push('\n');
767                    }
768                    Some('t') => {
769                        self.advance();
770                        value.push('\t');
771                    }
772                    Some('r') => {
773                        self.advance();
774                        value.push('\r');
775                    }
776                    Some('\'') => {
777                        self.advance();
778                        value.push('\'');
779                    }
780                    Some('"') => {
781                        self.advance();
782                        value.push('"');
783                    }
784                    Some('0') => {
785                        self.advance();
786                        value.push('\0');
787                    }
788                    Some('b') => {
789                        self.advance();
790                        value.push('\u{0008}');
791                    }
792                    Some('f') => {
793                        self.advance();
794                        value.push('\u{000C}');
795                    }
796                    Some('v') => {
797                        self.advance();
798                        value.push('\u{000B}');
799                    }
800                    Some('a') => {
801                        self.advance();
802                        value.push('\u{0007}');
803                    }
804                    Some(c) if c.is_ascii_alphanumeric() || c == '?' => {
805                        // Tolerate other escape sequences (e.g. ClickHouse
806                        // \xAA, \uXXXX, \?) by consuming the introducer
807                        // and keeping the literal character in the string.
808                        self.advance();
809                        value.push('\\');
810                        value.push(c);
811                    }
812                    _ => {
813                        value.push('\\');
814                    }
815                },
816                Some(c) => value.push(c),
817                None => {
818                    return Err(SqlglotError::TokenizerError {
819                        message: "Unterminated string literal".into(),
820                        position: start,
821                    });
822                }
823            }
824        }
825    }
826
827    fn read_number(
828        &mut self,
829        start: usize,
830        start_line: usize,
831        start_col: usize,
832        first: char,
833    ) -> Result<Token> {
834        let mut value = String::new();
835        value.push(first);
836
837        if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
838            value.push(self.advance().unwrap());
839            while self
840                .peek()
841                .is_some_and(|c| c.is_ascii_hexdigit() || c == '_')
842            {
843                value.push(self.advance().unwrap());
844            }
845            // Optional binary-exponent suffix `pN` / `PN` for hex floats
846            // (`0x1p-1022`, `0x123p4`).
847            if self.peek().is_some_and(|c| c == 'p' || c == 'P') {
848                value.push(self.advance().unwrap());
849                if self.peek().is_some_and(|c| c == '+' || c == '-') {
850                    value.push(self.advance().unwrap());
851                }
852                while self.peek().is_some_and(|c| c.is_ascii_digit()) {
853                    value.push(self.advance().unwrap());
854                }
855            }
856            return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
857        }
858
859        while self
860            .peek()
861            .is_some_and(|c| c.is_ascii_digit() || c == '_')
862        {
863            value.push(self.advance().unwrap());
864        }
865
866        if self.peek() == Some('.')
867            && (self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
868                || !self.peek_at(1).is_some_and(is_identifier_start))
869        {
870            value.push(self.advance().unwrap());
871            while self
872                .peek()
873                .is_some_and(|c| c.is_ascii_digit() || c == '_')
874            {
875                value.push(self.advance().unwrap());
876            }
877        }
878
879        if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
880            value.push(self.advance().unwrap());
881            if self.peek().is_some_and(|c| c == '+' || c == '-') {
882                value.push(self.advance().unwrap());
883            }
884            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
885                value.push(self.advance().unwrap());
886            }
887        }
888
889        // ClickHouse / Hive allow identifiers that start with digits
890        // (`03720_test_alter`, `1_table`). If the run of digits is butted
891        // directly against an identifier-continue character, treat the
892        // whole span as an identifier.
893        if !value.contains('.')
894            && !value.contains('e')
895            && !value.contains('E')
896            && self.peek().is_some_and(is_identifier_continue)
897        {
898            while self.peek().is_some_and(is_identifier_continue) {
899                value.push(self.advance().unwrap());
900            }
901            let token_type = Self::keyword_type(&value);
902            return Ok(self.make_token(token_type, value, start, start_line, start_col));
903        }
904
905        Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
906    }
907
908    fn read_identifier(
909        &mut self,
910        start: usize,
911        start_line: usize,
912        start_col: usize,
913        first: char,
914    ) -> Result<Token> {
915        let mut value = String::new();
916        value.push(first);
917        while self
918            .peek()
919            .is_some_and(is_identifier_continue)
920        {
921            // Don't swallow a `$` that starts a template variable
922            // (`${name}`) or a numbered parameter (`$1`) — those need to
923            // tokenize as their own Parameter token.
924            if self.peek() == Some('$') {
925                let next = self.peek_at(1);
926                if matches!(next, Some('{')) || next.is_some_and(|c| c.is_ascii_digit()) {
927                    break;
928                }
929            }
930            value.push(self.advance().unwrap());
931        }
932
933        // Phase 1 support: treat N'...' / n'...' as a string literal token.
934        // This unblocks Oracle/TSQL national string parsing without AST changes.
935        if value.len() == 1
936            && value
937                .as_bytes()
938                .first()
939                .is_some_and(|b| b.eq_ignore_ascii_case(&b'n'))
940            && self.peek() == Some('\'')
941        {
942            self.advance(); // consume opening quote
943            let mut token = self.read_string(start, start_line, start_col)?;
944            token.token_type = TokenType::NationalString;
945            return Ok(token);
946        }
947
948        // PostgreSQL / SQL standard string-literal prefixes:
949        //   E'...'  — escape string (backslash escapes processed)
950        //   B'...'  — bit string
951        //   X'...'  — hex / byte string
952        //   U&'...' — Unicode escape string (we accept the prefix and string;
953        //             the trailing `UESCAPE 'x'` clause is parser-side noise)
954        // Each prefix tokenizes as a single-char identifier; merge with the
955        // following `'...'` literal into a String token so the SQL parses.
956        if value.len() == 1
957            && value
958                .as_bytes()
959                .first()
960                .is_some_and(|b| matches!(b.to_ascii_uppercase(), b'E' | b'B' | b'X'))
961            && self.peek() == Some('\'')
962        {
963            self.advance();
964            return self.read_string(start, start_line, start_col);
965        }
966        // U&'...' — Unicode escape literal.
967        if value.len() == 1
968            && value
969                .as_bytes()
970                .first()
971                .is_some_and(|b| b.eq_ignore_ascii_case(&b'u'))
972            && self.peek() == Some('&')
973            && self.peek_at(1) == Some('\'')
974        {
975            self.advance(); // &
976            self.advance(); // '
977            return self.read_string(start, start_line, start_col);
978        }
979
980        let token_type = Self::keyword_type(&value);
981        Ok(self.make_token(token_type, value, start, start_line, start_col))
982    }
983
984    /// Map a word to its keyword token type, or `Identifier` if not a keyword.
985    fn keyword_type(word: &str) -> TokenType {
986        match word.to_uppercase().as_str() {
987            "SELECT" => TokenType::Select,
988            "FROM" => TokenType::From,
989            "WHERE" => TokenType::Where,
990            "AND" => TokenType::And,
991            "OR" => TokenType::Or,
992            "NOT" => TokenType::Not,
993            "AS" => TokenType::As,
994            "JOIN" => TokenType::Join,
995            "INNER" => TokenType::Inner,
996            "LEFT" => TokenType::Left,
997            "RIGHT" => TokenType::Right,
998            "FULL" => TokenType::Full,
999            "OUTER" => TokenType::Outer,
1000            "CROSS" => TokenType::Cross,
1001            "ON" => TokenType::On,
1002            "INSERT" => TokenType::Insert,
1003            "INTO" => TokenType::Into,
1004            "VALUES" => TokenType::Values,
1005            "UPDATE" => TokenType::Update,
1006            "SET" => TokenType::Set,
1007            "DELETE" => TokenType::Delete,
1008            "CREATE" => TokenType::Create,
1009            "TABLE" => TokenType::Table,
1010            "DROP" => TokenType::Drop,
1011            "ALTER" => TokenType::Alter,
1012            "INDEX" => TokenType::Index,
1013            "IF" => TokenType::If,
1014            "EXISTS" => TokenType::Exists,
1015            "IN" => TokenType::In,
1016            "IS" => TokenType::Is,
1017            "NULL" => TokenType::Null,
1018            "LIKE" => TokenType::Like,
1019            "ILIKE" => TokenType::ILike,
1020            "ESCAPE" => TokenType::Escape,
1021            "BETWEEN" => TokenType::Between,
1022            "CASE" => TokenType::Case,
1023            "WHEN" => TokenType::When,
1024            "THEN" => TokenType::Then,
1025            "ELSE" => TokenType::Else,
1026            "END" => TokenType::End,
1027            "ORDER" => TokenType::Order,
1028            "BY" => TokenType::By,
1029            "ASC" => TokenType::Asc,
1030            "DESC" => TokenType::Desc,
1031            "GROUP" => TokenType::Group,
1032            "HAVING" => TokenType::Having,
1033            "LIMIT" => TokenType::Limit,
1034            "OFFSET" => TokenType::Offset,
1035            "UNION" => TokenType::Union,
1036            "ALL" => TokenType::All,
1037            "DISTINCT" => TokenType::Distinct,
1038            "TRUE" => TokenType::True,
1039            "FALSE" => TokenType::False,
1040            "INTERSECT" => TokenType::Intersect,
1041            "EXCEPT" => TokenType::Except,
1042            "WITH" => TokenType::With,
1043            "RECURSIVE" => TokenType::Recursive,
1044            "ANY" => TokenType::Any,
1045            "SOME" => TokenType::Some,
1046            "CAST" => TokenType::Cast,
1047            "OVER" => TokenType::Over,
1048            "PARTITION" => TokenType::Partition,
1049            "WINDOW" => TokenType::Window,
1050            "ROWS" => TokenType::Rows,
1051            "RANGE" => TokenType::Range,
1052            "UNBOUNDED" => TokenType::Unbounded,
1053            "PRECEDING" => TokenType::Preceding,
1054            "FOLLOWING" => TokenType::Following,
1055            "FILTER" => TokenType::Filter,
1056            "INT" => TokenType::Int,
1057            "INTEGER" => TokenType::Integer,
1058            "BIGINT" => TokenType::BigInt,
1059            "SMALLINT" => TokenType::SmallInt,
1060            "TINYINT" => TokenType::TinyInt,
1061            "FLOAT" => TokenType::Float,
1062            "DOUBLE" => TokenType::Double,
1063            "DECIMAL" => TokenType::Decimal,
1064            "NUMERIC" => TokenType::Numeric,
1065            "REAL" => TokenType::Real,
1066            "VARCHAR" => TokenType::Varchar,
1067            "CHAR" | "CHARACTER" => TokenType::Char,
1068            "TEXT" => TokenType::Text,
1069            "BOOLEAN" | "BOOL" => TokenType::Boolean,
1070            "DATE" => TokenType::Date,
1071            "TIMESTAMP" => TokenType::Timestamp,
1072            "TIMESTAMPTZ" => TokenType::TimestampTz,
1073            "TIME" => TokenType::Time,
1074            "INTERVAL" => TokenType::Interval,
1075            "BLOB" => TokenType::Blob,
1076            "BYTEA" => TokenType::Bytea,
1077            "JSON" => TokenType::Json,
1078            "JSONB" => TokenType::Jsonb,
1079            "UUID" => TokenType::Uuid,
1080            "ARRAY" => TokenType::Array,
1081            "MAP" => TokenType::Map,
1082            "STRUCT" => TokenType::Struct,
1083            "PRIMARY" => TokenType::Primary,
1084            "KEY" => TokenType::Key,
1085            "FOREIGN" => TokenType::Foreign,
1086            "REFERENCES" => TokenType::References,
1087            "UNIQUE" => TokenType::Unique,
1088            "CHECK" => TokenType::Check,
1089            "DEFAULT" => TokenType::Default,
1090            "CONSTRAINT" => TokenType::Constraint,
1091            "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
1092            "CASCADE" => TokenType::Cascade,
1093            "RESTRICT" => TokenType::Restrict,
1094            "RETURNING" => TokenType::Returning,
1095            "CONFLICT" => TokenType::Conflict,
1096            "DO" => TokenType::Do,
1097            "NOTHING" => TokenType::Nothing,
1098            "REPLACE" => TokenType::Replace,
1099            "IGNORE" => TokenType::Ignore,
1100            "MERGE" => TokenType::Merge,
1101            "MATCHED" => TokenType::Matched,
1102            "USING" => TokenType::Using,
1103            "TRUNCATE" => TokenType::Truncate,
1104            "SCHEMA" => TokenType::Schema,
1105            "DATABASE" => TokenType::Database,
1106            "VIEW" => TokenType::View,
1107            "MATERIALIZED" => TokenType::Materialized,
1108            "TEMPORARY" => TokenType::Temporary,
1109            "TEMP" => TokenType::Temp,
1110            "BEGIN" => TokenType::Begin,
1111            "COMMIT" => TokenType::Commit,
1112            "ROLLBACK" => TokenType::Rollback,
1113            "SAVEPOINT" => TokenType::Savepoint,
1114            "TRANSACTION" => TokenType::Transaction,
1115            "EXPLAIN" => TokenType::Explain,
1116            "ANALYZE" => TokenType::Analyze,
1117            "SHOW" => TokenType::Show,
1118            "USE" => TokenType::Use,
1119            "GRANT" => TokenType::Grant,
1120            "REVOKE" => TokenType::Revoke,
1121            "LATERAL" => TokenType::Lateral,
1122            "UNNEST" => TokenType::Unnest,
1123            "PIVOT" => TokenType::Pivot,
1124            "UNPIVOT" => TokenType::Unpivot,
1125            "TABLESAMPLE" => TokenType::Tablesample,
1126            "FETCH" => TokenType::Fetch,
1127            "FIRST" => TokenType::First,
1128            "NEXT" => TokenType::Next,
1129            "ONLY" => TokenType::Only,
1130            "NULLS" => TokenType::Nulls,
1131            "RESPECT" => TokenType::Respect,
1132            "TOP" => TokenType::Top,
1133            "COLLATE" => TokenType::Collate,
1134            "QUALIFY" => TokenType::Qualify,
1135            "CUBE" => TokenType::Cube,
1136            "ROLLUP" => TokenType::Rollup,
1137            "GROUPING" => TokenType::Grouping,
1138            "SETS" => TokenType::Sets,
1139            "XOR" => TokenType::Xor,
1140            "EXTRACT" => TokenType::Extract,
1141            "EPOCH" => TokenType::Epoch,
1142            "YEAR" => TokenType::Year,
1143            "MONTH" => TokenType::Month,
1144            "DAY" => TokenType::Day,
1145            "HOUR" => TokenType::Hour,
1146            "MINUTE" => TokenType::Minute,
1147            "SECOND" => TokenType::Second,
1148            _ => TokenType::Identifier,
1149        }
1150    }
1151
1152    fn read_quoted_identifier(
1153        &mut self,
1154        start: usize,
1155        start_line: usize,
1156        start_col: usize,
1157        quote: char,
1158    ) -> Result<Token> {
1159        let end_char = if quote == '[' { ']' } else { quote };
1160        let mut value = String::new();
1161        loop {
1162            match self.advance() {
1163                Some(c) if c == end_char => {
1164                    if self.peek() == Some(end_char) && end_char != ']' {
1165                        self.advance();
1166                        value.push(end_char);
1167                    } else {
1168                        return Ok(Token::with_quote(
1169                            TokenType::Identifier,
1170                            value,
1171                            start,
1172                            start_line,
1173                            start_col,
1174                            quote,
1175                        ));
1176                    }
1177                }
1178                Some(c) => value.push(c),
1179                None => {
1180                    return Err(SqlglotError::TokenizerError {
1181                        message: format!("Unterminated quoted identifier (expected {end_char})"),
1182                        position: start,
1183                    });
1184                }
1185            }
1186        }
1187    }
1188}
1189
1190#[cfg(test)]
1191mod tests {
1192    use super::*;
1193
1194    #[test]
1195    fn test_tokenize_simple_select() {
1196        let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
1197        let tokens = tokenizer.tokenize().unwrap();
1198        assert_eq!(tokens[0].token_type, TokenType::Select);
1199        assert_eq!(tokens[1].token_type, TokenType::Identifier);
1200        assert_eq!(tokens[1].value, "a");
1201        assert_eq!(tokens[2].token_type, TokenType::Comma);
1202        assert_eq!(tokens[3].token_type, TokenType::Identifier);
1203        assert_eq!(tokens[3].value, "b");
1204        assert_eq!(tokens[4].token_type, TokenType::From);
1205        assert_eq!(tokens[5].token_type, TokenType::Identifier);
1206        assert_eq!(tokens[5].value, "t");
1207        assert_eq!(tokens[6].token_type, TokenType::Eof);
1208    }
1209
1210    #[test]
1211    fn test_tokenize_string_literal() {
1212        let mut tokenizer = Tokenizer::new("'hello world'");
1213        let tokens = tokenizer.tokenize().unwrap();
1214        assert_eq!(tokens[0].token_type, TokenType::String);
1215        assert_eq!(tokens[0].value, "hello world");
1216    }
1217
1218    #[test]
1219    fn test_tokenize_operators() {
1220        let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
1221        let tokens = tokenizer.tokenize().unwrap();
1222        assert_eq!(tokens[1].token_type, TokenType::GtEq);
1223        assert_eq!(tokens[3].token_type, TokenType::And);
1224        assert_eq!(tokens[5].token_type, TokenType::Neq);
1225    }
1226
1227    #[test]
1228    fn test_tokenize_number() {
1229        let mut tokenizer = Tokenizer::new("123.45");
1230        let tokens = tokenizer.tokenize().unwrap();
1231        assert_eq!(tokens[0].token_type, TokenType::Number);
1232        assert_eq!(tokens[0].value, "123.45");
1233    }
1234
1235    #[test]
1236    fn test_tokenize_line_comment() {
1237        let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
1238        let tokens = tok.tokenize().unwrap();
1239        assert!(
1240            tokens
1241                .iter()
1242                .any(|t| t.token_type == TokenType::LineComment)
1243        );
1244    }
1245
1246    #[test]
1247    fn test_tokenize_block_comment() {
1248        let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
1249        let tokens = tok.tokenize().unwrap();
1250        assert!(
1251            tokens
1252                .iter()
1253                .any(|t| t.token_type == TokenType::BlockComment)
1254        );
1255    }
1256
1257    #[test]
1258    fn test_tokenize_cte_keywords() {
1259        let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
1260        let tokens = tok.tokenize().unwrap();
1261        assert_eq!(tokens[0].token_type, TokenType::With);
1262        assert_eq!(tokens[2].token_type, TokenType::As);
1263    }
1264
1265    #[test]
1266    fn test_tokenize_double_colon() {
1267        let mut tok = Tokenizer::new("x::int");
1268        let tokens = tok.tokenize().unwrap();
1269        assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
1270    }
1271
1272    #[test]
1273    fn test_tokenize_cast() {
1274        let mut tok = Tokenizer::new("CAST(x AS INT)");
1275        let tokens = tok.tokenize().unwrap();
1276        assert_eq!(tokens[0].token_type, TokenType::Cast);
1277    }
1278
1279    #[test]
1280    fn test_tokenize_window() {
1281        let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
1282        let tokens = tok.tokenize().unwrap();
1283        assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
1284        assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
1285    }
1286
1287    #[test]
1288    fn test_line_tracking() {
1289        let mut tok = Tokenizer::new("SELECT\n  1");
1290        let tokens = tok.tokenize().unwrap();
1291        assert_eq!(tokens[0].line, 1);
1292        assert_eq!(tokens[1].line, 2);
1293    }
1294
1295    #[test]
1296    fn test_tokenize_union_intersect_except() {
1297        let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
1298        let tokens = tok.tokenize().unwrap();
1299        assert_eq!(tokens[0].token_type, TokenType::Union);
1300        assert_eq!(tokens[1].token_type, TokenType::Intersect);
1301        assert_eq!(tokens[2].token_type, TokenType::Except);
1302    }
1303
1304    #[test]
1305    fn test_tokenize_n_prefixed_string_literal_uppercase() {
1306        let mut tok = Tokenizer::new("N'Hello'");
1307        let tokens = tok.tokenize().unwrap();
1308        assert_eq!(tokens[0].token_type, TokenType::NationalString);
1309        assert_eq!(tokens[0].value, "Hello");
1310    }
1311
1312    #[test]
1313    fn test_tokenize_n_prefixed_string_literal_lowercase() {
1314        let mut tok = Tokenizer::new("n'hello'");
1315        let tokens = tok.tokenize().unwrap();
1316        assert_eq!(tokens[0].token_type, TokenType::NationalString);
1317        assert_eq!(tokens[0].value, "hello");
1318    }
1319
1320    #[test]
1321    fn test_tokenize_n_prefixed_string_literal_escaped_quote() {
1322        let mut tok = Tokenizer::new("N'can''t stop'");
1323        let tokens = tok.tokenize().unwrap();
1324        assert_eq!(tokens[0].token_type, TokenType::NationalString);
1325        assert_eq!(tokens[0].value, "can't stop");
1326    }
1327
1328    #[test]
1329    fn test_tokenize_n_prefixed_string_literal_unicode() {
1330        let mut tok = Tokenizer::new("N'テスト'");
1331        let tokens = tok.tokenize().unwrap();
1332        assert_eq!(tokens[0].token_type, TokenType::NationalString);
1333        assert_eq!(tokens[0].value, "テスト");
1334    }
1335
1336    #[test]
1337    fn test_tokenize_identifier_n_without_quote() {
1338        let mut tok = Tokenizer::new("SELECT N FROM t");
1339        let tokens = tok.tokenize().unwrap();
1340        assert_eq!(tokens[1].token_type, TokenType::Identifier);
1341        assert_eq!(tokens[1].value, "N");
1342    }
1343
1344    #[test]
1345    fn test_tokenize_identifier_name_starting_with_n() {
1346        let mut tok = Tokenizer::new("SELECT NAME FROM t");
1347        let tokens = tok.tokenize().unwrap();
1348        assert_eq!(tokens[1].token_type, TokenType::Identifier);
1349        assert_eq!(tokens[1].value, "NAME");
1350    }
1351}