Skip to main content

sqlglot_rust/tokens/
tokenizer.rs

1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4/// Identifier-start predicate. Accepts ASCII `_` plus any Unicode letter,
5/// matching SQL:2003 §5.2 (PostgreSQL/MySQL/SQLite/Oracle/ClickHouse all
6/// accept Unicode letters in regular identifiers).
7#[inline]
8fn is_identifier_start(c: char) -> bool {
9    c == '_' || c.is_alphabetic()
10}
11
12/// Identifier-continue predicate. Accepts Unicode alphanumerics, `_`, `$`,
13/// and additionally any non-ASCII printable character that is not a quote,
14/// bracket, or operator delimiter. This permits identifiers like `n°`, `±x`,
15/// or `tag€` that appear in some real-world corpora (auto-generated column
16/// names, scientific tables) — every major engine accepts these inside
17/// quoted identifiers and most accept them unquoted in tail position.
18#[inline]
19fn is_identifier_continue(c: char) -> bool {
20    if c == '_' || c == '$' || c.is_alphanumeric() {
21        return true;
22    }
23    if c.is_ascii() || c.is_whitespace() || c.is_control() {
24        return false;
25    }
26    // Non-ASCII printable: reject only characters that play a structural
27    // role in SQL syntax. Everything else (degree/euro/math symbols,
28    // sub/superscripts, fraction slash) folds into the identifier tail.
29    !matches!(
30        c,
31        '\u{00AB}' | '\u{00BB}' // « »
32        | '\u{2018}' | '\u{2019}' // ‘ ’
33        | '\u{201C}' | '\u{201D}' // “ ”
34    )
35}
36
37/// SQL tokenizer that converts a SQL string into a stream of tokens.
38///
39/// Tracks line and column numbers for error reporting. Supports:
40/// - Single-line comments (`--`)
41/// - Block comments (`/* ... */`)
42/// - Quoted identifiers (`"..."` and backtick)
43/// - String literals with escape handling
44/// - Multi-character operators (`<=`, `>=`, `<>`, `!=`, `||`, `::`, `->`, `->>`)
45pub struct Tokenizer {
46    input: Vec<char>,
47    pos: usize,
48    line: usize,
49    col: usize,
50    /// Whether to preserve comments as tokens.
51    pub preserve_comments: bool,
52    /// Last non-whitespace / non-comment token type emitted. Used by the
53    /// `[` handler to disambiguate bracket-quoted identifiers from array
54    /// subscripts.
55    prev_token_type: Option<TokenType>,
56}
57
58impl Tokenizer {
59    /// Create a new tokenizer for the given SQL input.
60    #[must_use]
61    pub fn new(input: &str) -> Self {
62        Self {
63            input: input.chars().collect(),
64            pos: 0,
65            line: 1,
66            col: 1,
67            preserve_comments: false,
68            prev_token_type: None,
69        }
70    }
71
72    /// Create a tokenizer that preserves comment tokens.
73    #[must_use]
74    pub fn with_comments(input: &str) -> Self {
75        Self {
76            input: input.chars().collect(),
77            pos: 0,
78            line: 1,
79            col: 1,
80            preserve_comments: true,
81            prev_token_type: None,
82        }
83    }
84
85    /// Tokenize the entire input and return a vector of tokens.
86    ///
87    /// Whitespace tokens are skipped. Comments are optionally preserved.
88    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
89        let mut tokens = Vec::new();
90        loop {
91            let token = self.next_token()?;
92            match token.token_type {
93                TokenType::Eof => {
94                    tokens.push(token);
95                    break;
96                }
97                TokenType::Whitespace => continue,
98                TokenType::LineComment | TokenType::BlockComment => {
99                    if self.preserve_comments {
100                        tokens.push(token);
101                    }
102                }
103                _ => {
104                    self.prev_token_type = Some(token.token_type.clone());
105                    tokens.push(token);
106                }
107            }
108        }
109        Ok(tokens)
110    }
111
112    fn peek(&self) -> Option<char> {
113        self.input.get(self.pos).copied()
114    }
115
116    fn peek_at(&self, offset: usize) -> Option<char> {
117        self.input.get(self.pos + offset).copied()
118    }
119
120    fn advance(&mut self) -> Option<char> {
121        let ch = self.input.get(self.pos).copied();
122        if let Some(c) = ch {
123            self.pos += 1;
124            if c == '\n' {
125                self.line += 1;
126                self.col = 1;
127            } else {
128                self.col += 1;
129            }
130        }
131        ch
132    }
133
134    fn make_token(
135        &self,
136        token_type: TokenType,
137        value: impl Into<String>,
138        start: usize,
139        start_line: usize,
140        start_col: usize,
141    ) -> Token {
142        Token::with_location(token_type, value, start, start_line, start_col)
143    }
144
145    fn next_token(&mut self) -> Result<Token> {
146        // Skip whitespace
147        while self.peek().is_some_and(|c| c.is_whitespace()) {
148            self.advance();
149        }
150
151        let start = self.pos;
152        let start_line = self.line;
153        let start_col = self.col;
154
155        let Some(ch) = self.advance() else {
156            return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
157        };
158
159        match ch {
160            // ── Punctuation ─────────────────────────────────────────
161            '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
162            ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
163            '[' => {
164                // Decide between two readings of `[`:
165                //   1. Bracket-quoted identifier (T-SQL / SQLite style): `[name]`,
166                //      `[#]`, `[1]`, `[User Link]`. Inner content may be anything
167                //      except `]` or newline.
168                //   2. Array subscript / element selector: `arr[1]`, `arr[1:5]`.
169                //
170                // Disambiguate on the previously emitted token: array subscript
171                // requires a subscriptable value on its left (closing paren /
172                // closing bracket / identifier / string / number). After
173                // statement-start, `AS`, `(`, `,`, operators, `BY`, etc. the
174                // bracket can only be a quoted identifier.
175                let prev_is_subscriptable = matches!(
176                    self.prev_token_type,
177                    Some(
178                        TokenType::Identifier
179                            | TokenType::RParen
180                            | TokenType::RBracket
181                            | TokenType::String
182                            | TokenType::Number
183                            // Type keywords commonly preceding array modifier `TYPE[N]`
184                            | TokenType::Int
185                            | TokenType::Integer
186                            | TokenType::BigInt
187                            | TokenType::SmallInt
188                            | TokenType::TinyInt
189                            | TokenType::Float
190                            | TokenType::Double
191                            | TokenType::Decimal
192                            | TokenType::Numeric
193                            | TokenType::Real
194                            | TokenType::Varchar
195                            | TokenType::Char
196                            | TokenType::Text
197                            | TokenType::Boolean
198                            | TokenType::Bool
199                            | TokenType::Date
200                            | TokenType::Timestamp
201                            | TokenType::TimestampTz
202                            | TokenType::Time
203                            | TokenType::Interval
204                            | TokenType::Blob
205                            | TokenType::Bytea
206                            | TokenType::Json
207                            | TokenType::Jsonb
208                            | TokenType::Uuid
209                            | TokenType::Array
210                            | TokenType::Map
211                            | TokenType::Struct
212                    )
213                );
214
215                let mut looks_like_ident = false;
216                // Always try bracketed-ident interpretation when there is a
217                // space inside before `]` (e.g. `id [User Link]` — implicit
218                // alias). Real array subscripts never contain a literal space.
219                let mut has_space_inside = false;
220                let mut has_operator_inside = false;
221                if prev_is_subscriptable {
222                    let mut scan = self.pos;
223                    while scan < self.input.len() {
224                        let c = self.input[scan];
225                        if c == ']' {
226                            break;
227                        }
228                        if c == '\n' || c == '[' || c == ',' {
229                            break;
230                        }
231                        if c == ' ' || c == '\t' {
232                            has_space_inside = true;
233                        }
234                        if matches!(
235                            c,
236                            '+' | '-' | '*' | '/' | '%' | '=' | '<' | '>' | '!' | '&' | '|' | '^'
237                        ) {
238                            has_operator_inside = true;
239                        }
240                        scan += 1;
241                    }
242                }
243                if !prev_is_subscriptable || (has_space_inside && !has_operator_inside) {
244                    let mut scan = self.pos;
245                    let mut saw_quote = false;
246                    while scan < self.input.len() {
247                        let c = self.input[scan];
248                        if c == ']' {
249                            // For ARRAY/typed subscripts, a `'` inside means
250                            // it's a string literal cast (`array['lit'::T]`),
251                            // not a bracket identifier. For non-subscriptable
252                            // contexts (TSQL `[user's name]`), accept quotes.
253                            looks_like_ident =
254                                scan > self.pos && (!prev_is_subscriptable || !saw_quote);
255                            break;
256                        }
257                        // `,` rules out `ARRAY[1,2,3]` style literals.
258                        if c == '\n' || c == '[' || c == ',' {
259                            break;
260                        }
261                        if c == '\'' {
262                            saw_quote = true;
263                        }
264                        scan += 1;
265                    }
266                }
267                if looks_like_ident {
268                    self.read_quoted_identifier(start, start_line, start_col, '[')
269                } else {
270                    Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
271                }
272            }
273            ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
274            '{' => {
275                // ClickHouse parameter / typed placeholder `{name:Type}`.
276                // The name is identifier-like; the type may itself contain
277                // parens (e.g. `{ids:Array(UInt64)}`). Scan until the
278                // matching `}` and emit a single Parameter token; fall back
279                // to a plain `LBrace` otherwise.
280                if self.peek().is_some_and(is_identifier_start) {
281                    let mut i = 1usize;
282                    while self.peek_at(i).is_some_and(|c| is_identifier_continue(c)) {
283                        i += 1;
284                    }
285                    if self.peek_at(i) == Some(':') {
286                        let mut value = String::from('{');
287                        let mut depth = 0usize;
288                        loop {
289                            match self.peek() {
290                                None => break,
291                                Some('{') => {
292                                    depth += 1;
293                                    value.push('{');
294                                    self.advance();
295                                }
296                                Some('}') => {
297                                    if depth == 0 {
298                                        value.push('}');
299                                        self.advance();
300                                        return Ok(self.make_token(
301                                            TokenType::Parameter,
302                                            value,
303                                            start,
304                                            start_line,
305                                            start_col,
306                                        ));
307                                    }
308                                    depth -= 1;
309                                    value.push('}');
310                                    self.advance();
311                                }
312                                Some(c) => {
313                                    value.push(c);
314                                    self.advance();
315                                }
316                            }
317                        }
318                        return Err(SqlglotError::TokenizerError {
319                            message: "Unterminated parameter placeholder".into(),
320                            position: start,
321                        });
322                    }
323                }
324                Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col))
325            }
326            '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
327            ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
328            ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
329            '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
330            '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
331            '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
332            '@' => {
333                if self.peek() == Some('>') {
334                    self.advance();
335                    Ok(self.make_token(TokenType::AtArrow, "@>", start, start_line, start_col))
336                } else {
337                    Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col))
338                }
339            }
340            '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
341            '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
342            '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
343            '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
344
345            // ── Colon ───────────────────────────────────────────────
346            ':' => {
347                if self.peek() == Some(':') {
348                    self.advance();
349                    Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
350                } else {
351                    Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
352                }
353            }
354
355            // ── Minus / line comment / arrow ────────────────────────
356            '-' => {
357                if self.peek() == Some('-') {
358                    self.advance();
359                    let mut value = String::from("--");
360                    while self.peek().is_some_and(|c| c != '\n') {
361                        value.push(self.advance().unwrap());
362                    }
363                    Ok(
364                        self.make_token(
365                            TokenType::LineComment,
366                            value,
367                            start,
368                            start_line,
369                            start_col,
370                        ),
371                    )
372                } else if self.peek() == Some('>') {
373                    self.advance();
374                    if self.peek() == Some('>') {
375                        self.advance();
376                        Ok(self.make_token(
377                            TokenType::DoubleArrow,
378                            "->>",
379                            start,
380                            start_line,
381                            start_col,
382                        ))
383                    } else {
384                        Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
385                    }
386                } else {
387                    Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
388                }
389            }
390
391            // ── Slash / block comment ───────────────────────────────
392            '/' => {
393                if self.peek() == Some('*') {
394                    self.advance();
395                    let mut value = String::from("/*");
396                    let mut depth = 1;
397                    while depth > 0 {
398                        match self.advance() {
399                            Some('*') if self.peek() == Some('/') => {
400                                self.advance();
401                                depth -= 1;
402                                value.push_str("*/");
403                            }
404                            Some('/') if self.peek() == Some('*') => {
405                                self.advance();
406                                depth += 1;
407                                value.push_str("/*");
408                            }
409                            Some(c) => value.push(c),
410                            None => {
411                                return Err(SqlglotError::TokenizerError {
412                                    message: "Unterminated block comment".into(),
413                                    position: start,
414                                });
415                            }
416                        }
417                    }
418                    Ok(self.make_token(
419                        TokenType::BlockComment,
420                        value,
421                        start,
422                        start_line,
423                        start_col,
424                    ))
425                } else {
426                    Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
427                }
428            }
429
430            // ── Less-than variants ──────────────────────────────────
431            '<' => {
432                if self.peek() == Some('=') {
433                    self.advance();
434                    Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
435                } else if self.peek() == Some('>') {
436                    self.advance();
437                    Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
438                } else if self.peek() == Some('<') {
439                    self.advance();
440                    Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col))
441                } else if self.peek() == Some('@') {
442                    self.advance();
443                    Ok(self.make_token(TokenType::ArrowAt, "<@", start, start_line, start_col))
444                } else {
445                    Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
446                }
447            }
448
449            // ── Greater-than variants ───────────────────────────────
450            '>' => {
451                if self.peek() == Some('=') {
452                    self.advance();
453                    Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
454                } else if self.peek() == Some('>') {
455                    self.advance();
456                    Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
457                } else {
458                    Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
459                }
460            }
461
462            // ── Bang ────────────────────────────────────────────────
463            '!' => {
464                if self.peek() == Some('=') {
465                    self.advance();
466                    Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
467                } else {
468                    Err(SqlglotError::TokenizerError {
469                        message: format!("Unexpected character: {ch}"),
470                        position: start,
471                    })
472                }
473            }
474
475            // ── Pipe / BitwiseOr / Concat ───────────────────────────
476            '|' => {
477                if self.peek() == Some('|') {
478                    self.advance();
479                    Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
480                } else {
481                    Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
482                }
483            }
484
485            // ── Ampersand ───────────────────────────────────────────
486            '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
487
488            // ── Hash ────────────────────────────────────────────────
489            '#' => {
490                if self.peek() == Some('>') {
491                    self.advance();
492                    if self.peek() == Some('>') {
493                        self.advance();
494                        Ok(self.make_token(
495                            TokenType::HashDoubleArrow,
496                            "#>>",
497                            start,
498                            start_line,
499                            start_col,
500                        ))
501                    } else {
502                        Ok(self.make_token(
503                            TokenType::HashArrow,
504                            "#>",
505                            start,
506                            start_line,
507                            start_col,
508                        ))
509                    }
510                } else if self.peek() == Some('#') {
511                    // `##name##` — StackExchange Data Explorer style template
512                    // placeholder. Surface as a regular identifier so the
513                    // surrounding query parses. If we can't find a matching
514                    // closing `##` on the same line, fall through to the
515                    // line-comment behavior below.
516                    let save_pos = self.pos;
517                    let save_line = self.line;
518                    let save_col = self.col;
519                    self.advance(); // consume second `#`
520                    let inner_start = self.pos;
521                    let mut found_close = false;
522                    while let Some(c) = self.peek() {
523                        if c == '\n' {
524                            break;
525                        }
526                        if c == '#' && self.peek_at(1) == Some('#') {
527                            found_close = true;
528                            break;
529                        }
530                        self.advance();
531                    }
532                    if found_close {
533                        let value: String = self.input[inner_start..self.pos].iter().collect();
534                        self.advance(); // first closing `#`
535                        self.advance(); // second closing `#`
536                        return Ok(Token::with_quote(
537                            TokenType::Identifier,
538                            value,
539                            start,
540                            start_line,
541                            start_col,
542                            '#',
543                        ));
544                    }
545                    // Rewind and fall through to line-comment handling.
546                    self.pos = save_pos;
547                    self.line = save_line;
548                    self.col = save_col;
549                    let mut value = String::from("#");
550                    while self.peek().is_some_and(|c| c != '\n') {
551                        value.push(self.advance().unwrap());
552                    }
553                    Ok(
554                        self.make_token(
555                            TokenType::LineComment,
556                            value,
557                            start,
558                            start_line,
559                            start_col,
560                        ),
561                    )
562                } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
563                    // DuckDB `#N` positional column reference. Emit as a
564                    // Parameter so it parses inside expressions / ORDER BY.
565                    let mut value = String::from("#");
566                    while self.peek().is_some_and(|c| c.is_ascii_digit()) {
567                        value.push(self.advance().unwrap());
568                    }
569                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
570                } else {
571                    let mut value = String::from("#");
572                    while self.peek().is_some_and(|c| c != '\n') {
573                        value.push(self.advance().unwrap());
574                    }
575                    Ok(
576                        self.make_token(
577                            TokenType::LineComment,
578                            value,
579                            start,
580                            start_line,
581                            start_col,
582                        ),
583                    )
584                }
585            }
586
587            // ── String literals ─────────────────────────────────────
588            '\'' => self.read_string(start, start_line, start_col),
589
590            // ── Numbers ─────────────────────────────────────────────
591            c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
592
593            // ── Identifiers and keywords ────────────────────────────
594            c if is_identifier_start(c) => self.read_identifier(start, start_line, start_col, c),
595
596            // ── Quoted identifiers (double quote) ───────────────────
597            '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
598
599            // ── Backtick identifiers (MySQL, BigQuery) ──────────────
600            '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
601
602            // ── Parameter markers ───────────────────────────────────
603            '$' => {
604                // PostgreSQL dollar-quoted string literal: `$$body$$` or
605                // `$tag$body$tag$`. The tag is an optional identifier. We
606                // detect the opening sequence and scan to the matching
607                // closing sequence; the body may contain any characters.
608                if self.peek() == Some('$') {
609                    self.advance(); // closing $ of opening $$
610                    let mut value = String::new();
611                    while let Some(c) = self.peek() {
612                        if c == '$' && self.peek_at(1) == Some('$') {
613                            self.advance();
614                            self.advance();
615                            return Ok(self.make_token(
616                                TokenType::String,
617                                value,
618                                start,
619                                start_line,
620                                start_col,
621                            ));
622                        }
623                        value.push(self.advance().unwrap());
624                    }
625                    // Unterminated — fall back to the captured body as String.
626                    return Ok(self.make_token(
627                        TokenType::String,
628                        value,
629                        start,
630                        start_line,
631                        start_col,
632                    ));
633                }
634                // Speculative `$tag$ … $tag$` form. Only treat as a
635                // dollar-quote if the tokens after the tag actually form
636                // a valid closing sequence; otherwise fall through to
637                // the identifier / parameter handling below.
638                if self.peek().is_some_and(is_identifier_start) {
639                    let save_pos = self.pos;
640                    let save_line = self.line;
641                    let save_col = self.col;
642                    let mut tag = String::new();
643                    while self.peek().is_some_and(is_identifier_continue) {
644                        tag.push(self.advance().unwrap());
645                    }
646                    if self.peek() == Some('$') {
647                        self.advance();
648                        // Look ahead for matching `$tag$` close.
649                        let mut value = String::new();
650                        let mut closed = false;
651                        while let Some(c) = self.peek() {
652                            if c == '$' {
653                                // Test for the closing tag.
654                                let mut matched = true;
655                                for (i, ch) in tag.chars().enumerate() {
656                                    if self.peek_at(i + 1) != Some(ch) {
657                                        matched = false;
658                                        break;
659                                    }
660                                }
661                                if matched && self.peek_at(tag.len() + 1) == Some('$') {
662                                    // Consume `$tag$`.
663                                    for _ in 0..(tag.len() + 2) {
664                                        self.advance();
665                                    }
666                                    closed = true;
667                                    break;
668                                }
669                            }
670                            value.push(self.advance().unwrap());
671                        }
672                        if closed {
673                            return Ok(self.make_token(
674                                TokenType::String,
675                                value,
676                                start,
677                                start_line,
678                                start_col,
679                            ));
680                        }
681                    }
682                    // Not a dollar-quote; rewind and fall through to the
683                    // identifier path.
684                    self.pos = save_pos;
685                    self.line = save_line;
686                    self.col = save_col;
687                }
688                if self.peek() == Some('{') {
689                    // `${name}` template variable (DuckDB / shell-style). Consume
690                    // through the closing `}` and emit as a single Parameter token.
691                    let mut value = String::from("$");
692                    value.push(self.advance().unwrap()); // '{'
693                    while let Some(c) = self.peek() {
694                        value.push(self.advance().unwrap());
695                        if c == '}' {
696                            break;
697                        }
698                    }
699                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
700                } else if self.peek().is_some_and(|c| c.is_ascii_digit()) {
701                    let mut value = String::from("$");
702                    while self.peek().is_some_and(|c| c.is_ascii_digit()) {
703                        value.push(self.advance().unwrap());
704                    }
705                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
706                } else if self.peek().is_some_and(is_identifier_start) {
707                    // `$alias` / `$_`: identifier with a leading `$`. Appears
708                    // in auto-generated column names (e.g. `purse__$__`) and as
709                    // SELECT aliases (`AS $__`). PostgreSQL prepared-statement
710                    // parameters (`$1`, `$2`) keep the digits-only fast path
711                    // above; the `$<digit>` form cannot start an identifier so
712                    // there is no ambiguity.
713                    let mut value = String::from("$");
714                    while self.peek().is_some_and(is_identifier_continue) {
715                        value.push(self.advance().unwrap());
716                    }
717                    Ok(self.make_token(TokenType::Identifier, value, start, start_line, start_col))
718                } else {
719                    Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
720                }
721            }
722
723            '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
724
725            _ => Err(SqlglotError::TokenizerError {
726                message: format!("Unexpected character: {ch}"),
727                position: start,
728            }),
729        }
730    }
731
732    fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
733        let mut value = String::new();
734        loop {
735            match self.advance() {
736                Some('\'') => {
737                    if self.peek() == Some('\'') {
738                        self.advance();
739                        value.push('\'');
740                    } else {
741                        return Ok(self.make_token(
742                            TokenType::String,
743                            value,
744                            start,
745                            start_line,
746                            start_col,
747                        ));
748                    }
749                }
750                Some('\\') => match self.peek() {
751                    Some('\\') => {
752                        self.advance();
753                        value.push('\\');
754                    }
755                    Some('n') => {
756                        self.advance();
757                        value.push('\n');
758                    }
759                    Some('t') => {
760                        self.advance();
761                        value.push('\t');
762                    }
763                    Some('r') => {
764                        self.advance();
765                        value.push('\r');
766                    }
767                    Some('\'') => {
768                        self.advance();
769                        value.push('\'');
770                    }
771                    Some('"') => {
772                        self.advance();
773                        value.push('"');
774                    }
775                    Some('0') => {
776                        self.advance();
777                        value.push('\0');
778                    }
779                    Some('b') => {
780                        self.advance();
781                        value.push('\u{0008}');
782                    }
783                    Some('f') => {
784                        self.advance();
785                        value.push('\u{000C}');
786                    }
787                    Some('v') => {
788                        self.advance();
789                        value.push('\u{000B}');
790                    }
791                    Some('a') => {
792                        self.advance();
793                        value.push('\u{0007}');
794                    }
795                    Some(c) if c.is_ascii_alphanumeric() || c == '?' => {
796                        // Tolerate other escape sequences (e.g. ClickHouse
797                        // \xAA, \uXXXX, \?) by consuming the introducer
798                        // and keeping the literal character in the string.
799                        self.advance();
800                        value.push('\\');
801                        value.push(c);
802                    }
803                    _ => {
804                        value.push('\\');
805                    }
806                },
807                Some(c) => value.push(c),
808                None => {
809                    return Err(SqlglotError::TokenizerError {
810                        message: "Unterminated string literal".into(),
811                        position: start,
812                    });
813                }
814            }
815        }
816    }
817
818    fn read_number(
819        &mut self,
820        start: usize,
821        start_line: usize,
822        start_col: usize,
823        first: char,
824    ) -> Result<Token> {
825        let mut value = String::new();
826        value.push(first);
827
828        if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
829            value.push(self.advance().unwrap());
830            while self
831                .peek()
832                .is_some_and(|c| c.is_ascii_hexdigit() || c == '_')
833            {
834                value.push(self.advance().unwrap());
835            }
836            // Optional binary-exponent suffix `pN` / `PN` for hex floats
837            // (`0x1p-1022`, `0x123p4`).
838            if self.peek().is_some_and(|c| c == 'p' || c == 'P') {
839                value.push(self.advance().unwrap());
840                if self.peek().is_some_and(|c| c == '+' || c == '-') {
841                    value.push(self.advance().unwrap());
842                }
843                while self.peek().is_some_and(|c| c.is_ascii_digit()) {
844                    value.push(self.advance().unwrap());
845                }
846            }
847            return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
848        }
849
850        while self.peek().is_some_and(|c| c.is_ascii_digit() || c == '_') {
851            value.push(self.advance().unwrap());
852        }
853
854        if self.peek() == Some('.')
855            && (self.peek_at(1).is_some_and(|c| c.is_ascii_digit())
856                || !self.peek_at(1).is_some_and(is_identifier_start))
857        {
858            value.push(self.advance().unwrap());
859            while self.peek().is_some_and(|c| c.is_ascii_digit() || c == '_') {
860                value.push(self.advance().unwrap());
861            }
862        }
863
864        if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
865            value.push(self.advance().unwrap());
866            if self.peek().is_some_and(|c| c == '+' || c == '-') {
867                value.push(self.advance().unwrap());
868            }
869            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
870                value.push(self.advance().unwrap());
871            }
872        }
873
874        // ClickHouse / Hive allow identifiers that start with digits
875        // (`03720_test_alter`, `1_table`). If the run of digits is butted
876        // directly against an identifier-continue character, treat the
877        // whole span as an identifier.
878        if !value.contains('.')
879            && !value.contains('e')
880            && !value.contains('E')
881            && self.peek().is_some_and(is_identifier_continue)
882        {
883            while self.peek().is_some_and(is_identifier_continue) {
884                value.push(self.advance().unwrap());
885            }
886            let token_type = Self::keyword_type(&value);
887            return Ok(self.make_token(token_type, value, start, start_line, start_col));
888        }
889
890        Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
891    }
892
893    fn read_identifier(
894        &mut self,
895        start: usize,
896        start_line: usize,
897        start_col: usize,
898        first: char,
899    ) -> Result<Token> {
900        let mut value = String::new();
901        value.push(first);
902        while self.peek().is_some_and(is_identifier_continue) {
903            // Don't swallow a `$` that starts a template variable
904            // (`${name}`) or a numbered parameter (`$1`) — those need to
905            // tokenize as their own Parameter token.
906            if self.peek() == Some('$') {
907                let next = self.peek_at(1);
908                if matches!(next, Some('{')) || next.is_some_and(|c| c.is_ascii_digit()) {
909                    break;
910                }
911            }
912            value.push(self.advance().unwrap());
913        }
914
915        // Phase 1 support: treat N'...' / n'...' as a string literal token.
916        // This unblocks Oracle/TSQL national string parsing without AST changes.
917        if value.len() == 1
918            && value
919                .as_bytes()
920                .first()
921                .is_some_and(|b| b.eq_ignore_ascii_case(&b'n'))
922            && self.peek() == Some('\'')
923        {
924            self.advance(); // consume opening quote
925            let mut token = self.read_string(start, start_line, start_col)?;
926            token.token_type = TokenType::NationalString;
927            return Ok(token);
928        }
929
930        // PostgreSQL / SQL standard string-literal prefixes:
931        //   E'...'  — escape string (backslash escapes processed)
932        //   B'...'  — bit string
933        //   X'...'  — hex / byte string
934        //   U&'...' — Unicode escape string (we accept the prefix and string;
935        //             the trailing `UESCAPE 'x'` clause is parser-side noise)
936        // Each prefix tokenizes as a single-char identifier; merge with the
937        // following `'...'` literal into a String token so the SQL parses.
938        if value.len() == 1
939            && value
940                .as_bytes()
941                .first()
942                .is_some_and(|b| matches!(b.to_ascii_uppercase(), b'E' | b'B' | b'X'))
943            && self.peek() == Some('\'')
944        {
945            self.advance();
946            return self.read_string(start, start_line, start_col);
947        }
948        // U&'...' — Unicode escape literal.
949        if value.len() == 1
950            && value
951                .as_bytes()
952                .first()
953                .is_some_and(|b| b.eq_ignore_ascii_case(&b'u'))
954            && self.peek() == Some('&')
955            && self.peek_at(1) == Some('\'')
956        {
957            self.advance(); // &
958            self.advance(); // '
959            return self.read_string(start, start_line, start_col);
960        }
961
962        let token_type = Self::keyword_type(&value);
963        Ok(self.make_token(token_type, value, start, start_line, start_col))
964    }
965
966    /// Map a word to its keyword token type, or `Identifier` if not a keyword.
967    fn keyword_type(word: &str) -> TokenType {
968        match word.to_uppercase().as_str() {
969            "SELECT" => TokenType::Select,
970            "FROM" => TokenType::From,
971            "WHERE" => TokenType::Where,
972            "AND" => TokenType::And,
973            "OR" => TokenType::Or,
974            "NOT" => TokenType::Not,
975            "AS" => TokenType::As,
976            "JOIN" => TokenType::Join,
977            "INNER" => TokenType::Inner,
978            "LEFT" => TokenType::Left,
979            "RIGHT" => TokenType::Right,
980            "FULL" => TokenType::Full,
981            "OUTER" => TokenType::Outer,
982            "CROSS" => TokenType::Cross,
983            "ON" => TokenType::On,
984            "INSERT" => TokenType::Insert,
985            "INTO" => TokenType::Into,
986            "VALUES" => TokenType::Values,
987            "UPDATE" => TokenType::Update,
988            "SET" => TokenType::Set,
989            "DELETE" => TokenType::Delete,
990            "CREATE" => TokenType::Create,
991            "TABLE" => TokenType::Table,
992            "DROP" => TokenType::Drop,
993            "ALTER" => TokenType::Alter,
994            "INDEX" => TokenType::Index,
995            "IF" => TokenType::If,
996            "EXISTS" => TokenType::Exists,
997            "IN" => TokenType::In,
998            "IS" => TokenType::Is,
999            "NULL" => TokenType::Null,
1000            "LIKE" => TokenType::Like,
1001            "ILIKE" => TokenType::ILike,
1002            "ESCAPE" => TokenType::Escape,
1003            "BETWEEN" => TokenType::Between,
1004            "CASE" => TokenType::Case,
1005            "WHEN" => TokenType::When,
1006            "THEN" => TokenType::Then,
1007            "ELSE" => TokenType::Else,
1008            "END" => TokenType::End,
1009            "ORDER" => TokenType::Order,
1010            "BY" => TokenType::By,
1011            "ASC" => TokenType::Asc,
1012            "DESC" => TokenType::Desc,
1013            "GROUP" => TokenType::Group,
1014            "HAVING" => TokenType::Having,
1015            "LIMIT" => TokenType::Limit,
1016            "OFFSET" => TokenType::Offset,
1017            "UNION" => TokenType::Union,
1018            "ALL" => TokenType::All,
1019            "DISTINCT" => TokenType::Distinct,
1020            "TRUE" => TokenType::True,
1021            "FALSE" => TokenType::False,
1022            "INTERSECT" => TokenType::Intersect,
1023            "EXCEPT" => TokenType::Except,
1024            "WITH" => TokenType::With,
1025            "RECURSIVE" => TokenType::Recursive,
1026            "ANY" => TokenType::Any,
1027            "SOME" => TokenType::Some,
1028            "CAST" => TokenType::Cast,
1029            "OVER" => TokenType::Over,
1030            "PARTITION" => TokenType::Partition,
1031            "WINDOW" => TokenType::Window,
1032            "ROWS" => TokenType::Rows,
1033            "RANGE" => TokenType::Range,
1034            "UNBOUNDED" => TokenType::Unbounded,
1035            "PRECEDING" => TokenType::Preceding,
1036            "FOLLOWING" => TokenType::Following,
1037            "FILTER" => TokenType::Filter,
1038            "INT" => TokenType::Int,
1039            "INTEGER" => TokenType::Integer,
1040            "BIGINT" => TokenType::BigInt,
1041            "SMALLINT" => TokenType::SmallInt,
1042            "TINYINT" => TokenType::TinyInt,
1043            "FLOAT" => TokenType::Float,
1044            "DOUBLE" => TokenType::Double,
1045            "DECIMAL" => TokenType::Decimal,
1046            "NUMERIC" => TokenType::Numeric,
1047            "REAL" => TokenType::Real,
1048            "VARCHAR" => TokenType::Varchar,
1049            "CHAR" | "CHARACTER" => TokenType::Char,
1050            "TEXT" => TokenType::Text,
1051            "BOOLEAN" | "BOOL" => TokenType::Boolean,
1052            "DATE" => TokenType::Date,
1053            "TIMESTAMP" => TokenType::Timestamp,
1054            "TIMESTAMPTZ" => TokenType::TimestampTz,
1055            "TIME" => TokenType::Time,
1056            "INTERVAL" => TokenType::Interval,
1057            "BLOB" => TokenType::Blob,
1058            "BYTEA" => TokenType::Bytea,
1059            "JSON" => TokenType::Json,
1060            "JSONB" => TokenType::Jsonb,
1061            "UUID" => TokenType::Uuid,
1062            "ARRAY" => TokenType::Array,
1063            "MAP" => TokenType::Map,
1064            "STRUCT" => TokenType::Struct,
1065            "PRIMARY" => TokenType::Primary,
1066            "KEY" => TokenType::Key,
1067            "FOREIGN" => TokenType::Foreign,
1068            "REFERENCES" => TokenType::References,
1069            "UNIQUE" => TokenType::Unique,
1070            "CHECK" => TokenType::Check,
1071            "DEFAULT" => TokenType::Default,
1072            "CONSTRAINT" => TokenType::Constraint,
1073            "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
1074            "CASCADE" => TokenType::Cascade,
1075            "RESTRICT" => TokenType::Restrict,
1076            "RETURNING" => TokenType::Returning,
1077            "CONFLICT" => TokenType::Conflict,
1078            "DO" => TokenType::Do,
1079            "NOTHING" => TokenType::Nothing,
1080            "REPLACE" => TokenType::Replace,
1081            "IGNORE" => TokenType::Ignore,
1082            "MERGE" => TokenType::Merge,
1083            "MATCHED" => TokenType::Matched,
1084            "USING" => TokenType::Using,
1085            "TRUNCATE" => TokenType::Truncate,
1086            "SCHEMA" => TokenType::Schema,
1087            "DATABASE" => TokenType::Database,
1088            "VIEW" => TokenType::View,
1089            "MATERIALIZED" => TokenType::Materialized,
1090            "TEMPORARY" => TokenType::Temporary,
1091            "TEMP" => TokenType::Temp,
1092            "BEGIN" => TokenType::Begin,
1093            "COMMIT" => TokenType::Commit,
1094            "ROLLBACK" => TokenType::Rollback,
1095            "SAVEPOINT" => TokenType::Savepoint,
1096            "TRANSACTION" => TokenType::Transaction,
1097            "EXPLAIN" => TokenType::Explain,
1098            "ANALYZE" => TokenType::Analyze,
1099            "SHOW" => TokenType::Show,
1100            "USE" => TokenType::Use,
1101            "GRANT" => TokenType::Grant,
1102            "REVOKE" => TokenType::Revoke,
1103            "LATERAL" => TokenType::Lateral,
1104            "UNNEST" => TokenType::Unnest,
1105            "PIVOT" => TokenType::Pivot,
1106            "UNPIVOT" => TokenType::Unpivot,
1107            "TABLESAMPLE" => TokenType::Tablesample,
1108            "FETCH" => TokenType::Fetch,
1109            "FIRST" => TokenType::First,
1110            "NEXT" => TokenType::Next,
1111            "ONLY" => TokenType::Only,
1112            "NULLS" => TokenType::Nulls,
1113            "RESPECT" => TokenType::Respect,
1114            "TOP" => TokenType::Top,
1115            "COLLATE" => TokenType::Collate,
1116            "QUALIFY" => TokenType::Qualify,
1117            "CUBE" => TokenType::Cube,
1118            "ROLLUP" => TokenType::Rollup,
1119            "GROUPING" => TokenType::Grouping,
1120            "SETS" => TokenType::Sets,
1121            "XOR" => TokenType::Xor,
1122            "EXTRACT" => TokenType::Extract,
1123            "EPOCH" => TokenType::Epoch,
1124            "YEAR" => TokenType::Year,
1125            "MONTH" => TokenType::Month,
1126            "DAY" => TokenType::Day,
1127            "HOUR" => TokenType::Hour,
1128            "MINUTE" => TokenType::Minute,
1129            "SECOND" => TokenType::Second,
1130            _ => TokenType::Identifier,
1131        }
1132    }
1133
1134    fn read_quoted_identifier(
1135        &mut self,
1136        start: usize,
1137        start_line: usize,
1138        start_col: usize,
1139        quote: char,
1140    ) -> Result<Token> {
1141        let end_char = if quote == '[' { ']' } else { quote };
1142        let mut value = String::new();
1143        loop {
1144            match self.advance() {
1145                Some(c) if c == end_char => {
1146                    if self.peek() == Some(end_char) && end_char != ']' {
1147                        self.advance();
1148                        value.push(end_char);
1149                    } else {
1150                        return Ok(Token::with_quote(
1151                            TokenType::Identifier,
1152                            value,
1153                            start,
1154                            start_line,
1155                            start_col,
1156                            quote,
1157                        ));
1158                    }
1159                }
1160                Some(c) => value.push(c),
1161                None => {
1162                    return Err(SqlglotError::TokenizerError {
1163                        message: format!("Unterminated quoted identifier (expected {end_char})"),
1164                        position: start,
1165                    });
1166                }
1167            }
1168        }
1169    }
1170}
1171
1172#[cfg(test)]
1173mod tests {
1174    use super::*;
1175
1176    #[test]
1177    fn test_tokenize_simple_select() {
1178        let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
1179        let tokens = tokenizer.tokenize().unwrap();
1180        assert_eq!(tokens[0].token_type, TokenType::Select);
1181        assert_eq!(tokens[1].token_type, TokenType::Identifier);
1182        assert_eq!(tokens[1].value, "a");
1183        assert_eq!(tokens[2].token_type, TokenType::Comma);
1184        assert_eq!(tokens[3].token_type, TokenType::Identifier);
1185        assert_eq!(tokens[3].value, "b");
1186        assert_eq!(tokens[4].token_type, TokenType::From);
1187        assert_eq!(tokens[5].token_type, TokenType::Identifier);
1188        assert_eq!(tokens[5].value, "t");
1189        assert_eq!(tokens[6].token_type, TokenType::Eof);
1190    }
1191
1192    #[test]
1193    fn test_tokenize_string_literal() {
1194        let mut tokenizer = Tokenizer::new("'hello world'");
1195        let tokens = tokenizer.tokenize().unwrap();
1196        assert_eq!(tokens[0].token_type, TokenType::String);
1197        assert_eq!(tokens[0].value, "hello world");
1198    }
1199
1200    #[test]
1201    fn test_tokenize_operators() {
1202        let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
1203        let tokens = tokenizer.tokenize().unwrap();
1204        assert_eq!(tokens[1].token_type, TokenType::GtEq);
1205        assert_eq!(tokens[3].token_type, TokenType::And);
1206        assert_eq!(tokens[5].token_type, TokenType::Neq);
1207    }
1208
1209    #[test]
1210    fn test_tokenize_number() {
1211        let mut tokenizer = Tokenizer::new("123.45");
1212        let tokens = tokenizer.tokenize().unwrap();
1213        assert_eq!(tokens[0].token_type, TokenType::Number);
1214        assert_eq!(tokens[0].value, "123.45");
1215    }
1216
1217    #[test]
1218    fn test_tokenize_line_comment() {
1219        let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
1220        let tokens = tok.tokenize().unwrap();
1221        assert!(
1222            tokens
1223                .iter()
1224                .any(|t| t.token_type == TokenType::LineComment)
1225        );
1226    }
1227
1228    #[test]
1229    fn test_tokenize_block_comment() {
1230        let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
1231        let tokens = tok.tokenize().unwrap();
1232        assert!(
1233            tokens
1234                .iter()
1235                .any(|t| t.token_type == TokenType::BlockComment)
1236        );
1237    }
1238
1239    #[test]
1240    fn test_tokenize_cte_keywords() {
1241        let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
1242        let tokens = tok.tokenize().unwrap();
1243        assert_eq!(tokens[0].token_type, TokenType::With);
1244        assert_eq!(tokens[2].token_type, TokenType::As);
1245    }
1246
1247    #[test]
1248    fn test_tokenize_double_colon() {
1249        let mut tok = Tokenizer::new("x::int");
1250        let tokens = tok.tokenize().unwrap();
1251        assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
1252    }
1253
1254    #[test]
1255    fn test_tokenize_cast() {
1256        let mut tok = Tokenizer::new("CAST(x AS INT)");
1257        let tokens = tok.tokenize().unwrap();
1258        assert_eq!(tokens[0].token_type, TokenType::Cast);
1259    }
1260
1261    #[test]
1262    fn test_tokenize_window() {
1263        let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
1264        let tokens = tok.tokenize().unwrap();
1265        assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
1266        assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
1267    }
1268
1269    #[test]
1270    fn test_line_tracking() {
1271        let mut tok = Tokenizer::new("SELECT\n  1");
1272        let tokens = tok.tokenize().unwrap();
1273        assert_eq!(tokens[0].line, 1);
1274        assert_eq!(tokens[1].line, 2);
1275    }
1276
1277    #[test]
1278    fn test_tokenize_union_intersect_except() {
1279        let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
1280        let tokens = tok.tokenize().unwrap();
1281        assert_eq!(tokens[0].token_type, TokenType::Union);
1282        assert_eq!(tokens[1].token_type, TokenType::Intersect);
1283        assert_eq!(tokens[2].token_type, TokenType::Except);
1284    }
1285
1286    #[test]
1287    fn test_tokenize_n_prefixed_string_literal_uppercase() {
1288        let mut tok = Tokenizer::new("N'Hello'");
1289        let tokens = tok.tokenize().unwrap();
1290        assert_eq!(tokens[0].token_type, TokenType::NationalString);
1291        assert_eq!(tokens[0].value, "Hello");
1292    }
1293
1294    #[test]
1295    fn test_tokenize_n_prefixed_string_literal_lowercase() {
1296        let mut tok = Tokenizer::new("n'hello'");
1297        let tokens = tok.tokenize().unwrap();
1298        assert_eq!(tokens[0].token_type, TokenType::NationalString);
1299        assert_eq!(tokens[0].value, "hello");
1300    }
1301
1302    #[test]
1303    fn test_tokenize_n_prefixed_string_literal_escaped_quote() {
1304        let mut tok = Tokenizer::new("N'can''t stop'");
1305        let tokens = tok.tokenize().unwrap();
1306        assert_eq!(tokens[0].token_type, TokenType::NationalString);
1307        assert_eq!(tokens[0].value, "can't stop");
1308    }
1309
1310    #[test]
1311    fn test_tokenize_n_prefixed_string_literal_unicode() {
1312        let mut tok = Tokenizer::new("N'テスト'");
1313        let tokens = tok.tokenize().unwrap();
1314        assert_eq!(tokens[0].token_type, TokenType::NationalString);
1315        assert_eq!(tokens[0].value, "テスト");
1316    }
1317
1318    #[test]
1319    fn test_tokenize_identifier_n_without_quote() {
1320        let mut tok = Tokenizer::new("SELECT N FROM t");
1321        let tokens = tok.tokenize().unwrap();
1322        assert_eq!(tokens[1].token_type, TokenType::Identifier);
1323        assert_eq!(tokens[1].value, "N");
1324    }
1325
1326    #[test]
1327    fn test_tokenize_identifier_name_starting_with_n() {
1328        let mut tok = Tokenizer::new("SELECT NAME FROM t");
1329        let tokens = tok.tokenize().unwrap();
1330        assert_eq!(tokens[1].token_type, TokenType::Identifier);
1331        assert_eq!(tokens[1].value, "NAME");
1332    }
1333}