Skip to main content

sqlglot_rust/tokens/
tokenizer.rs

1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4/// Identifier-start predicate. Accepts ASCII `_` plus any Unicode letter,
5/// matching SQL:2003 §5.2 (PostgreSQL/MySQL/SQLite/Oracle/ClickHouse all
6/// accept Unicode letters in regular identifiers).
7#[inline]
8fn is_identifier_start(c: char) -> bool {
9    c == '_' || c.is_alphabetic()
10}
11
12/// Identifier-continue predicate. Includes Unicode alphanumerics, `_`, and `$`
13/// (MySQL/Oracle/SQL Server/SQLite all permit `$` inside identifiers after
14/// the first character).
15#[inline]
16fn is_identifier_continue(c: char) -> bool {
17    c == '_' || c == '$' || c.is_alphanumeric()
18}
19
20/// SQL tokenizer that converts a SQL string into a stream of tokens.
21///
22/// Tracks line and column numbers for error reporting. Supports:
23/// - Single-line comments (`--`)
24/// - Block comments (`/* ... */`)
25/// - Quoted identifiers (`"..."` and backtick)
26/// - String literals with escape handling
27/// - Multi-character operators (`<=`, `>=`, `<>`, `!=`, `||`, `::`, `->`, `->>`)
28pub struct Tokenizer {
29    input: Vec<char>,
30    pos: usize,
31    line: usize,
32    col: usize,
33    /// Whether to preserve comments as tokens.
34    pub preserve_comments: bool,
35}
36
37impl Tokenizer {
38    /// Create a new tokenizer for the given SQL input.
39    #[must_use]
40    pub fn new(input: &str) -> Self {
41        Self {
42            input: input.chars().collect(),
43            pos: 0,
44            line: 1,
45            col: 1,
46            preserve_comments: false,
47        }
48    }
49
50    /// Create a tokenizer that preserves comment tokens.
51    #[must_use]
52    pub fn with_comments(input: &str) -> Self {
53        Self {
54            input: input.chars().collect(),
55            pos: 0,
56            line: 1,
57            col: 1,
58            preserve_comments: true,
59        }
60    }
61
62    /// Tokenize the entire input and return a vector of tokens.
63    ///
64    /// Whitespace tokens are skipped. Comments are optionally preserved.
65    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
66        let mut tokens = Vec::new();
67        loop {
68            let token = self.next_token()?;
69            match token.token_type {
70                TokenType::Eof => {
71                    tokens.push(token);
72                    break;
73                }
74                TokenType::Whitespace => continue,
75                TokenType::LineComment | TokenType::BlockComment => {
76                    if self.preserve_comments {
77                        tokens.push(token);
78                    }
79                }
80                _ => tokens.push(token),
81            }
82        }
83        Ok(tokens)
84    }
85
86    fn peek(&self) -> Option<char> {
87        self.input.get(self.pos).copied()
88    }
89
90    fn peek_at(&self, offset: usize) -> Option<char> {
91        self.input.get(self.pos + offset).copied()
92    }
93
94    fn advance(&mut self) -> Option<char> {
95        let ch = self.input.get(self.pos).copied();
96        if let Some(c) = ch {
97            self.pos += 1;
98            if c == '\n' {
99                self.line += 1;
100                self.col = 1;
101            } else {
102                self.col += 1;
103            }
104        }
105        ch
106    }
107
108    fn make_token(
109        &self,
110        token_type: TokenType,
111        value: impl Into<String>,
112        start: usize,
113        start_line: usize,
114        start_col: usize,
115    ) -> Token {
116        Token::with_location(token_type, value, start, start_line, start_col)
117    }
118
119    fn next_token(&mut self) -> Result<Token> {
120        // Skip whitespace
121        while self.peek().is_some_and(|c| c.is_whitespace()) {
122            self.advance();
123        }
124
125        let start = self.pos;
126        let start_line = self.line;
127        let start_col = self.col;
128
129        let Some(ch) = self.advance() else {
130            return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
131        };
132
133        match ch {
134            // ── Punctuation ─────────────────────────────────────────
135            '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
136            ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
137            '[' => {
138                // Check if this is a bracket-quoted identifier (T-SQL style: [identifier])
139                // Only treat as quoted identifier if the content between [ and ] looks like
140                // an identifier (starts with a letter or underscore, no commas inside).
141                let mut looks_like_ident = false;
142                if let Some(first_inner) = self.peek()
143                    && (first_inner.is_ascii_alphabetic() || first_inner == '_')
144                {
145                    let mut scan = self.pos;
146                    while scan < self.input.len() {
147                        if self.input[scan] == ']' {
148                            looks_like_ident = scan > self.pos;
149                            break;
150                        }
151                        if self.input[scan] == ',' || self.input[scan] == '\n' {
152                            break;
153                        }
154                        scan += 1;
155                    }
156                }
157                if looks_like_ident {
158                    self.read_quoted_identifier(start, start_line, start_col, '[')
159                } else {
160                    Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
161                }
162            }
163            ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
164            '{' => Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col)),
165            '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
166            ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
167            ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
168            '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
169            '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
170            '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
171            '@' => {
172                if self.peek() == Some('>') {
173                    self.advance();
174                    Ok(self.make_token(TokenType::AtArrow, "@>", start, start_line, start_col))
175                } else {
176                    Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col))
177                }
178            }
179            '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
180            '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
181            '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
182            '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
183
184            // ── Colon ───────────────────────────────────────────────
185            ':' => {
186                if self.peek() == Some(':') {
187                    self.advance();
188                    Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
189                } else {
190                    Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
191                }
192            }
193
194            // ── Minus / line comment / arrow ────────────────────────
195            '-' => {
196                if self.peek() == Some('-') {
197                    self.advance();
198                    let mut value = String::from("--");
199                    while self.peek().is_some_and(|c| c != '\n') {
200                        value.push(self.advance().unwrap());
201                    }
202                    Ok(
203                        self.make_token(
204                            TokenType::LineComment,
205                            value,
206                            start,
207                            start_line,
208                            start_col,
209                        ),
210                    )
211                } else if self.peek() == Some('>') {
212                    self.advance();
213                    if self.peek() == Some('>') {
214                        self.advance();
215                        Ok(self.make_token(
216                            TokenType::DoubleArrow,
217                            "->>",
218                            start,
219                            start_line,
220                            start_col,
221                        ))
222                    } else {
223                        Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
224                    }
225                } else {
226                    Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
227                }
228            }
229
230            // ── Slash / block comment ───────────────────────────────
231            '/' => {
232                if self.peek() == Some('*') {
233                    self.advance();
234                    let mut value = String::from("/*");
235                    let mut depth = 1;
236                    while depth > 0 {
237                        match self.advance() {
238                            Some('*') if self.peek() == Some('/') => {
239                                self.advance();
240                                depth -= 1;
241                                value.push_str("*/");
242                            }
243                            Some('/') if self.peek() == Some('*') => {
244                                self.advance();
245                                depth += 1;
246                                value.push_str("/*");
247                            }
248                            Some(c) => value.push(c),
249                            None => {
250                                return Err(SqlglotError::TokenizerError {
251                                    message: "Unterminated block comment".into(),
252                                    position: start,
253                                });
254                            }
255                        }
256                    }
257                    Ok(self.make_token(
258                        TokenType::BlockComment,
259                        value,
260                        start,
261                        start_line,
262                        start_col,
263                    ))
264                } else {
265                    Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
266                }
267            }
268
269            // ── Less-than variants ──────────────────────────────────
270            '<' => {
271                if self.peek() == Some('=') {
272                    self.advance();
273                    Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
274                } else if self.peek() == Some('>') {
275                    self.advance();
276                    Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
277                } else if self.peek() == Some('<') {
278                    self.advance();
279                    Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col))                } else if self.peek() == Some('@') {
280                    self.advance();
281                    Ok(self.make_token(TokenType::ArrowAt, "<@", start, start_line, start_col))                } else {
282                    Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
283                }
284            }
285
286            // ── Greater-than variants ───────────────────────────────
287            '>' => {
288                if self.peek() == Some('=') {
289                    self.advance();
290                    Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
291                } else if self.peek() == Some('>') {
292                    self.advance();
293                    Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
294                } else {
295                    Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
296                }
297            }
298
299            // ── Bang ────────────────────────────────────────────────
300            '!' => {
301                if self.peek() == Some('=') {
302                    self.advance();
303                    Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
304                } else {
305                    Err(SqlglotError::TokenizerError {
306                        message: format!("Unexpected character: {ch}"),
307                        position: start,
308                    })
309                }
310            }
311
312            // ── Pipe / BitwiseOr / Concat ───────────────────────────
313            '|' => {
314                if self.peek() == Some('|') {
315                    self.advance();
316                    Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
317                } else {
318                    Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
319                }
320            }
321
322            // ── Ampersand ───────────────────────────────────────────
323            '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
324
325            // ── Hash ────────────────────────────────────────────────
326            '#' => {
327                if self.peek() == Some('>') {
328                    self.advance();
329                    if self.peek() == Some('>') {
330                        self.advance();
331                        Ok(self.make_token(
332                            TokenType::HashDoubleArrow,
333                            "#>>",
334                            start,
335                            start_line,
336                            start_col,
337                        ))
338                    } else {
339                        Ok(self.make_token(
340                            TokenType::HashArrow,
341                            "#>",
342                            start,
343                            start_line,
344                            start_col,
345                        ))
346                    }
347                } else {
348                    let mut value = String::from("#");
349                    while self.peek().is_some_and(|c| c != '\n') {
350                        value.push(self.advance().unwrap());
351                    }
352                    Ok(
353                        self.make_token(
354                            TokenType::LineComment,
355                            value,
356                            start,
357                            start_line,
358                            start_col,
359                        ),
360                    )
361                }
362            }
363
364            // ── String literals ─────────────────────────────────────
365            '\'' => self.read_string(start, start_line, start_col),
366
367            // ── Numbers ─────────────────────────────────────────────
368            c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
369
370            // ── Identifiers and keywords ────────────────────────────
371            c if is_identifier_start(c) => {
372                self.read_identifier(start, start_line, start_col, c)
373            }
374
375            // ── Quoted identifiers (double quote) ───────────────────
376            '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
377
378            // ── Backtick identifiers (MySQL, BigQuery) ──────────────
379            '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
380
381            // ── Parameter markers ───────────────────────────────────
382            '$' => {
383                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
384                    let mut value = String::from("$");
385                    while self.peek().is_some_and(|c| c.is_ascii_digit()) {
386                        value.push(self.advance().unwrap());
387                    }
388                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
389                } else {
390                    Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
391                }
392            }
393
394            '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
395
396            _ => Err(SqlglotError::TokenizerError {
397                message: format!("Unexpected character: {ch}"),
398                position: start,
399            }),
400        }
401    }
402
403    fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
404        let mut value = String::new();
405        loop {
406            match self.advance() {
407                Some('\'') => {
408                    if self.peek() == Some('\'') {
409                        self.advance();
410                        value.push('\'');
411                    } else {
412                        return Ok(self.make_token(
413                            TokenType::String,
414                            value,
415                            start,
416                            start_line,
417                            start_col,
418                        ));
419                    }
420                }
421                Some('\\') => match self.peek() {
422                    Some('\\') => {
423                        self.advance();
424                        value.push('\\');
425                    }
426                    Some('n') => {
427                        self.advance();
428                        value.push('\n');
429                    }
430                    Some('t') => {
431                        self.advance();
432                        value.push('\t');
433                    }
434                    Some('r') => {
435                        self.advance();
436                        value.push('\r');
437                    }
438                    _ => {
439                        value.push('\\');
440                    }
441                },
442                Some(c) => value.push(c),
443                None => {
444                    return Err(SqlglotError::TokenizerError {
445                        message: "Unterminated string literal".into(),
446                        position: start,
447                    });
448                }
449            }
450        }
451    }
452
453    fn read_number(
454        &mut self,
455        start: usize,
456        start_line: usize,
457        start_col: usize,
458        first: char,
459    ) -> Result<Token> {
460        let mut value = String::new();
461        value.push(first);
462
463        if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
464            value.push(self.advance().unwrap());
465            while self.peek().is_some_and(|c| c.is_ascii_hexdigit()) {
466                value.push(self.advance().unwrap());
467            }
468            return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
469        }
470
471        while self.peek().is_some_and(|c| c.is_ascii_digit()) {
472            value.push(self.advance().unwrap());
473        }
474
475        if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
476            value.push(self.advance().unwrap());
477            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
478                value.push(self.advance().unwrap());
479            }
480        }
481
482        if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
483            value.push(self.advance().unwrap());
484            if self.peek().is_some_and(|c| c == '+' || c == '-') {
485                value.push(self.advance().unwrap());
486            }
487            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
488                value.push(self.advance().unwrap());
489            }
490        }
491
492        Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
493    }
494
495    fn read_identifier(
496        &mut self,
497        start: usize,
498        start_line: usize,
499        start_col: usize,
500        first: char,
501    ) -> Result<Token> {
502        let mut value = String::new();
503        value.push(first);
504        while self
505            .peek()
506            .is_some_and(is_identifier_continue)
507        {
508            value.push(self.advance().unwrap());
509        }
510
511        // Phase 1 support: treat N'...' / n'...' as a string literal token.
512        // This unblocks Oracle/TSQL national string parsing without AST changes.
513        if value.len() == 1
514            && value
515                .as_bytes()
516                .first()
517                .is_some_and(|b| b.eq_ignore_ascii_case(&b'n'))
518            && self.peek() == Some('\'')
519        {
520            self.advance(); // consume opening quote
521            let mut token = self.read_string(start, start_line, start_col)?;
522            token.token_type = TokenType::NationalString;
523            return Ok(token);
524        }
525
526        let token_type = Self::keyword_type(&value);
527        Ok(self.make_token(token_type, value, start, start_line, start_col))
528    }
529
530    /// Map a word to its keyword token type, or `Identifier` if not a keyword.
531    fn keyword_type(word: &str) -> TokenType {
532        match word.to_uppercase().as_str() {
533            "SELECT" => TokenType::Select,
534            "FROM" => TokenType::From,
535            "WHERE" => TokenType::Where,
536            "AND" => TokenType::And,
537            "OR" => TokenType::Or,
538            "NOT" => TokenType::Not,
539            "AS" => TokenType::As,
540            "JOIN" => TokenType::Join,
541            "INNER" => TokenType::Inner,
542            "LEFT" => TokenType::Left,
543            "RIGHT" => TokenType::Right,
544            "FULL" => TokenType::Full,
545            "OUTER" => TokenType::Outer,
546            "CROSS" => TokenType::Cross,
547            "ON" => TokenType::On,
548            "INSERT" => TokenType::Insert,
549            "INTO" => TokenType::Into,
550            "VALUES" => TokenType::Values,
551            "UPDATE" => TokenType::Update,
552            "SET" => TokenType::Set,
553            "DELETE" => TokenType::Delete,
554            "CREATE" => TokenType::Create,
555            "TABLE" => TokenType::Table,
556            "DROP" => TokenType::Drop,
557            "ALTER" => TokenType::Alter,
558            "INDEX" => TokenType::Index,
559            "IF" => TokenType::If,
560            "EXISTS" => TokenType::Exists,
561            "IN" => TokenType::In,
562            "IS" => TokenType::Is,
563            "NULL" => TokenType::Null,
564            "LIKE" => TokenType::Like,
565            "ILIKE" => TokenType::ILike,
566            "ESCAPE" => TokenType::Escape,
567            "BETWEEN" => TokenType::Between,
568            "CASE" => TokenType::Case,
569            "WHEN" => TokenType::When,
570            "THEN" => TokenType::Then,
571            "ELSE" => TokenType::Else,
572            "END" => TokenType::End,
573            "ORDER" => TokenType::Order,
574            "BY" => TokenType::By,
575            "ASC" => TokenType::Asc,
576            "DESC" => TokenType::Desc,
577            "GROUP" => TokenType::Group,
578            "HAVING" => TokenType::Having,
579            "LIMIT" => TokenType::Limit,
580            "OFFSET" => TokenType::Offset,
581            "UNION" => TokenType::Union,
582            "ALL" => TokenType::All,
583            "DISTINCT" => TokenType::Distinct,
584            "TRUE" => TokenType::True,
585            "FALSE" => TokenType::False,
586            "INTERSECT" => TokenType::Intersect,
587            "EXCEPT" => TokenType::Except,
588            "WITH" => TokenType::With,
589            "RECURSIVE" => TokenType::Recursive,
590            "ANY" => TokenType::Any,
591            "SOME" => TokenType::Some,
592            "CAST" => TokenType::Cast,
593            "OVER" => TokenType::Over,
594            "PARTITION" => TokenType::Partition,
595            "WINDOW" => TokenType::Window,
596            "ROWS" => TokenType::Rows,
597            "RANGE" => TokenType::Range,
598            "UNBOUNDED" => TokenType::Unbounded,
599            "PRECEDING" => TokenType::Preceding,
600            "FOLLOWING" => TokenType::Following,
601            "FILTER" => TokenType::Filter,
602            "INT" => TokenType::Int,
603            "INTEGER" => TokenType::Integer,
604            "BIGINT" => TokenType::BigInt,
605            "SMALLINT" => TokenType::SmallInt,
606            "TINYINT" => TokenType::TinyInt,
607            "FLOAT" => TokenType::Float,
608            "DOUBLE" => TokenType::Double,
609            "DECIMAL" => TokenType::Decimal,
610            "NUMERIC" => TokenType::Numeric,
611            "REAL" => TokenType::Real,
612            "VARCHAR" => TokenType::Varchar,
613            "CHAR" | "CHARACTER" => TokenType::Char,
614            "TEXT" => TokenType::Text,
615            "BOOLEAN" | "BOOL" => TokenType::Boolean,
616            "DATE" => TokenType::Date,
617            "TIMESTAMP" => TokenType::Timestamp,
618            "TIMESTAMPTZ" => TokenType::TimestampTz,
619            "TIME" => TokenType::Time,
620            "INTERVAL" => TokenType::Interval,
621            "BLOB" => TokenType::Blob,
622            "BYTEA" => TokenType::Bytea,
623            "JSON" => TokenType::Json,
624            "JSONB" => TokenType::Jsonb,
625            "UUID" => TokenType::Uuid,
626            "ARRAY" => TokenType::Array,
627            "MAP" => TokenType::Map,
628            "STRUCT" => TokenType::Struct,
629            "PRIMARY" => TokenType::Primary,
630            "KEY" => TokenType::Key,
631            "FOREIGN" => TokenType::Foreign,
632            "REFERENCES" => TokenType::References,
633            "UNIQUE" => TokenType::Unique,
634            "CHECK" => TokenType::Check,
635            "DEFAULT" => TokenType::Default,
636            "CONSTRAINT" => TokenType::Constraint,
637            "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
638            "CASCADE" => TokenType::Cascade,
639            "RESTRICT" => TokenType::Restrict,
640            "RETURNING" => TokenType::Returning,
641            "CONFLICT" => TokenType::Conflict,
642            "DO" => TokenType::Do,
643            "NOTHING" => TokenType::Nothing,
644            "REPLACE" => TokenType::Replace,
645            "IGNORE" => TokenType::Ignore,
646            "MERGE" => TokenType::Merge,
647            "MATCHED" => TokenType::Matched,
648            "USING" => TokenType::Using,
649            "TRUNCATE" => TokenType::Truncate,
650            "SCHEMA" => TokenType::Schema,
651            "DATABASE" => TokenType::Database,
652            "VIEW" => TokenType::View,
653            "MATERIALIZED" => TokenType::Materialized,
654            "TEMPORARY" => TokenType::Temporary,
655            "TEMP" => TokenType::Temp,
656            "BEGIN" => TokenType::Begin,
657            "COMMIT" => TokenType::Commit,
658            "ROLLBACK" => TokenType::Rollback,
659            "SAVEPOINT" => TokenType::Savepoint,
660            "TRANSACTION" => TokenType::Transaction,
661            "EXPLAIN" => TokenType::Explain,
662            "ANALYZE" => TokenType::Analyze,
663            "SHOW" => TokenType::Show,
664            "USE" => TokenType::Use,
665            "GRANT" => TokenType::Grant,
666            "REVOKE" => TokenType::Revoke,
667            "LATERAL" => TokenType::Lateral,
668            "UNNEST" => TokenType::Unnest,
669            "PIVOT" => TokenType::Pivot,
670            "UNPIVOT" => TokenType::Unpivot,
671            "TABLESAMPLE" => TokenType::Tablesample,
672            "FETCH" => TokenType::Fetch,
673            "FIRST" => TokenType::First,
674            "NEXT" => TokenType::Next,
675            "ONLY" => TokenType::Only,
676            "NULLS" => TokenType::Nulls,
677            "RESPECT" => TokenType::Respect,
678            "TOP" => TokenType::Top,
679            "COLLATE" => TokenType::Collate,
680            "QUALIFY" => TokenType::Qualify,
681            "CUBE" => TokenType::Cube,
682            "ROLLUP" => TokenType::Rollup,
683            "GROUPING" => TokenType::Grouping,
684            "SETS" => TokenType::Sets,
685            "XOR" => TokenType::Xor,
686            "EXTRACT" => TokenType::Extract,
687            "EPOCH" => TokenType::Epoch,
688            "YEAR" => TokenType::Year,
689            "MONTH" => TokenType::Month,
690            "DAY" => TokenType::Day,
691            "HOUR" => TokenType::Hour,
692            "MINUTE" => TokenType::Minute,
693            "SECOND" => TokenType::Second,
694            _ => TokenType::Identifier,
695        }
696    }
697
698    fn read_quoted_identifier(
699        &mut self,
700        start: usize,
701        start_line: usize,
702        start_col: usize,
703        quote: char,
704    ) -> Result<Token> {
705        let end_char = if quote == '[' { ']' } else { quote };
706        let mut value = String::new();
707        loop {
708            match self.advance() {
709                Some(c) if c == end_char => {
710                    if self.peek() == Some(end_char) && end_char != ']' {
711                        self.advance();
712                        value.push(end_char);
713                    } else {
714                        return Ok(Token::with_quote(
715                            TokenType::Identifier,
716                            value,
717                            start,
718                            start_line,
719                            start_col,
720                            quote,
721                        ));
722                    }
723                }
724                Some(c) => value.push(c),
725                None => {
726                    return Err(SqlglotError::TokenizerError {
727                        message: format!("Unterminated quoted identifier (expected {end_char})"),
728                        position: start,
729                    });
730                }
731            }
732        }
733    }
734}
735
736#[cfg(test)]
737mod tests {
738    use super::*;
739
740    #[test]
741    fn test_tokenize_simple_select() {
742        let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
743        let tokens = tokenizer.tokenize().unwrap();
744        assert_eq!(tokens[0].token_type, TokenType::Select);
745        assert_eq!(tokens[1].token_type, TokenType::Identifier);
746        assert_eq!(tokens[1].value, "a");
747        assert_eq!(tokens[2].token_type, TokenType::Comma);
748        assert_eq!(tokens[3].token_type, TokenType::Identifier);
749        assert_eq!(tokens[3].value, "b");
750        assert_eq!(tokens[4].token_type, TokenType::From);
751        assert_eq!(tokens[5].token_type, TokenType::Identifier);
752        assert_eq!(tokens[5].value, "t");
753        assert_eq!(tokens[6].token_type, TokenType::Eof);
754    }
755
756    #[test]
757    fn test_tokenize_string_literal() {
758        let mut tokenizer = Tokenizer::new("'hello world'");
759        let tokens = tokenizer.tokenize().unwrap();
760        assert_eq!(tokens[0].token_type, TokenType::String);
761        assert_eq!(tokens[0].value, "hello world");
762    }
763
764    #[test]
765    fn test_tokenize_operators() {
766        let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
767        let tokens = tokenizer.tokenize().unwrap();
768        assert_eq!(tokens[1].token_type, TokenType::GtEq);
769        assert_eq!(tokens[3].token_type, TokenType::And);
770        assert_eq!(tokens[5].token_type, TokenType::Neq);
771    }
772
773    #[test]
774    fn test_tokenize_number() {
775        let mut tokenizer = Tokenizer::new("123.45");
776        let tokens = tokenizer.tokenize().unwrap();
777        assert_eq!(tokens[0].token_type, TokenType::Number);
778        assert_eq!(tokens[0].value, "123.45");
779    }
780
781    #[test]
782    fn test_tokenize_line_comment() {
783        let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
784        let tokens = tok.tokenize().unwrap();
785        assert!(
786            tokens
787                .iter()
788                .any(|t| t.token_type == TokenType::LineComment)
789        );
790    }
791
792    #[test]
793    fn test_tokenize_block_comment() {
794        let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
795        let tokens = tok.tokenize().unwrap();
796        assert!(
797            tokens
798                .iter()
799                .any(|t| t.token_type == TokenType::BlockComment)
800        );
801    }
802
803    #[test]
804    fn test_tokenize_cte_keywords() {
805        let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
806        let tokens = tok.tokenize().unwrap();
807        assert_eq!(tokens[0].token_type, TokenType::With);
808        assert_eq!(tokens[2].token_type, TokenType::As);
809    }
810
811    #[test]
812    fn test_tokenize_double_colon() {
813        let mut tok = Tokenizer::new("x::int");
814        let tokens = tok.tokenize().unwrap();
815        assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
816    }
817
818    #[test]
819    fn test_tokenize_cast() {
820        let mut tok = Tokenizer::new("CAST(x AS INT)");
821        let tokens = tok.tokenize().unwrap();
822        assert_eq!(tokens[0].token_type, TokenType::Cast);
823    }
824
825    #[test]
826    fn test_tokenize_window() {
827        let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
828        let tokens = tok.tokenize().unwrap();
829        assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
830        assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
831    }
832
833    #[test]
834    fn test_line_tracking() {
835        let mut tok = Tokenizer::new("SELECT\n  1");
836        let tokens = tok.tokenize().unwrap();
837        assert_eq!(tokens[0].line, 1);
838        assert_eq!(tokens[1].line, 2);
839    }
840
841    #[test]
842    fn test_tokenize_union_intersect_except() {
843        let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
844        let tokens = tok.tokenize().unwrap();
845        assert_eq!(tokens[0].token_type, TokenType::Union);
846        assert_eq!(tokens[1].token_type, TokenType::Intersect);
847        assert_eq!(tokens[2].token_type, TokenType::Except);
848    }
849
850    #[test]
851    fn test_tokenize_n_prefixed_string_literal_uppercase() {
852        let mut tok = Tokenizer::new("N'Hello'");
853        let tokens = tok.tokenize().unwrap();
854        assert_eq!(tokens[0].token_type, TokenType::NationalString);
855        assert_eq!(tokens[0].value, "Hello");
856    }
857
858    #[test]
859    fn test_tokenize_n_prefixed_string_literal_lowercase() {
860        let mut tok = Tokenizer::new("n'hello'");
861        let tokens = tok.tokenize().unwrap();
862        assert_eq!(tokens[0].token_type, TokenType::NationalString);
863        assert_eq!(tokens[0].value, "hello");
864    }
865
866    #[test]
867    fn test_tokenize_n_prefixed_string_literal_escaped_quote() {
868        let mut tok = Tokenizer::new("N'can''t stop'");
869        let tokens = tok.tokenize().unwrap();
870        assert_eq!(tokens[0].token_type, TokenType::NationalString);
871        assert_eq!(tokens[0].value, "can't stop");
872    }
873
874    #[test]
875    fn test_tokenize_n_prefixed_string_literal_unicode() {
876        let mut tok = Tokenizer::new("N'テスト'");
877        let tokens = tok.tokenize().unwrap();
878        assert_eq!(tokens[0].token_type, TokenType::NationalString);
879        assert_eq!(tokens[0].value, "テスト");
880    }
881
882    #[test]
883    fn test_tokenize_identifier_n_without_quote() {
884        let mut tok = Tokenizer::new("SELECT N FROM t");
885        let tokens = tok.tokenize().unwrap();
886        assert_eq!(tokens[1].token_type, TokenType::Identifier);
887        assert_eq!(tokens[1].value, "N");
888    }
889
890    #[test]
891    fn test_tokenize_identifier_name_starting_with_n() {
892        let mut tok = Tokenizer::new("SELECT NAME FROM t");
893        let tokens = tok.tokenize().unwrap();
894        assert_eq!(tokens[1].token_type, TokenType::Identifier);
895        assert_eq!(tokens[1].value, "NAME");
896    }
897}