Skip to main content

sqlglot_rust/tokens/
tokenizer.rs

1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4/// SQL tokenizer that converts a SQL string into a stream of tokens.
5///
6/// Tracks line and column numbers for error reporting. Supports:
7/// - Single-line comments (`--`)
8/// - Block comments (`/* ... */`)
9/// - Quoted identifiers (`"..."` and backtick)
10/// - String literals with escape handling
11/// - Multi-character operators (`<=`, `>=`, `<>`, `!=`, `||`, `::`, `->`, `->>`)
12pub struct Tokenizer {
13    input: Vec<char>,
14    pos: usize,
15    line: usize,
16    col: usize,
17    /// Whether to preserve comments as tokens.
18    pub preserve_comments: bool,
19}
20
21impl Tokenizer {
22    /// Create a new tokenizer for the given SQL input.
23    #[must_use]
24    pub fn new(input: &str) -> Self {
25        Self {
26            input: input.chars().collect(),
27            pos: 0,
28            line: 1,
29            col: 1,
30            preserve_comments: false,
31        }
32    }
33
34    /// Create a tokenizer that preserves comment tokens.
35    #[must_use]
36    pub fn with_comments(input: &str) -> Self {
37        Self {
38            input: input.chars().collect(),
39            pos: 0,
40            line: 1,
41            col: 1,
42            preserve_comments: true,
43        }
44    }
45
46    /// Tokenize the entire input and return a vector of tokens.
47    ///
48    /// Whitespace tokens are skipped. Comments are optionally preserved.
49    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
50        let mut tokens = Vec::new();
51        loop {
52            let token = self.next_token()?;
53            match token.token_type {
54                TokenType::Eof => {
55                    tokens.push(token);
56                    break;
57                }
58                TokenType::Whitespace => continue,
59                TokenType::LineComment | TokenType::BlockComment => {
60                    if self.preserve_comments {
61                        tokens.push(token);
62                    }
63                }
64                _ => tokens.push(token),
65            }
66        }
67        Ok(tokens)
68    }
69
70    fn peek(&self) -> Option<char> {
71        self.input.get(self.pos).copied()
72    }
73
74    fn peek_at(&self, offset: usize) -> Option<char> {
75        self.input.get(self.pos + offset).copied()
76    }
77
78    fn advance(&mut self) -> Option<char> {
79        let ch = self.input.get(self.pos).copied();
80        if let Some(c) = ch {
81            self.pos += 1;
82            if c == '\n' {
83                self.line += 1;
84                self.col = 1;
85            } else {
86                self.col += 1;
87            }
88        }
89        ch
90    }
91
92    fn make_token(
93        &self,
94        token_type: TokenType,
95        value: impl Into<String>,
96        start: usize,
97        start_line: usize,
98        start_col: usize,
99    ) -> Token {
100        Token::with_location(token_type, value, start, start_line, start_col)
101    }
102
103    fn next_token(&mut self) -> Result<Token> {
104        // Skip whitespace
105        while self.peek().is_some_and(|c| c.is_whitespace()) {
106            self.advance();
107        }
108
109        let start = self.pos;
110        let start_line = self.line;
111        let start_col = self.col;
112
113        let Some(ch) = self.advance() else {
114            return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
115        };
116
117        match ch {
118            // ── Punctuation ─────────────────────────────────────────
119            '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
120            ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
121            '[' => {
122                // Check if this is a bracket-quoted identifier (T-SQL style: [identifier])
123                // Only treat as quoted identifier if the content between [ and ] looks like
124                // an identifier (starts with a letter or underscore, no commas inside).
125                let mut looks_like_ident = false;
126                if let Some(first_inner) = self.peek()
127                    && (first_inner.is_ascii_alphabetic() || first_inner == '_')
128                {
129                    let mut scan = self.pos;
130                    while scan < self.input.len() {
131                        if self.input[scan] == ']' {
132                            looks_like_ident = scan > self.pos;
133                            break;
134                        }
135                        if self.input[scan] == ',' || self.input[scan] == '\n' {
136                            break;
137                        }
138                        scan += 1;
139                    }
140                }
141                if looks_like_ident {
142                    self.read_quoted_identifier(start, start_line, start_col, '[')
143                } else {
144                    Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
145                }
146            }
147            ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
148            '{' => Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col)),
149            '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
150            ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
151            ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
152            '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
153            '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
154            '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
155            '@' => Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col)),
156            '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
157            '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
158            '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
159            '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
160
161            // ── Colon ───────────────────────────────────────────────
162            ':' => {
163                if self.peek() == Some(':') {
164                    self.advance();
165                    Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
166                } else {
167                    Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
168                }
169            }
170
171            // ── Minus / line comment / arrow ────────────────────────
172            '-' => {
173                if self.peek() == Some('-') {
174                    self.advance();
175                    let mut value = String::from("--");
176                    while self.peek().is_some_and(|c| c != '\n') {
177                        value.push(self.advance().unwrap());
178                    }
179                    Ok(
180                        self.make_token(
181                            TokenType::LineComment,
182                            value,
183                            start,
184                            start_line,
185                            start_col,
186                        ),
187                    )
188                } else if self.peek() == Some('>') {
189                    self.advance();
190                    if self.peek() == Some('>') {
191                        self.advance();
192                        Ok(self.make_token(
193                            TokenType::DoubleArrow,
194                            "->>",
195                            start,
196                            start_line,
197                            start_col,
198                        ))
199                    } else {
200                        Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
201                    }
202                } else {
203                    Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
204                }
205            }
206
207            // ── Slash / block comment ───────────────────────────────
208            '/' => {
209                if self.peek() == Some('*') {
210                    self.advance();
211                    let mut value = String::from("/*");
212                    let mut depth = 1;
213                    while depth > 0 {
214                        match self.advance() {
215                            Some('*') if self.peek() == Some('/') => {
216                                self.advance();
217                                depth -= 1;
218                                value.push_str("*/");
219                            }
220                            Some('/') if self.peek() == Some('*') => {
221                                self.advance();
222                                depth += 1;
223                                value.push_str("/*");
224                            }
225                            Some(c) => value.push(c),
226                            None => {
227                                return Err(SqlglotError::TokenizerError {
228                                    message: "Unterminated block comment".into(),
229                                    position: start,
230                                });
231                            }
232                        }
233                    }
234                    Ok(self.make_token(
235                        TokenType::BlockComment,
236                        value,
237                        start,
238                        start_line,
239                        start_col,
240                    ))
241                } else {
242                    Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
243                }
244            }
245
246            // ── Less-than variants ──────────────────────────────────
247            '<' => {
248                if self.peek() == Some('=') {
249                    self.advance();
250                    Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
251                } else if self.peek() == Some('>') {
252                    self.advance();
253                    Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
254                } else if self.peek() == Some('<') {
255                    self.advance();
256                    Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col))
257                } else {
258                    Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
259                }
260            }
261
262            // ── Greater-than variants ───────────────────────────────
263            '>' => {
264                if self.peek() == Some('=') {
265                    self.advance();
266                    Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
267                } else if self.peek() == Some('>') {
268                    self.advance();
269                    Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
270                } else {
271                    Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
272                }
273            }
274
275            // ── Bang ────────────────────────────────────────────────
276            '!' => {
277                if self.peek() == Some('=') {
278                    self.advance();
279                    Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
280                } else {
281                    Err(SqlglotError::TokenizerError {
282                        message: format!("Unexpected character: {ch}"),
283                        position: start,
284                    })
285                }
286            }
287
288            // ── Pipe / BitwiseOr / Concat ───────────────────────────
289            '|' => {
290                if self.peek() == Some('|') {
291                    self.advance();
292                    Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
293                } else {
294                    Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
295                }
296            }
297
298            // ── Ampersand ───────────────────────────────────────────
299            '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
300
301            // ── Hash ────────────────────────────────────────────────
302            '#' => {
303                if self.peek() == Some('>') {
304                    self.advance();
305                    if self.peek() == Some('>') {
306                        self.advance();
307                        Ok(self.make_token(
308                            TokenType::HashDoubleArrow,
309                            "#>>",
310                            start,
311                            start_line,
312                            start_col,
313                        ))
314                    } else {
315                        Ok(self.make_token(
316                            TokenType::HashArrow,
317                            "#>",
318                            start,
319                            start_line,
320                            start_col,
321                        ))
322                    }
323                } else {
324                    let mut value = String::from("#");
325                    while self.peek().is_some_and(|c| c != '\n') {
326                        value.push(self.advance().unwrap());
327                    }
328                    Ok(
329                        self.make_token(
330                            TokenType::LineComment,
331                            value,
332                            start,
333                            start_line,
334                            start_col,
335                        ),
336                    )
337                }
338            }
339
340            // ── String literals ─────────────────────────────────────
341            '\'' => self.read_string(start, start_line, start_col),
342
343            // ── Numbers ─────────────────────────────────────────────
344            c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
345
346            // ── Identifiers and keywords ────────────────────────────
347            c if c.is_ascii_alphabetic() || c == '_' => {
348                self.read_identifier(start, start_line, start_col, c)
349            }
350
351            // ── Quoted identifiers (double quote) ───────────────────
352            '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
353
354            // ── Backtick identifiers (MySQL, BigQuery) ──────────────
355            '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
356
357            // ── Parameter markers ───────────────────────────────────
358            '$' => {
359                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
360                    let mut value = String::from("$");
361                    while self.peek().is_some_and(|c| c.is_ascii_digit()) {
362                        value.push(self.advance().unwrap());
363                    }
364                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
365                } else {
366                    Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
367                }
368            }
369
370            '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
371
372            _ => Err(SqlglotError::TokenizerError {
373                message: format!("Unexpected character: {ch}"),
374                position: start,
375            }),
376        }
377    }
378
379    fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
380        let mut value = String::new();
381        loop {
382            match self.advance() {
383                Some('\'') => {
384                    if self.peek() == Some('\'') {
385                        self.advance();
386                        value.push('\'');
387                    } else {
388                        return Ok(self.make_token(
389                            TokenType::String,
390                            value,
391                            start,
392                            start_line,
393                            start_col,
394                        ));
395                    }
396                }
397                Some('\\') => match self.peek() {
398                    Some('\\') => {
399                        self.advance();
400                        value.push('\\');
401                    }
402                    Some('n') => {
403                        self.advance();
404                        value.push('\n');
405                    }
406                    Some('t') => {
407                        self.advance();
408                        value.push('\t');
409                    }
410                    Some('r') => {
411                        self.advance();
412                        value.push('\r');
413                    }
414                    _ => {
415                        value.push('\\');
416                    }
417                },
418                Some(c) => value.push(c),
419                None => {
420                    return Err(SqlglotError::TokenizerError {
421                        message: "Unterminated string literal".into(),
422                        position: start,
423                    });
424                }
425            }
426        }
427    }
428
429    fn read_number(
430        &mut self,
431        start: usize,
432        start_line: usize,
433        start_col: usize,
434        first: char,
435    ) -> Result<Token> {
436        let mut value = String::new();
437        value.push(first);
438
439        if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
440            value.push(self.advance().unwrap());
441            while self.peek().is_some_and(|c| c.is_ascii_hexdigit()) {
442                value.push(self.advance().unwrap());
443            }
444            return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
445        }
446
447        while self.peek().is_some_and(|c| c.is_ascii_digit()) {
448            value.push(self.advance().unwrap());
449        }
450
451        if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
452            value.push(self.advance().unwrap());
453            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
454                value.push(self.advance().unwrap());
455            }
456        }
457
458        if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
459            value.push(self.advance().unwrap());
460            if self.peek().is_some_and(|c| c == '+' || c == '-') {
461                value.push(self.advance().unwrap());
462            }
463            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
464                value.push(self.advance().unwrap());
465            }
466        }
467
468        Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
469    }
470
471    fn read_identifier(
472        &mut self,
473        start: usize,
474        start_line: usize,
475        start_col: usize,
476        first: char,
477    ) -> Result<Token> {
478        let mut value = String::new();
479        value.push(first);
480        while self
481            .peek()
482            .is_some_and(|c| c.is_ascii_alphanumeric() || c == '_')
483        {
484            value.push(self.advance().unwrap());
485        }
486
487        // Phase 1 support: treat N'...' / n'...' as a string literal token.
488        // This unblocks Oracle/TSQL national string parsing without AST changes.
489        if value.len() == 1
490            && value
491                .as_bytes()
492                .first()
493                .is_some_and(|b| b.eq_ignore_ascii_case(&b'n'))
494            && self.peek() == Some('\'')
495        {
496            self.advance(); // consume opening quote
497            let mut token = self.read_string(start, start_line, start_col)?;
498            token.token_type = TokenType::NationalString;
499            return Ok(token);
500        }
501
502        let token_type = Self::keyword_type(&value);
503        Ok(self.make_token(token_type, value, start, start_line, start_col))
504    }
505
506    /// Map a word to its keyword token type, or `Identifier` if not a keyword.
507    fn keyword_type(word: &str) -> TokenType {
508        match word.to_uppercase().as_str() {
509            "SELECT" => TokenType::Select,
510            "FROM" => TokenType::From,
511            "WHERE" => TokenType::Where,
512            "AND" => TokenType::And,
513            "OR" => TokenType::Or,
514            "NOT" => TokenType::Not,
515            "AS" => TokenType::As,
516            "JOIN" => TokenType::Join,
517            "INNER" => TokenType::Inner,
518            "LEFT" => TokenType::Left,
519            "RIGHT" => TokenType::Right,
520            "FULL" => TokenType::Full,
521            "OUTER" => TokenType::Outer,
522            "CROSS" => TokenType::Cross,
523            "ON" => TokenType::On,
524            "INSERT" => TokenType::Insert,
525            "INTO" => TokenType::Into,
526            "VALUES" => TokenType::Values,
527            "UPDATE" => TokenType::Update,
528            "SET" => TokenType::Set,
529            "DELETE" => TokenType::Delete,
530            "CREATE" => TokenType::Create,
531            "TABLE" => TokenType::Table,
532            "DROP" => TokenType::Drop,
533            "ALTER" => TokenType::Alter,
534            "INDEX" => TokenType::Index,
535            "IF" => TokenType::If,
536            "EXISTS" => TokenType::Exists,
537            "IN" => TokenType::In,
538            "IS" => TokenType::Is,
539            "NULL" => TokenType::Null,
540            "LIKE" => TokenType::Like,
541            "ILIKE" => TokenType::ILike,
542            "ESCAPE" => TokenType::Escape,
543            "BETWEEN" => TokenType::Between,
544            "CASE" => TokenType::Case,
545            "WHEN" => TokenType::When,
546            "THEN" => TokenType::Then,
547            "ELSE" => TokenType::Else,
548            "END" => TokenType::End,
549            "ORDER" => TokenType::Order,
550            "BY" => TokenType::By,
551            "ASC" => TokenType::Asc,
552            "DESC" => TokenType::Desc,
553            "GROUP" => TokenType::Group,
554            "HAVING" => TokenType::Having,
555            "LIMIT" => TokenType::Limit,
556            "OFFSET" => TokenType::Offset,
557            "UNION" => TokenType::Union,
558            "ALL" => TokenType::All,
559            "DISTINCT" => TokenType::Distinct,
560            "TRUE" => TokenType::True,
561            "FALSE" => TokenType::False,
562            "INTERSECT" => TokenType::Intersect,
563            "EXCEPT" => TokenType::Except,
564            "WITH" => TokenType::With,
565            "RECURSIVE" => TokenType::Recursive,
566            "ANY" => TokenType::Any,
567            "SOME" => TokenType::Some,
568            "CAST" => TokenType::Cast,
569            "OVER" => TokenType::Over,
570            "PARTITION" => TokenType::Partition,
571            "WINDOW" => TokenType::Window,
572            "ROWS" => TokenType::Rows,
573            "RANGE" => TokenType::Range,
574            "UNBOUNDED" => TokenType::Unbounded,
575            "PRECEDING" => TokenType::Preceding,
576            "FOLLOWING" => TokenType::Following,
577            "FILTER" => TokenType::Filter,
578            "INT" => TokenType::Int,
579            "INTEGER" => TokenType::Integer,
580            "BIGINT" => TokenType::BigInt,
581            "SMALLINT" => TokenType::SmallInt,
582            "TINYINT" => TokenType::TinyInt,
583            "FLOAT" => TokenType::Float,
584            "DOUBLE" => TokenType::Double,
585            "DECIMAL" => TokenType::Decimal,
586            "NUMERIC" => TokenType::Numeric,
587            "REAL" => TokenType::Real,
588            "VARCHAR" => TokenType::Varchar,
589            "CHAR" | "CHARACTER" => TokenType::Char,
590            "TEXT" => TokenType::Text,
591            "BOOLEAN" | "BOOL" => TokenType::Boolean,
592            "DATE" => TokenType::Date,
593            "TIMESTAMP" => TokenType::Timestamp,
594            "TIMESTAMPTZ" => TokenType::TimestampTz,
595            "TIME" => TokenType::Time,
596            "INTERVAL" => TokenType::Interval,
597            "BLOB" => TokenType::Blob,
598            "BYTEA" => TokenType::Bytea,
599            "JSON" => TokenType::Json,
600            "JSONB" => TokenType::Jsonb,
601            "UUID" => TokenType::Uuid,
602            "ARRAY" => TokenType::Array,
603            "MAP" => TokenType::Map,
604            "STRUCT" => TokenType::Struct,
605            "PRIMARY" => TokenType::Primary,
606            "KEY" => TokenType::Key,
607            "FOREIGN" => TokenType::Foreign,
608            "REFERENCES" => TokenType::References,
609            "UNIQUE" => TokenType::Unique,
610            "CHECK" => TokenType::Check,
611            "DEFAULT" => TokenType::Default,
612            "CONSTRAINT" => TokenType::Constraint,
613            "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
614            "CASCADE" => TokenType::Cascade,
615            "RESTRICT" => TokenType::Restrict,
616            "RETURNING" => TokenType::Returning,
617            "CONFLICT" => TokenType::Conflict,
618            "DO" => TokenType::Do,
619            "NOTHING" => TokenType::Nothing,
620            "REPLACE" => TokenType::Replace,
621            "IGNORE" => TokenType::Ignore,
622            "MERGE" => TokenType::Merge,
623            "MATCHED" => TokenType::Matched,
624            "USING" => TokenType::Using,
625            "TRUNCATE" => TokenType::Truncate,
626            "SCHEMA" => TokenType::Schema,
627            "DATABASE" => TokenType::Database,
628            "VIEW" => TokenType::View,
629            "MATERIALIZED" => TokenType::Materialized,
630            "TEMPORARY" => TokenType::Temporary,
631            "TEMP" => TokenType::Temp,
632            "BEGIN" => TokenType::Begin,
633            "COMMIT" => TokenType::Commit,
634            "ROLLBACK" => TokenType::Rollback,
635            "SAVEPOINT" => TokenType::Savepoint,
636            "TRANSACTION" => TokenType::Transaction,
637            "EXPLAIN" => TokenType::Explain,
638            "ANALYZE" => TokenType::Analyze,
639            "SHOW" => TokenType::Show,
640            "USE" => TokenType::Use,
641            "GRANT" => TokenType::Grant,
642            "REVOKE" => TokenType::Revoke,
643            "LATERAL" => TokenType::Lateral,
644            "UNNEST" => TokenType::Unnest,
645            "PIVOT" => TokenType::Pivot,
646            "UNPIVOT" => TokenType::Unpivot,
647            "TABLESAMPLE" => TokenType::Tablesample,
648            "FETCH" => TokenType::Fetch,
649            "FIRST" => TokenType::First,
650            "NEXT" => TokenType::Next,
651            "ONLY" => TokenType::Only,
652            "NULLS" => TokenType::Nulls,
653            "RESPECT" => TokenType::Respect,
654            "TOP" => TokenType::Top,
655            "COLLATE" => TokenType::Collate,
656            "QUALIFY" => TokenType::Qualify,
657            "CUBE" => TokenType::Cube,
658            "ROLLUP" => TokenType::Rollup,
659            "GROUPING" => TokenType::Grouping,
660            "SETS" => TokenType::Sets,
661            "XOR" => TokenType::Xor,
662            "EXTRACT" => TokenType::Extract,
663            "EPOCH" => TokenType::Epoch,
664            "YEAR" => TokenType::Year,
665            "MONTH" => TokenType::Month,
666            "DAY" => TokenType::Day,
667            "HOUR" => TokenType::Hour,
668            "MINUTE" => TokenType::Minute,
669            "SECOND" => TokenType::Second,
670            _ => TokenType::Identifier,
671        }
672    }
673
674    fn read_quoted_identifier(
675        &mut self,
676        start: usize,
677        start_line: usize,
678        start_col: usize,
679        quote: char,
680    ) -> Result<Token> {
681        let end_char = if quote == '[' { ']' } else { quote };
682        let mut value = String::new();
683        loop {
684            match self.advance() {
685                Some(c) if c == end_char => {
686                    if self.peek() == Some(end_char) && end_char != ']' {
687                        self.advance();
688                        value.push(end_char);
689                    } else {
690                        return Ok(Token::with_quote(
691                            TokenType::Identifier,
692                            value,
693                            start,
694                            start_line,
695                            start_col,
696                            quote,
697                        ));
698                    }
699                }
700                Some(c) => value.push(c),
701                None => {
702                    return Err(SqlglotError::TokenizerError {
703                        message: format!("Unterminated quoted identifier (expected {end_char})"),
704                        position: start,
705                    });
706                }
707            }
708        }
709    }
710}
711
712#[cfg(test)]
713mod tests {
714    use super::*;
715
716    #[test]
717    fn test_tokenize_simple_select() {
718        let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
719        let tokens = tokenizer.tokenize().unwrap();
720        assert_eq!(tokens[0].token_type, TokenType::Select);
721        assert_eq!(tokens[1].token_type, TokenType::Identifier);
722        assert_eq!(tokens[1].value, "a");
723        assert_eq!(tokens[2].token_type, TokenType::Comma);
724        assert_eq!(tokens[3].token_type, TokenType::Identifier);
725        assert_eq!(tokens[3].value, "b");
726        assert_eq!(tokens[4].token_type, TokenType::From);
727        assert_eq!(tokens[5].token_type, TokenType::Identifier);
728        assert_eq!(tokens[5].value, "t");
729        assert_eq!(tokens[6].token_type, TokenType::Eof);
730    }
731
732    #[test]
733    fn test_tokenize_string_literal() {
734        let mut tokenizer = Tokenizer::new("'hello world'");
735        let tokens = tokenizer.tokenize().unwrap();
736        assert_eq!(tokens[0].token_type, TokenType::String);
737        assert_eq!(tokens[0].value, "hello world");
738    }
739
740    #[test]
741    fn test_tokenize_operators() {
742        let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
743        let tokens = tokenizer.tokenize().unwrap();
744        assert_eq!(tokens[1].token_type, TokenType::GtEq);
745        assert_eq!(tokens[3].token_type, TokenType::And);
746        assert_eq!(tokens[5].token_type, TokenType::Neq);
747    }
748
749    #[test]
750    fn test_tokenize_number() {
751        let mut tokenizer = Tokenizer::new("123.45");
752        let tokens = tokenizer.tokenize().unwrap();
753        assert_eq!(tokens[0].token_type, TokenType::Number);
754        assert_eq!(tokens[0].value, "123.45");
755    }
756
757    #[test]
758    fn test_tokenize_line_comment() {
759        let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
760        let tokens = tok.tokenize().unwrap();
761        assert!(
762            tokens
763                .iter()
764                .any(|t| t.token_type == TokenType::LineComment)
765        );
766    }
767
768    #[test]
769    fn test_tokenize_block_comment() {
770        let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
771        let tokens = tok.tokenize().unwrap();
772        assert!(
773            tokens
774                .iter()
775                .any(|t| t.token_type == TokenType::BlockComment)
776        );
777    }
778
779    #[test]
780    fn test_tokenize_cte_keywords() {
781        let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
782        let tokens = tok.tokenize().unwrap();
783        assert_eq!(tokens[0].token_type, TokenType::With);
784        assert_eq!(tokens[2].token_type, TokenType::As);
785    }
786
787    #[test]
788    fn test_tokenize_double_colon() {
789        let mut tok = Tokenizer::new("x::int");
790        let tokens = tok.tokenize().unwrap();
791        assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
792    }
793
794    #[test]
795    fn test_tokenize_cast() {
796        let mut tok = Tokenizer::new("CAST(x AS INT)");
797        let tokens = tok.tokenize().unwrap();
798        assert_eq!(tokens[0].token_type, TokenType::Cast);
799    }
800
801    #[test]
802    fn test_tokenize_window() {
803        let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
804        let tokens = tok.tokenize().unwrap();
805        assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
806        assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
807    }
808
809    #[test]
810    fn test_line_tracking() {
811        let mut tok = Tokenizer::new("SELECT\n  1");
812        let tokens = tok.tokenize().unwrap();
813        assert_eq!(tokens[0].line, 1);
814        assert_eq!(tokens[1].line, 2);
815    }
816
817    #[test]
818    fn test_tokenize_union_intersect_except() {
819        let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
820        let tokens = tok.tokenize().unwrap();
821        assert_eq!(tokens[0].token_type, TokenType::Union);
822        assert_eq!(tokens[1].token_type, TokenType::Intersect);
823        assert_eq!(tokens[2].token_type, TokenType::Except);
824    }
825
826    #[test]
827    fn test_tokenize_n_prefixed_string_literal_uppercase() {
828        let mut tok = Tokenizer::new("N'Hello'");
829        let tokens = tok.tokenize().unwrap();
830        assert_eq!(tokens[0].token_type, TokenType::NationalString);
831        assert_eq!(tokens[0].value, "Hello");
832    }
833
834    #[test]
835    fn test_tokenize_n_prefixed_string_literal_lowercase() {
836        let mut tok = Tokenizer::new("n'hello'");
837        let tokens = tok.tokenize().unwrap();
838        assert_eq!(tokens[0].token_type, TokenType::NationalString);
839        assert_eq!(tokens[0].value, "hello");
840    }
841
842    #[test]
843    fn test_tokenize_n_prefixed_string_literal_escaped_quote() {
844        let mut tok = Tokenizer::new("N'can''t stop'");
845        let tokens = tok.tokenize().unwrap();
846        assert_eq!(tokens[0].token_type, TokenType::NationalString);
847        assert_eq!(tokens[0].value, "can't stop");
848    }
849
850    #[test]
851    fn test_tokenize_n_prefixed_string_literal_unicode() {
852        let mut tok = Tokenizer::new("N'テスト'");
853        let tokens = tok.tokenize().unwrap();
854        assert_eq!(tokens[0].token_type, TokenType::NationalString);
855        assert_eq!(tokens[0].value, "テスト");
856    }
857
858    #[test]
859    fn test_tokenize_identifier_n_without_quote() {
860        let mut tok = Tokenizer::new("SELECT N FROM t");
861        let tokens = tok.tokenize().unwrap();
862        assert_eq!(tokens[1].token_type, TokenType::Identifier);
863        assert_eq!(tokens[1].value, "N");
864    }
865
866    #[test]
867    fn test_tokenize_identifier_name_starting_with_n() {
868        let mut tok = Tokenizer::new("SELECT NAME FROM t");
869        let tokens = tok.tokenize().unwrap();
870        assert_eq!(tokens[1].token_type, TokenType::Identifier);
871        assert_eq!(tokens[1].value, "NAME");
872    }
873}