Skip to main content

sqlglot_rust/tokens/
tokenizer.rs

1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4/// SQL tokenizer that converts a SQL string into a stream of tokens.
5///
6/// Tracks line and column numbers for error reporting. Supports:
7/// - Single-line comments (`--`)
8/// - Block comments (`/* ... */`)
9/// - Quoted identifiers (`"..."` and backtick)
10/// - String literals with escape handling
11/// - Multi-character operators (`<=`, `>=`, `<>`, `!=`, `||`, `::`, `->`, `->>`)
12pub struct Tokenizer {
13    input: Vec<char>,
14    pos: usize,
15    line: usize,
16    col: usize,
17    /// Whether to preserve comments as tokens.
18    pub preserve_comments: bool,
19}
20
21impl Tokenizer {
22    /// Create a new tokenizer for the given SQL input.
23    #[must_use]
24    pub fn new(input: &str) -> Self {
25        Self {
26            input: input.chars().collect(),
27            pos: 0,
28            line: 1,
29            col: 1,
30            preserve_comments: false,
31        }
32    }
33
34    /// Create a tokenizer that preserves comment tokens.
35    #[must_use]
36    pub fn with_comments(input: &str) -> Self {
37        Self {
38            input: input.chars().collect(),
39            pos: 0,
40            line: 1,
41            col: 1,
42            preserve_comments: true,
43        }
44    }
45
46    /// Tokenize the entire input and return a vector of tokens.
47    ///
48    /// Whitespace tokens are skipped. Comments are optionally preserved.
49    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
50        let mut tokens = Vec::new();
51        loop {
52            let token = self.next_token()?;
53            match token.token_type {
54                TokenType::Eof => {
55                    tokens.push(token);
56                    break;
57                }
58                TokenType::Whitespace => continue,
59                TokenType::LineComment | TokenType::BlockComment => {
60                    if self.preserve_comments {
61                        tokens.push(token);
62                    }
63                }
64                _ => tokens.push(token),
65            }
66        }
67        Ok(tokens)
68    }
69
70    fn peek(&self) -> Option<char> {
71        self.input.get(self.pos).copied()
72    }
73
74    fn peek_at(&self, offset: usize) -> Option<char> {
75        self.input.get(self.pos + offset).copied()
76    }
77
78    fn advance(&mut self) -> Option<char> {
79        let ch = self.input.get(self.pos).copied();
80        if let Some(c) = ch {
81            self.pos += 1;
82            if c == '\n' {
83                self.line += 1;
84                self.col = 1;
85            } else {
86                self.col += 1;
87            }
88        }
89        ch
90    }
91
92    fn make_token(&self, token_type: TokenType, value: impl Into<String>, start: usize, start_line: usize, start_col: usize) -> Token {
93        Token::with_location(token_type, value, start, start_line, start_col)
94    }
95
96    fn next_token(&mut self) -> Result<Token> {
97        // Skip whitespace
98        while self.peek().is_some_and(|c| c.is_whitespace()) {
99            self.advance();
100        }
101
102        let start = self.pos;
103        let start_line = self.line;
104        let start_col = self.col;
105
106        let Some(ch) = self.advance() else {
107            return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
108        };
109
110        match ch {
111            // ── Punctuation ─────────────────────────────────────────
112            '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
113            ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
114            '[' => {
115                // Check if this is a bracket-quoted identifier (T-SQL style: [identifier])
116                // Only treat as quoted identifier if the content between [ and ] looks like
117                // an identifier (starts with a letter or underscore, no commas inside).
118                let mut looks_like_ident = false;
119                if let Some(first_inner) = self.peek()
120                    && (first_inner.is_ascii_alphabetic() || first_inner == '_')
121                {
122                    let mut scan = self.pos;
123                    while scan < self.input.len() {
124                        if self.input[scan] == ']' {
125                            looks_like_ident = scan > self.pos;
126                            break;
127                        }
128                        if self.input[scan] == ',' || self.input[scan] == '\n' {
129                            break;
130                        }
131                        scan += 1;
132                    }
133                }
134                if looks_like_ident {
135                    self.read_quoted_identifier(start, start_line, start_col, '[')
136                } else {
137                    Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
138                }
139            }
140            ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
141            '{' => Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col)),
142            '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
143            ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
144            ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
145            '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
146            '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
147            '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
148            '@' => Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col)),
149            '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
150            '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
151            '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
152            '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
153
154            // ── Colon ───────────────────────────────────────────────
155            ':' => {
156                if self.peek() == Some(':') {
157                    self.advance();
158                    Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
159                } else {
160                    Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
161                }
162            }
163
164            // ── Minus / line comment / arrow ────────────────────────
165            '-' => {
166                if self.peek() == Some('-') {
167                    self.advance();
168                    let mut value = String::from("--");
169                    while self.peek().is_some_and(|c| c != '\n') {
170                        value.push(self.advance().unwrap());
171                    }
172                    Ok(self.make_token(TokenType::LineComment, value, start, start_line, start_col))
173                } else if self.peek() == Some('>') {
174                    self.advance();
175                    if self.peek() == Some('>') {
176                        self.advance();
177                        Ok(self.make_token(TokenType::DoubleArrow, "->>", start, start_line, start_col))
178                    } else {
179                        Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
180                    }
181                } else {
182                    Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
183                }
184            }
185
186            // ── Slash / block comment ───────────────────────────────
187            '/' => {
188                if self.peek() == Some('*') {
189                    self.advance();
190                    let mut value = String::from("/*");
191                    let mut depth = 1;
192                    while depth > 0 {
193                        match self.advance() {
194                            Some('*') if self.peek() == Some('/') => {
195                                self.advance();
196                                depth -= 1;
197                                value.push_str("*/");
198                            }
199                            Some('/') if self.peek() == Some('*') => {
200                                self.advance();
201                                depth += 1;
202                                value.push_str("/*");
203                            }
204                            Some(c) => value.push(c),
205                            None => {
206                                return Err(SqlglotError::TokenizerError {
207                                    message: "Unterminated block comment".into(),
208                                    position: start,
209                                });
210                            }
211                        }
212                    }
213                    Ok(self.make_token(TokenType::BlockComment, value, start, start_line, start_col))
214                } else {
215                    Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
216                }
217            }
218
219            // ── Less-than variants ──────────────────────────────────
220            '<' => {
221                if self.peek() == Some('=') {
222                    self.advance();
223                    Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
224                } else if self.peek() == Some('>') {
225                    self.advance();
226                    Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
227                } else if self.peek() == Some('<') {
228                    self.advance();
229                    Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col))
230                } else {
231                    Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
232                }
233            }
234
235            // ── Greater-than variants ───────────────────────────────
236            '>' => {
237                if self.peek() == Some('=') {
238                    self.advance();
239                    Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
240                } else if self.peek() == Some('>') {
241                    self.advance();
242                    Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
243                } else {
244                    Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
245                }
246            }
247
248            // ── Bang ────────────────────────────────────────────────
249            '!' => {
250                if self.peek() == Some('=') {
251                    self.advance();
252                    Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
253                } else {
254                    Err(SqlglotError::TokenizerError {
255                        message: format!("Unexpected character: {ch}"),
256                        position: start,
257                    })
258                }
259            }
260
261            // ── Pipe / BitwiseOr / Concat ───────────────────────────
262            '|' => {
263                if self.peek() == Some('|') {
264                    self.advance();
265                    Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
266                } else {
267                    Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
268                }
269            }
270
271            // ── Ampersand ───────────────────────────────────────────
272            '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
273
274            // ── Hash ────────────────────────────────────────────────
275            '#' => {
276                if self.peek() == Some('>') {
277                    self.advance();
278                    if self.peek() == Some('>') {
279                        self.advance();
280                        Ok(self.make_token(TokenType::HashDoubleArrow, "#>>", start, start_line, start_col))
281                    } else {
282                        Ok(self.make_token(TokenType::HashArrow, "#>", start, start_line, start_col))
283                    }
284                } else {
285                    let mut value = String::from("#");
286                    while self.peek().is_some_and(|c| c != '\n') {
287                        value.push(self.advance().unwrap());
288                    }
289                    Ok(self.make_token(TokenType::LineComment, value, start, start_line, start_col))
290                }
291            }
292
293            // ── String literals ─────────────────────────────────────
294            '\'' => self.read_string(start, start_line, start_col),
295
296            // ── Numbers ─────────────────────────────────────────────
297            c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
298
299            // ── Identifiers and keywords ────────────────────────────
300            c if c.is_ascii_alphabetic() || c == '_' => self.read_identifier(start, start_line, start_col, c),
301
302            // ── Quoted identifiers (double quote) ───────────────────
303            '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
304
305            // ── Backtick identifiers (MySQL, BigQuery) ──────────────
306            '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
307
308            // ── Parameter markers ───────────────────────────────────
309            '$' => {
310                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
311                    let mut value = String::from("$");
312                    while self.peek().is_some_and(|c| c.is_ascii_digit()) {
313                        value.push(self.advance().unwrap());
314                    }
315                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
316                } else {
317                    Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
318                }
319            }
320
321            '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
322
323            _ => Err(SqlglotError::TokenizerError {
324                message: format!("Unexpected character: {ch}"),
325                position: start,
326            }),
327        }
328    }
329
330    fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
331        let mut value = String::new();
332        loop {
333            match self.advance() {
334                Some('\'') => {
335                    if self.peek() == Some('\'') {
336                        self.advance();
337                        value.push('\'');
338                    } else {
339                        return Ok(self.make_token(TokenType::String, value, start, start_line, start_col));
340                    }
341                }
342                Some('\\') => {
343                    match self.peek() {
344                        Some('\\') => {
345                            self.advance();
346                            value.push('\\');
347                        }
348                        Some('n') => {
349                            self.advance();
350                            value.push('\n');
351                        }
352                        Some('t') => {
353                            self.advance();
354                            value.push('\t');
355                        }
356                        Some('r') => {
357                            self.advance();
358                            value.push('\r');
359                        }
360                        _ => {
361                            value.push('\\');
362                        }
363                    }
364                }
365                Some(c) => value.push(c),
366                None => {
367                    return Err(SqlglotError::TokenizerError {
368                        message: "Unterminated string literal".into(),
369                        position: start,
370                    });
371                }
372            }
373        }
374    }
375
376    fn read_number(&mut self, start: usize, start_line: usize, start_col: usize, first: char) -> Result<Token> {
377        let mut value = String::new();
378        value.push(first);
379
380        if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
381            value.push(self.advance().unwrap());
382            while self.peek().is_some_and(|c| c.is_ascii_hexdigit()) {
383                value.push(self.advance().unwrap());
384            }
385            return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
386        }
387
388        while self.peek().is_some_and(|c| c.is_ascii_digit()) {
389            value.push(self.advance().unwrap());
390        }
391
392        if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
393            value.push(self.advance().unwrap());
394            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
395                value.push(self.advance().unwrap());
396            }
397        }
398
399        if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
400            value.push(self.advance().unwrap());
401            if self.peek().is_some_and(|c| c == '+' || c == '-') {
402                value.push(self.advance().unwrap());
403            }
404            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
405                value.push(self.advance().unwrap());
406            }
407        }
408
409        Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
410    }
411
412    fn read_identifier(&mut self, start: usize, start_line: usize, start_col: usize, first: char) -> Result<Token> {
413        let mut value = String::new();
414        value.push(first);
415        while self
416            .peek()
417            .is_some_and(|c| c.is_ascii_alphanumeric() || c == '_')
418        {
419            value.push(self.advance().unwrap());
420        }
421
422        let token_type = Self::keyword_type(&value);
423        Ok(self.make_token(token_type, value, start, start_line, start_col))
424    }
425
426    /// Map a word to its keyword token type, or `Identifier` if not a keyword.
427    fn keyword_type(word: &str) -> TokenType {
428        match word.to_uppercase().as_str() {
429            "SELECT" => TokenType::Select,
430            "FROM" => TokenType::From,
431            "WHERE" => TokenType::Where,
432            "AND" => TokenType::And,
433            "OR" => TokenType::Or,
434            "NOT" => TokenType::Not,
435            "AS" => TokenType::As,
436            "JOIN" => TokenType::Join,
437            "INNER" => TokenType::Inner,
438            "LEFT" => TokenType::Left,
439            "RIGHT" => TokenType::Right,
440            "FULL" => TokenType::Full,
441            "OUTER" => TokenType::Outer,
442            "CROSS" => TokenType::Cross,
443            "ON" => TokenType::On,
444            "INSERT" => TokenType::Insert,
445            "INTO" => TokenType::Into,
446            "VALUES" => TokenType::Values,
447            "UPDATE" => TokenType::Update,
448            "SET" => TokenType::Set,
449            "DELETE" => TokenType::Delete,
450            "CREATE" => TokenType::Create,
451            "TABLE" => TokenType::Table,
452            "DROP" => TokenType::Drop,
453            "ALTER" => TokenType::Alter,
454            "INDEX" => TokenType::Index,
455            "IF" => TokenType::If,
456            "EXISTS" => TokenType::Exists,
457            "IN" => TokenType::In,
458            "IS" => TokenType::Is,
459            "NULL" => TokenType::Null,
460            "LIKE" => TokenType::Like,
461            "ILIKE" => TokenType::ILike,
462            "ESCAPE" => TokenType::Escape,
463            "BETWEEN" => TokenType::Between,
464            "CASE" => TokenType::Case,
465            "WHEN" => TokenType::When,
466            "THEN" => TokenType::Then,
467            "ELSE" => TokenType::Else,
468            "END" => TokenType::End,
469            "ORDER" => TokenType::Order,
470            "BY" => TokenType::By,
471            "ASC" => TokenType::Asc,
472            "DESC" => TokenType::Desc,
473            "GROUP" => TokenType::Group,
474            "HAVING" => TokenType::Having,
475            "LIMIT" => TokenType::Limit,
476            "OFFSET" => TokenType::Offset,
477            "UNION" => TokenType::Union,
478            "ALL" => TokenType::All,
479            "DISTINCT" => TokenType::Distinct,
480            "TRUE" => TokenType::True,
481            "FALSE" => TokenType::False,
482            "INTERSECT" => TokenType::Intersect,
483            "EXCEPT" => TokenType::Except,
484            "WITH" => TokenType::With,
485            "RECURSIVE" => TokenType::Recursive,
486            "ANY" => TokenType::Any,
487            "SOME" => TokenType::Some,
488            "CAST" => TokenType::Cast,
489            "OVER" => TokenType::Over,
490            "PARTITION" => TokenType::Partition,
491            "WINDOW" => TokenType::Window,
492            "ROWS" => TokenType::Rows,
493            "RANGE" => TokenType::Range,
494            "UNBOUNDED" => TokenType::Unbounded,
495            "PRECEDING" => TokenType::Preceding,
496            "FOLLOWING" => TokenType::Following,
497            "FILTER" => TokenType::Filter,
498            "INT" => TokenType::Int,
499            "INTEGER" => TokenType::Integer,
500            "BIGINT" => TokenType::BigInt,
501            "SMALLINT" => TokenType::SmallInt,
502            "TINYINT" => TokenType::TinyInt,
503            "FLOAT" => TokenType::Float,
504            "DOUBLE" => TokenType::Double,
505            "DECIMAL" => TokenType::Decimal,
506            "NUMERIC" => TokenType::Numeric,
507            "REAL" => TokenType::Real,
508            "VARCHAR" => TokenType::Varchar,
509            "CHAR" | "CHARACTER" => TokenType::Char,
510            "TEXT" => TokenType::Text,
511            "BOOLEAN" | "BOOL" => TokenType::Boolean,
512            "DATE" => TokenType::Date,
513            "TIMESTAMP" => TokenType::Timestamp,
514            "TIMESTAMPTZ" => TokenType::TimestampTz,
515            "TIME" => TokenType::Time,
516            "INTERVAL" => TokenType::Interval,
517            "BLOB" => TokenType::Blob,
518            "BYTEA" => TokenType::Bytea,
519            "JSON" => TokenType::Json,
520            "JSONB" => TokenType::Jsonb,
521            "UUID" => TokenType::Uuid,
522            "ARRAY" => TokenType::Array,
523            "MAP" => TokenType::Map,
524            "STRUCT" => TokenType::Struct,
525            "PRIMARY" => TokenType::Primary,
526            "KEY" => TokenType::Key,
527            "FOREIGN" => TokenType::Foreign,
528            "REFERENCES" => TokenType::References,
529            "UNIQUE" => TokenType::Unique,
530            "CHECK" => TokenType::Check,
531            "DEFAULT" => TokenType::Default,
532            "CONSTRAINT" => TokenType::Constraint,
533            "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
534            "CASCADE" => TokenType::Cascade,
535            "RESTRICT" => TokenType::Restrict,
536            "RETURNING" => TokenType::Returning,
537            "CONFLICT" => TokenType::Conflict,
538            "DO" => TokenType::Do,
539            "NOTHING" => TokenType::Nothing,
540            "REPLACE" => TokenType::Replace,
541            "IGNORE" => TokenType::Ignore,
542            "MERGE" => TokenType::Merge,
543            "MATCHED" => TokenType::Matched,
544            "USING" => TokenType::Using,
545            "TRUNCATE" => TokenType::Truncate,
546            "SCHEMA" => TokenType::Schema,
547            "DATABASE" => TokenType::Database,
548            "VIEW" => TokenType::View,
549            "MATERIALIZED" => TokenType::Materialized,
550            "TEMPORARY" => TokenType::Temporary,
551            "TEMP" => TokenType::Temp,
552            "BEGIN" => TokenType::Begin,
553            "COMMIT" => TokenType::Commit,
554            "ROLLBACK" => TokenType::Rollback,
555            "SAVEPOINT" => TokenType::Savepoint,
556            "TRANSACTION" => TokenType::Transaction,
557            "EXPLAIN" => TokenType::Explain,
558            "ANALYZE" => TokenType::Analyze,
559            "SHOW" => TokenType::Show,
560            "USE" => TokenType::Use,
561            "GRANT" => TokenType::Grant,
562            "REVOKE" => TokenType::Revoke,
563            "LATERAL" => TokenType::Lateral,
564            "UNNEST" => TokenType::Unnest,
565            "PIVOT" => TokenType::Pivot,
566            "UNPIVOT" => TokenType::Unpivot,
567            "TABLESAMPLE" => TokenType::Tablesample,
568            "FETCH" => TokenType::Fetch,
569            "FIRST" => TokenType::First,
570            "NEXT" => TokenType::Next,
571            "ONLY" => TokenType::Only,
572            "NULLS" => TokenType::Nulls,
573            "RESPECT" => TokenType::Respect,
574            "TOP" => TokenType::Top,
575            "COLLATE" => TokenType::Collate,
576            "QUALIFY" => TokenType::Qualify,
577            "XOR" => TokenType::Xor,
578            "EXTRACT" => TokenType::Extract,
579            "EPOCH" => TokenType::Epoch,
580            "YEAR" => TokenType::Year,
581            "MONTH" => TokenType::Month,
582            "DAY" => TokenType::Day,
583            "HOUR" => TokenType::Hour,
584            "MINUTE" => TokenType::Minute,
585            "SECOND" => TokenType::Second,
586            _ => TokenType::Identifier,
587        }
588    }
589
590    fn read_quoted_identifier(&mut self, start: usize, start_line: usize, start_col: usize, quote: char) -> Result<Token> {
591        let end_char = if quote == '[' { ']' } else { quote };
592        let mut value = String::new();
593        loop {
594            match self.advance() {
595                Some(c) if c == end_char => {
596                    if self.peek() == Some(end_char) && end_char != ']' {
597                        self.advance();
598                        value.push(end_char);
599                    } else {
600                        return Ok(Token::with_quote(
601                            TokenType::Identifier,
602                            value,
603                            start,
604                            start_line,
605                            start_col,
606                            quote,
607                        ));
608                    }
609                }
610                Some(c) => value.push(c),
611                None => {
612                    return Err(SqlglotError::TokenizerError {
613                        message: format!("Unterminated quoted identifier (expected {end_char})"),
614                        position: start,
615                    });
616                }
617            }
618        }
619    }
620}
621
622#[cfg(test)]
623mod tests {
624    use super::*;
625
626    #[test]
627    fn test_tokenize_simple_select() {
628        let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
629        let tokens = tokenizer.tokenize().unwrap();
630        assert_eq!(tokens[0].token_type, TokenType::Select);
631        assert_eq!(tokens[1].token_type, TokenType::Identifier);
632        assert_eq!(tokens[1].value, "a");
633        assert_eq!(tokens[2].token_type, TokenType::Comma);
634        assert_eq!(tokens[3].token_type, TokenType::Identifier);
635        assert_eq!(tokens[3].value, "b");
636        assert_eq!(tokens[4].token_type, TokenType::From);
637        assert_eq!(tokens[5].token_type, TokenType::Identifier);
638        assert_eq!(tokens[5].value, "t");
639        assert_eq!(tokens[6].token_type, TokenType::Eof);
640    }
641
642    #[test]
643    fn test_tokenize_string_literal() {
644        let mut tokenizer = Tokenizer::new("'hello world'");
645        let tokens = tokenizer.tokenize().unwrap();
646        assert_eq!(tokens[0].token_type, TokenType::String);
647        assert_eq!(tokens[0].value, "hello world");
648    }
649
650    #[test]
651    fn test_tokenize_operators() {
652        let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
653        let tokens = tokenizer.tokenize().unwrap();
654        assert_eq!(tokens[1].token_type, TokenType::GtEq);
655        assert_eq!(tokens[3].token_type, TokenType::And);
656        assert_eq!(tokens[5].token_type, TokenType::Neq);
657    }
658
659    #[test]
660    fn test_tokenize_number() {
661        let mut tokenizer = Tokenizer::new("123.45");
662        let tokens = tokenizer.tokenize().unwrap();
663        assert_eq!(tokens[0].token_type, TokenType::Number);
664        assert_eq!(tokens[0].value, "123.45");
665    }
666
667    #[test]
668    fn test_tokenize_line_comment() {
669        let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
670        let tokens = tok.tokenize().unwrap();
671        assert!(tokens.iter().any(|t| t.token_type == TokenType::LineComment));
672    }
673
674    #[test]
675    fn test_tokenize_block_comment() {
676        let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
677        let tokens = tok.tokenize().unwrap();
678        assert!(tokens.iter().any(|t| t.token_type == TokenType::BlockComment));
679    }
680
681    #[test]
682    fn test_tokenize_cte_keywords() {
683        let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
684        let tokens = tok.tokenize().unwrap();
685        assert_eq!(tokens[0].token_type, TokenType::With);
686        assert_eq!(tokens[2].token_type, TokenType::As);
687    }
688
689    #[test]
690    fn test_tokenize_double_colon() {
691        let mut tok = Tokenizer::new("x::int");
692        let tokens = tok.tokenize().unwrap();
693        assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
694    }
695
696    #[test]
697    fn test_tokenize_cast() {
698        let mut tok = Tokenizer::new("CAST(x AS INT)");
699        let tokens = tok.tokenize().unwrap();
700        assert_eq!(tokens[0].token_type, TokenType::Cast);
701    }
702
703    #[test]
704    fn test_tokenize_window() {
705        let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
706        let tokens = tok.tokenize().unwrap();
707        assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
708        assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
709    }
710
711    #[test]
712    fn test_line_tracking() {
713        let mut tok = Tokenizer::new("SELECT\n  1");
714        let tokens = tok.tokenize().unwrap();
715        assert_eq!(tokens[0].line, 1);
716        assert_eq!(tokens[1].line, 2);
717    }
718
719    #[test]
720    fn test_tokenize_union_intersect_except() {
721        let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
722        let tokens = tok.tokenize().unwrap();
723        assert_eq!(tokens[0].token_type, TokenType::Union);
724        assert_eq!(tokens[1].token_type, TokenType::Intersect);
725        assert_eq!(tokens[2].token_type, TokenType::Except);
726    }
727}