Skip to main content

sqlglot_rust/tokens/
tokenizer.rs

1use crate::errors::{Result, SqlglotError};
2use crate::tokens::{Token, TokenType};
3
4/// SQL tokenizer that converts a SQL string into a stream of tokens.
5///
6/// Tracks line and column numbers for error reporting. Supports:
7/// - Single-line comments (`--`)
8/// - Block comments (`/* ... */`)
9/// - Quoted identifiers (`"..."` and backtick)
10/// - String literals with escape handling
11/// - Multi-character operators (`<=`, `>=`, `<>`, `!=`, `||`, `::`, `->`, `->>`)
12pub struct Tokenizer {
13    input: Vec<char>,
14    pos: usize,
15    line: usize,
16    col: usize,
17    /// Whether to preserve comments as tokens.
18    pub preserve_comments: bool,
19}
20
21impl Tokenizer {
22    /// Create a new tokenizer for the given SQL input.
23    #[must_use]
24    pub fn new(input: &str) -> Self {
25        Self {
26            input: input.chars().collect(),
27            pos: 0,
28            line: 1,
29            col: 1,
30            preserve_comments: false,
31        }
32    }
33
34    /// Create a tokenizer that preserves comment tokens.
35    #[must_use]
36    pub fn with_comments(input: &str) -> Self {
37        Self {
38            input: input.chars().collect(),
39            pos: 0,
40            line: 1,
41            col: 1,
42            preserve_comments: true,
43        }
44    }
45
46    /// Tokenize the entire input and return a vector of tokens.
47    ///
48    /// Whitespace tokens are skipped. Comments are optionally preserved.
49    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
50        let mut tokens = Vec::new();
51        loop {
52            let token = self.next_token()?;
53            match token.token_type {
54                TokenType::Eof => {
55                    tokens.push(token);
56                    break;
57                }
58                TokenType::Whitespace => continue,
59                TokenType::LineComment | TokenType::BlockComment => {
60                    if self.preserve_comments {
61                        tokens.push(token);
62                    }
63                }
64                _ => tokens.push(token),
65            }
66        }
67        Ok(tokens)
68    }
69
70    fn peek(&self) -> Option<char> {
71        self.input.get(self.pos).copied()
72    }
73
74    fn peek_at(&self, offset: usize) -> Option<char> {
75        self.input.get(self.pos + offset).copied()
76    }
77
78    fn advance(&mut self) -> Option<char> {
79        let ch = self.input.get(self.pos).copied();
80        if let Some(c) = ch {
81            self.pos += 1;
82            if c == '\n' {
83                self.line += 1;
84                self.col = 1;
85            } else {
86                self.col += 1;
87            }
88        }
89        ch
90    }
91
92    fn make_token(
93        &self,
94        token_type: TokenType,
95        value: impl Into<String>,
96        start: usize,
97        start_line: usize,
98        start_col: usize,
99    ) -> Token {
100        Token::with_location(token_type, value, start, start_line, start_col)
101    }
102
103    fn next_token(&mut self) -> Result<Token> {
104        // Skip whitespace
105        while self.peek().is_some_and(|c| c.is_whitespace()) {
106            self.advance();
107        }
108
109        let start = self.pos;
110        let start_line = self.line;
111        let start_col = self.col;
112
113        let Some(ch) = self.advance() else {
114            return Ok(self.make_token(TokenType::Eof, "", start, start_line, start_col));
115        };
116
117        match ch {
118            // ── Punctuation ─────────────────────────────────────────
119            '(' => Ok(self.make_token(TokenType::LParen, "(", start, start_line, start_col)),
120            ')' => Ok(self.make_token(TokenType::RParen, ")", start, start_line, start_col)),
121            '[' => {
122                // Check if this is a bracket-quoted identifier (T-SQL style: [identifier])
123                // Only treat as quoted identifier if the content between [ and ] looks like
124                // an identifier (starts with a letter or underscore, no commas inside).
125                let mut looks_like_ident = false;
126                if let Some(first_inner) = self.peek()
127                    && (first_inner.is_ascii_alphabetic() || first_inner == '_')
128                {
129                    let mut scan = self.pos;
130                    while scan < self.input.len() {
131                        if self.input[scan] == ']' {
132                            looks_like_ident = scan > self.pos;
133                            break;
134                        }
135                        if self.input[scan] == ',' || self.input[scan] == '\n' {
136                            break;
137                        }
138                        scan += 1;
139                    }
140                }
141                if looks_like_ident {
142                    self.read_quoted_identifier(start, start_line, start_col, '[')
143                } else {
144                    Ok(self.make_token(TokenType::LBracket, "[", start, start_line, start_col))
145                }
146            }
147            ']' => Ok(self.make_token(TokenType::RBracket, "]", start, start_line, start_col)),
148            '{' => Ok(self.make_token(TokenType::LBrace, "{", start, start_line, start_col)),
149            '}' => Ok(self.make_token(TokenType::RBrace, "}", start, start_line, start_col)),
150            ',' => Ok(self.make_token(TokenType::Comma, ",", start, start_line, start_col)),
151            ';' => Ok(self.make_token(TokenType::Semicolon, ";", start, start_line, start_col)),
152            '.' => Ok(self.make_token(TokenType::Dot, ".", start, start_line, start_col)),
153            '+' => Ok(self.make_token(TokenType::Plus, "+", start, start_line, start_col)),
154            '~' => Ok(self.make_token(TokenType::BitwiseNot, "~", start, start_line, start_col)),
155            '@' => Ok(self.make_token(TokenType::AtSign, "@", start, start_line, start_col)),
156            '=' => Ok(self.make_token(TokenType::Eq, "=", start, start_line, start_col)),
157            '*' => Ok(self.make_token(TokenType::Star, "*", start, start_line, start_col)),
158            '%' => Ok(self.make_token(TokenType::Percent2, "%", start, start_line, start_col)),
159            '^' => Ok(self.make_token(TokenType::BitwiseXor, "^", start, start_line, start_col)),
160
161            // ── Colon ───────────────────────────────────────────────
162            ':' => {
163                if self.peek() == Some(':') {
164                    self.advance();
165                    Ok(self.make_token(TokenType::DoubleColon, "::", start, start_line, start_col))
166                } else {
167                    Ok(self.make_token(TokenType::Colon, ":", start, start_line, start_col))
168                }
169            }
170
171            // ── Minus / line comment / arrow ────────────────────────
172            '-' => {
173                if self.peek() == Some('-') {
174                    self.advance();
175                    let mut value = String::from("--");
176                    while self.peek().is_some_and(|c| c != '\n') {
177                        value.push(self.advance().unwrap());
178                    }
179                    Ok(
180                        self.make_token(
181                            TokenType::LineComment,
182                            value,
183                            start,
184                            start_line,
185                            start_col,
186                        ),
187                    )
188                } else if self.peek() == Some('>') {
189                    self.advance();
190                    if self.peek() == Some('>') {
191                        self.advance();
192                        Ok(self.make_token(
193                            TokenType::DoubleArrow,
194                            "->>",
195                            start,
196                            start_line,
197                            start_col,
198                        ))
199                    } else {
200                        Ok(self.make_token(TokenType::Arrow, "->", start, start_line, start_col))
201                    }
202                } else {
203                    Ok(self.make_token(TokenType::Minus, "-", start, start_line, start_col))
204                }
205            }
206
207            // ── Slash / block comment ───────────────────────────────
208            '/' => {
209                if self.peek() == Some('*') {
210                    self.advance();
211                    let mut value = String::from("/*");
212                    let mut depth = 1;
213                    while depth > 0 {
214                        match self.advance() {
215                            Some('*') if self.peek() == Some('/') => {
216                                self.advance();
217                                depth -= 1;
218                                value.push_str("*/");
219                            }
220                            Some('/') if self.peek() == Some('*') => {
221                                self.advance();
222                                depth += 1;
223                                value.push_str("/*");
224                            }
225                            Some(c) => value.push(c),
226                            None => {
227                                return Err(SqlglotError::TokenizerError {
228                                    message: "Unterminated block comment".into(),
229                                    position: start,
230                                });
231                            }
232                        }
233                    }
234                    Ok(self.make_token(
235                        TokenType::BlockComment,
236                        value,
237                        start,
238                        start_line,
239                        start_col,
240                    ))
241                } else {
242                    Ok(self.make_token(TokenType::Slash, "/", start, start_line, start_col))
243                }
244            }
245
246            // ── Less-than variants ──────────────────────────────────
247            '<' => {
248                if self.peek() == Some('=') {
249                    self.advance();
250                    Ok(self.make_token(TokenType::LtEq, "<=", start, start_line, start_col))
251                } else if self.peek() == Some('>') {
252                    self.advance();
253                    Ok(self.make_token(TokenType::Neq, "<>", start, start_line, start_col))
254                } else if self.peek() == Some('<') {
255                    self.advance();
256                    Ok(self.make_token(TokenType::ShiftLeft, "<<", start, start_line, start_col))
257                } else {
258                    Ok(self.make_token(TokenType::Lt, "<", start, start_line, start_col))
259                }
260            }
261
262            // ── Greater-than variants ───────────────────────────────
263            '>' => {
264                if self.peek() == Some('=') {
265                    self.advance();
266                    Ok(self.make_token(TokenType::GtEq, ">=", start, start_line, start_col))
267                } else if self.peek() == Some('>') {
268                    self.advance();
269                    Ok(self.make_token(TokenType::ShiftRight, ">>", start, start_line, start_col))
270                } else {
271                    Ok(self.make_token(TokenType::Gt, ">", start, start_line, start_col))
272                }
273            }
274
275            // ── Bang ────────────────────────────────────────────────
276            '!' => {
277                if self.peek() == Some('=') {
278                    self.advance();
279                    Ok(self.make_token(TokenType::Neq, "!=", start, start_line, start_col))
280                } else {
281                    Err(SqlglotError::TokenizerError {
282                        message: format!("Unexpected character: {ch}"),
283                        position: start,
284                    })
285                }
286            }
287
288            // ── Pipe / BitwiseOr / Concat ───────────────────────────
289            '|' => {
290                if self.peek() == Some('|') {
291                    self.advance();
292                    Ok(self.make_token(TokenType::Concat, "||", start, start_line, start_col))
293                } else {
294                    Ok(self.make_token(TokenType::BitwiseOr, "|", start, start_line, start_col))
295                }
296            }
297
298            // ── Ampersand ───────────────────────────────────────────
299            '&' => Ok(self.make_token(TokenType::BitwiseAnd, "&", start, start_line, start_col)),
300
301            // ── Hash ────────────────────────────────────────────────
302            '#' => {
303                if self.peek() == Some('>') {
304                    self.advance();
305                    if self.peek() == Some('>') {
306                        self.advance();
307                        Ok(self.make_token(
308                            TokenType::HashDoubleArrow,
309                            "#>>",
310                            start,
311                            start_line,
312                            start_col,
313                        ))
314                    } else {
315                        Ok(self.make_token(
316                            TokenType::HashArrow,
317                            "#>",
318                            start,
319                            start_line,
320                            start_col,
321                        ))
322                    }
323                } else {
324                    let mut value = String::from("#");
325                    while self.peek().is_some_and(|c| c != '\n') {
326                        value.push(self.advance().unwrap());
327                    }
328                    Ok(
329                        self.make_token(
330                            TokenType::LineComment,
331                            value,
332                            start,
333                            start_line,
334                            start_col,
335                        ),
336                    )
337                }
338            }
339
340            // ── String literals ─────────────────────────────────────
341            '\'' => self.read_string(start, start_line, start_col),
342
343            // ── Numbers ─────────────────────────────────────────────
344            c if c.is_ascii_digit() => self.read_number(start, start_line, start_col, c),
345
346            // ── Identifiers and keywords ────────────────────────────
347            c if c.is_ascii_alphabetic() || c == '_' => {
348                self.read_identifier(start, start_line, start_col, c)
349            }
350
351            // ── Quoted identifiers (double quote) ───────────────────
352            '"' => self.read_quoted_identifier(start, start_line, start_col, '"'),
353
354            // ── Backtick identifiers (MySQL, BigQuery) ──────────────
355            '`' => self.read_quoted_identifier(start, start_line, start_col, '`'),
356
357            // ── Parameter markers ───────────────────────────────────
358            '$' => {
359                if self.peek().is_some_and(|c| c.is_ascii_digit()) {
360                    let mut value = String::from("$");
361                    while self.peek().is_some_and(|c| c.is_ascii_digit()) {
362                        value.push(self.advance().unwrap());
363                    }
364                    Ok(self.make_token(TokenType::Parameter, value, start, start_line, start_col))
365                } else {
366                    Ok(self.make_token(TokenType::Parameter, "$", start, start_line, start_col))
367                }
368            }
369
370            '?' => Ok(self.make_token(TokenType::Parameter, "?", start, start_line, start_col)),
371
372            _ => Err(SqlglotError::TokenizerError {
373                message: format!("Unexpected character: {ch}"),
374                position: start,
375            }),
376        }
377    }
378
379    fn read_string(&mut self, start: usize, start_line: usize, start_col: usize) -> Result<Token> {
380        let mut value = String::new();
381        loop {
382            match self.advance() {
383                Some('\'') => {
384                    if self.peek() == Some('\'') {
385                        self.advance();
386                        value.push('\'');
387                    } else {
388                        return Ok(self.make_token(
389                            TokenType::String,
390                            value,
391                            start,
392                            start_line,
393                            start_col,
394                        ));
395                    }
396                }
397                Some('\\') => match self.peek() {
398                    Some('\\') => {
399                        self.advance();
400                        value.push('\\');
401                    }
402                    Some('n') => {
403                        self.advance();
404                        value.push('\n');
405                    }
406                    Some('t') => {
407                        self.advance();
408                        value.push('\t');
409                    }
410                    Some('r') => {
411                        self.advance();
412                        value.push('\r');
413                    }
414                    _ => {
415                        value.push('\\');
416                    }
417                },
418                Some(c) => value.push(c),
419                None => {
420                    return Err(SqlglotError::TokenizerError {
421                        message: "Unterminated string literal".into(),
422                        position: start,
423                    });
424                }
425            }
426        }
427    }
428
429    fn read_number(
430        &mut self,
431        start: usize,
432        start_line: usize,
433        start_col: usize,
434        first: char,
435    ) -> Result<Token> {
436        let mut value = String::new();
437        value.push(first);
438
439        if first == '0' && self.peek().is_some_and(|c| c == 'x' || c == 'X') {
440            value.push(self.advance().unwrap());
441            while self.peek().is_some_and(|c| c.is_ascii_hexdigit()) {
442                value.push(self.advance().unwrap());
443            }
444            return Ok(self.make_token(TokenType::HexString, value, start, start_line, start_col));
445        }
446
447        while self.peek().is_some_and(|c| c.is_ascii_digit()) {
448            value.push(self.advance().unwrap());
449        }
450
451        if self.peek() == Some('.') && self.peek_at(1).is_some_and(|c| c.is_ascii_digit()) {
452            value.push(self.advance().unwrap());
453            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
454                value.push(self.advance().unwrap());
455            }
456        }
457
458        if self.peek().is_some_and(|c| c == 'e' || c == 'E') {
459            value.push(self.advance().unwrap());
460            if self.peek().is_some_and(|c| c == '+' || c == '-') {
461                value.push(self.advance().unwrap());
462            }
463            while self.peek().is_some_and(|c| c.is_ascii_digit()) {
464                value.push(self.advance().unwrap());
465            }
466        }
467
468        Ok(self.make_token(TokenType::Number, value, start, start_line, start_col))
469    }
470
471    fn read_identifier(
472        &mut self,
473        start: usize,
474        start_line: usize,
475        start_col: usize,
476        first: char,
477    ) -> Result<Token> {
478        let mut value = String::new();
479        value.push(first);
480        while self
481            .peek()
482            .is_some_and(|c| c.is_ascii_alphanumeric() || c == '_')
483        {
484            value.push(self.advance().unwrap());
485        }
486
487        let token_type = Self::keyword_type(&value);
488        Ok(self.make_token(token_type, value, start, start_line, start_col))
489    }
490
491    /// Map a word to its keyword token type, or `Identifier` if not a keyword.
492    fn keyword_type(word: &str) -> TokenType {
493        match word.to_uppercase().as_str() {
494            "SELECT" => TokenType::Select,
495            "FROM" => TokenType::From,
496            "WHERE" => TokenType::Where,
497            "AND" => TokenType::And,
498            "OR" => TokenType::Or,
499            "NOT" => TokenType::Not,
500            "AS" => TokenType::As,
501            "JOIN" => TokenType::Join,
502            "INNER" => TokenType::Inner,
503            "LEFT" => TokenType::Left,
504            "RIGHT" => TokenType::Right,
505            "FULL" => TokenType::Full,
506            "OUTER" => TokenType::Outer,
507            "CROSS" => TokenType::Cross,
508            "ON" => TokenType::On,
509            "INSERT" => TokenType::Insert,
510            "INTO" => TokenType::Into,
511            "VALUES" => TokenType::Values,
512            "UPDATE" => TokenType::Update,
513            "SET" => TokenType::Set,
514            "DELETE" => TokenType::Delete,
515            "CREATE" => TokenType::Create,
516            "TABLE" => TokenType::Table,
517            "DROP" => TokenType::Drop,
518            "ALTER" => TokenType::Alter,
519            "INDEX" => TokenType::Index,
520            "IF" => TokenType::If,
521            "EXISTS" => TokenType::Exists,
522            "IN" => TokenType::In,
523            "IS" => TokenType::Is,
524            "NULL" => TokenType::Null,
525            "LIKE" => TokenType::Like,
526            "ILIKE" => TokenType::ILike,
527            "ESCAPE" => TokenType::Escape,
528            "BETWEEN" => TokenType::Between,
529            "CASE" => TokenType::Case,
530            "WHEN" => TokenType::When,
531            "THEN" => TokenType::Then,
532            "ELSE" => TokenType::Else,
533            "END" => TokenType::End,
534            "ORDER" => TokenType::Order,
535            "BY" => TokenType::By,
536            "ASC" => TokenType::Asc,
537            "DESC" => TokenType::Desc,
538            "GROUP" => TokenType::Group,
539            "HAVING" => TokenType::Having,
540            "LIMIT" => TokenType::Limit,
541            "OFFSET" => TokenType::Offset,
542            "UNION" => TokenType::Union,
543            "ALL" => TokenType::All,
544            "DISTINCT" => TokenType::Distinct,
545            "TRUE" => TokenType::True,
546            "FALSE" => TokenType::False,
547            "INTERSECT" => TokenType::Intersect,
548            "EXCEPT" => TokenType::Except,
549            "WITH" => TokenType::With,
550            "RECURSIVE" => TokenType::Recursive,
551            "ANY" => TokenType::Any,
552            "SOME" => TokenType::Some,
553            "CAST" => TokenType::Cast,
554            "OVER" => TokenType::Over,
555            "PARTITION" => TokenType::Partition,
556            "WINDOW" => TokenType::Window,
557            "ROWS" => TokenType::Rows,
558            "RANGE" => TokenType::Range,
559            "UNBOUNDED" => TokenType::Unbounded,
560            "PRECEDING" => TokenType::Preceding,
561            "FOLLOWING" => TokenType::Following,
562            "FILTER" => TokenType::Filter,
563            "INT" => TokenType::Int,
564            "INTEGER" => TokenType::Integer,
565            "BIGINT" => TokenType::BigInt,
566            "SMALLINT" => TokenType::SmallInt,
567            "TINYINT" => TokenType::TinyInt,
568            "FLOAT" => TokenType::Float,
569            "DOUBLE" => TokenType::Double,
570            "DECIMAL" => TokenType::Decimal,
571            "NUMERIC" => TokenType::Numeric,
572            "REAL" => TokenType::Real,
573            "VARCHAR" => TokenType::Varchar,
574            "CHAR" | "CHARACTER" => TokenType::Char,
575            "TEXT" => TokenType::Text,
576            "BOOLEAN" | "BOOL" => TokenType::Boolean,
577            "DATE" => TokenType::Date,
578            "TIMESTAMP" => TokenType::Timestamp,
579            "TIMESTAMPTZ" => TokenType::TimestampTz,
580            "TIME" => TokenType::Time,
581            "INTERVAL" => TokenType::Interval,
582            "BLOB" => TokenType::Blob,
583            "BYTEA" => TokenType::Bytea,
584            "JSON" => TokenType::Json,
585            "JSONB" => TokenType::Jsonb,
586            "UUID" => TokenType::Uuid,
587            "ARRAY" => TokenType::Array,
588            "MAP" => TokenType::Map,
589            "STRUCT" => TokenType::Struct,
590            "PRIMARY" => TokenType::Primary,
591            "KEY" => TokenType::Key,
592            "FOREIGN" => TokenType::Foreign,
593            "REFERENCES" => TokenType::References,
594            "UNIQUE" => TokenType::Unique,
595            "CHECK" => TokenType::Check,
596            "DEFAULT" => TokenType::Default,
597            "CONSTRAINT" => TokenType::Constraint,
598            "AUTO_INCREMENT" | "AUTOINCREMENT" => TokenType::AutoIncrement,
599            "CASCADE" => TokenType::Cascade,
600            "RESTRICT" => TokenType::Restrict,
601            "RETURNING" => TokenType::Returning,
602            "CONFLICT" => TokenType::Conflict,
603            "DO" => TokenType::Do,
604            "NOTHING" => TokenType::Nothing,
605            "REPLACE" => TokenType::Replace,
606            "IGNORE" => TokenType::Ignore,
607            "MERGE" => TokenType::Merge,
608            "MATCHED" => TokenType::Matched,
609            "USING" => TokenType::Using,
610            "TRUNCATE" => TokenType::Truncate,
611            "SCHEMA" => TokenType::Schema,
612            "DATABASE" => TokenType::Database,
613            "VIEW" => TokenType::View,
614            "MATERIALIZED" => TokenType::Materialized,
615            "TEMPORARY" => TokenType::Temporary,
616            "TEMP" => TokenType::Temp,
617            "BEGIN" => TokenType::Begin,
618            "COMMIT" => TokenType::Commit,
619            "ROLLBACK" => TokenType::Rollback,
620            "SAVEPOINT" => TokenType::Savepoint,
621            "TRANSACTION" => TokenType::Transaction,
622            "EXPLAIN" => TokenType::Explain,
623            "ANALYZE" => TokenType::Analyze,
624            "SHOW" => TokenType::Show,
625            "USE" => TokenType::Use,
626            "GRANT" => TokenType::Grant,
627            "REVOKE" => TokenType::Revoke,
628            "LATERAL" => TokenType::Lateral,
629            "UNNEST" => TokenType::Unnest,
630            "PIVOT" => TokenType::Pivot,
631            "UNPIVOT" => TokenType::Unpivot,
632            "TABLESAMPLE" => TokenType::Tablesample,
633            "FETCH" => TokenType::Fetch,
634            "FIRST" => TokenType::First,
635            "NEXT" => TokenType::Next,
636            "ONLY" => TokenType::Only,
637            "NULLS" => TokenType::Nulls,
638            "RESPECT" => TokenType::Respect,
639            "TOP" => TokenType::Top,
640            "COLLATE" => TokenType::Collate,
641            "QUALIFY" => TokenType::Qualify,
642            "XOR" => TokenType::Xor,
643            "EXTRACT" => TokenType::Extract,
644            "EPOCH" => TokenType::Epoch,
645            "YEAR" => TokenType::Year,
646            "MONTH" => TokenType::Month,
647            "DAY" => TokenType::Day,
648            "HOUR" => TokenType::Hour,
649            "MINUTE" => TokenType::Minute,
650            "SECOND" => TokenType::Second,
651            _ => TokenType::Identifier,
652        }
653    }
654
655    fn read_quoted_identifier(
656        &mut self,
657        start: usize,
658        start_line: usize,
659        start_col: usize,
660        quote: char,
661    ) -> Result<Token> {
662        let end_char = if quote == '[' { ']' } else { quote };
663        let mut value = String::new();
664        loop {
665            match self.advance() {
666                Some(c) if c == end_char => {
667                    if self.peek() == Some(end_char) && end_char != ']' {
668                        self.advance();
669                        value.push(end_char);
670                    } else {
671                        return Ok(Token::with_quote(
672                            TokenType::Identifier,
673                            value,
674                            start,
675                            start_line,
676                            start_col,
677                            quote,
678                        ));
679                    }
680                }
681                Some(c) => value.push(c),
682                None => {
683                    return Err(SqlglotError::TokenizerError {
684                        message: format!("Unterminated quoted identifier (expected {end_char})"),
685                        position: start,
686                    });
687                }
688            }
689        }
690    }
691}
692
693#[cfg(test)]
694mod tests {
695    use super::*;
696
697    #[test]
698    fn test_tokenize_simple_select() {
699        let mut tokenizer = Tokenizer::new("SELECT a, b FROM t");
700        let tokens = tokenizer.tokenize().unwrap();
701        assert_eq!(tokens[0].token_type, TokenType::Select);
702        assert_eq!(tokens[1].token_type, TokenType::Identifier);
703        assert_eq!(tokens[1].value, "a");
704        assert_eq!(tokens[2].token_type, TokenType::Comma);
705        assert_eq!(tokens[3].token_type, TokenType::Identifier);
706        assert_eq!(tokens[3].value, "b");
707        assert_eq!(tokens[4].token_type, TokenType::From);
708        assert_eq!(tokens[5].token_type, TokenType::Identifier);
709        assert_eq!(tokens[5].value, "t");
710        assert_eq!(tokens[6].token_type, TokenType::Eof);
711    }
712
713    #[test]
714    fn test_tokenize_string_literal() {
715        let mut tokenizer = Tokenizer::new("'hello world'");
716        let tokens = tokenizer.tokenize().unwrap();
717        assert_eq!(tokens[0].token_type, TokenType::String);
718        assert_eq!(tokens[0].value, "hello world");
719    }
720
721    #[test]
722    fn test_tokenize_operators() {
723        let mut tokenizer = Tokenizer::new("a >= 1 AND b != 2");
724        let tokens = tokenizer.tokenize().unwrap();
725        assert_eq!(tokens[1].token_type, TokenType::GtEq);
726        assert_eq!(tokens[3].token_type, TokenType::And);
727        assert_eq!(tokens[5].token_type, TokenType::Neq);
728    }
729
730    #[test]
731    fn test_tokenize_number() {
732        let mut tokenizer = Tokenizer::new("123.45");
733        let tokens = tokenizer.tokenize().unwrap();
734        assert_eq!(tokens[0].token_type, TokenType::Number);
735        assert_eq!(tokens[0].value, "123.45");
736    }
737
738    #[test]
739    fn test_tokenize_line_comment() {
740        let mut tok = Tokenizer::with_comments("SELECT 1 -- comment\nFROM t");
741        let tokens = tok.tokenize().unwrap();
742        assert!(
743            tokens
744                .iter()
745                .any(|t| t.token_type == TokenType::LineComment)
746        );
747    }
748
749    #[test]
750    fn test_tokenize_block_comment() {
751        let mut tok = Tokenizer::with_comments("SELECT /* hello */ 1");
752        let tokens = tok.tokenize().unwrap();
753        assert!(
754            tokens
755                .iter()
756                .any(|t| t.token_type == TokenType::BlockComment)
757        );
758    }
759
760    #[test]
761    fn test_tokenize_cte_keywords() {
762        let mut tok = Tokenizer::new("WITH cte AS (SELECT 1) SELECT * FROM cte");
763        let tokens = tok.tokenize().unwrap();
764        assert_eq!(tokens[0].token_type, TokenType::With);
765        assert_eq!(tokens[2].token_type, TokenType::As);
766    }
767
768    #[test]
769    fn test_tokenize_double_colon() {
770        let mut tok = Tokenizer::new("x::int");
771        let tokens = tok.tokenize().unwrap();
772        assert_eq!(tokens[1].token_type, TokenType::DoubleColon);
773    }
774
775    #[test]
776    fn test_tokenize_cast() {
777        let mut tok = Tokenizer::new("CAST(x AS INT)");
778        let tokens = tok.tokenize().unwrap();
779        assert_eq!(tokens[0].token_type, TokenType::Cast);
780    }
781
782    #[test]
783    fn test_tokenize_window() {
784        let mut tok = Tokenizer::new("ROW_NUMBER() OVER (PARTITION BY id ORDER BY name)");
785        let tokens = tok.tokenize().unwrap();
786        assert!(tokens.iter().any(|t| t.token_type == TokenType::Over));
787        assert!(tokens.iter().any(|t| t.token_type == TokenType::Partition));
788    }
789
790    #[test]
791    fn test_line_tracking() {
792        let mut tok = Tokenizer::new("SELECT\n  1");
793        let tokens = tok.tokenize().unwrap();
794        assert_eq!(tokens[0].line, 1);
795        assert_eq!(tokens[1].line, 2);
796    }
797
798    #[test]
799    fn test_tokenize_union_intersect_except() {
800        let mut tok = Tokenizer::new("UNION INTERSECT EXCEPT");
801        let tokens = tok.tokenize().unwrap();
802        assert_eq!(tokens[0].token_type, TokenType::Union);
803        assert_eq!(tokens[1].token_type, TokenType::Intersect);
804        assert_eq!(tokens[2].token_type, TokenType::Except);
805    }
806}