sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6/// Lexer mode - controls whether comments are preserved or skipped
7#[derive(Debug, Clone, Copy, PartialEq)]
8pub enum LexerMode {
9    /// Standard mode - skip comments (current default behavior)
10    SkipComments,
11    /// Preserve mode - tokenize comments as tokens
12    PreserveComments,
13}
14
15impl Default for LexerMode {
16    fn default() -> Self {
17        LexerMode::SkipComments
18    }
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub enum Token {
23    // Keywords
24    Select,
25    From,
26    Where,
27    With, // WITH clause for CTEs
28    And,
29    Or,
30    In,
31    Not,
32    Between,
33    Like,
34    ILike, // Case-insensitive LIKE (PostgreSQL)
35    Is,
36    Null,
37    OrderBy,
38    GroupBy,
39    Having,
40    Qualify,
41    As,
42    Asc,
43    Desc,
44    Limit,
45    Offset,
46    Into,      // INTO keyword for temporary tables
47    DateTime,  // DateTime constructor
48    Case,      // CASE expression
49    When,      // WHEN clause
50    Then,      // THEN clause
51    Else,      // ELSE clause
52    End,       // END keyword
53    Distinct,  // DISTINCT keyword for aggregate functions
54    Over,      // OVER keyword for window functions
55    Partition, // PARTITION keyword for window functions
56    By,        // BY keyword (used with PARTITION BY, ORDER BY)
57    Exclude,   // EXCLUDE keyword (for SELECT * EXCLUDE)
58    // Note: REPLACE is NOT a keyword - it's handled as a function name
59    // to avoid conflicting with the REPLACE() string function
60
61    // PIVOT/UNPIVOT keywords
62    Pivot,   // PIVOT keyword for row-to-column transformation
63    Unpivot, // UNPIVOT keyword for column-to-row transformation
64    For,     // FOR keyword (used in PIVOT: FOR column IN (...))
65
66    // Window frame keywords
67    Rows,      // ROWS frame type
68    Range,     // RANGE frame type
69    Unbounded, // UNBOUNDED for frame bounds
70    Preceding, // PRECEDING for frame bounds
71    Following, // FOLLOWING for frame bounds
72    Current,   // CURRENT for CURRENT ROW
73    Row,       // ROW for CURRENT ROW
74
75    // Set operation keywords
76    Union,     // UNION
77    Intersect, // INTERSECT
78    Except,    // EXCEPT
79
80    // Special CTE keywords
81    Web,  // WEB (for WEB CTEs)
82    File, // FILE (for FILE CTEs — filesystem metadata)
83
84    // Row expansion functions
85    Unnest, // UNNEST (for expanding delimited strings into rows)
86
87    // JOIN keywords
88    Join,  // JOIN keyword
89    Inner, // INNER JOIN
90    Left,  // LEFT JOIN
91    Right, // RIGHT JOIN
92    Full,  // FULL JOIN
93    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
94    On,    // ON keyword for join conditions
95    Cross, // CROSS JOIN
96
97    // Literals
98    Identifier(String),
99    QuotedIdentifier(String), // For "Customer Id" style identifiers
100    StringLiteral(String),
101    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
102    NumberLiteral(String),
103    Star,
104
105    // Operators
106    Dot,
107    Comma,
108    Colon,
109    LeftParen,
110    RightParen,
111    Equal,
112    NotEqual,
113    LessThan,
114    GreaterThan,
115    LessThanOrEqual,
116    GreaterThanOrEqual,
117
118    // Arithmetic operators
119    Plus,
120    Minus,
121    Divide,
122    Modulo,
123
124    // String operators
125    Concat, // || for string concatenation
126
127    // Comments (preserved for formatting)
128    LineComment(String),  // -- comment text (without the -- prefix)
129    BlockComment(String), // /* comment text */ (without delimiters)
130
131    // Special
132    Eof,
133}
134
135impl Token {
136    /// Check if a string is a SQL keyword and return corresponding token
137    pub fn from_keyword(s: &str) -> Option<Token> {
138        match s.to_uppercase().as_str() {
139            "SELECT" => Some(Token::Select),
140            "FROM" => Some(Token::From),
141            "WHERE" => Some(Token::Where),
142            "WITH" => Some(Token::With),
143            "AND" => Some(Token::And),
144            "OR" => Some(Token::Or),
145            "IN" => Some(Token::In),
146            "NOT" => Some(Token::Not),
147            "BETWEEN" => Some(Token::Between),
148            "LIKE" => Some(Token::Like),
149            "ILIKE" => Some(Token::ILike),
150            "IS" => Some(Token::Is),
151            "NULL" => Some(Token::Null),
152            "ORDER" => Some(Token::OrderBy),
153            "GROUP" => Some(Token::GroupBy),
154            "HAVING" => Some(Token::Having),
155            "QUALIFY" => Some(Token::Qualify),
156            "AS" => Some(Token::As),
157            "ASC" => Some(Token::Asc),
158            "DESC" => Some(Token::Desc),
159            "LIMIT" => Some(Token::Limit),
160            "OFFSET" => Some(Token::Offset),
161            "INTO" => Some(Token::Into),
162            "DISTINCT" => Some(Token::Distinct),
163            "EXCLUDE" => Some(Token::Exclude),
164            "PIVOT" => Some(Token::Pivot),
165            "UNPIVOT" => Some(Token::Unpivot),
166            "FOR" => Some(Token::For),
167            "CASE" => Some(Token::Case),
168            "WHEN" => Some(Token::When),
169            "THEN" => Some(Token::Then),
170            "ELSE" => Some(Token::Else),
171            "END" => Some(Token::End),
172            "OVER" => Some(Token::Over),
173            "PARTITION" => Some(Token::Partition),
174            "BY" => Some(Token::By),
175            "ROWS" => Some(Token::Rows),
176            "RANGE" => Some(Token::Range),
177            "UNBOUNDED" => Some(Token::Unbounded),
178            "PRECEDING" => Some(Token::Preceding),
179            "FOLLOWING" => Some(Token::Following),
180            "CURRENT" => Some(Token::Current),
181            "ROW" => Some(Token::Row),
182            "UNION" => Some(Token::Union),
183            "INTERSECT" => Some(Token::Intersect),
184            "EXCEPT" => Some(Token::Except),
185            "WEB" => Some(Token::Web),
186            "FILE" => Some(Token::File),
187            "UNNEST" => Some(Token::Unnest),
188            "JOIN" => Some(Token::Join),
189            "INNER" => Some(Token::Inner),
190            "LEFT" => Some(Token::Left),
191            "RIGHT" => Some(Token::Right),
192            "FULL" => Some(Token::Full),
193            "OUTER" => Some(Token::Outer),
194            "ON" => Some(Token::On),
195            "CROSS" => Some(Token::Cross),
196            _ => None,
197        }
198    }
199
200    /// Check if token is a logical operator
201    pub fn is_logical_operator(&self) -> bool {
202        matches!(self, Token::And | Token::Or)
203    }
204
205    /// Check if token is a join type
206    pub fn is_join_type(&self) -> bool {
207        matches!(
208            self,
209            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
210        )
211    }
212
213    /// Check if token ends a clause
214    pub fn is_clause_terminator(&self) -> bool {
215        matches!(
216            self,
217            Token::OrderBy
218                | Token::GroupBy
219                | Token::Having
220                | Token::Limit
221                | Token::Offset
222                | Token::Union
223                | Token::Intersect
224                | Token::Except
225        )
226    }
227
228    /// Get the string representation of a keyword token
229    /// Returns the keyword as it would appear in SQL (uppercase)
230    pub fn as_keyword_str(&self) -> Option<&'static str> {
231        match self {
232            Token::Select => Some("SELECT"),
233            Token::From => Some("FROM"),
234            Token::Where => Some("WHERE"),
235            Token::With => Some("WITH"),
236            Token::And => Some("AND"),
237            Token::Or => Some("OR"),
238            Token::In => Some("IN"),
239            Token::Not => Some("NOT"),
240            Token::Between => Some("BETWEEN"),
241            Token::Like => Some("LIKE"),
242            Token::ILike => Some("ILIKE"),
243            Token::Is => Some("IS"),
244            Token::Null => Some("NULL"),
245            Token::OrderBy => Some("ORDER BY"),
246            Token::GroupBy => Some("GROUP BY"),
247            Token::Having => Some("HAVING"),
248            Token::Qualify => Some("QUALIFY"),
249            Token::As => Some("AS"),
250            Token::Asc => Some("ASC"),
251            Token::Desc => Some("DESC"),
252            Token::Limit => Some("LIMIT"),
253            Token::Offset => Some("OFFSET"),
254            Token::Into => Some("INTO"),
255            Token::Distinct => Some("DISTINCT"),
256            Token::Exclude => Some("EXCLUDE"),
257            Token::Pivot => Some("PIVOT"),
258            Token::Unpivot => Some("UNPIVOT"),
259            Token::For => Some("FOR"),
260            Token::Case => Some("CASE"),
261            Token::When => Some("WHEN"),
262            Token::Then => Some("THEN"),
263            Token::Else => Some("ELSE"),
264            Token::End => Some("END"),
265            Token::Join => Some("JOIN"),
266            Token::Inner => Some("INNER"),
267            Token::Left => Some("LEFT"),
268            Token::Right => Some("RIGHT"),
269            Token::Full => Some("FULL"),
270            Token::Cross => Some("CROSS"),
271            Token::On => Some("ON"),
272            Token::Union => Some("UNION"),
273            Token::Intersect => Some("INTERSECT"),
274            Token::Except => Some("EXCEPT"),
275            Token::Over => Some("OVER"),
276            Token::Partition => Some("PARTITION"),
277            Token::By => Some("BY"),
278            Token::Rows => Some("ROWS"),
279            Token::Range => Some("RANGE"),
280            Token::Preceding => Some("PRECEDING"),
281            Token::Following => Some("FOLLOWING"),
282            Token::Current => Some("CURRENT"),
283            Token::Row => Some("ROW"),
284            Token::Unbounded => Some("UNBOUNDED"),
285            Token::DateTime => Some("DATETIME"),
286            _ => None,
287        }
288    }
289}
290
291#[derive(Debug, Clone)]
292pub struct Lexer {
293    input: Vec<char>,
294    position: usize,
295    current_char: Option<char>,
296    mode: LexerMode,
297}
298
299impl Lexer {
300    #[must_use]
301    pub fn new(input: &str) -> Self {
302        Self::with_mode(input, LexerMode::default())
303    }
304
305    /// Create a new lexer with specified mode
306    #[must_use]
307    pub fn with_mode(input: &str, mode: LexerMode) -> Self {
308        let chars: Vec<char> = input.chars().collect();
309        let current = chars.first().copied();
310        Self {
311            input: chars,
312            position: 0,
313            current_char: current,
314            mode,
315        }
316    }
317
318    fn advance(&mut self) {
319        self.position += 1;
320        self.current_char = self.input.get(self.position).copied();
321    }
322
323    fn peek(&self, offset: usize) -> Option<char> {
324        self.input.get(self.position + offset).copied()
325    }
326
327    /// Peek ahead n characters and return as a string
328    fn peek_string(&self, n: usize) -> String {
329        let mut result = String::new();
330        for i in 0..n {
331            if let Some(ch) = self.input.get(self.position + i) {
332                result.push(*ch);
333            } else {
334                break;
335            }
336        }
337        result
338    }
339
340    /// Read a JSON block delimited by $JSON$...$JSON$
341    /// Consumes the opening delimiter and reads until closing $JSON$
342    fn read_json_block(&mut self) -> String {
343        let mut result = String::new();
344
345        // Skip opening $JSON$
346        for _ in 0..6 {
347            self.advance();
348        }
349
350        // Read until we find closing $JSON$
351        while let Some(ch) = self.current_char {
352            // Check if we're at the closing delimiter
353            if ch == '$' && self.peek_string(6) == "$JSON$" {
354                // Skip closing $JSON$
355                for _ in 0..6 {
356                    self.advance();
357                }
358                break;
359            }
360            result.push(ch);
361            self.advance();
362        }
363
364        result
365    }
366
367    fn skip_whitespace(&mut self) {
368        while let Some(ch) = self.current_char {
369            if ch.is_whitespace() {
370                self.advance();
371            } else {
372                break;
373            }
374        }
375    }
376
377    /// Read a line comment and return its content (without the -- prefix)
378    fn read_line_comment(&mut self) -> String {
379        let mut result = String::new();
380
381        // Skip '--'
382        self.advance();
383        self.advance();
384
385        // Read until end of line or EOF
386        while let Some(ch) = self.current_char {
387            if ch == '\n' {
388                self.advance(); // consume the newline
389                break;
390            }
391            result.push(ch);
392            self.advance();
393        }
394
395        result
396    }
397
398    /// Read a block comment and return its content (without /* */ delimiters)
399    fn read_block_comment(&mut self) -> String {
400        let mut result = String::new();
401
402        // Skip '/*'
403        self.advance();
404        self.advance();
405
406        // Read until we find '*/'
407        while let Some(ch) = self.current_char {
408            if ch == '*' && self.peek(1) == Some('/') {
409                self.advance(); // skip '*'
410                self.advance(); // skip '/'
411                break;
412            }
413            result.push(ch);
414            self.advance();
415        }
416
417        result
418    }
419
420    /// Skip whitespace and comments (for backwards compatibility with parser)
421    /// This is the old behavior that discards comments
422    fn skip_whitespace_and_comments(&mut self) {
423        loop {
424            // Skip whitespace
425            while let Some(ch) = self.current_char {
426                if ch.is_whitespace() {
427                    self.advance();
428                } else {
429                    break;
430                }
431            }
432
433            // Check for comments
434            match self.current_char {
435                Some('-') if self.peek(1) == Some('-') => {
436                    // Single-line comment: skip until end of line
437                    self.advance(); // skip first '-'
438                    self.advance(); // skip second '-'
439                    while let Some(ch) = self.current_char {
440                        self.advance();
441                        if ch == '\n' {
442                            break;
443                        }
444                    }
445                }
446                Some('/') if self.peek(1) == Some('*') => {
447                    // Multi-line comment: skip until */
448                    self.advance(); // skip '/'
449                    self.advance(); // skip '*'
450                    while let Some(ch) = self.current_char {
451                        if ch == '*' && self.peek(1) == Some('/') {
452                            self.advance(); // skip '*'
453                            self.advance(); // skip '/'
454                            break;
455                        }
456                        self.advance();
457                    }
458                }
459                _ => {
460                    // No more comments or whitespace
461                    break;
462                }
463            }
464        }
465    }
466
467    fn read_identifier(&mut self) -> String {
468        let mut result = String::new();
469        while let Some(ch) = self.current_char {
470            if ch.is_alphanumeric() || ch == '_' {
471                result.push(ch);
472                self.advance();
473            } else {
474                break;
475            }
476        }
477        result
478    }
479
480    fn read_string(&mut self) -> String {
481        let mut result = String::new();
482        let quote_char = self.current_char.unwrap(); // ' or "
483        self.advance(); // skip opening quote
484
485        while let Some(ch) = self.current_char {
486            if ch == quote_char {
487                self.advance(); // skip closing quote
488                break;
489            }
490            result.push(ch);
491            self.advance();
492        }
493        result
494    }
495
496    fn read_number(&mut self) -> String {
497        let mut result = String::new();
498        let has_e = false;
499
500        // Read the main number part (including decimal point)
501        while let Some(ch) = self.current_char {
502            if !has_e && (ch.is_numeric() || ch == '.') {
503                result.push(ch);
504                self.advance();
505            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
506                // Handle scientific notation
507                result.push(ch);
508                self.advance();
509                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
510
511                // Check for optional sign after 'e'
512                if let Some(sign) = self.current_char {
513                    if sign == '+' || sign == '-' {
514                        result.push(sign);
515                        self.advance();
516                    }
517                }
518
519                // Read exponent digits
520                while let Some(digit) = self.current_char {
521                    if digit.is_numeric() {
522                        result.push(digit);
523                        self.advance();
524                    } else {
525                        break;
526                    }
527                }
528                break; // Done reading the number
529            } else {
530                break;
531            }
532        }
533        result
534    }
535
536    /// Get next token while preserving comments as tokens
537    /// This is the new behavior for comment-aware formatting
538    pub fn next_token_with_comments(&mut self) -> Token {
539        // Only skip whitespace, NOT comments
540        self.skip_whitespace();
541
542        match self.current_char {
543            None => Token::Eof,
544            // Handle comments as tokens
545            Some('-') if self.peek(1) == Some('-') => {
546                let comment_text = self.read_line_comment();
547                Token::LineComment(comment_text)
548            }
549            Some('/') if self.peek(1) == Some('*') => {
550                let comment_text = self.read_block_comment();
551                Token::BlockComment(comment_text)
552            }
553            Some('*') => {
554                self.advance();
555                Token::Star
556            }
557            Some('+') => {
558                self.advance();
559                Token::Plus
560            }
561            Some('/') => {
562                // Regular division (comment case handled above)
563                self.advance();
564                Token::Divide
565            }
566            Some('%') => {
567                self.advance();
568                Token::Modulo
569            }
570            Some('.') => {
571                self.advance();
572                Token::Dot
573            }
574            Some(',') => {
575                self.advance();
576                Token::Comma
577            }
578            Some(':') => {
579                self.advance();
580                Token::Colon
581            }
582            Some('(') => {
583                self.advance();
584                Token::LeftParen
585            }
586            Some(')') => {
587                self.advance();
588                Token::RightParen
589            }
590            Some('=') => {
591                self.advance();
592                Token::Equal
593            }
594            Some('<') => {
595                self.advance();
596                if self.current_char == Some('=') {
597                    self.advance();
598                    Token::LessThanOrEqual
599                } else if self.current_char == Some('>') {
600                    self.advance();
601                    Token::NotEqual
602                } else {
603                    Token::LessThan
604                }
605            }
606            Some('>') => {
607                self.advance();
608                if self.current_char == Some('=') {
609                    self.advance();
610                    Token::GreaterThanOrEqual
611                } else {
612                    Token::GreaterThan
613                }
614            }
615            Some('!') if self.peek(1) == Some('=') => {
616                self.advance();
617                self.advance();
618                Token::NotEqual
619            }
620            Some('|') if self.peek(1) == Some('|') => {
621                self.advance();
622                self.advance();
623                Token::Concat
624            }
625            Some('"') => {
626                let ident_val = self.read_string();
627                Token::QuotedIdentifier(ident_val)
628            }
629            Some('$') => {
630                if self.peek_string(6) == "$JSON$" {
631                    let json_content = self.read_json_block();
632                    Token::JsonBlock(json_content)
633                } else {
634                    let ident = self.read_identifier();
635                    Token::Identifier(ident)
636                }
637            }
638            Some('\'') => {
639                let string_val = self.read_string();
640                Token::StringLiteral(string_val)
641            }
642            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
643                self.advance();
644                let num = self.read_number();
645                Token::NumberLiteral(format!("-{num}"))
646            }
647            Some('-') => {
648                self.advance();
649                Token::Minus
650            }
651            Some(ch) if ch.is_numeric() => {
652                let num = self.read_number();
653                Token::NumberLiteral(num)
654            }
655            Some('#') => {
656                self.advance();
657                let table_name = self.read_identifier();
658                if table_name.is_empty() {
659                    Token::Identifier("#".to_string())
660                } else {
661                    Token::Identifier(format!("#{}", table_name))
662                }
663            }
664            Some(ch) if ch.is_alphabetic() || ch == '_' => {
665                let ident = self.read_identifier();
666                // Handle multi-word keywords like GROUP BY and ORDER BY
667                match ident.to_uppercase().as_str() {
668                    "ORDER" if self.peek_keyword("BY") => {
669                        self.skip_whitespace();
670                        self.read_identifier(); // consume "BY"
671                        Token::OrderBy
672                    }
673                    "GROUP" if self.peek_keyword("BY") => {
674                        self.skip_whitespace();
675                        self.read_identifier(); // consume "BY"
676                        Token::GroupBy
677                    }
678                    _ => Token::from_keyword(&ident).unwrap_or_else(|| Token::Identifier(ident)),
679                }
680            }
681            Some(ch) => {
682                self.advance();
683                Token::Identifier(ch.to_string())
684            }
685        }
686    }
687
688    /// Get next token - dispatches based on lexer mode
689    pub fn next_token(&mut self) -> Token {
690        match self.mode {
691            LexerMode::SkipComments => self.next_token_skip_comments(),
692            LexerMode::PreserveComments => self.next_token_with_comments(),
693        }
694    }
695
696    /// Get next token skipping comments (original behavior)
697    fn next_token_skip_comments(&mut self) -> Token {
698        self.skip_whitespace_and_comments();
699
700        match self.current_char {
701            None => Token::Eof,
702            Some('*') => {
703                self.advance();
704                // Context-sensitive: could be SELECT * or multiplication
705                // The parser will distinguish based on context
706                Token::Star // We'll handle multiplication in parser
707            }
708            Some('+') => {
709                self.advance();
710                Token::Plus
711            }
712            Some('/') => {
713                // Check if this is a comment start
714                if self.peek(1) == Some('*') {
715                    // This shouldn't happen as comments are skipped above,
716                    // but handle it just in case
717                    self.skip_whitespace_and_comments();
718                    return self.next_token();
719                }
720                self.advance();
721                Token::Divide
722            }
723            Some('%') => {
724                self.advance();
725                Token::Modulo
726            }
727            Some('.') => {
728                self.advance();
729                Token::Dot
730            }
731            Some(',') => {
732                self.advance();
733                Token::Comma
734            }
735            Some(':') => {
736                self.advance();
737                Token::Colon
738            }
739            Some('(') => {
740                self.advance();
741                Token::LeftParen
742            }
743            Some(')') => {
744                self.advance();
745                Token::RightParen
746            }
747            Some('=') => {
748                self.advance();
749                Token::Equal
750            }
751            Some('<') => {
752                self.advance();
753                if self.current_char == Some('=') {
754                    self.advance();
755                    Token::LessThanOrEqual
756                } else if self.current_char == Some('>') {
757                    self.advance();
758                    Token::NotEqual
759                } else {
760                    Token::LessThan
761                }
762            }
763            Some('>') => {
764                self.advance();
765                if self.current_char == Some('=') {
766                    self.advance();
767                    Token::GreaterThanOrEqual
768                } else {
769                    Token::GreaterThan
770                }
771            }
772            Some('!') if self.peek(1) == Some('=') => {
773                self.advance();
774                self.advance();
775                Token::NotEqual
776            }
777            Some('|') if self.peek(1) == Some('|') => {
778                self.advance();
779                self.advance();
780                Token::Concat
781            }
782            Some('"') => {
783                // Double quotes = identifier
784                let ident_val = self.read_string();
785                Token::QuotedIdentifier(ident_val)
786            }
787            Some('$') => {
788                // Check if this is $JSON$ delimiter
789                if self.peek_string(6) == "$JSON$" {
790                    let json_content = self.read_json_block();
791                    Token::JsonBlock(json_content)
792                } else {
793                    // Not a JSON block, could be part of identifier or parameter
794                    // For now, treat as identifier start
795                    let ident = self.read_identifier();
796                    Token::Identifier(ident)
797                }
798            }
799            Some('\'') => {
800                // Single quotes = string literal
801                let string_val = self.read_string();
802                Token::StringLiteral(string_val)
803            }
804            Some('-') if self.peek(1) == Some('-') => {
805                // This is a comment, skip it and get next token
806                self.skip_whitespace_and_comments();
807                self.next_token()
808            }
809            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
810                // Handle negative numbers
811                self.advance(); // skip '-'
812                let num = self.read_number();
813                Token::NumberLiteral(format!("-{num}"))
814            }
815            Some('-') => {
816                // Handle subtraction operator
817                self.advance();
818                Token::Minus
819            }
820            Some(ch) if ch.is_numeric() => {
821                let num = self.read_number();
822                Token::NumberLiteral(num)
823            }
824            Some('#') => {
825                // Temporary table identifier: #tablename
826                self.advance(); // consume #
827                let table_name = self.read_identifier();
828                if table_name.is_empty() {
829                    // Just # by itself
830                    Token::Identifier("#".to_string())
831                } else {
832                    // #tablename
833                    Token::Identifier(format!("#{}", table_name))
834                }
835            }
836            Some(ch) if ch.is_alphabetic() || ch == '_' => {
837                let ident = self.read_identifier();
838                match ident.to_uppercase().as_str() {
839                    "SELECT" => Token::Select,
840                    "FROM" => Token::From,
841                    "WHERE" => Token::Where,
842                    "WITH" => Token::With,
843                    "AND" => Token::And,
844                    "OR" => Token::Or,
845                    "IN" => Token::In,
846                    "NOT" => Token::Not,
847                    "BETWEEN" => Token::Between,
848                    "LIKE" => Token::Like,
849                    "ILIKE" => Token::ILike,
850                    "IS" => Token::Is,
851                    "NULL" => Token::Null,
852                    "ORDER" if self.peek_keyword("BY") => {
853                        self.skip_whitespace();
854                        self.read_identifier(); // consume "BY"
855                        Token::OrderBy
856                    }
857                    "GROUP" if self.peek_keyword("BY") => {
858                        self.skip_whitespace();
859                        self.read_identifier(); // consume "BY"
860                        Token::GroupBy
861                    }
862                    "HAVING" => Token::Having,
863                    "QUALIFY" => Token::Qualify,
864                    "AS" => Token::As,
865                    "ASC" => Token::Asc,
866                    "DESC" => Token::Desc,
867                    "LIMIT" => Token::Limit,
868                    "OFFSET" => Token::Offset,
869                    "INTO" => Token::Into,
870                    "DATETIME" => Token::DateTime,
871                    "CASE" => Token::Case,
872                    "WHEN" => Token::When,
873                    "THEN" => Token::Then,
874                    "ELSE" => Token::Else,
875                    "END" => Token::End,
876                    "DISTINCT" => Token::Distinct,
877                    "EXCLUDE" => Token::Exclude,
878                    "PIVOT" => Token::Pivot,
879                    "UNPIVOT" => Token::Unpivot,
880                    "FOR" => Token::For,
881                    "OVER" => Token::Over,
882                    "PARTITION" => Token::Partition,
883                    "BY" => Token::By,
884                    // Window frame keywords
885                    "ROWS" => Token::Rows,
886                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
887                    // We'll handle this in the parser based on context
888                    "UNBOUNDED" => Token::Unbounded,
889                    "PRECEDING" => Token::Preceding,
890                    "FOLLOWING" => Token::Following,
891                    "CURRENT" => Token::Current,
892                    "ROW" => Token::Row,
893                    // Set operation keywords
894                    "UNION" => Token::Union,
895                    "INTERSECT" => Token::Intersect,
896                    "EXCEPT" => Token::Except,
897                    // Special CTE keywords
898                    "WEB" => Token::Web,
899                    "FILE" => Token::File,
900                    // Row expansion functions
901                    "UNNEST" => Token::Unnest,
902                    // JOIN keywords
903                    "JOIN" => Token::Join,
904                    "INNER" => Token::Inner,
905                    "LEFT" => Token::Left,
906                    "RIGHT" => Token::Right,
907                    "FULL" => Token::Full,
908                    "OUTER" => Token::Outer,
909                    "ON" => Token::On,
910                    "CROSS" => Token::Cross,
911                    _ => Token::Identifier(ident),
912                }
913            }
914            Some(ch) => {
915                self.advance();
916                Token::Identifier(ch.to_string())
917            }
918        }
919    }
920
921    fn peek_keyword(&mut self, keyword: &str) -> bool {
922        let saved_pos = self.position;
923        let saved_char = self.current_char;
924
925        self.skip_whitespace_and_comments();
926        let next_word = self.read_identifier();
927        let matches = next_word.to_uppercase() == keyword;
928
929        // Restore position
930        self.position = saved_pos;
931        self.current_char = saved_char;
932
933        matches
934    }
935
936    #[must_use]
937    pub fn get_position(&self) -> usize {
938        self.position
939    }
940
941    pub fn tokenize_all(&mut self) -> Vec<Token> {
942        let mut tokens = Vec::new();
943        loop {
944            let token = self.next_token();
945            if matches!(token, Token::Eof) {
946                tokens.push(token);
947                break;
948            }
949            tokens.push(token);
950        }
951        tokens
952    }
953
954    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
955        let mut tokens = Vec::new();
956        loop {
957            self.skip_whitespace_and_comments();
958            let start_pos = self.position;
959            let token = self.next_token();
960            let end_pos = self.position;
961
962            if matches!(token, Token::Eof) {
963                break;
964            }
965            tokens.push((start_pos, end_pos, token));
966        }
967        tokens
968    }
969
970    /// Tokenize all tokens including comments
971    /// This is useful for formatting tools that need to preserve comments
972    pub fn tokenize_all_with_comments(&mut self) -> Vec<Token> {
973        let mut tokens = Vec::new();
974        loop {
975            let token = self.next_token_with_comments();
976            if matches!(token, Token::Eof) {
977                tokens.push(token);
978                break;
979            }
980            tokens.push(token);
981        }
982        tokens
983    }
984}
985
986#[cfg(test)]
987mod tests {
988    use super::*;
989
990    #[test]
991    fn test_line_comment_tokenization() {
992        let sql = "SELECT col1, -- this is a comment\ncol2 FROM table";
993        let mut lexer = Lexer::new(sql);
994        let tokens = lexer.tokenize_all_with_comments();
995
996        // Find the comment token
997        let comment_token = tokens.iter().find(|t| matches!(t, Token::LineComment(_)));
998        assert!(comment_token.is_some(), "Should find line comment token");
999
1000        if let Some(Token::LineComment(text)) = comment_token {
1001            assert_eq!(text.trim(), "this is a comment");
1002        }
1003    }
1004
1005    #[test]
1006    fn test_block_comment_tokenization() {
1007        let sql = "SELECT /* block comment */ col1 FROM table";
1008        let mut lexer = Lexer::new(sql);
1009        let tokens = lexer.tokenize_all_with_comments();
1010
1011        // Find the comment token
1012        let comment_token = tokens.iter().find(|t| matches!(t, Token::BlockComment(_)));
1013        assert!(comment_token.is_some(), "Should find block comment token");
1014
1015        if let Some(Token::BlockComment(text)) = comment_token {
1016            assert_eq!(text.trim(), "block comment");
1017        }
1018    }
1019
1020    #[test]
1021    fn test_multiple_comments() {
1022        let sql = "-- First comment\nSELECT col1, /* inline */ col2\n-- Second comment\nFROM table";
1023        let mut lexer = Lexer::new(sql);
1024        let tokens = lexer.tokenize_all_with_comments();
1025
1026        let line_comments: Vec<_> = tokens
1027            .iter()
1028            .filter(|t| matches!(t, Token::LineComment(_)))
1029            .collect();
1030        let block_comments: Vec<_> = tokens
1031            .iter()
1032            .filter(|t| matches!(t, Token::BlockComment(_)))
1033            .collect();
1034
1035        assert_eq!(line_comments.len(), 2, "Should find 2 line comments");
1036        assert_eq!(block_comments.len(), 1, "Should find 1 block comment");
1037    }
1038
1039    #[test]
1040    fn test_backwards_compatibility() {
1041        // Test that next_token() still skips comments
1042        let sql = "SELECT -- comment\ncol1 FROM table";
1043        let mut lexer = Lexer::new(sql);
1044        let tokens = lexer.tokenize_all();
1045
1046        // Should NOT contain any comment tokens
1047        let has_comments = tokens
1048            .iter()
1049            .any(|t| matches!(t, Token::LineComment(_) | Token::BlockComment(_)));
1050        assert!(
1051            !has_comments,
1052            "next_token() should skip comments for backwards compatibility"
1053        );
1054
1055        // Should still parse correctly
1056        assert!(tokens.iter().any(|t| matches!(t, Token::Select)));
1057        assert!(tokens.iter().any(|t| matches!(t, Token::From)));
1058    }
1059
1060    // ===== Dual-Mode Lexer Tests (Phase 1) =====
1061
1062    #[test]
1063    fn test_lexer_mode_skip_comments() {
1064        let sql = "SELECT id -- comment\nFROM table";
1065
1066        // SkipComments mode (default)
1067        let mut lexer = Lexer::with_mode(sql, LexerMode::SkipComments);
1068
1069        assert_eq!(lexer.next_token(), Token::Select);
1070        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1071        // Comment should be skipped
1072        assert_eq!(lexer.next_token(), Token::From);
1073        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1074        assert_eq!(lexer.next_token(), Token::Eof);
1075    }
1076
1077    #[test]
1078    fn test_lexer_mode_preserve_comments() {
1079        let sql = "SELECT id -- comment\nFROM table";
1080
1081        // PreserveComments mode
1082        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1083
1084        assert_eq!(lexer.next_token(), Token::Select);
1085        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1086
1087        // Comment should be preserved as a token
1088        let comment_tok = lexer.next_token();
1089        assert!(matches!(comment_tok, Token::LineComment(_)));
1090        if let Token::LineComment(text) = comment_tok {
1091            assert_eq!(text.trim(), "comment");
1092        }
1093
1094        assert_eq!(lexer.next_token(), Token::From);
1095        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1096        assert_eq!(lexer.next_token(), Token::Eof);
1097    }
1098
1099    #[test]
1100    fn test_lexer_mode_default_is_skip() {
1101        let sql = "SELECT id -- comment\nFROM table";
1102
1103        // Default (using new()) should skip comments
1104        let mut lexer = Lexer::new(sql);
1105
1106        let mut tok_count = 0;
1107        loop {
1108            let tok = lexer.next_token();
1109            if matches!(tok, Token::Eof) {
1110                break;
1111            }
1112            // Should never see a comment token
1113            assert!(!matches!(
1114                tok,
1115                Token::LineComment(_) | Token::BlockComment(_)
1116            ));
1117            tok_count += 1;
1118        }
1119
1120        // SELECT, id, FROM, table = 4 tokens (no comment)
1121        assert_eq!(tok_count, 4);
1122    }
1123
1124    #[test]
1125    fn test_lexer_mode_block_comments() {
1126        let sql = "SELECT /* block */ id FROM table";
1127
1128        // Skip mode
1129        let mut lexer_skip = Lexer::with_mode(sql, LexerMode::SkipComments);
1130        assert_eq!(lexer_skip.next_token(), Token::Select);
1131        assert_eq!(lexer_skip.next_token(), Token::Identifier("id".into()));
1132        assert_eq!(lexer_skip.next_token(), Token::From);
1133
1134        // Preserve mode
1135        let mut lexer_preserve = Lexer::with_mode(sql, LexerMode::PreserveComments);
1136        assert_eq!(lexer_preserve.next_token(), Token::Select);
1137
1138        let comment_tok = lexer_preserve.next_token();
1139        assert!(matches!(comment_tok, Token::BlockComment(_)));
1140        if let Token::BlockComment(text) = comment_tok {
1141            assert_eq!(text.trim(), "block");
1142        }
1143
1144        assert_eq!(lexer_preserve.next_token(), Token::Identifier("id".into()));
1145    }
1146
1147    #[test]
1148    fn test_lexer_mode_mixed_comments() {
1149        let sql = "-- leading\nSELECT /* inline */ id -- trailing\nFROM table";
1150
1151        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1152
1153        // leading comment
1154        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1155
1156        // SELECT
1157        assert_eq!(lexer.next_token(), Token::Select);
1158
1159        // inline block comment
1160        assert!(matches!(lexer.next_token(), Token::BlockComment(_)));
1161
1162        // id
1163        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1164
1165        // trailing comment
1166        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1167
1168        // FROM table
1169        assert_eq!(lexer.next_token(), Token::From);
1170        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1171        assert_eq!(lexer.next_token(), Token::Eof);
1172    }
1173
1174    #[test]
1175    fn test_pivot_keywords() {
1176        let sql = "PIVOT (MAX(amount) FOR month IN (val1, val2)) UNPIVOT";
1177        let mut lexer = Lexer::new(sql);
1178
1179        // Test individual token recognition
1180        assert_eq!(
1181            lexer.next_token(),
1182            Token::Pivot,
1183            "First token should be PIVOT"
1184        );
1185        assert_eq!(lexer.next_token(), Token::LeftParen);
1186        assert!(matches!(lexer.next_token(), Token::Identifier(_))); // MAX
1187        assert_eq!(lexer.next_token(), Token::LeftParen);
1188        assert!(matches!(lexer.next_token(), Token::Identifier(_))); // amount
1189        assert_eq!(lexer.next_token(), Token::RightParen);
1190        assert_eq!(lexer.next_token(), Token::For, "Should tokenize FOR");
1191        assert!(matches!(lexer.next_token(), Token::Identifier(_))); // month
1192        assert_eq!(lexer.next_token(), Token::In, "Should tokenize IN");
1193        assert_eq!(lexer.next_token(), Token::LeftParen);
1194        assert!(matches!(lexer.next_token(), Token::Identifier(_))); // val1
1195        assert_eq!(lexer.next_token(), Token::Comma);
1196        assert!(matches!(lexer.next_token(), Token::Identifier(_))); // val2
1197        assert_eq!(lexer.next_token(), Token::RightParen);
1198        assert_eq!(lexer.next_token(), Token::RightParen);
1199        assert_eq!(
1200            lexer.next_token(),
1201            Token::Unpivot,
1202            "Should tokenize UNPIVOT"
1203        );
1204        assert_eq!(lexer.next_token(), Token::Eof);
1205    }
1206}
sql_cli/sql/parser/lexer.rs

sql_cli/sql/parser/
lexer.rs