sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6/// Lexer mode - controls whether comments are preserved or skipped
7#[derive(Debug, Clone, Copy, PartialEq)]
8pub enum LexerMode {
9    /// Standard mode - skip comments (current default behavior)
10    SkipComments,
11    /// Preserve mode - tokenize comments as tokens
12    PreserveComments,
13}
14
15impl Default for LexerMode {
16    fn default() -> Self {
17        LexerMode::SkipComments
18    }
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub enum Token {
23    // Keywords
24    Select,
25    From,
26    Where,
27    With, // WITH clause for CTEs
28    And,
29    Or,
30    In,
31    Not,
32    Between,
33    Like,
34    Is,
35    Null,
36    OrderBy,
37    GroupBy,
38    Having,
39    Qualify,
40    As,
41    Asc,
42    Desc,
43    Limit,
44    Offset,
45    Into,      // INTO keyword for temporary tables
46    DateTime,  // DateTime constructor
47    Case,      // CASE expression
48    When,      // WHEN clause
49    Then,      // THEN clause
50    Else,      // ELSE clause
51    End,       // END keyword
52    Distinct,  // DISTINCT keyword for aggregate functions
53    Over,      // OVER keyword for window functions
54    Partition, // PARTITION keyword for window functions
55    By,        // BY keyword (used with PARTITION BY, ORDER BY)
56
57    // Window frame keywords
58    Rows,      // ROWS frame type
59    Range,     // RANGE frame type
60    Unbounded, // UNBOUNDED for frame bounds
61    Preceding, // PRECEDING for frame bounds
62    Following, // FOLLOWING for frame bounds
63    Current,   // CURRENT for CURRENT ROW
64    Row,       // ROW for CURRENT ROW
65
66    // Set operation keywords
67    Union,     // UNION
68    Intersect, // INTERSECT
69    Except,    // EXCEPT
70
71    // Special CTE keyword
72    Web, // WEB (for WEB CTEs)
73
74    // Row expansion functions
75    Unnest, // UNNEST (for expanding delimited strings into rows)
76
77    // JOIN keywords
78    Join,  // JOIN keyword
79    Inner, // INNER JOIN
80    Left,  // LEFT JOIN
81    Right, // RIGHT JOIN
82    Full,  // FULL JOIN
83    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
84    On,    // ON keyword for join conditions
85    Cross, // CROSS JOIN
86
87    // Literals
88    Identifier(String),
89    QuotedIdentifier(String), // For "Customer Id" style identifiers
90    StringLiteral(String),
91    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
92    NumberLiteral(String),
93    Star,
94
95    // Operators
96    Dot,
97    Comma,
98    Colon,
99    LeftParen,
100    RightParen,
101    Equal,
102    NotEqual,
103    LessThan,
104    GreaterThan,
105    LessThanOrEqual,
106    GreaterThanOrEqual,
107
108    // Arithmetic operators
109    Plus,
110    Minus,
111    Divide,
112    Modulo,
113
114    // String operators
115    Concat, // || for string concatenation
116
117    // Comments (preserved for formatting)
118    LineComment(String),  // -- comment text (without the -- prefix)
119    BlockComment(String), // /* comment text */ (without delimiters)
120
121    // Special
122    Eof,
123}
124
125impl Token {
126    /// Check if a string is a SQL keyword and return corresponding token
127    pub fn from_keyword(s: &str) -> Option<Token> {
128        match s.to_uppercase().as_str() {
129            "SELECT" => Some(Token::Select),
130            "FROM" => Some(Token::From),
131            "WHERE" => Some(Token::Where),
132            "WITH" => Some(Token::With),
133            "AND" => Some(Token::And),
134            "OR" => Some(Token::Or),
135            "IN" => Some(Token::In),
136            "NOT" => Some(Token::Not),
137            "BETWEEN" => Some(Token::Between),
138            "LIKE" => Some(Token::Like),
139            "IS" => Some(Token::Is),
140            "NULL" => Some(Token::Null),
141            "ORDER" => Some(Token::OrderBy),
142            "GROUP" => Some(Token::GroupBy),
143            "HAVING" => Some(Token::Having),
144            "QUALIFY" => Some(Token::Qualify),
145            "AS" => Some(Token::As),
146            "ASC" => Some(Token::Asc),
147            "DESC" => Some(Token::Desc),
148            "LIMIT" => Some(Token::Limit),
149            "OFFSET" => Some(Token::Offset),
150            "INTO" => Some(Token::Into),
151            "DISTINCT" => Some(Token::Distinct),
152            "CASE" => Some(Token::Case),
153            "WHEN" => Some(Token::When),
154            "THEN" => Some(Token::Then),
155            "ELSE" => Some(Token::Else),
156            "END" => Some(Token::End),
157            "OVER" => Some(Token::Over),
158            "PARTITION" => Some(Token::Partition),
159            "BY" => Some(Token::By),
160            "ROWS" => Some(Token::Rows),
161            "RANGE" => Some(Token::Range),
162            "UNBOUNDED" => Some(Token::Unbounded),
163            "PRECEDING" => Some(Token::Preceding),
164            "FOLLOWING" => Some(Token::Following),
165            "CURRENT" => Some(Token::Current),
166            "ROW" => Some(Token::Row),
167            "UNION" => Some(Token::Union),
168            "INTERSECT" => Some(Token::Intersect),
169            "EXCEPT" => Some(Token::Except),
170            "WEB" => Some(Token::Web),
171            "UNNEST" => Some(Token::Unnest),
172            "JOIN" => Some(Token::Join),
173            "INNER" => Some(Token::Inner),
174            "LEFT" => Some(Token::Left),
175            "RIGHT" => Some(Token::Right),
176            "FULL" => Some(Token::Full),
177            "OUTER" => Some(Token::Outer),
178            "ON" => Some(Token::On),
179            "CROSS" => Some(Token::Cross),
180            _ => None,
181        }
182    }
183
184    /// Check if token is a logical operator
185    pub fn is_logical_operator(&self) -> bool {
186        matches!(self, Token::And | Token::Or)
187    }
188
189    /// Check if token is a join type
190    pub fn is_join_type(&self) -> bool {
191        matches!(
192            self,
193            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
194        )
195    }
196
197    /// Check if token ends a clause
198    pub fn is_clause_terminator(&self) -> bool {
199        matches!(
200            self,
201            Token::OrderBy
202                | Token::GroupBy
203                | Token::Having
204                | Token::Limit
205                | Token::Offset
206                | Token::Union
207                | Token::Intersect
208                | Token::Except
209        )
210    }
211
212    /// Get the string representation of a keyword token
213    /// Returns the keyword as it would appear in SQL (uppercase)
214    pub fn as_keyword_str(&self) -> Option<&'static str> {
215        match self {
216            Token::Select => Some("SELECT"),
217            Token::From => Some("FROM"),
218            Token::Where => Some("WHERE"),
219            Token::With => Some("WITH"),
220            Token::And => Some("AND"),
221            Token::Or => Some("OR"),
222            Token::In => Some("IN"),
223            Token::Not => Some("NOT"),
224            Token::Between => Some("BETWEEN"),
225            Token::Like => Some("LIKE"),
226            Token::Is => Some("IS"),
227            Token::Null => Some("NULL"),
228            Token::OrderBy => Some("ORDER BY"),
229            Token::GroupBy => Some("GROUP BY"),
230            Token::Having => Some("HAVING"),
231            Token::Qualify => Some("QUALIFY"),
232            Token::As => Some("AS"),
233            Token::Asc => Some("ASC"),
234            Token::Desc => Some("DESC"),
235            Token::Limit => Some("LIMIT"),
236            Token::Offset => Some("OFFSET"),
237            Token::Into => Some("INTO"),
238            Token::Distinct => Some("DISTINCT"),
239            Token::Case => Some("CASE"),
240            Token::When => Some("WHEN"),
241            Token::Then => Some("THEN"),
242            Token::Else => Some("ELSE"),
243            Token::End => Some("END"),
244            Token::Join => Some("JOIN"),
245            Token::Inner => Some("INNER"),
246            Token::Left => Some("LEFT"),
247            Token::Right => Some("RIGHT"),
248            Token::Full => Some("FULL"),
249            Token::Cross => Some("CROSS"),
250            Token::On => Some("ON"),
251            Token::Union => Some("UNION"),
252            Token::Intersect => Some("INTERSECT"),
253            Token::Except => Some("EXCEPT"),
254            Token::Over => Some("OVER"),
255            Token::Partition => Some("PARTITION"),
256            Token::By => Some("BY"),
257            Token::Rows => Some("ROWS"),
258            Token::Range => Some("RANGE"),
259            Token::Preceding => Some("PRECEDING"),
260            Token::Following => Some("FOLLOWING"),
261            Token::Current => Some("CURRENT"),
262            Token::Row => Some("ROW"),
263            Token::Unbounded => Some("UNBOUNDED"),
264            Token::DateTime => Some("DATETIME"),
265            _ => None,
266        }
267    }
268}
269
270#[derive(Debug, Clone)]
271pub struct Lexer {
272    input: Vec<char>,
273    position: usize,
274    current_char: Option<char>,
275    mode: LexerMode,
276}
277
278impl Lexer {
279    #[must_use]
280    pub fn new(input: &str) -> Self {
281        Self::with_mode(input, LexerMode::default())
282    }
283
284    /// Create a new lexer with specified mode
285    #[must_use]
286    pub fn with_mode(input: &str, mode: LexerMode) -> Self {
287        let chars: Vec<char> = input.chars().collect();
288        let current = chars.first().copied();
289        Self {
290            input: chars,
291            position: 0,
292            current_char: current,
293            mode,
294        }
295    }
296
297    fn advance(&mut self) {
298        self.position += 1;
299        self.current_char = self.input.get(self.position).copied();
300    }
301
302    fn peek(&self, offset: usize) -> Option<char> {
303        self.input.get(self.position + offset).copied()
304    }
305
306    /// Peek ahead n characters and return as a string
307    fn peek_string(&self, n: usize) -> String {
308        let mut result = String::new();
309        for i in 0..n {
310            if let Some(ch) = self.input.get(self.position + i) {
311                result.push(*ch);
312            } else {
313                break;
314            }
315        }
316        result
317    }
318
319    /// Read a JSON block delimited by $JSON$...$JSON$
320    /// Consumes the opening delimiter and reads until closing $JSON$
321    fn read_json_block(&mut self) -> String {
322        let mut result = String::new();
323
324        // Skip opening $JSON$
325        for _ in 0..6 {
326            self.advance();
327        }
328
329        // Read until we find closing $JSON$
330        while let Some(ch) = self.current_char {
331            // Check if we're at the closing delimiter
332            if ch == '$' && self.peek_string(6) == "$JSON$" {
333                // Skip closing $JSON$
334                for _ in 0..6 {
335                    self.advance();
336                }
337                break;
338            }
339            result.push(ch);
340            self.advance();
341        }
342
343        result
344    }
345
346    fn skip_whitespace(&mut self) {
347        while let Some(ch) = self.current_char {
348            if ch.is_whitespace() {
349                self.advance();
350            } else {
351                break;
352            }
353        }
354    }
355
356    /// Read a line comment and return its content (without the -- prefix)
357    fn read_line_comment(&mut self) -> String {
358        let mut result = String::new();
359
360        // Skip '--'
361        self.advance();
362        self.advance();
363
364        // Read until end of line or EOF
365        while let Some(ch) = self.current_char {
366            if ch == '\n' {
367                self.advance(); // consume the newline
368                break;
369            }
370            result.push(ch);
371            self.advance();
372        }
373
374        result
375    }
376
377    /// Read a block comment and return its content (without /* */ delimiters)
378    fn read_block_comment(&mut self) -> String {
379        let mut result = String::new();
380
381        // Skip '/*'
382        self.advance();
383        self.advance();
384
385        // Read until we find '*/'
386        while let Some(ch) = self.current_char {
387            if ch == '*' && self.peek(1) == Some('/') {
388                self.advance(); // skip '*'
389                self.advance(); // skip '/'
390                break;
391            }
392            result.push(ch);
393            self.advance();
394        }
395
396        result
397    }
398
399    /// Skip whitespace and comments (for backwards compatibility with parser)
400    /// This is the old behavior that discards comments
401    fn skip_whitespace_and_comments(&mut self) {
402        loop {
403            // Skip whitespace
404            while let Some(ch) = self.current_char {
405                if ch.is_whitespace() {
406                    self.advance();
407                } else {
408                    break;
409                }
410            }
411
412            // Check for comments
413            match self.current_char {
414                Some('-') if self.peek(1) == Some('-') => {
415                    // Single-line comment: skip until end of line
416                    self.advance(); // skip first '-'
417                    self.advance(); // skip second '-'
418                    while let Some(ch) = self.current_char {
419                        self.advance();
420                        if ch == '\n' {
421                            break;
422                        }
423                    }
424                }
425                Some('/') if self.peek(1) == Some('*') => {
426                    // Multi-line comment: skip until */
427                    self.advance(); // skip '/'
428                    self.advance(); // skip '*'
429                    while let Some(ch) = self.current_char {
430                        if ch == '*' && self.peek(1) == Some('/') {
431                            self.advance(); // skip '*'
432                            self.advance(); // skip '/'
433                            break;
434                        }
435                        self.advance();
436                    }
437                }
438                _ => {
439                    // No more comments or whitespace
440                    break;
441                }
442            }
443        }
444    }
445
446    fn read_identifier(&mut self) -> String {
447        let mut result = String::new();
448        while let Some(ch) = self.current_char {
449            if ch.is_alphanumeric() || ch == '_' {
450                result.push(ch);
451                self.advance();
452            } else {
453                break;
454            }
455        }
456        result
457    }
458
459    fn read_string(&mut self) -> String {
460        let mut result = String::new();
461        let quote_char = self.current_char.unwrap(); // ' or "
462        self.advance(); // skip opening quote
463
464        while let Some(ch) = self.current_char {
465            if ch == quote_char {
466                self.advance(); // skip closing quote
467                break;
468            }
469            result.push(ch);
470            self.advance();
471        }
472        result
473    }
474
475    fn read_number(&mut self) -> String {
476        let mut result = String::new();
477        let has_e = false;
478
479        // Read the main number part (including decimal point)
480        while let Some(ch) = self.current_char {
481            if !has_e && (ch.is_numeric() || ch == '.') {
482                result.push(ch);
483                self.advance();
484            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
485                // Handle scientific notation
486                result.push(ch);
487                self.advance();
488                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
489
490                // Check for optional sign after 'e'
491                if let Some(sign) = self.current_char {
492                    if sign == '+' || sign == '-' {
493                        result.push(sign);
494                        self.advance();
495                    }
496                }
497
498                // Read exponent digits
499                while let Some(digit) = self.current_char {
500                    if digit.is_numeric() {
501                        result.push(digit);
502                        self.advance();
503                    } else {
504                        break;
505                    }
506                }
507                break; // Done reading the number
508            } else {
509                break;
510            }
511        }
512        result
513    }
514
515    /// Get next token while preserving comments as tokens
516    /// This is the new behavior for comment-aware formatting
517    pub fn next_token_with_comments(&mut self) -> Token {
518        // Only skip whitespace, NOT comments
519        self.skip_whitespace();
520
521        match self.current_char {
522            None => Token::Eof,
523            // Handle comments as tokens
524            Some('-') if self.peek(1) == Some('-') => {
525                let comment_text = self.read_line_comment();
526                Token::LineComment(comment_text)
527            }
528            Some('/') if self.peek(1) == Some('*') => {
529                let comment_text = self.read_block_comment();
530                Token::BlockComment(comment_text)
531            }
532            Some('*') => {
533                self.advance();
534                Token::Star
535            }
536            Some('+') => {
537                self.advance();
538                Token::Plus
539            }
540            Some('/') => {
541                // Regular division (comment case handled above)
542                self.advance();
543                Token::Divide
544            }
545            Some('%') => {
546                self.advance();
547                Token::Modulo
548            }
549            Some('.') => {
550                self.advance();
551                Token::Dot
552            }
553            Some(',') => {
554                self.advance();
555                Token::Comma
556            }
557            Some(':') => {
558                self.advance();
559                Token::Colon
560            }
561            Some('(') => {
562                self.advance();
563                Token::LeftParen
564            }
565            Some(')') => {
566                self.advance();
567                Token::RightParen
568            }
569            Some('=') => {
570                self.advance();
571                Token::Equal
572            }
573            Some('<') => {
574                self.advance();
575                if self.current_char == Some('=') {
576                    self.advance();
577                    Token::LessThanOrEqual
578                } else if self.current_char == Some('>') {
579                    self.advance();
580                    Token::NotEqual
581                } else {
582                    Token::LessThan
583                }
584            }
585            Some('>') => {
586                self.advance();
587                if self.current_char == Some('=') {
588                    self.advance();
589                    Token::GreaterThanOrEqual
590                } else {
591                    Token::GreaterThan
592                }
593            }
594            Some('!') if self.peek(1) == Some('=') => {
595                self.advance();
596                self.advance();
597                Token::NotEqual
598            }
599            Some('|') if self.peek(1) == Some('|') => {
600                self.advance();
601                self.advance();
602                Token::Concat
603            }
604            Some('"') => {
605                let ident_val = self.read_string();
606                Token::QuotedIdentifier(ident_val)
607            }
608            Some('$') => {
609                if self.peek_string(6) == "$JSON$" {
610                    let json_content = self.read_json_block();
611                    Token::JsonBlock(json_content)
612                } else {
613                    let ident = self.read_identifier();
614                    Token::Identifier(ident)
615                }
616            }
617            Some('\'') => {
618                let string_val = self.read_string();
619                Token::StringLiteral(string_val)
620            }
621            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
622                self.advance();
623                let num = self.read_number();
624                Token::NumberLiteral(format!("-{num}"))
625            }
626            Some('-') => {
627                self.advance();
628                Token::Minus
629            }
630            Some(ch) if ch.is_numeric() => {
631                let num = self.read_number();
632                Token::NumberLiteral(num)
633            }
634            Some('#') => {
635                self.advance();
636                let table_name = self.read_identifier();
637                if table_name.is_empty() {
638                    Token::Identifier("#".to_string())
639                } else {
640                    Token::Identifier(format!("#{}", table_name))
641                }
642            }
643            Some(ch) if ch.is_alphabetic() || ch == '_' => {
644                let ident = self.read_identifier();
645                // Handle multi-word keywords like GROUP BY and ORDER BY
646                match ident.to_uppercase().as_str() {
647                    "ORDER" if self.peek_keyword("BY") => {
648                        self.skip_whitespace();
649                        self.read_identifier(); // consume "BY"
650                        Token::OrderBy
651                    }
652                    "GROUP" if self.peek_keyword("BY") => {
653                        self.skip_whitespace();
654                        self.read_identifier(); // consume "BY"
655                        Token::GroupBy
656                    }
657                    _ => Token::from_keyword(&ident).unwrap_or_else(|| Token::Identifier(ident)),
658                }
659            }
660            Some(ch) => {
661                self.advance();
662                Token::Identifier(ch.to_string())
663            }
664        }
665    }
666
667    /// Get next token - dispatches based on lexer mode
668    pub fn next_token(&mut self) -> Token {
669        match self.mode {
670            LexerMode::SkipComments => self.next_token_skip_comments(),
671            LexerMode::PreserveComments => self.next_token_with_comments(),
672        }
673    }
674
675    /// Get next token skipping comments (original behavior)
676    fn next_token_skip_comments(&mut self) -> Token {
677        self.skip_whitespace_and_comments();
678
679        match self.current_char {
680            None => Token::Eof,
681            Some('*') => {
682                self.advance();
683                // Context-sensitive: could be SELECT * or multiplication
684                // The parser will distinguish based on context
685                Token::Star // We'll handle multiplication in parser
686            }
687            Some('+') => {
688                self.advance();
689                Token::Plus
690            }
691            Some('/') => {
692                // Check if this is a comment start
693                if self.peek(1) == Some('*') {
694                    // This shouldn't happen as comments are skipped above,
695                    // but handle it just in case
696                    self.skip_whitespace_and_comments();
697                    return self.next_token();
698                }
699                self.advance();
700                Token::Divide
701            }
702            Some('%') => {
703                self.advance();
704                Token::Modulo
705            }
706            Some('.') => {
707                self.advance();
708                Token::Dot
709            }
710            Some(',') => {
711                self.advance();
712                Token::Comma
713            }
714            Some(':') => {
715                self.advance();
716                Token::Colon
717            }
718            Some('(') => {
719                self.advance();
720                Token::LeftParen
721            }
722            Some(')') => {
723                self.advance();
724                Token::RightParen
725            }
726            Some('=') => {
727                self.advance();
728                Token::Equal
729            }
730            Some('<') => {
731                self.advance();
732                if self.current_char == Some('=') {
733                    self.advance();
734                    Token::LessThanOrEqual
735                } else if self.current_char == Some('>') {
736                    self.advance();
737                    Token::NotEqual
738                } else {
739                    Token::LessThan
740                }
741            }
742            Some('>') => {
743                self.advance();
744                if self.current_char == Some('=') {
745                    self.advance();
746                    Token::GreaterThanOrEqual
747                } else {
748                    Token::GreaterThan
749                }
750            }
751            Some('!') if self.peek(1) == Some('=') => {
752                self.advance();
753                self.advance();
754                Token::NotEqual
755            }
756            Some('|') if self.peek(1) == Some('|') => {
757                self.advance();
758                self.advance();
759                Token::Concat
760            }
761            Some('"') => {
762                // Double quotes = identifier
763                let ident_val = self.read_string();
764                Token::QuotedIdentifier(ident_val)
765            }
766            Some('$') => {
767                // Check if this is $JSON$ delimiter
768                if self.peek_string(6) == "$JSON$" {
769                    let json_content = self.read_json_block();
770                    Token::JsonBlock(json_content)
771                } else {
772                    // Not a JSON block, could be part of identifier or parameter
773                    // For now, treat as identifier start
774                    let ident = self.read_identifier();
775                    Token::Identifier(ident)
776                }
777            }
778            Some('\'') => {
779                // Single quotes = string literal
780                let string_val = self.read_string();
781                Token::StringLiteral(string_val)
782            }
783            Some('-') if self.peek(1) == Some('-') => {
784                // This is a comment, skip it and get next token
785                self.skip_whitespace_and_comments();
786                self.next_token()
787            }
788            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
789                // Handle negative numbers
790                self.advance(); // skip '-'
791                let num = self.read_number();
792                Token::NumberLiteral(format!("-{num}"))
793            }
794            Some('-') => {
795                // Handle subtraction operator
796                self.advance();
797                Token::Minus
798            }
799            Some(ch) if ch.is_numeric() => {
800                let num = self.read_number();
801                Token::NumberLiteral(num)
802            }
803            Some('#') => {
804                // Temporary table identifier: #tablename
805                self.advance(); // consume #
806                let table_name = self.read_identifier();
807                if table_name.is_empty() {
808                    // Just # by itself
809                    Token::Identifier("#".to_string())
810                } else {
811                    // #tablename
812                    Token::Identifier(format!("#{}", table_name))
813                }
814            }
815            Some(ch) if ch.is_alphabetic() || ch == '_' => {
816                let ident = self.read_identifier();
817                match ident.to_uppercase().as_str() {
818                    "SELECT" => Token::Select,
819                    "FROM" => Token::From,
820                    "WHERE" => Token::Where,
821                    "WITH" => Token::With,
822                    "AND" => Token::And,
823                    "OR" => Token::Or,
824                    "IN" => Token::In,
825                    "NOT" => Token::Not,
826                    "BETWEEN" => Token::Between,
827                    "LIKE" => Token::Like,
828                    "IS" => Token::Is,
829                    "NULL" => Token::Null,
830                    "ORDER" if self.peek_keyword("BY") => {
831                        self.skip_whitespace();
832                        self.read_identifier(); // consume "BY"
833                        Token::OrderBy
834                    }
835                    "GROUP" if self.peek_keyword("BY") => {
836                        self.skip_whitespace();
837                        self.read_identifier(); // consume "BY"
838                        Token::GroupBy
839                    }
840                    "HAVING" => Token::Having,
841                    "QUALIFY" => Token::Qualify,
842                    "AS" => Token::As,
843                    "ASC" => Token::Asc,
844                    "DESC" => Token::Desc,
845                    "LIMIT" => Token::Limit,
846                    "OFFSET" => Token::Offset,
847                    "INTO" => Token::Into,
848                    "DATETIME" => Token::DateTime,
849                    "CASE" => Token::Case,
850                    "WHEN" => Token::When,
851                    "THEN" => Token::Then,
852                    "ELSE" => Token::Else,
853                    "END" => Token::End,
854                    "DISTINCT" => Token::Distinct,
855                    "OVER" => Token::Over,
856                    "PARTITION" => Token::Partition,
857                    "BY" => Token::By,
858                    // Window frame keywords
859                    "ROWS" => Token::Rows,
860                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
861                    // We'll handle this in the parser based on context
862                    "UNBOUNDED" => Token::Unbounded,
863                    "PRECEDING" => Token::Preceding,
864                    "FOLLOWING" => Token::Following,
865                    "CURRENT" => Token::Current,
866                    "ROW" => Token::Row,
867                    // Set operation keywords
868                    "UNION" => Token::Union,
869                    "INTERSECT" => Token::Intersect,
870                    "EXCEPT" => Token::Except,
871                    // Special CTE keyword
872                    "WEB" => Token::Web,
873                    // Row expansion functions
874                    "UNNEST" => Token::Unnest,
875                    // JOIN keywords
876                    "JOIN" => Token::Join,
877                    "INNER" => Token::Inner,
878                    "LEFT" => Token::Left,
879                    "RIGHT" => Token::Right,
880                    "FULL" => Token::Full,
881                    "OUTER" => Token::Outer,
882                    "ON" => Token::On,
883                    "CROSS" => Token::Cross,
884                    _ => Token::Identifier(ident),
885                }
886            }
887            Some(ch) => {
888                self.advance();
889                Token::Identifier(ch.to_string())
890            }
891        }
892    }
893
894    fn peek_keyword(&mut self, keyword: &str) -> bool {
895        let saved_pos = self.position;
896        let saved_char = self.current_char;
897
898        self.skip_whitespace_and_comments();
899        let next_word = self.read_identifier();
900        let matches = next_word.to_uppercase() == keyword;
901
902        // Restore position
903        self.position = saved_pos;
904        self.current_char = saved_char;
905
906        matches
907    }
908
909    #[must_use]
910    pub fn get_position(&self) -> usize {
911        self.position
912    }
913
914    pub fn tokenize_all(&mut self) -> Vec<Token> {
915        let mut tokens = Vec::new();
916        loop {
917            let token = self.next_token();
918            if matches!(token, Token::Eof) {
919                tokens.push(token);
920                break;
921            }
922            tokens.push(token);
923        }
924        tokens
925    }
926
927    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
928        let mut tokens = Vec::new();
929        loop {
930            self.skip_whitespace_and_comments();
931            let start_pos = self.position;
932            let token = self.next_token();
933            let end_pos = self.position;
934
935            if matches!(token, Token::Eof) {
936                break;
937            }
938            tokens.push((start_pos, end_pos, token));
939        }
940        tokens
941    }
942
943    /// Tokenize all tokens including comments
944    /// This is useful for formatting tools that need to preserve comments
945    pub fn tokenize_all_with_comments(&mut self) -> Vec<Token> {
946        let mut tokens = Vec::new();
947        loop {
948            let token = self.next_token_with_comments();
949            if matches!(token, Token::Eof) {
950                tokens.push(token);
951                break;
952            }
953            tokens.push(token);
954        }
955        tokens
956    }
957}
958
959#[cfg(test)]
960mod tests {
961    use super::*;
962
963    #[test]
964    fn test_line_comment_tokenization() {
965        let sql = "SELECT col1, -- this is a comment\ncol2 FROM table";
966        let mut lexer = Lexer::new(sql);
967        let tokens = lexer.tokenize_all_with_comments();
968
969        // Find the comment token
970        let comment_token = tokens.iter().find(|t| matches!(t, Token::LineComment(_)));
971        assert!(comment_token.is_some(), "Should find line comment token");
972
973        if let Some(Token::LineComment(text)) = comment_token {
974            assert_eq!(text.trim(), "this is a comment");
975        }
976    }
977
978    #[test]
979    fn test_block_comment_tokenization() {
980        let sql = "SELECT /* block comment */ col1 FROM table";
981        let mut lexer = Lexer::new(sql);
982        let tokens = lexer.tokenize_all_with_comments();
983
984        // Find the comment token
985        let comment_token = tokens.iter().find(|t| matches!(t, Token::BlockComment(_)));
986        assert!(comment_token.is_some(), "Should find block comment token");
987
988        if let Some(Token::BlockComment(text)) = comment_token {
989            assert_eq!(text.trim(), "block comment");
990        }
991    }
992
993    #[test]
994    fn test_multiple_comments() {
995        let sql = "-- First comment\nSELECT col1, /* inline */ col2\n-- Second comment\nFROM table";
996        let mut lexer = Lexer::new(sql);
997        let tokens = lexer.tokenize_all_with_comments();
998
999        let line_comments: Vec<_> = tokens
1000            .iter()
1001            .filter(|t| matches!(t, Token::LineComment(_)))
1002            .collect();
1003        let block_comments: Vec<_> = tokens
1004            .iter()
1005            .filter(|t| matches!(t, Token::BlockComment(_)))
1006            .collect();
1007
1008        assert_eq!(line_comments.len(), 2, "Should find 2 line comments");
1009        assert_eq!(block_comments.len(), 1, "Should find 1 block comment");
1010    }
1011
1012    #[test]
1013    fn test_backwards_compatibility() {
1014        // Test that next_token() still skips comments
1015        let sql = "SELECT -- comment\ncol1 FROM table";
1016        let mut lexer = Lexer::new(sql);
1017        let tokens = lexer.tokenize_all();
1018
1019        // Should NOT contain any comment tokens
1020        let has_comments = tokens
1021            .iter()
1022            .any(|t| matches!(t, Token::LineComment(_) | Token::BlockComment(_)));
1023        assert!(
1024            !has_comments,
1025            "next_token() should skip comments for backwards compatibility"
1026        );
1027
1028        // Should still parse correctly
1029        assert!(tokens.iter().any(|t| matches!(t, Token::Select)));
1030        assert!(tokens.iter().any(|t| matches!(t, Token::From)));
1031    }
1032
1033    // ===== Dual-Mode Lexer Tests (Phase 1) =====
1034
1035    #[test]
1036    fn test_lexer_mode_skip_comments() {
1037        let sql = "SELECT id -- comment\nFROM table";
1038
1039        // SkipComments mode (default)
1040        let mut lexer = Lexer::with_mode(sql, LexerMode::SkipComments);
1041
1042        assert_eq!(lexer.next_token(), Token::Select);
1043        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1044        // Comment should be skipped
1045        assert_eq!(lexer.next_token(), Token::From);
1046        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1047        assert_eq!(lexer.next_token(), Token::Eof);
1048    }
1049
1050    #[test]
1051    fn test_lexer_mode_preserve_comments() {
1052        let sql = "SELECT id -- comment\nFROM table";
1053
1054        // PreserveComments mode
1055        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1056
1057        assert_eq!(lexer.next_token(), Token::Select);
1058        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1059
1060        // Comment should be preserved as a token
1061        let comment_tok = lexer.next_token();
1062        assert!(matches!(comment_tok, Token::LineComment(_)));
1063        if let Token::LineComment(text) = comment_tok {
1064            assert_eq!(text.trim(), "comment");
1065        }
1066
1067        assert_eq!(lexer.next_token(), Token::From);
1068        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1069        assert_eq!(lexer.next_token(), Token::Eof);
1070    }
1071
1072    #[test]
1073    fn test_lexer_mode_default_is_skip() {
1074        let sql = "SELECT id -- comment\nFROM table";
1075
1076        // Default (using new()) should skip comments
1077        let mut lexer = Lexer::new(sql);
1078
1079        let mut tok_count = 0;
1080        loop {
1081            let tok = lexer.next_token();
1082            if matches!(tok, Token::Eof) {
1083                break;
1084            }
1085            // Should never see a comment token
1086            assert!(!matches!(
1087                tok,
1088                Token::LineComment(_) | Token::BlockComment(_)
1089            ));
1090            tok_count += 1;
1091        }
1092
1093        // SELECT, id, FROM, table = 4 tokens (no comment)
1094        assert_eq!(tok_count, 4);
1095    }
1096
1097    #[test]
1098    fn test_lexer_mode_block_comments() {
1099        let sql = "SELECT /* block */ id FROM table";
1100
1101        // Skip mode
1102        let mut lexer_skip = Lexer::with_mode(sql, LexerMode::SkipComments);
1103        assert_eq!(lexer_skip.next_token(), Token::Select);
1104        assert_eq!(lexer_skip.next_token(), Token::Identifier("id".into()));
1105        assert_eq!(lexer_skip.next_token(), Token::From);
1106
1107        // Preserve mode
1108        let mut lexer_preserve = Lexer::with_mode(sql, LexerMode::PreserveComments);
1109        assert_eq!(lexer_preserve.next_token(), Token::Select);
1110
1111        let comment_tok = lexer_preserve.next_token();
1112        assert!(matches!(comment_tok, Token::BlockComment(_)));
1113        if let Token::BlockComment(text) = comment_tok {
1114            assert_eq!(text.trim(), "block");
1115        }
1116
1117        assert_eq!(lexer_preserve.next_token(), Token::Identifier("id".into()));
1118    }
1119
1120    #[test]
1121    fn test_lexer_mode_mixed_comments() {
1122        let sql = "-- leading\nSELECT /* inline */ id -- trailing\nFROM table";
1123
1124        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1125
1126        // leading comment
1127        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1128
1129        // SELECT
1130        assert_eq!(lexer.next_token(), Token::Select);
1131
1132        // inline block comment
1133        assert!(matches!(lexer.next_token(), Token::BlockComment(_)));
1134
1135        // id
1136        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1137
1138        // trailing comment
1139        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1140
1141        // FROM table
1142        assert_eq!(lexer.next_token(), Token::From);
1143        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1144        assert_eq!(lexer.next_token(), Token::Eof);
1145    }
1146}
sql_cli/sql/parser/lexer.rs

sql_cli/sql/parser/
lexer.rs