sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6/// Lexer mode - controls whether comments are preserved or skipped
7#[derive(Debug, Clone, Copy, PartialEq)]
8pub enum LexerMode {
9    /// Standard mode - skip comments (current default behavior)
10    SkipComments,
11    /// Preserve mode - tokenize comments as tokens
12    PreserveComments,
13}
14
15impl Default for LexerMode {
16    fn default() -> Self {
17        LexerMode::SkipComments
18    }
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub enum Token {
23    // Keywords
24    Select,
25    From,
26    Where,
27    With, // WITH clause for CTEs
28    And,
29    Or,
30    In,
31    Not,
32    Between,
33    Like,
34    Is,
35    Null,
36    OrderBy,
37    GroupBy,
38    Having,
39    As,
40    Asc,
41    Desc,
42    Limit,
43    Offset,
44    Into,      // INTO keyword for temporary tables
45    DateTime,  // DateTime constructor
46    Case,      // CASE expression
47    When,      // WHEN clause
48    Then,      // THEN clause
49    Else,      // ELSE clause
50    End,       // END keyword
51    Distinct,  // DISTINCT keyword for aggregate functions
52    Over,      // OVER keyword for window functions
53    Partition, // PARTITION keyword for window functions
54    By,        // BY keyword (used with PARTITION BY, ORDER BY)
55
56    // Window frame keywords
57    Rows,      // ROWS frame type
58    Range,     // RANGE frame type
59    Unbounded, // UNBOUNDED for frame bounds
60    Preceding, // PRECEDING for frame bounds
61    Following, // FOLLOWING for frame bounds
62    Current,   // CURRENT for CURRENT ROW
63    Row,       // ROW for CURRENT ROW
64
65    // Set operation keywords
66    Union,     // UNION
67    Intersect, // INTERSECT
68    Except,    // EXCEPT
69
70    // Special CTE keyword
71    Web, // WEB (for WEB CTEs)
72
73    // Row expansion functions
74    Unnest, // UNNEST (for expanding delimited strings into rows)
75
76    // JOIN keywords
77    Join,  // JOIN keyword
78    Inner, // INNER JOIN
79    Left,  // LEFT JOIN
80    Right, // RIGHT JOIN
81    Full,  // FULL JOIN
82    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
83    On,    // ON keyword for join conditions
84    Cross, // CROSS JOIN
85
86    // Literals
87    Identifier(String),
88    QuotedIdentifier(String), // For "Customer Id" style identifiers
89    StringLiteral(String),
90    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
91    NumberLiteral(String),
92    Star,
93
94    // Operators
95    Dot,
96    Comma,
97    Colon,
98    LeftParen,
99    RightParen,
100    Equal,
101    NotEqual,
102    LessThan,
103    GreaterThan,
104    LessThanOrEqual,
105    GreaterThanOrEqual,
106
107    // Arithmetic operators
108    Plus,
109    Minus,
110    Divide,
111    Modulo,
112
113    // String operators
114    Concat, // || for string concatenation
115
116    // Comments (preserved for formatting)
117    LineComment(String),  // -- comment text (without the -- prefix)
118    BlockComment(String), // /* comment text */ (without delimiters)
119
120    // Special
121    Eof,
122}
123
124impl Token {
125    /// Check if a string is a SQL keyword and return corresponding token
126    pub fn from_keyword(s: &str) -> Option<Token> {
127        match s.to_uppercase().as_str() {
128            "SELECT" => Some(Token::Select),
129            "FROM" => Some(Token::From),
130            "WHERE" => Some(Token::Where),
131            "WITH" => Some(Token::With),
132            "AND" => Some(Token::And),
133            "OR" => Some(Token::Or),
134            "IN" => Some(Token::In),
135            "NOT" => Some(Token::Not),
136            "BETWEEN" => Some(Token::Between),
137            "LIKE" => Some(Token::Like),
138            "IS" => Some(Token::Is),
139            "NULL" => Some(Token::Null),
140            "ORDER" => Some(Token::OrderBy),
141            "GROUP" => Some(Token::GroupBy),
142            "HAVING" => Some(Token::Having),
143            "AS" => Some(Token::As),
144            "ASC" => Some(Token::Asc),
145            "DESC" => Some(Token::Desc),
146            "LIMIT" => Some(Token::Limit),
147            "OFFSET" => Some(Token::Offset),
148            "INTO" => Some(Token::Into),
149            "DISTINCT" => Some(Token::Distinct),
150            "CASE" => Some(Token::Case),
151            "WHEN" => Some(Token::When),
152            "THEN" => Some(Token::Then),
153            "ELSE" => Some(Token::Else),
154            "END" => Some(Token::End),
155            "OVER" => Some(Token::Over),
156            "PARTITION" => Some(Token::Partition),
157            "BY" => Some(Token::By),
158            "ROWS" => Some(Token::Rows),
159            "RANGE" => Some(Token::Range),
160            "UNBOUNDED" => Some(Token::Unbounded),
161            "PRECEDING" => Some(Token::Preceding),
162            "FOLLOWING" => Some(Token::Following),
163            "CURRENT" => Some(Token::Current),
164            "ROW" => Some(Token::Row),
165            "UNION" => Some(Token::Union),
166            "INTERSECT" => Some(Token::Intersect),
167            "EXCEPT" => Some(Token::Except),
168            "WEB" => Some(Token::Web),
169            "UNNEST" => Some(Token::Unnest),
170            "JOIN" => Some(Token::Join),
171            "INNER" => Some(Token::Inner),
172            "LEFT" => Some(Token::Left),
173            "RIGHT" => Some(Token::Right),
174            "FULL" => Some(Token::Full),
175            "OUTER" => Some(Token::Outer),
176            "ON" => Some(Token::On),
177            "CROSS" => Some(Token::Cross),
178            _ => None,
179        }
180    }
181
182    /// Check if token is a logical operator
183    pub fn is_logical_operator(&self) -> bool {
184        matches!(self, Token::And | Token::Or)
185    }
186
187    /// Check if token is a join type
188    pub fn is_join_type(&self) -> bool {
189        matches!(
190            self,
191            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
192        )
193    }
194
195    /// Check if token ends a clause
196    pub fn is_clause_terminator(&self) -> bool {
197        matches!(
198            self,
199            Token::OrderBy
200                | Token::GroupBy
201                | Token::Having
202                | Token::Limit
203                | Token::Offset
204                | Token::Union
205                | Token::Intersect
206                | Token::Except
207        )
208    }
209
210    /// Get the string representation of a keyword token
211    /// Returns the keyword as it would appear in SQL (uppercase)
212    pub fn as_keyword_str(&self) -> Option<&'static str> {
213        match self {
214            Token::Select => Some("SELECT"),
215            Token::From => Some("FROM"),
216            Token::Where => Some("WHERE"),
217            Token::With => Some("WITH"),
218            Token::And => Some("AND"),
219            Token::Or => Some("OR"),
220            Token::In => Some("IN"),
221            Token::Not => Some("NOT"),
222            Token::Between => Some("BETWEEN"),
223            Token::Like => Some("LIKE"),
224            Token::Is => Some("IS"),
225            Token::Null => Some("NULL"),
226            Token::OrderBy => Some("ORDER BY"),
227            Token::GroupBy => Some("GROUP BY"),
228            Token::Having => Some("HAVING"),
229            Token::As => Some("AS"),
230            Token::Asc => Some("ASC"),
231            Token::Desc => Some("DESC"),
232            Token::Limit => Some("LIMIT"),
233            Token::Offset => Some("OFFSET"),
234            Token::Into => Some("INTO"),
235            Token::Distinct => Some("DISTINCT"),
236            Token::Case => Some("CASE"),
237            Token::When => Some("WHEN"),
238            Token::Then => Some("THEN"),
239            Token::Else => Some("ELSE"),
240            Token::End => Some("END"),
241            Token::Join => Some("JOIN"),
242            Token::Inner => Some("INNER"),
243            Token::Left => Some("LEFT"),
244            Token::Right => Some("RIGHT"),
245            Token::Full => Some("FULL"),
246            Token::Cross => Some("CROSS"),
247            Token::On => Some("ON"),
248            Token::Union => Some("UNION"),
249            Token::Intersect => Some("INTERSECT"),
250            Token::Except => Some("EXCEPT"),
251            Token::Over => Some("OVER"),
252            Token::Partition => Some("PARTITION"),
253            Token::By => Some("BY"),
254            Token::Rows => Some("ROWS"),
255            Token::Range => Some("RANGE"),
256            Token::Preceding => Some("PRECEDING"),
257            Token::Following => Some("FOLLOWING"),
258            Token::Current => Some("CURRENT"),
259            Token::Row => Some("ROW"),
260            Token::Unbounded => Some("UNBOUNDED"),
261            Token::DateTime => Some("DATETIME"),
262            _ => None,
263        }
264    }
265}
266
267#[derive(Debug, Clone)]
268pub struct Lexer {
269    input: Vec<char>,
270    position: usize,
271    current_char: Option<char>,
272    mode: LexerMode,
273}
274
275impl Lexer {
276    #[must_use]
277    pub fn new(input: &str) -> Self {
278        Self::with_mode(input, LexerMode::default())
279    }
280
281    /// Create a new lexer with specified mode
282    #[must_use]
283    pub fn with_mode(input: &str, mode: LexerMode) -> Self {
284        let chars: Vec<char> = input.chars().collect();
285        let current = chars.first().copied();
286        Self {
287            input: chars,
288            position: 0,
289            current_char: current,
290            mode,
291        }
292    }
293
294    fn advance(&mut self) {
295        self.position += 1;
296        self.current_char = self.input.get(self.position).copied();
297    }
298
299    fn peek(&self, offset: usize) -> Option<char> {
300        self.input.get(self.position + offset).copied()
301    }
302
303    /// Peek ahead n characters and return as a string
304    fn peek_string(&self, n: usize) -> String {
305        let mut result = String::new();
306        for i in 0..n {
307            if let Some(ch) = self.input.get(self.position + i) {
308                result.push(*ch);
309            } else {
310                break;
311            }
312        }
313        result
314    }
315
316    /// Read a JSON block delimited by $JSON$...$JSON$
317    /// Consumes the opening delimiter and reads until closing $JSON$
318    fn read_json_block(&mut self) -> String {
319        let mut result = String::new();
320
321        // Skip opening $JSON$
322        for _ in 0..6 {
323            self.advance();
324        }
325
326        // Read until we find closing $JSON$
327        while let Some(ch) = self.current_char {
328            // Check if we're at the closing delimiter
329            if ch == '$' && self.peek_string(6) == "$JSON$" {
330                // Skip closing $JSON$
331                for _ in 0..6 {
332                    self.advance();
333                }
334                break;
335            }
336            result.push(ch);
337            self.advance();
338        }
339
340        result
341    }
342
343    fn skip_whitespace(&mut self) {
344        while let Some(ch) = self.current_char {
345            if ch.is_whitespace() {
346                self.advance();
347            } else {
348                break;
349            }
350        }
351    }
352
353    /// Read a line comment and return its content (without the -- prefix)
354    fn read_line_comment(&mut self) -> String {
355        let mut result = String::new();
356
357        // Skip '--'
358        self.advance();
359        self.advance();
360
361        // Read until end of line or EOF
362        while let Some(ch) = self.current_char {
363            if ch == '\n' {
364                self.advance(); // consume the newline
365                break;
366            }
367            result.push(ch);
368            self.advance();
369        }
370
371        result
372    }
373
374    /// Read a block comment and return its content (without /* */ delimiters)
375    fn read_block_comment(&mut self) -> String {
376        let mut result = String::new();
377
378        // Skip '/*'
379        self.advance();
380        self.advance();
381
382        // Read until we find '*/'
383        while let Some(ch) = self.current_char {
384            if ch == '*' && self.peek(1) == Some('/') {
385                self.advance(); // skip '*'
386                self.advance(); // skip '/'
387                break;
388            }
389            result.push(ch);
390            self.advance();
391        }
392
393        result
394    }
395
396    /// Skip whitespace and comments (for backwards compatibility with parser)
397    /// This is the old behavior that discards comments
398    fn skip_whitespace_and_comments(&mut self) {
399        loop {
400            // Skip whitespace
401            while let Some(ch) = self.current_char {
402                if ch.is_whitespace() {
403                    self.advance();
404                } else {
405                    break;
406                }
407            }
408
409            // Check for comments
410            match self.current_char {
411                Some('-') if self.peek(1) == Some('-') => {
412                    // Single-line comment: skip until end of line
413                    self.advance(); // skip first '-'
414                    self.advance(); // skip second '-'
415                    while let Some(ch) = self.current_char {
416                        self.advance();
417                        if ch == '\n' {
418                            break;
419                        }
420                    }
421                }
422                Some('/') if self.peek(1) == Some('*') => {
423                    // Multi-line comment: skip until */
424                    self.advance(); // skip '/'
425                    self.advance(); // skip '*'
426                    while let Some(ch) = self.current_char {
427                        if ch == '*' && self.peek(1) == Some('/') {
428                            self.advance(); // skip '*'
429                            self.advance(); // skip '/'
430                            break;
431                        }
432                        self.advance();
433                    }
434                }
435                _ => {
436                    // No more comments or whitespace
437                    break;
438                }
439            }
440        }
441    }
442
443    fn read_identifier(&mut self) -> String {
444        let mut result = String::new();
445        while let Some(ch) = self.current_char {
446            if ch.is_alphanumeric() || ch == '_' {
447                result.push(ch);
448                self.advance();
449            } else {
450                break;
451            }
452        }
453        result
454    }
455
456    fn read_string(&mut self) -> String {
457        let mut result = String::new();
458        let quote_char = self.current_char.unwrap(); // ' or "
459        self.advance(); // skip opening quote
460
461        while let Some(ch) = self.current_char {
462            if ch == quote_char {
463                self.advance(); // skip closing quote
464                break;
465            }
466            result.push(ch);
467            self.advance();
468        }
469        result
470    }
471
472    fn read_number(&mut self) -> String {
473        let mut result = String::new();
474        let has_e = false;
475
476        // Read the main number part (including decimal point)
477        while let Some(ch) = self.current_char {
478            if !has_e && (ch.is_numeric() || ch == '.') {
479                result.push(ch);
480                self.advance();
481            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
482                // Handle scientific notation
483                result.push(ch);
484                self.advance();
485                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
486
487                // Check for optional sign after 'e'
488                if let Some(sign) = self.current_char {
489                    if sign == '+' || sign == '-' {
490                        result.push(sign);
491                        self.advance();
492                    }
493                }
494
495                // Read exponent digits
496                while let Some(digit) = self.current_char {
497                    if digit.is_numeric() {
498                        result.push(digit);
499                        self.advance();
500                    } else {
501                        break;
502                    }
503                }
504                break; // Done reading the number
505            } else {
506                break;
507            }
508        }
509        result
510    }
511
512    /// Get next token while preserving comments as tokens
513    /// This is the new behavior for comment-aware formatting
514    pub fn next_token_with_comments(&mut self) -> Token {
515        // Only skip whitespace, NOT comments
516        self.skip_whitespace();
517
518        match self.current_char {
519            None => Token::Eof,
520            // Handle comments as tokens
521            Some('-') if self.peek(1) == Some('-') => {
522                let comment_text = self.read_line_comment();
523                Token::LineComment(comment_text)
524            }
525            Some('/') if self.peek(1) == Some('*') => {
526                let comment_text = self.read_block_comment();
527                Token::BlockComment(comment_text)
528            }
529            Some('*') => {
530                self.advance();
531                Token::Star
532            }
533            Some('+') => {
534                self.advance();
535                Token::Plus
536            }
537            Some('/') => {
538                // Regular division (comment case handled above)
539                self.advance();
540                Token::Divide
541            }
542            Some('%') => {
543                self.advance();
544                Token::Modulo
545            }
546            Some('.') => {
547                self.advance();
548                Token::Dot
549            }
550            Some(',') => {
551                self.advance();
552                Token::Comma
553            }
554            Some(':') => {
555                self.advance();
556                Token::Colon
557            }
558            Some('(') => {
559                self.advance();
560                Token::LeftParen
561            }
562            Some(')') => {
563                self.advance();
564                Token::RightParen
565            }
566            Some('=') => {
567                self.advance();
568                Token::Equal
569            }
570            Some('<') => {
571                self.advance();
572                if self.current_char == Some('=') {
573                    self.advance();
574                    Token::LessThanOrEqual
575                } else if self.current_char == Some('>') {
576                    self.advance();
577                    Token::NotEqual
578                } else {
579                    Token::LessThan
580                }
581            }
582            Some('>') => {
583                self.advance();
584                if self.current_char == Some('=') {
585                    self.advance();
586                    Token::GreaterThanOrEqual
587                } else {
588                    Token::GreaterThan
589                }
590            }
591            Some('!') if self.peek(1) == Some('=') => {
592                self.advance();
593                self.advance();
594                Token::NotEqual
595            }
596            Some('|') if self.peek(1) == Some('|') => {
597                self.advance();
598                self.advance();
599                Token::Concat
600            }
601            Some('"') => {
602                let ident_val = self.read_string();
603                Token::QuotedIdentifier(ident_val)
604            }
605            Some('$') => {
606                if self.peek_string(6) == "$JSON$" {
607                    let json_content = self.read_json_block();
608                    Token::JsonBlock(json_content)
609                } else {
610                    let ident = self.read_identifier();
611                    Token::Identifier(ident)
612                }
613            }
614            Some('\'') => {
615                let string_val = self.read_string();
616                Token::StringLiteral(string_val)
617            }
618            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
619                self.advance();
620                let num = self.read_number();
621                Token::NumberLiteral(format!("-{num}"))
622            }
623            Some('-') => {
624                self.advance();
625                Token::Minus
626            }
627            Some(ch) if ch.is_numeric() => {
628                let num = self.read_number();
629                Token::NumberLiteral(num)
630            }
631            Some('#') => {
632                self.advance();
633                let table_name = self.read_identifier();
634                if table_name.is_empty() {
635                    Token::Identifier("#".to_string())
636                } else {
637                    Token::Identifier(format!("#{}", table_name))
638                }
639            }
640            Some(ch) if ch.is_alphabetic() || ch == '_' => {
641                let ident = self.read_identifier();
642                // Handle multi-word keywords like GROUP BY and ORDER BY
643                match ident.to_uppercase().as_str() {
644                    "ORDER" if self.peek_keyword("BY") => {
645                        self.skip_whitespace();
646                        self.read_identifier(); // consume "BY"
647                        Token::OrderBy
648                    }
649                    "GROUP" if self.peek_keyword("BY") => {
650                        self.skip_whitespace();
651                        self.read_identifier(); // consume "BY"
652                        Token::GroupBy
653                    }
654                    _ => Token::from_keyword(&ident).unwrap_or_else(|| Token::Identifier(ident)),
655                }
656            }
657            Some(ch) => {
658                self.advance();
659                Token::Identifier(ch.to_string())
660            }
661        }
662    }
663
664    /// Get next token - dispatches based on lexer mode
665    pub fn next_token(&mut self) -> Token {
666        match self.mode {
667            LexerMode::SkipComments => self.next_token_skip_comments(),
668            LexerMode::PreserveComments => self.next_token_with_comments(),
669        }
670    }
671
672    /// Get next token skipping comments (original behavior)
673    fn next_token_skip_comments(&mut self) -> Token {
674        self.skip_whitespace_and_comments();
675
676        match self.current_char {
677            None => Token::Eof,
678            Some('*') => {
679                self.advance();
680                // Context-sensitive: could be SELECT * or multiplication
681                // The parser will distinguish based on context
682                Token::Star // We'll handle multiplication in parser
683            }
684            Some('+') => {
685                self.advance();
686                Token::Plus
687            }
688            Some('/') => {
689                // Check if this is a comment start
690                if self.peek(1) == Some('*') {
691                    // This shouldn't happen as comments are skipped above,
692                    // but handle it just in case
693                    self.skip_whitespace_and_comments();
694                    return self.next_token();
695                }
696                self.advance();
697                Token::Divide
698            }
699            Some('%') => {
700                self.advance();
701                Token::Modulo
702            }
703            Some('.') => {
704                self.advance();
705                Token::Dot
706            }
707            Some(',') => {
708                self.advance();
709                Token::Comma
710            }
711            Some(':') => {
712                self.advance();
713                Token::Colon
714            }
715            Some('(') => {
716                self.advance();
717                Token::LeftParen
718            }
719            Some(')') => {
720                self.advance();
721                Token::RightParen
722            }
723            Some('=') => {
724                self.advance();
725                Token::Equal
726            }
727            Some('<') => {
728                self.advance();
729                if self.current_char == Some('=') {
730                    self.advance();
731                    Token::LessThanOrEqual
732                } else if self.current_char == Some('>') {
733                    self.advance();
734                    Token::NotEqual
735                } else {
736                    Token::LessThan
737                }
738            }
739            Some('>') => {
740                self.advance();
741                if self.current_char == Some('=') {
742                    self.advance();
743                    Token::GreaterThanOrEqual
744                } else {
745                    Token::GreaterThan
746                }
747            }
748            Some('!') if self.peek(1) == Some('=') => {
749                self.advance();
750                self.advance();
751                Token::NotEqual
752            }
753            Some('|') if self.peek(1) == Some('|') => {
754                self.advance();
755                self.advance();
756                Token::Concat
757            }
758            Some('"') => {
759                // Double quotes = identifier
760                let ident_val = self.read_string();
761                Token::QuotedIdentifier(ident_val)
762            }
763            Some('$') => {
764                // Check if this is $JSON$ delimiter
765                if self.peek_string(6) == "$JSON$" {
766                    let json_content = self.read_json_block();
767                    Token::JsonBlock(json_content)
768                } else {
769                    // Not a JSON block, could be part of identifier or parameter
770                    // For now, treat as identifier start
771                    let ident = self.read_identifier();
772                    Token::Identifier(ident)
773                }
774            }
775            Some('\'') => {
776                // Single quotes = string literal
777                let string_val = self.read_string();
778                Token::StringLiteral(string_val)
779            }
780            Some('-') if self.peek(1) == Some('-') => {
781                // This is a comment, skip it and get next token
782                self.skip_whitespace_and_comments();
783                self.next_token()
784            }
785            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
786                // Handle negative numbers
787                self.advance(); // skip '-'
788                let num = self.read_number();
789                Token::NumberLiteral(format!("-{num}"))
790            }
791            Some('-') => {
792                // Handle subtraction operator
793                self.advance();
794                Token::Minus
795            }
796            Some(ch) if ch.is_numeric() => {
797                let num = self.read_number();
798                Token::NumberLiteral(num)
799            }
800            Some('#') => {
801                // Temporary table identifier: #tablename
802                self.advance(); // consume #
803                let table_name = self.read_identifier();
804                if table_name.is_empty() {
805                    // Just # by itself
806                    Token::Identifier("#".to_string())
807                } else {
808                    // #tablename
809                    Token::Identifier(format!("#{}", table_name))
810                }
811            }
812            Some(ch) if ch.is_alphabetic() || ch == '_' => {
813                let ident = self.read_identifier();
814                match ident.to_uppercase().as_str() {
815                    "SELECT" => Token::Select,
816                    "FROM" => Token::From,
817                    "WHERE" => Token::Where,
818                    "WITH" => Token::With,
819                    "AND" => Token::And,
820                    "OR" => Token::Or,
821                    "IN" => Token::In,
822                    "NOT" => Token::Not,
823                    "BETWEEN" => Token::Between,
824                    "LIKE" => Token::Like,
825                    "IS" => Token::Is,
826                    "NULL" => Token::Null,
827                    "ORDER" if self.peek_keyword("BY") => {
828                        self.skip_whitespace();
829                        self.read_identifier(); // consume "BY"
830                        Token::OrderBy
831                    }
832                    "GROUP" if self.peek_keyword("BY") => {
833                        self.skip_whitespace();
834                        self.read_identifier(); // consume "BY"
835                        Token::GroupBy
836                    }
837                    "HAVING" => Token::Having,
838                    "AS" => Token::As,
839                    "ASC" => Token::Asc,
840                    "DESC" => Token::Desc,
841                    "LIMIT" => Token::Limit,
842                    "OFFSET" => Token::Offset,
843                    "INTO" => Token::Into,
844                    "DATETIME" => Token::DateTime,
845                    "CASE" => Token::Case,
846                    "WHEN" => Token::When,
847                    "THEN" => Token::Then,
848                    "ELSE" => Token::Else,
849                    "END" => Token::End,
850                    "DISTINCT" => Token::Distinct,
851                    "OVER" => Token::Over,
852                    "PARTITION" => Token::Partition,
853                    "BY" => Token::By,
854                    // Window frame keywords
855                    "ROWS" => Token::Rows,
856                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
857                    // We'll handle this in the parser based on context
858                    "UNBOUNDED" => Token::Unbounded,
859                    "PRECEDING" => Token::Preceding,
860                    "FOLLOWING" => Token::Following,
861                    "CURRENT" => Token::Current,
862                    "ROW" => Token::Row,
863                    // Set operation keywords
864                    "UNION" => Token::Union,
865                    "INTERSECT" => Token::Intersect,
866                    "EXCEPT" => Token::Except,
867                    // Special CTE keyword
868                    "WEB" => Token::Web,
869                    // Row expansion functions
870                    "UNNEST" => Token::Unnest,
871                    // JOIN keywords
872                    "JOIN" => Token::Join,
873                    "INNER" => Token::Inner,
874                    "LEFT" => Token::Left,
875                    "RIGHT" => Token::Right,
876                    "FULL" => Token::Full,
877                    "OUTER" => Token::Outer,
878                    "ON" => Token::On,
879                    "CROSS" => Token::Cross,
880                    _ => Token::Identifier(ident),
881                }
882            }
883            Some(ch) => {
884                self.advance();
885                Token::Identifier(ch.to_string())
886            }
887        }
888    }
889
890    fn peek_keyword(&mut self, keyword: &str) -> bool {
891        let saved_pos = self.position;
892        let saved_char = self.current_char;
893
894        self.skip_whitespace_and_comments();
895        let next_word = self.read_identifier();
896        let matches = next_word.to_uppercase() == keyword;
897
898        // Restore position
899        self.position = saved_pos;
900        self.current_char = saved_char;
901
902        matches
903    }
904
905    #[must_use]
906    pub fn get_position(&self) -> usize {
907        self.position
908    }
909
910    pub fn tokenize_all(&mut self) -> Vec<Token> {
911        let mut tokens = Vec::new();
912        loop {
913            let token = self.next_token();
914            if matches!(token, Token::Eof) {
915                tokens.push(token);
916                break;
917            }
918            tokens.push(token);
919        }
920        tokens
921    }
922
923    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
924        let mut tokens = Vec::new();
925        loop {
926            self.skip_whitespace_and_comments();
927            let start_pos = self.position;
928            let token = self.next_token();
929            let end_pos = self.position;
930
931            if matches!(token, Token::Eof) {
932                break;
933            }
934            tokens.push((start_pos, end_pos, token));
935        }
936        tokens
937    }
938
939    /// Tokenize all tokens including comments
940    /// This is useful for formatting tools that need to preserve comments
941    pub fn tokenize_all_with_comments(&mut self) -> Vec<Token> {
942        let mut tokens = Vec::new();
943        loop {
944            let token = self.next_token_with_comments();
945            if matches!(token, Token::Eof) {
946                tokens.push(token);
947                break;
948            }
949            tokens.push(token);
950        }
951        tokens
952    }
953}
954
955#[cfg(test)]
956mod tests {
957    use super::*;
958
959    #[test]
960    fn test_line_comment_tokenization() {
961        let sql = "SELECT col1, -- this is a comment\ncol2 FROM table";
962        let mut lexer = Lexer::new(sql);
963        let tokens = lexer.tokenize_all_with_comments();
964
965        // Find the comment token
966        let comment_token = tokens.iter().find(|t| matches!(t, Token::LineComment(_)));
967        assert!(comment_token.is_some(), "Should find line comment token");
968
969        if let Some(Token::LineComment(text)) = comment_token {
970            assert_eq!(text.trim(), "this is a comment");
971        }
972    }
973
974    #[test]
975    fn test_block_comment_tokenization() {
976        let sql = "SELECT /* block comment */ col1 FROM table";
977        let mut lexer = Lexer::new(sql);
978        let tokens = lexer.tokenize_all_with_comments();
979
980        // Find the comment token
981        let comment_token = tokens.iter().find(|t| matches!(t, Token::BlockComment(_)));
982        assert!(comment_token.is_some(), "Should find block comment token");
983
984        if let Some(Token::BlockComment(text)) = comment_token {
985            assert_eq!(text.trim(), "block comment");
986        }
987    }
988
989    #[test]
990    fn test_multiple_comments() {
991        let sql = "-- First comment\nSELECT col1, /* inline */ col2\n-- Second comment\nFROM table";
992        let mut lexer = Lexer::new(sql);
993        let tokens = lexer.tokenize_all_with_comments();
994
995        let line_comments: Vec<_> = tokens
996            .iter()
997            .filter(|t| matches!(t, Token::LineComment(_)))
998            .collect();
999        let block_comments: Vec<_> = tokens
1000            .iter()
1001            .filter(|t| matches!(t, Token::BlockComment(_)))
1002            .collect();
1003
1004        assert_eq!(line_comments.len(), 2, "Should find 2 line comments");
1005        assert_eq!(block_comments.len(), 1, "Should find 1 block comment");
1006    }
1007
1008    #[test]
1009    fn test_backwards_compatibility() {
1010        // Test that next_token() still skips comments
1011        let sql = "SELECT -- comment\ncol1 FROM table";
1012        let mut lexer = Lexer::new(sql);
1013        let tokens = lexer.tokenize_all();
1014
1015        // Should NOT contain any comment tokens
1016        let has_comments = tokens
1017            .iter()
1018            .any(|t| matches!(t, Token::LineComment(_) | Token::BlockComment(_)));
1019        assert!(
1020            !has_comments,
1021            "next_token() should skip comments for backwards compatibility"
1022        );
1023
1024        // Should still parse correctly
1025        assert!(tokens.iter().any(|t| matches!(t, Token::Select)));
1026        assert!(tokens.iter().any(|t| matches!(t, Token::From)));
1027    }
1028
1029    // ===== Dual-Mode Lexer Tests (Phase 1) =====
1030
1031    #[test]
1032    fn test_lexer_mode_skip_comments() {
1033        let sql = "SELECT id -- comment\nFROM table";
1034
1035        // SkipComments mode (default)
1036        let mut lexer = Lexer::with_mode(sql, LexerMode::SkipComments);
1037
1038        assert_eq!(lexer.next_token(), Token::Select);
1039        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1040        // Comment should be skipped
1041        assert_eq!(lexer.next_token(), Token::From);
1042        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1043        assert_eq!(lexer.next_token(), Token::Eof);
1044    }
1045
1046    #[test]
1047    fn test_lexer_mode_preserve_comments() {
1048        let sql = "SELECT id -- comment\nFROM table";
1049
1050        // PreserveComments mode
1051        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1052
1053        assert_eq!(lexer.next_token(), Token::Select);
1054        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1055
1056        // Comment should be preserved as a token
1057        let comment_tok = lexer.next_token();
1058        assert!(matches!(comment_tok, Token::LineComment(_)));
1059        if let Token::LineComment(text) = comment_tok {
1060            assert_eq!(text.trim(), "comment");
1061        }
1062
1063        assert_eq!(lexer.next_token(), Token::From);
1064        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1065        assert_eq!(lexer.next_token(), Token::Eof);
1066    }
1067
1068    #[test]
1069    fn test_lexer_mode_default_is_skip() {
1070        let sql = "SELECT id -- comment\nFROM table";
1071
1072        // Default (using new()) should skip comments
1073        let mut lexer = Lexer::new(sql);
1074
1075        let mut tok_count = 0;
1076        loop {
1077            let tok = lexer.next_token();
1078            if matches!(tok, Token::Eof) {
1079                break;
1080            }
1081            // Should never see a comment token
1082            assert!(!matches!(
1083                tok,
1084                Token::LineComment(_) | Token::BlockComment(_)
1085            ));
1086            tok_count += 1;
1087        }
1088
1089        // SELECT, id, FROM, table = 4 tokens (no comment)
1090        assert_eq!(tok_count, 4);
1091    }
1092
1093    #[test]
1094    fn test_lexer_mode_block_comments() {
1095        let sql = "SELECT /* block */ id FROM table";
1096
1097        // Skip mode
1098        let mut lexer_skip = Lexer::with_mode(sql, LexerMode::SkipComments);
1099        assert_eq!(lexer_skip.next_token(), Token::Select);
1100        assert_eq!(lexer_skip.next_token(), Token::Identifier("id".into()));
1101        assert_eq!(lexer_skip.next_token(), Token::From);
1102
1103        // Preserve mode
1104        let mut lexer_preserve = Lexer::with_mode(sql, LexerMode::PreserveComments);
1105        assert_eq!(lexer_preserve.next_token(), Token::Select);
1106
1107        let comment_tok = lexer_preserve.next_token();
1108        assert!(matches!(comment_tok, Token::BlockComment(_)));
1109        if let Token::BlockComment(text) = comment_tok {
1110            assert_eq!(text.trim(), "block");
1111        }
1112
1113        assert_eq!(lexer_preserve.next_token(), Token::Identifier("id".into()));
1114    }
1115
1116    #[test]
1117    fn test_lexer_mode_mixed_comments() {
1118        let sql = "-- leading\nSELECT /* inline */ id -- trailing\nFROM table";
1119
1120        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1121
1122        // leading comment
1123        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1124
1125        // SELECT
1126        assert_eq!(lexer.next_token(), Token::Select);
1127
1128        // inline block comment
1129        assert!(matches!(lexer.next_token(), Token::BlockComment(_)));
1130
1131        // id
1132        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1133
1134        // trailing comment
1135        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1136
1137        // FROM table
1138        assert_eq!(lexer.next_token(), Token::From);
1139        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1140        assert_eq!(lexer.next_token(), Token::Eof);
1141    }
1142}
sql_cli/sql/parser/lexer.rs

sql_cli/sql/parser/
lexer.rs