sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6/// Lexer mode - controls whether comments are preserved or skipped
7#[derive(Debug, Clone, Copy, PartialEq)]
8pub enum LexerMode {
9    /// Standard mode - skip comments (current default behavior)
10    SkipComments,
11    /// Preserve mode - tokenize comments as tokens
12    PreserveComments,
13}
14
15impl Default for LexerMode {
16    fn default() -> Self {
17        LexerMode::SkipComments
18    }
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub enum Token {
23    // Keywords
24    Select,
25    From,
26    Where,
27    With, // WITH clause for CTEs
28    And,
29    Or,
30    In,
31    Not,
32    Between,
33    Like,
34    Is,
35    Null,
36    OrderBy,
37    GroupBy,
38    Having,
39    As,
40    Asc,
41    Desc,
42    Limit,
43    Offset,
44    Into,      // INTO keyword for temporary tables
45    DateTime,  // DateTime constructor
46    Case,      // CASE expression
47    When,      // WHEN clause
48    Then,      // THEN clause
49    Else,      // ELSE clause
50    End,       // END keyword
51    Distinct,  // DISTINCT keyword for aggregate functions
52    Over,      // OVER keyword for window functions
53    Partition, // PARTITION keyword for window functions
54    By,        // BY keyword (used with PARTITION BY, ORDER BY)
55
56    // Window frame keywords
57    Rows,      // ROWS frame type
58    Range,     // RANGE frame type
59    Unbounded, // UNBOUNDED for frame bounds
60    Preceding, // PRECEDING for frame bounds
61    Following, // FOLLOWING for frame bounds
62    Current,   // CURRENT for CURRENT ROW
63    Row,       // ROW for CURRENT ROW
64
65    // Set operation keywords
66    Union,     // UNION
67    Intersect, // INTERSECT
68    Except,    // EXCEPT
69
70    // Special CTE keyword
71    Web, // WEB (for WEB CTEs)
72
73    // Row expansion functions
74    Unnest, // UNNEST (for expanding delimited strings into rows)
75
76    // JOIN keywords
77    Join,  // JOIN keyword
78    Inner, // INNER JOIN
79    Left,  // LEFT JOIN
80    Right, // RIGHT JOIN
81    Full,  // FULL JOIN
82    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
83    On,    // ON keyword for join conditions
84    Cross, // CROSS JOIN
85
86    // Literals
87    Identifier(String),
88    QuotedIdentifier(String), // For "Customer Id" style identifiers
89    StringLiteral(String),
90    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
91    NumberLiteral(String),
92    Star,
93
94    // Operators
95    Dot,
96    Comma,
97    Colon,
98    LeftParen,
99    RightParen,
100    Equal,
101    NotEqual,
102    LessThan,
103    GreaterThan,
104    LessThanOrEqual,
105    GreaterThanOrEqual,
106
107    // Arithmetic operators
108    Plus,
109    Minus,
110    Divide,
111    Modulo,
112
113    // String operators
114    Concat, // || for string concatenation
115
116    // Comments (preserved for formatting)
117    LineComment(String),  // -- comment text (without the -- prefix)
118    BlockComment(String), // /* comment text */ (without delimiters)
119
120    // Special
121    Eof,
122}
123
124impl Token {
125    /// Check if a string is a SQL keyword and return corresponding token
126    pub fn from_keyword(s: &str) -> Option<Token> {
127        match s.to_uppercase().as_str() {
128            "SELECT" => Some(Token::Select),
129            "FROM" => Some(Token::From),
130            "WHERE" => Some(Token::Where),
131            "WITH" => Some(Token::With),
132            "AND" => Some(Token::And),
133            "OR" => Some(Token::Or),
134            "IN" => Some(Token::In),
135            "NOT" => Some(Token::Not),
136            "BETWEEN" => Some(Token::Between),
137            "LIKE" => Some(Token::Like),
138            "IS" => Some(Token::Is),
139            "NULL" => Some(Token::Null),
140            "ORDER" => Some(Token::OrderBy),
141            "GROUP" => Some(Token::GroupBy),
142            "HAVING" => Some(Token::Having),
143            "AS" => Some(Token::As),
144            "ASC" => Some(Token::Asc),
145            "DESC" => Some(Token::Desc),
146            "LIMIT" => Some(Token::Limit),
147            "OFFSET" => Some(Token::Offset),
148            "INTO" => Some(Token::Into),
149            "DISTINCT" => Some(Token::Distinct),
150            "CASE" => Some(Token::Case),
151            "WHEN" => Some(Token::When),
152            "THEN" => Some(Token::Then),
153            "ELSE" => Some(Token::Else),
154            "END" => Some(Token::End),
155            "OVER" => Some(Token::Over),
156            "PARTITION" => Some(Token::Partition),
157            "BY" => Some(Token::By),
158            "ROWS" => Some(Token::Rows),
159            "RANGE" => Some(Token::Range),
160            "UNBOUNDED" => Some(Token::Unbounded),
161            "PRECEDING" => Some(Token::Preceding),
162            "FOLLOWING" => Some(Token::Following),
163            "CURRENT" => Some(Token::Current),
164            "ROW" => Some(Token::Row),
165            "UNION" => Some(Token::Union),
166            "INTERSECT" => Some(Token::Intersect),
167            "EXCEPT" => Some(Token::Except),
168            "WEB" => Some(Token::Web),
169            "UNNEST" => Some(Token::Unnest),
170            "JOIN" => Some(Token::Join),
171            "INNER" => Some(Token::Inner),
172            "LEFT" => Some(Token::Left),
173            "RIGHT" => Some(Token::Right),
174            "FULL" => Some(Token::Full),
175            "OUTER" => Some(Token::Outer),
176            "ON" => Some(Token::On),
177            "CROSS" => Some(Token::Cross),
178            _ => None,
179        }
180    }
181
182    /// Check if token is a logical operator
183    pub fn is_logical_operator(&self) -> bool {
184        matches!(self, Token::And | Token::Or)
185    }
186
187    /// Check if token is a join type
188    pub fn is_join_type(&self) -> bool {
189        matches!(
190            self,
191            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
192        )
193    }
194
195    /// Check if token ends a clause
196    pub fn is_clause_terminator(&self) -> bool {
197        matches!(
198            self,
199            Token::OrderBy
200                | Token::GroupBy
201                | Token::Having
202                | Token::Limit
203                | Token::Offset
204                | Token::Union
205                | Token::Intersect
206                | Token::Except
207        )
208    }
209
210    /// Get the string representation of a keyword token
211    pub fn as_keyword_str(&self) -> Option<&'static str> {
212        match self {
213            Token::Select => Some("SELECT"),
214            Token::From => Some("FROM"),
215            Token::Where => Some("WHERE"),
216            Token::With => Some("WITH"),
217            Token::And => Some("AND"),
218            Token::Or => Some("OR"),
219            Token::OrderBy => Some("ORDER BY"),
220            Token::GroupBy => Some("GROUP BY"),
221            Token::Having => Some("HAVING"),
222            // Add more as needed
223            _ => None,
224        }
225    }
226}
227
228#[derive(Debug, Clone)]
229pub struct Lexer {
230    input: Vec<char>,
231    position: usize,
232    current_char: Option<char>,
233    mode: LexerMode,
234}
235
236impl Lexer {
237    #[must_use]
238    pub fn new(input: &str) -> Self {
239        Self::with_mode(input, LexerMode::default())
240    }
241
242    /// Create a new lexer with specified mode
243    #[must_use]
244    pub fn with_mode(input: &str, mode: LexerMode) -> Self {
245        let chars: Vec<char> = input.chars().collect();
246        let current = chars.first().copied();
247        Self {
248            input: chars,
249            position: 0,
250            current_char: current,
251            mode,
252        }
253    }
254
255    fn advance(&mut self) {
256        self.position += 1;
257        self.current_char = self.input.get(self.position).copied();
258    }
259
260    fn peek(&self, offset: usize) -> Option<char> {
261        self.input.get(self.position + offset).copied()
262    }
263
264    /// Peek ahead n characters and return as a string
265    fn peek_string(&self, n: usize) -> String {
266        let mut result = String::new();
267        for i in 0..n {
268            if let Some(ch) = self.input.get(self.position + i) {
269                result.push(*ch);
270            } else {
271                break;
272            }
273        }
274        result
275    }
276
277    /// Read a JSON block delimited by $JSON$...$JSON$
278    /// Consumes the opening delimiter and reads until closing $JSON$
279    fn read_json_block(&mut self) -> String {
280        let mut result = String::new();
281
282        // Skip opening $JSON$
283        for _ in 0..6 {
284            self.advance();
285        }
286
287        // Read until we find closing $JSON$
288        while let Some(ch) = self.current_char {
289            // Check if we're at the closing delimiter
290            if ch == '$' && self.peek_string(6) == "$JSON$" {
291                // Skip closing $JSON$
292                for _ in 0..6 {
293                    self.advance();
294                }
295                break;
296            }
297            result.push(ch);
298            self.advance();
299        }
300
301        result
302    }
303
304    fn skip_whitespace(&mut self) {
305        while let Some(ch) = self.current_char {
306            if ch.is_whitespace() {
307                self.advance();
308            } else {
309                break;
310            }
311        }
312    }
313
314    /// Read a line comment and return its content (without the -- prefix)
315    fn read_line_comment(&mut self) -> String {
316        let mut result = String::new();
317
318        // Skip '--'
319        self.advance();
320        self.advance();
321
322        // Read until end of line or EOF
323        while let Some(ch) = self.current_char {
324            if ch == '\n' {
325                self.advance(); // consume the newline
326                break;
327            }
328            result.push(ch);
329            self.advance();
330        }
331
332        result
333    }
334
335    /// Read a block comment and return its content (without /* */ delimiters)
336    fn read_block_comment(&mut self) -> String {
337        let mut result = String::new();
338
339        // Skip '/*'
340        self.advance();
341        self.advance();
342
343        // Read until we find '*/'
344        while let Some(ch) = self.current_char {
345            if ch == '*' && self.peek(1) == Some('/') {
346                self.advance(); // skip '*'
347                self.advance(); // skip '/'
348                break;
349            }
350            result.push(ch);
351            self.advance();
352        }
353
354        result
355    }
356
357    /// Skip whitespace and comments (for backwards compatibility with parser)
358    /// This is the old behavior that discards comments
359    fn skip_whitespace_and_comments(&mut self) {
360        loop {
361            // Skip whitespace
362            while let Some(ch) = self.current_char {
363                if ch.is_whitespace() {
364                    self.advance();
365                } else {
366                    break;
367                }
368            }
369
370            // Check for comments
371            match self.current_char {
372                Some('-') if self.peek(1) == Some('-') => {
373                    // Single-line comment: skip until end of line
374                    self.advance(); // skip first '-'
375                    self.advance(); // skip second '-'
376                    while let Some(ch) = self.current_char {
377                        self.advance();
378                        if ch == '\n' {
379                            break;
380                        }
381                    }
382                }
383                Some('/') if self.peek(1) == Some('*') => {
384                    // Multi-line comment: skip until */
385                    self.advance(); // skip '/'
386                    self.advance(); // skip '*'
387                    while let Some(ch) = self.current_char {
388                        if ch == '*' && self.peek(1) == Some('/') {
389                            self.advance(); // skip '*'
390                            self.advance(); // skip '/'
391                            break;
392                        }
393                        self.advance();
394                    }
395                }
396                _ => {
397                    // No more comments or whitespace
398                    break;
399                }
400            }
401        }
402    }
403
404    fn read_identifier(&mut self) -> String {
405        let mut result = String::new();
406        while let Some(ch) = self.current_char {
407            if ch.is_alphanumeric() || ch == '_' {
408                result.push(ch);
409                self.advance();
410            } else {
411                break;
412            }
413        }
414        result
415    }
416
417    fn read_string(&mut self) -> String {
418        let mut result = String::new();
419        let quote_char = self.current_char.unwrap(); // ' or "
420        self.advance(); // skip opening quote
421
422        while let Some(ch) = self.current_char {
423            if ch == quote_char {
424                self.advance(); // skip closing quote
425                break;
426            }
427            result.push(ch);
428            self.advance();
429        }
430        result
431    }
432
433    fn read_number(&mut self) -> String {
434        let mut result = String::new();
435        let has_e = false;
436
437        // Read the main number part (including decimal point)
438        while let Some(ch) = self.current_char {
439            if !has_e && (ch.is_numeric() || ch == '.') {
440                result.push(ch);
441                self.advance();
442            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
443                // Handle scientific notation
444                result.push(ch);
445                self.advance();
446                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
447
448                // Check for optional sign after 'e'
449                if let Some(sign) = self.current_char {
450                    if sign == '+' || sign == '-' {
451                        result.push(sign);
452                        self.advance();
453                    }
454                }
455
456                // Read exponent digits
457                while let Some(digit) = self.current_char {
458                    if digit.is_numeric() {
459                        result.push(digit);
460                        self.advance();
461                    } else {
462                        break;
463                    }
464                }
465                break; // Done reading the number
466            } else {
467                break;
468            }
469        }
470        result
471    }
472
473    /// Get next token while preserving comments as tokens
474    /// This is the new behavior for comment-aware formatting
475    pub fn next_token_with_comments(&mut self) -> Token {
476        // Only skip whitespace, NOT comments
477        self.skip_whitespace();
478
479        match self.current_char {
480            None => Token::Eof,
481            // Handle comments as tokens
482            Some('-') if self.peek(1) == Some('-') => {
483                let comment_text = self.read_line_comment();
484                Token::LineComment(comment_text)
485            }
486            Some('/') if self.peek(1) == Some('*') => {
487                let comment_text = self.read_block_comment();
488                Token::BlockComment(comment_text)
489            }
490            Some('*') => {
491                self.advance();
492                Token::Star
493            }
494            Some('+') => {
495                self.advance();
496                Token::Plus
497            }
498            Some('/') => {
499                // Regular division (comment case handled above)
500                self.advance();
501                Token::Divide
502            }
503            Some('%') => {
504                self.advance();
505                Token::Modulo
506            }
507            Some('.') => {
508                self.advance();
509                Token::Dot
510            }
511            Some(',') => {
512                self.advance();
513                Token::Comma
514            }
515            Some(':') => {
516                self.advance();
517                Token::Colon
518            }
519            Some('(') => {
520                self.advance();
521                Token::LeftParen
522            }
523            Some(')') => {
524                self.advance();
525                Token::RightParen
526            }
527            Some('=') => {
528                self.advance();
529                Token::Equal
530            }
531            Some('<') => {
532                self.advance();
533                if self.current_char == Some('=') {
534                    self.advance();
535                    Token::LessThanOrEqual
536                } else if self.current_char == Some('>') {
537                    self.advance();
538                    Token::NotEqual
539                } else {
540                    Token::LessThan
541                }
542            }
543            Some('>') => {
544                self.advance();
545                if self.current_char == Some('=') {
546                    self.advance();
547                    Token::GreaterThanOrEqual
548                } else {
549                    Token::GreaterThan
550                }
551            }
552            Some('!') if self.peek(1) == Some('=') => {
553                self.advance();
554                self.advance();
555                Token::NotEqual
556            }
557            Some('|') if self.peek(1) == Some('|') => {
558                self.advance();
559                self.advance();
560                Token::Concat
561            }
562            Some('"') => {
563                let ident_val = self.read_string();
564                Token::QuotedIdentifier(ident_val)
565            }
566            Some('$') => {
567                if self.peek_string(6) == "$JSON$" {
568                    let json_content = self.read_json_block();
569                    Token::JsonBlock(json_content)
570                } else {
571                    let ident = self.read_identifier();
572                    Token::Identifier(ident)
573                }
574            }
575            Some('\'') => {
576                let string_val = self.read_string();
577                Token::StringLiteral(string_val)
578            }
579            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
580                self.advance();
581                let num = self.read_number();
582                Token::NumberLiteral(format!("-{num}"))
583            }
584            Some('-') => {
585                self.advance();
586                Token::Minus
587            }
588            Some(ch) if ch.is_numeric() => {
589                let num = self.read_number();
590                Token::NumberLiteral(num)
591            }
592            Some('#') => {
593                self.advance();
594                let table_name = self.read_identifier();
595                if table_name.is_empty() {
596                    Token::Identifier("#".to_string())
597                } else {
598                    Token::Identifier(format!("#{}", table_name))
599                }
600            }
601            Some(ch) if ch.is_alphabetic() || ch == '_' => {
602                let ident = self.read_identifier();
603                Token::from_keyword(&ident).unwrap_or_else(|| Token::Identifier(ident))
604            }
605            Some(ch) => {
606                self.advance();
607                Token::Identifier(ch.to_string())
608            }
609        }
610    }
611
612    /// Get next token - dispatches based on lexer mode
613    pub fn next_token(&mut self) -> Token {
614        match self.mode {
615            LexerMode::SkipComments => self.next_token_skip_comments(),
616            LexerMode::PreserveComments => self.next_token_with_comments(),
617        }
618    }
619
620    /// Get next token skipping comments (original behavior)
621    fn next_token_skip_comments(&mut self) -> Token {
622        self.skip_whitespace_and_comments();
623
624        match self.current_char {
625            None => Token::Eof,
626            Some('*') => {
627                self.advance();
628                // Context-sensitive: could be SELECT * or multiplication
629                // The parser will distinguish based on context
630                Token::Star // We'll handle multiplication in parser
631            }
632            Some('+') => {
633                self.advance();
634                Token::Plus
635            }
636            Some('/') => {
637                // Check if this is a comment start
638                if self.peek(1) == Some('*') {
639                    // This shouldn't happen as comments are skipped above,
640                    // but handle it just in case
641                    self.skip_whitespace_and_comments();
642                    return self.next_token();
643                }
644                self.advance();
645                Token::Divide
646            }
647            Some('%') => {
648                self.advance();
649                Token::Modulo
650            }
651            Some('.') => {
652                self.advance();
653                Token::Dot
654            }
655            Some(',') => {
656                self.advance();
657                Token::Comma
658            }
659            Some(':') => {
660                self.advance();
661                Token::Colon
662            }
663            Some('(') => {
664                self.advance();
665                Token::LeftParen
666            }
667            Some(')') => {
668                self.advance();
669                Token::RightParen
670            }
671            Some('=') => {
672                self.advance();
673                Token::Equal
674            }
675            Some('<') => {
676                self.advance();
677                if self.current_char == Some('=') {
678                    self.advance();
679                    Token::LessThanOrEqual
680                } else if self.current_char == Some('>') {
681                    self.advance();
682                    Token::NotEqual
683                } else {
684                    Token::LessThan
685                }
686            }
687            Some('>') => {
688                self.advance();
689                if self.current_char == Some('=') {
690                    self.advance();
691                    Token::GreaterThanOrEqual
692                } else {
693                    Token::GreaterThan
694                }
695            }
696            Some('!') if self.peek(1) == Some('=') => {
697                self.advance();
698                self.advance();
699                Token::NotEqual
700            }
701            Some('|') if self.peek(1) == Some('|') => {
702                self.advance();
703                self.advance();
704                Token::Concat
705            }
706            Some('"') => {
707                // Double quotes = identifier
708                let ident_val = self.read_string();
709                Token::QuotedIdentifier(ident_val)
710            }
711            Some('$') => {
712                // Check if this is $JSON$ delimiter
713                if self.peek_string(6) == "$JSON$" {
714                    let json_content = self.read_json_block();
715                    Token::JsonBlock(json_content)
716                } else {
717                    // Not a JSON block, could be part of identifier or parameter
718                    // For now, treat as identifier start
719                    let ident = self.read_identifier();
720                    Token::Identifier(ident)
721                }
722            }
723            Some('\'') => {
724                // Single quotes = string literal
725                let string_val = self.read_string();
726                Token::StringLiteral(string_val)
727            }
728            Some('-') if self.peek(1) == Some('-') => {
729                // This is a comment, skip it and get next token
730                self.skip_whitespace_and_comments();
731                self.next_token()
732            }
733            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
734                // Handle negative numbers
735                self.advance(); // skip '-'
736                let num = self.read_number();
737                Token::NumberLiteral(format!("-{num}"))
738            }
739            Some('-') => {
740                // Handle subtraction operator
741                self.advance();
742                Token::Minus
743            }
744            Some(ch) if ch.is_numeric() => {
745                let num = self.read_number();
746                Token::NumberLiteral(num)
747            }
748            Some('#') => {
749                // Temporary table identifier: #tablename
750                self.advance(); // consume #
751                let table_name = self.read_identifier();
752                if table_name.is_empty() {
753                    // Just # by itself
754                    Token::Identifier("#".to_string())
755                } else {
756                    // #tablename
757                    Token::Identifier(format!("#{}", table_name))
758                }
759            }
760            Some(ch) if ch.is_alphabetic() || ch == '_' => {
761                let ident = self.read_identifier();
762                match ident.to_uppercase().as_str() {
763                    "SELECT" => Token::Select,
764                    "FROM" => Token::From,
765                    "WHERE" => Token::Where,
766                    "WITH" => Token::With,
767                    "AND" => Token::And,
768                    "OR" => Token::Or,
769                    "IN" => Token::In,
770                    "NOT" => Token::Not,
771                    "BETWEEN" => Token::Between,
772                    "LIKE" => Token::Like,
773                    "IS" => Token::Is,
774                    "NULL" => Token::Null,
775                    "ORDER" if self.peek_keyword("BY") => {
776                        self.skip_whitespace();
777                        self.read_identifier(); // consume "BY"
778                        Token::OrderBy
779                    }
780                    "GROUP" if self.peek_keyword("BY") => {
781                        self.skip_whitespace();
782                        self.read_identifier(); // consume "BY"
783                        Token::GroupBy
784                    }
785                    "HAVING" => Token::Having,
786                    "AS" => Token::As,
787                    "ASC" => Token::Asc,
788                    "DESC" => Token::Desc,
789                    "LIMIT" => Token::Limit,
790                    "OFFSET" => Token::Offset,
791                    "INTO" => Token::Into,
792                    "DATETIME" => Token::DateTime,
793                    "CASE" => Token::Case,
794                    "WHEN" => Token::When,
795                    "THEN" => Token::Then,
796                    "ELSE" => Token::Else,
797                    "END" => Token::End,
798                    "DISTINCT" => Token::Distinct,
799                    "OVER" => Token::Over,
800                    "PARTITION" => Token::Partition,
801                    "BY" => Token::By,
802                    // Window frame keywords
803                    "ROWS" => Token::Rows,
804                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
805                    // We'll handle this in the parser based on context
806                    "UNBOUNDED" => Token::Unbounded,
807                    "PRECEDING" => Token::Preceding,
808                    "FOLLOWING" => Token::Following,
809                    "CURRENT" => Token::Current,
810                    "ROW" => Token::Row,
811                    // Set operation keywords
812                    "UNION" => Token::Union,
813                    "INTERSECT" => Token::Intersect,
814                    "EXCEPT" => Token::Except,
815                    // Special CTE keyword
816                    "WEB" => Token::Web,
817                    // Row expansion functions
818                    "UNNEST" => Token::Unnest,
819                    // JOIN keywords
820                    "JOIN" => Token::Join,
821                    "INNER" => Token::Inner,
822                    "LEFT" => Token::Left,
823                    "RIGHT" => Token::Right,
824                    "FULL" => Token::Full,
825                    "OUTER" => Token::Outer,
826                    "ON" => Token::On,
827                    "CROSS" => Token::Cross,
828                    _ => Token::Identifier(ident),
829                }
830            }
831            Some(ch) => {
832                self.advance();
833                Token::Identifier(ch.to_string())
834            }
835        }
836    }
837
838    fn peek_keyword(&mut self, keyword: &str) -> bool {
839        let saved_pos = self.position;
840        let saved_char = self.current_char;
841
842        self.skip_whitespace_and_comments();
843        let next_word = self.read_identifier();
844        let matches = next_word.to_uppercase() == keyword;
845
846        // Restore position
847        self.position = saved_pos;
848        self.current_char = saved_char;
849
850        matches
851    }
852
853    #[must_use]
854    pub fn get_position(&self) -> usize {
855        self.position
856    }
857
858    pub fn tokenize_all(&mut self) -> Vec<Token> {
859        let mut tokens = Vec::new();
860        loop {
861            let token = self.next_token();
862            if matches!(token, Token::Eof) {
863                tokens.push(token);
864                break;
865            }
866            tokens.push(token);
867        }
868        tokens
869    }
870
871    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
872        let mut tokens = Vec::new();
873        loop {
874            self.skip_whitespace_and_comments();
875            let start_pos = self.position;
876            let token = self.next_token();
877            let end_pos = self.position;
878
879            if matches!(token, Token::Eof) {
880                break;
881            }
882            tokens.push((start_pos, end_pos, token));
883        }
884        tokens
885    }
886
887    /// Tokenize all tokens including comments
888    /// This is useful for formatting tools that need to preserve comments
889    pub fn tokenize_all_with_comments(&mut self) -> Vec<Token> {
890        let mut tokens = Vec::new();
891        loop {
892            let token = self.next_token_with_comments();
893            if matches!(token, Token::Eof) {
894                tokens.push(token);
895                break;
896            }
897            tokens.push(token);
898        }
899        tokens
900    }
901}
902
903#[cfg(test)]
904mod tests {
905    use super::*;
906
907    #[test]
908    fn test_line_comment_tokenization() {
909        let sql = "SELECT col1, -- this is a comment\ncol2 FROM table";
910        let mut lexer = Lexer::new(sql);
911        let tokens = lexer.tokenize_all_with_comments();
912
913        // Find the comment token
914        let comment_token = tokens.iter().find(|t| matches!(t, Token::LineComment(_)));
915        assert!(comment_token.is_some(), "Should find line comment token");
916
917        if let Some(Token::LineComment(text)) = comment_token {
918            assert_eq!(text.trim(), "this is a comment");
919        }
920    }
921
922    #[test]
923    fn test_block_comment_tokenization() {
924        let sql = "SELECT /* block comment */ col1 FROM table";
925        let mut lexer = Lexer::new(sql);
926        let tokens = lexer.tokenize_all_with_comments();
927
928        // Find the comment token
929        let comment_token = tokens.iter().find(|t| matches!(t, Token::BlockComment(_)));
930        assert!(comment_token.is_some(), "Should find block comment token");
931
932        if let Some(Token::BlockComment(text)) = comment_token {
933            assert_eq!(text.trim(), "block comment");
934        }
935    }
936
937    #[test]
938    fn test_multiple_comments() {
939        let sql = "-- First comment\nSELECT col1, /* inline */ col2\n-- Second comment\nFROM table";
940        let mut lexer = Lexer::new(sql);
941        let tokens = lexer.tokenize_all_with_comments();
942
943        let line_comments: Vec<_> = tokens
944            .iter()
945            .filter(|t| matches!(t, Token::LineComment(_)))
946            .collect();
947        let block_comments: Vec<_> = tokens
948            .iter()
949            .filter(|t| matches!(t, Token::BlockComment(_)))
950            .collect();
951
952        assert_eq!(line_comments.len(), 2, "Should find 2 line comments");
953        assert_eq!(block_comments.len(), 1, "Should find 1 block comment");
954    }
955
956    #[test]
957    fn test_backwards_compatibility() {
958        // Test that next_token() still skips comments
959        let sql = "SELECT -- comment\ncol1 FROM table";
960        let mut lexer = Lexer::new(sql);
961        let tokens = lexer.tokenize_all();
962
963        // Should NOT contain any comment tokens
964        let has_comments = tokens
965            .iter()
966            .any(|t| matches!(t, Token::LineComment(_) | Token::BlockComment(_)));
967        assert!(
968            !has_comments,
969            "next_token() should skip comments for backwards compatibility"
970        );
971
972        // Should still parse correctly
973        assert!(tokens.iter().any(|t| matches!(t, Token::Select)));
974        assert!(tokens.iter().any(|t| matches!(t, Token::From)));
975    }
976
977    // ===== Dual-Mode Lexer Tests (Phase 1) =====
978
979    #[test]
980    fn test_lexer_mode_skip_comments() {
981        let sql = "SELECT id -- comment\nFROM table";
982
983        // SkipComments mode (default)
984        let mut lexer = Lexer::with_mode(sql, LexerMode::SkipComments);
985
986        assert_eq!(lexer.next_token(), Token::Select);
987        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
988        // Comment should be skipped
989        assert_eq!(lexer.next_token(), Token::From);
990        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
991        assert_eq!(lexer.next_token(), Token::Eof);
992    }
993
994    #[test]
995    fn test_lexer_mode_preserve_comments() {
996        let sql = "SELECT id -- comment\nFROM table";
997
998        // PreserveComments mode
999        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1000
1001        assert_eq!(lexer.next_token(), Token::Select);
1002        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1003
1004        // Comment should be preserved as a token
1005        let comment_tok = lexer.next_token();
1006        assert!(matches!(comment_tok, Token::LineComment(_)));
1007        if let Token::LineComment(text) = comment_tok {
1008            assert_eq!(text.trim(), "comment");
1009        }
1010
1011        assert_eq!(lexer.next_token(), Token::From);
1012        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1013        assert_eq!(lexer.next_token(), Token::Eof);
1014    }
1015
1016    #[test]
1017    fn test_lexer_mode_default_is_skip() {
1018        let sql = "SELECT id -- comment\nFROM table";
1019
1020        // Default (using new()) should skip comments
1021        let mut lexer = Lexer::new(sql);
1022
1023        let mut tok_count = 0;
1024        loop {
1025            let tok = lexer.next_token();
1026            if matches!(tok, Token::Eof) {
1027                break;
1028            }
1029            // Should never see a comment token
1030            assert!(!matches!(
1031                tok,
1032                Token::LineComment(_) | Token::BlockComment(_)
1033            ));
1034            tok_count += 1;
1035        }
1036
1037        // SELECT, id, FROM, table = 4 tokens (no comment)
1038        assert_eq!(tok_count, 4);
1039    }
1040
1041    #[test]
1042    fn test_lexer_mode_block_comments() {
1043        let sql = "SELECT /* block */ id FROM table";
1044
1045        // Skip mode
1046        let mut lexer_skip = Lexer::with_mode(sql, LexerMode::SkipComments);
1047        assert_eq!(lexer_skip.next_token(), Token::Select);
1048        assert_eq!(lexer_skip.next_token(), Token::Identifier("id".into()));
1049        assert_eq!(lexer_skip.next_token(), Token::From);
1050
1051        // Preserve mode
1052        let mut lexer_preserve = Lexer::with_mode(sql, LexerMode::PreserveComments);
1053        assert_eq!(lexer_preserve.next_token(), Token::Select);
1054
1055        let comment_tok = lexer_preserve.next_token();
1056        assert!(matches!(comment_tok, Token::BlockComment(_)));
1057        if let Token::BlockComment(text) = comment_tok {
1058            assert_eq!(text.trim(), "block");
1059        }
1060
1061        assert_eq!(lexer_preserve.next_token(), Token::Identifier("id".into()));
1062    }
1063
1064    #[test]
1065    fn test_lexer_mode_mixed_comments() {
1066        let sql = "-- leading\nSELECT /* inline */ id -- trailing\nFROM table";
1067
1068        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1069
1070        // leading comment
1071        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1072
1073        // SELECT
1074        assert_eq!(lexer.next_token(), Token::Select);
1075
1076        // inline block comment
1077        assert!(matches!(lexer.next_token(), Token::BlockComment(_)));
1078
1079        // id
1080        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1081
1082        // trailing comment
1083        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1084
1085        // FROM table
1086        assert_eq!(lexer.next_token(), Token::From);
1087        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1088        assert_eq!(lexer.next_token(), Token::Eof);
1089    }
1090}
sql_cli/sql/parser/lexer.rs

sql_cli/sql/parser/
lexer.rs