sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    Into,      // INTO keyword for temporary tables
30    DateTime,  // DateTime constructor
31    Case,      // CASE expression
32    When,      // WHEN clause
33    Then,      // THEN clause
34    Else,      // ELSE clause
35    End,       // END keyword
36    Distinct,  // DISTINCT keyword for aggregate functions
37    Over,      // OVER keyword for window functions
38    Partition, // PARTITION keyword for window functions
39    By,        // BY keyword (used with PARTITION BY, ORDER BY)
40
41    // Window frame keywords
42    Rows,      // ROWS frame type
43    Range,     // RANGE frame type
44    Unbounded, // UNBOUNDED for frame bounds
45    Preceding, // PRECEDING for frame bounds
46    Following, // FOLLOWING for frame bounds
47    Current,   // CURRENT for CURRENT ROW
48    Row,       // ROW for CURRENT ROW
49
50    // Set operation keywords
51    Union,     // UNION
52    Intersect, // INTERSECT
53    Except,    // EXCEPT
54
55    // Special CTE keyword
56    Web, // WEB (for WEB CTEs)
57
58    // Row expansion functions
59    Unnest, // UNNEST (for expanding delimited strings into rows)
60
61    // JOIN keywords
62    Join,  // JOIN keyword
63    Inner, // INNER JOIN
64    Left,  // LEFT JOIN
65    Right, // RIGHT JOIN
66    Full,  // FULL JOIN
67    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
68    On,    // ON keyword for join conditions
69    Cross, // CROSS JOIN
70
71    // Literals
72    Identifier(String),
73    QuotedIdentifier(String), // For "Customer Id" style identifiers
74    StringLiteral(String),
75    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
76    NumberLiteral(String),
77    Star,
78
79    // Operators
80    Dot,
81    Comma,
82    Colon,
83    LeftParen,
84    RightParen,
85    Equal,
86    NotEqual,
87    LessThan,
88    GreaterThan,
89    LessThanOrEqual,
90    GreaterThanOrEqual,
91
92    // Arithmetic operators
93    Plus,
94    Minus,
95    Divide,
96    Modulo,
97
98    // String operators
99    Concat, // || for string concatenation
100
101    // Comments (preserved for formatting)
102    LineComment(String),  // -- comment text (without the -- prefix)
103    BlockComment(String), // /* comment text */ (without delimiters)
104
105    // Special
106    Eof,
107}
108
109impl Token {
110    /// Check if a string is a SQL keyword and return corresponding token
111    pub fn from_keyword(s: &str) -> Option<Token> {
112        match s.to_uppercase().as_str() {
113            "SELECT" => Some(Token::Select),
114            "FROM" => Some(Token::From),
115            "WHERE" => Some(Token::Where),
116            "WITH" => Some(Token::With),
117            "AND" => Some(Token::And),
118            "OR" => Some(Token::Or),
119            "IN" => Some(Token::In),
120            "NOT" => Some(Token::Not),
121            "BETWEEN" => Some(Token::Between),
122            "LIKE" => Some(Token::Like),
123            "IS" => Some(Token::Is),
124            "NULL" => Some(Token::Null),
125            "ORDER" => Some(Token::OrderBy),
126            "GROUP" => Some(Token::GroupBy),
127            "HAVING" => Some(Token::Having),
128            "AS" => Some(Token::As),
129            "ASC" => Some(Token::Asc),
130            "DESC" => Some(Token::Desc),
131            "LIMIT" => Some(Token::Limit),
132            "OFFSET" => Some(Token::Offset),
133            "INTO" => Some(Token::Into),
134            "DISTINCT" => Some(Token::Distinct),
135            "CASE" => Some(Token::Case),
136            "WHEN" => Some(Token::When),
137            "THEN" => Some(Token::Then),
138            "ELSE" => Some(Token::Else),
139            "END" => Some(Token::End),
140            "OVER" => Some(Token::Over),
141            "PARTITION" => Some(Token::Partition),
142            "BY" => Some(Token::By),
143            "ROWS" => Some(Token::Rows),
144            "RANGE" => Some(Token::Range),
145            "UNBOUNDED" => Some(Token::Unbounded),
146            "PRECEDING" => Some(Token::Preceding),
147            "FOLLOWING" => Some(Token::Following),
148            "CURRENT" => Some(Token::Current),
149            "ROW" => Some(Token::Row),
150            "UNION" => Some(Token::Union),
151            "INTERSECT" => Some(Token::Intersect),
152            "EXCEPT" => Some(Token::Except),
153            "WEB" => Some(Token::Web),
154            "UNNEST" => Some(Token::Unnest),
155            "JOIN" => Some(Token::Join),
156            "INNER" => Some(Token::Inner),
157            "LEFT" => Some(Token::Left),
158            "RIGHT" => Some(Token::Right),
159            "FULL" => Some(Token::Full),
160            "OUTER" => Some(Token::Outer),
161            "ON" => Some(Token::On),
162            "CROSS" => Some(Token::Cross),
163            _ => None,
164        }
165    }
166
167    /// Check if token is a logical operator
168    pub fn is_logical_operator(&self) -> bool {
169        matches!(self, Token::And | Token::Or)
170    }
171
172    /// Check if token is a join type
173    pub fn is_join_type(&self) -> bool {
174        matches!(
175            self,
176            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
177        )
178    }
179
180    /// Check if token ends a clause
181    pub fn is_clause_terminator(&self) -> bool {
182        matches!(
183            self,
184            Token::OrderBy
185                | Token::GroupBy
186                | Token::Having
187                | Token::Limit
188                | Token::Offset
189                | Token::Union
190                | Token::Intersect
191                | Token::Except
192        )
193    }
194
195    /// Get the string representation of a keyword token
196    pub fn as_keyword_str(&self) -> Option<&'static str> {
197        match self {
198            Token::Select => Some("SELECT"),
199            Token::From => Some("FROM"),
200            Token::Where => Some("WHERE"),
201            Token::With => Some("WITH"),
202            Token::And => Some("AND"),
203            Token::Or => Some("OR"),
204            Token::OrderBy => Some("ORDER BY"),
205            Token::GroupBy => Some("GROUP BY"),
206            Token::Having => Some("HAVING"),
207            // Add more as needed
208            _ => None,
209        }
210    }
211}
212
213#[derive(Debug, Clone)]
214pub struct Lexer {
215    input: Vec<char>,
216    position: usize,
217    current_char: Option<char>,
218}
219
220impl Lexer {
221    #[must_use]
222    pub fn new(input: &str) -> Self {
223        let chars: Vec<char> = input.chars().collect();
224        let current = chars.first().copied();
225        Self {
226            input: chars,
227            position: 0,
228            current_char: current,
229        }
230    }
231
232    fn advance(&mut self) {
233        self.position += 1;
234        self.current_char = self.input.get(self.position).copied();
235    }
236
237    fn peek(&self, offset: usize) -> Option<char> {
238        self.input.get(self.position + offset).copied()
239    }
240
241    /// Peek ahead n characters and return as a string
242    fn peek_string(&self, n: usize) -> String {
243        let mut result = String::new();
244        for i in 0..n {
245            if let Some(ch) = self.input.get(self.position + i) {
246                result.push(*ch);
247            } else {
248                break;
249            }
250        }
251        result
252    }
253
254    /// Read a JSON block delimited by $JSON$...$JSON$
255    /// Consumes the opening delimiter and reads until closing $JSON$
256    fn read_json_block(&mut self) -> String {
257        let mut result = String::new();
258
259        // Skip opening $JSON$
260        for _ in 0..6 {
261            self.advance();
262        }
263
264        // Read until we find closing $JSON$
265        while let Some(ch) = self.current_char {
266            // Check if we're at the closing delimiter
267            if ch == '$' && self.peek_string(6) == "$JSON$" {
268                // Skip closing $JSON$
269                for _ in 0..6 {
270                    self.advance();
271                }
272                break;
273            }
274            result.push(ch);
275            self.advance();
276        }
277
278        result
279    }
280
281    fn skip_whitespace(&mut self) {
282        while let Some(ch) = self.current_char {
283            if ch.is_whitespace() {
284                self.advance();
285            } else {
286                break;
287            }
288        }
289    }
290
291    /// Read a line comment and return its content (without the -- prefix)
292    fn read_line_comment(&mut self) -> String {
293        let mut result = String::new();
294
295        // Skip '--'
296        self.advance();
297        self.advance();
298
299        // Read until end of line or EOF
300        while let Some(ch) = self.current_char {
301            if ch == '\n' {
302                self.advance(); // consume the newline
303                break;
304            }
305            result.push(ch);
306            self.advance();
307        }
308
309        result
310    }
311
312    /// Read a block comment and return its content (without /* */ delimiters)
313    fn read_block_comment(&mut self) -> String {
314        let mut result = String::new();
315
316        // Skip '/*'
317        self.advance();
318        self.advance();
319
320        // Read until we find '*/'
321        while let Some(ch) = self.current_char {
322            if ch == '*' && self.peek(1) == Some('/') {
323                self.advance(); // skip '*'
324                self.advance(); // skip '/'
325                break;
326            }
327            result.push(ch);
328            self.advance();
329        }
330
331        result
332    }
333
334    /// Skip whitespace and comments (for backwards compatibility with parser)
335    /// This is the old behavior that discards comments
336    fn skip_whitespace_and_comments(&mut self) {
337        loop {
338            // Skip whitespace
339            while let Some(ch) = self.current_char {
340                if ch.is_whitespace() {
341                    self.advance();
342                } else {
343                    break;
344                }
345            }
346
347            // Check for comments
348            match self.current_char {
349                Some('-') if self.peek(1) == Some('-') => {
350                    // Single-line comment: skip until end of line
351                    self.advance(); // skip first '-'
352                    self.advance(); // skip second '-'
353                    while let Some(ch) = self.current_char {
354                        self.advance();
355                        if ch == '\n' {
356                            break;
357                        }
358                    }
359                }
360                Some('/') if self.peek(1) == Some('*') => {
361                    // Multi-line comment: skip until */
362                    self.advance(); // skip '/'
363                    self.advance(); // skip '*'
364                    while let Some(ch) = self.current_char {
365                        if ch == '*' && self.peek(1) == Some('/') {
366                            self.advance(); // skip '*'
367                            self.advance(); // skip '/'
368                            break;
369                        }
370                        self.advance();
371                    }
372                }
373                _ => {
374                    // No more comments or whitespace
375                    break;
376                }
377            }
378        }
379    }
380
381    fn read_identifier(&mut self) -> String {
382        let mut result = String::new();
383        while let Some(ch) = self.current_char {
384            if ch.is_alphanumeric() || ch == '_' {
385                result.push(ch);
386                self.advance();
387            } else {
388                break;
389            }
390        }
391        result
392    }
393
394    fn read_string(&mut self) -> String {
395        let mut result = String::new();
396        let quote_char = self.current_char.unwrap(); // ' or "
397        self.advance(); // skip opening quote
398
399        while let Some(ch) = self.current_char {
400            if ch == quote_char {
401                self.advance(); // skip closing quote
402                break;
403            }
404            result.push(ch);
405            self.advance();
406        }
407        result
408    }
409
410    fn read_number(&mut self) -> String {
411        let mut result = String::new();
412        let has_e = false;
413
414        // Read the main number part (including decimal point)
415        while let Some(ch) = self.current_char {
416            if !has_e && (ch.is_numeric() || ch == '.') {
417                result.push(ch);
418                self.advance();
419            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
420                // Handle scientific notation
421                result.push(ch);
422                self.advance();
423                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
424
425                // Check for optional sign after 'e'
426                if let Some(sign) = self.current_char {
427                    if sign == '+' || sign == '-' {
428                        result.push(sign);
429                        self.advance();
430                    }
431                }
432
433                // Read exponent digits
434                while let Some(digit) = self.current_char {
435                    if digit.is_numeric() {
436                        result.push(digit);
437                        self.advance();
438                    } else {
439                        break;
440                    }
441                }
442                break; // Done reading the number
443            } else {
444                break;
445            }
446        }
447        result
448    }
449
450    /// Get next token while preserving comments as tokens
451    /// This is the new behavior for comment-aware formatting
452    pub fn next_token_with_comments(&mut self) -> Token {
453        // Only skip whitespace, NOT comments
454        self.skip_whitespace();
455
456        match self.current_char {
457            None => Token::Eof,
458            // Handle comments as tokens
459            Some('-') if self.peek(1) == Some('-') => {
460                let comment_text = self.read_line_comment();
461                Token::LineComment(comment_text)
462            }
463            Some('/') if self.peek(1) == Some('*') => {
464                let comment_text = self.read_block_comment();
465                Token::BlockComment(comment_text)
466            }
467            Some('*') => {
468                self.advance();
469                Token::Star
470            }
471            Some('+') => {
472                self.advance();
473                Token::Plus
474            }
475            Some('/') => {
476                // Regular division (comment case handled above)
477                self.advance();
478                Token::Divide
479            }
480            Some('%') => {
481                self.advance();
482                Token::Modulo
483            }
484            Some('.') => {
485                self.advance();
486                Token::Dot
487            }
488            Some(',') => {
489                self.advance();
490                Token::Comma
491            }
492            Some(':') => {
493                self.advance();
494                Token::Colon
495            }
496            Some('(') => {
497                self.advance();
498                Token::LeftParen
499            }
500            Some(')') => {
501                self.advance();
502                Token::RightParen
503            }
504            Some('=') => {
505                self.advance();
506                Token::Equal
507            }
508            Some('<') => {
509                self.advance();
510                if self.current_char == Some('=') {
511                    self.advance();
512                    Token::LessThanOrEqual
513                } else if self.current_char == Some('>') {
514                    self.advance();
515                    Token::NotEqual
516                } else {
517                    Token::LessThan
518                }
519            }
520            Some('>') => {
521                self.advance();
522                if self.current_char == Some('=') {
523                    self.advance();
524                    Token::GreaterThanOrEqual
525                } else {
526                    Token::GreaterThan
527                }
528            }
529            Some('!') if self.peek(1) == Some('=') => {
530                self.advance();
531                self.advance();
532                Token::NotEqual
533            }
534            Some('|') if self.peek(1) == Some('|') => {
535                self.advance();
536                self.advance();
537                Token::Concat
538            }
539            Some('"') => {
540                let ident_val = self.read_string();
541                Token::QuotedIdentifier(ident_val)
542            }
543            Some('$') => {
544                if self.peek_string(6) == "$JSON$" {
545                    let json_content = self.read_json_block();
546                    Token::JsonBlock(json_content)
547                } else {
548                    let ident = self.read_identifier();
549                    Token::Identifier(ident)
550                }
551            }
552            Some('\'') => {
553                let string_val = self.read_string();
554                Token::StringLiteral(string_val)
555            }
556            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
557                self.advance();
558                let num = self.read_number();
559                Token::NumberLiteral(format!("-{num}"))
560            }
561            Some('-') => {
562                self.advance();
563                Token::Minus
564            }
565            Some(ch) if ch.is_numeric() => {
566                let num = self.read_number();
567                Token::NumberLiteral(num)
568            }
569            Some('#') => {
570                self.advance();
571                let table_name = self.read_identifier();
572                if table_name.is_empty() {
573                    Token::Identifier("#".to_string())
574                } else {
575                    Token::Identifier(format!("#{}", table_name))
576                }
577            }
578            Some(ch) if ch.is_alphabetic() || ch == '_' => {
579                let ident = self.read_identifier();
580                Token::from_keyword(&ident).unwrap_or_else(|| Token::Identifier(ident))
581            }
582            Some(ch) => {
583                self.advance();
584                Token::Identifier(ch.to_string())
585            }
586        }
587    }
588
589    /// Get next token (backwards compatible - skips comments)
590    /// This is the old behavior for existing parser
591    pub fn next_token(&mut self) -> Token {
592        self.skip_whitespace_and_comments();
593
594        match self.current_char {
595            None => Token::Eof,
596            Some('*') => {
597                self.advance();
598                // Context-sensitive: could be SELECT * or multiplication
599                // The parser will distinguish based on context
600                Token::Star // We'll handle multiplication in parser
601            }
602            Some('+') => {
603                self.advance();
604                Token::Plus
605            }
606            Some('/') => {
607                // Check if this is a comment start
608                if self.peek(1) == Some('*') {
609                    // This shouldn't happen as comments are skipped above,
610                    // but handle it just in case
611                    self.skip_whitespace_and_comments();
612                    return self.next_token();
613                }
614                self.advance();
615                Token::Divide
616            }
617            Some('%') => {
618                self.advance();
619                Token::Modulo
620            }
621            Some('.') => {
622                self.advance();
623                Token::Dot
624            }
625            Some(',') => {
626                self.advance();
627                Token::Comma
628            }
629            Some(':') => {
630                self.advance();
631                Token::Colon
632            }
633            Some('(') => {
634                self.advance();
635                Token::LeftParen
636            }
637            Some(')') => {
638                self.advance();
639                Token::RightParen
640            }
641            Some('=') => {
642                self.advance();
643                Token::Equal
644            }
645            Some('<') => {
646                self.advance();
647                if self.current_char == Some('=') {
648                    self.advance();
649                    Token::LessThanOrEqual
650                } else if self.current_char == Some('>') {
651                    self.advance();
652                    Token::NotEqual
653                } else {
654                    Token::LessThan
655                }
656            }
657            Some('>') => {
658                self.advance();
659                if self.current_char == Some('=') {
660                    self.advance();
661                    Token::GreaterThanOrEqual
662                } else {
663                    Token::GreaterThan
664                }
665            }
666            Some('!') if self.peek(1) == Some('=') => {
667                self.advance();
668                self.advance();
669                Token::NotEqual
670            }
671            Some('|') if self.peek(1) == Some('|') => {
672                self.advance();
673                self.advance();
674                Token::Concat
675            }
676            Some('"') => {
677                // Double quotes = identifier
678                let ident_val = self.read_string();
679                Token::QuotedIdentifier(ident_val)
680            }
681            Some('$') => {
682                // Check if this is $JSON$ delimiter
683                if self.peek_string(6) == "$JSON$" {
684                    let json_content = self.read_json_block();
685                    Token::JsonBlock(json_content)
686                } else {
687                    // Not a JSON block, could be part of identifier or parameter
688                    // For now, treat as identifier start
689                    let ident = self.read_identifier();
690                    Token::Identifier(ident)
691                }
692            }
693            Some('\'') => {
694                // Single quotes = string literal
695                let string_val = self.read_string();
696                Token::StringLiteral(string_val)
697            }
698            Some('-') if self.peek(1) == Some('-') => {
699                // This is a comment, skip it and get next token
700                self.skip_whitespace_and_comments();
701                self.next_token()
702            }
703            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
704                // Handle negative numbers
705                self.advance(); // skip '-'
706                let num = self.read_number();
707                Token::NumberLiteral(format!("-{num}"))
708            }
709            Some('-') => {
710                // Handle subtraction operator
711                self.advance();
712                Token::Minus
713            }
714            Some(ch) if ch.is_numeric() => {
715                let num = self.read_number();
716                Token::NumberLiteral(num)
717            }
718            Some('#') => {
719                // Temporary table identifier: #tablename
720                self.advance(); // consume #
721                let table_name = self.read_identifier();
722                if table_name.is_empty() {
723                    // Just # by itself
724                    Token::Identifier("#".to_string())
725                } else {
726                    // #tablename
727                    Token::Identifier(format!("#{}", table_name))
728                }
729            }
730            Some(ch) if ch.is_alphabetic() || ch == '_' => {
731                let ident = self.read_identifier();
732                match ident.to_uppercase().as_str() {
733                    "SELECT" => Token::Select,
734                    "FROM" => Token::From,
735                    "WHERE" => Token::Where,
736                    "WITH" => Token::With,
737                    "AND" => Token::And,
738                    "OR" => Token::Or,
739                    "IN" => Token::In,
740                    "NOT" => Token::Not,
741                    "BETWEEN" => Token::Between,
742                    "LIKE" => Token::Like,
743                    "IS" => Token::Is,
744                    "NULL" => Token::Null,
745                    "ORDER" if self.peek_keyword("BY") => {
746                        self.skip_whitespace();
747                        self.read_identifier(); // consume "BY"
748                        Token::OrderBy
749                    }
750                    "GROUP" if self.peek_keyword("BY") => {
751                        self.skip_whitespace();
752                        self.read_identifier(); // consume "BY"
753                        Token::GroupBy
754                    }
755                    "HAVING" => Token::Having,
756                    "AS" => Token::As,
757                    "ASC" => Token::Asc,
758                    "DESC" => Token::Desc,
759                    "LIMIT" => Token::Limit,
760                    "OFFSET" => Token::Offset,
761                    "INTO" => Token::Into,
762                    "DATETIME" => Token::DateTime,
763                    "CASE" => Token::Case,
764                    "WHEN" => Token::When,
765                    "THEN" => Token::Then,
766                    "ELSE" => Token::Else,
767                    "END" => Token::End,
768                    "DISTINCT" => Token::Distinct,
769                    "OVER" => Token::Over,
770                    "PARTITION" => Token::Partition,
771                    "BY" => Token::By,
772                    // Window frame keywords
773                    "ROWS" => Token::Rows,
774                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
775                    // We'll handle this in the parser based on context
776                    "UNBOUNDED" => Token::Unbounded,
777                    "PRECEDING" => Token::Preceding,
778                    "FOLLOWING" => Token::Following,
779                    "CURRENT" => Token::Current,
780                    "ROW" => Token::Row,
781                    // Set operation keywords
782                    "UNION" => Token::Union,
783                    "INTERSECT" => Token::Intersect,
784                    "EXCEPT" => Token::Except,
785                    // Special CTE keyword
786                    "WEB" => Token::Web,
787                    // Row expansion functions
788                    "UNNEST" => Token::Unnest,
789                    // JOIN keywords
790                    "JOIN" => Token::Join,
791                    "INNER" => Token::Inner,
792                    "LEFT" => Token::Left,
793                    "RIGHT" => Token::Right,
794                    "FULL" => Token::Full,
795                    "OUTER" => Token::Outer,
796                    "ON" => Token::On,
797                    "CROSS" => Token::Cross,
798                    _ => Token::Identifier(ident),
799                }
800            }
801            Some(ch) => {
802                self.advance();
803                Token::Identifier(ch.to_string())
804            }
805        }
806    }
807
808    fn peek_keyword(&mut self, keyword: &str) -> bool {
809        let saved_pos = self.position;
810        let saved_char = self.current_char;
811
812        self.skip_whitespace_and_comments();
813        let next_word = self.read_identifier();
814        let matches = next_word.to_uppercase() == keyword;
815
816        // Restore position
817        self.position = saved_pos;
818        self.current_char = saved_char;
819
820        matches
821    }
822
823    #[must_use]
824    pub fn get_position(&self) -> usize {
825        self.position
826    }
827
828    pub fn tokenize_all(&mut self) -> Vec<Token> {
829        let mut tokens = Vec::new();
830        loop {
831            let token = self.next_token();
832            if matches!(token, Token::Eof) {
833                tokens.push(token);
834                break;
835            }
836            tokens.push(token);
837        }
838        tokens
839    }
840
841    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
842        let mut tokens = Vec::new();
843        loop {
844            self.skip_whitespace_and_comments();
845            let start_pos = self.position;
846            let token = self.next_token();
847            let end_pos = self.position;
848
849            if matches!(token, Token::Eof) {
850                break;
851            }
852            tokens.push((start_pos, end_pos, token));
853        }
854        tokens
855    }
856
857    /// Tokenize all tokens including comments
858    /// This is useful for formatting tools that need to preserve comments
859    pub fn tokenize_all_with_comments(&mut self) -> Vec<Token> {
860        let mut tokens = Vec::new();
861        loop {
862            let token = self.next_token_with_comments();
863            if matches!(token, Token::Eof) {
864                tokens.push(token);
865                break;
866            }
867            tokens.push(token);
868        }
869        tokens
870    }
871}
872
873#[cfg(test)]
874mod tests {
875    use super::*;
876
877    #[test]
878    fn test_line_comment_tokenization() {
879        let sql = "SELECT col1, -- this is a comment\ncol2 FROM table";
880        let mut lexer = Lexer::new(sql);
881        let tokens = lexer.tokenize_all_with_comments();
882
883        // Find the comment token
884        let comment_token = tokens.iter().find(|t| matches!(t, Token::LineComment(_)));
885        assert!(comment_token.is_some(), "Should find line comment token");
886
887        if let Some(Token::LineComment(text)) = comment_token {
888            assert_eq!(text.trim(), "this is a comment");
889        }
890    }
891
892    #[test]
893    fn test_block_comment_tokenization() {
894        let sql = "SELECT /* block comment */ col1 FROM table";
895        let mut lexer = Lexer::new(sql);
896        let tokens = lexer.tokenize_all_with_comments();
897
898        // Find the comment token
899        let comment_token = tokens.iter().find(|t| matches!(t, Token::BlockComment(_)));
900        assert!(comment_token.is_some(), "Should find block comment token");
901
902        if let Some(Token::BlockComment(text)) = comment_token {
903            assert_eq!(text.trim(), "block comment");
904        }
905    }
906
907    #[test]
908    fn test_multiple_comments() {
909        let sql = "-- First comment\nSELECT col1, /* inline */ col2\n-- Second comment\nFROM table";
910        let mut lexer = Lexer::new(sql);
911        let tokens = lexer.tokenize_all_with_comments();
912
913        let line_comments: Vec<_> = tokens
914            .iter()
915            .filter(|t| matches!(t, Token::LineComment(_)))
916            .collect();
917        let block_comments: Vec<_> = tokens
918            .iter()
919            .filter(|t| matches!(t, Token::BlockComment(_)))
920            .collect();
921
922        assert_eq!(line_comments.len(), 2, "Should find 2 line comments");
923        assert_eq!(block_comments.len(), 1, "Should find 1 block comment");
924    }
925
926    #[test]
927    fn test_backwards_compatibility() {
928        // Test that next_token() still skips comments
929        let sql = "SELECT -- comment\ncol1 FROM table";
930        let mut lexer = Lexer::new(sql);
931        let tokens = lexer.tokenize_all();
932
933        // Should NOT contain any comment tokens
934        let has_comments = tokens
935            .iter()
936            .any(|t| matches!(t, Token::LineComment(_) | Token::BlockComment(_)));
937        assert!(
938            !has_comments,
939            "next_token() should skip comments for backwards compatibility"
940        );
941
942        // Should still parse correctly
943        assert!(tokens.iter().any(|t| matches!(t, Token::Select)));
944        assert!(tokens.iter().any(|t| matches!(t, Token::From)));
945    }
946}
sql_cli/sql/parser/lexer.rs

sql_cli/sql/parser/
lexer.rs