sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6/// Lexer mode - controls whether comments are preserved or skipped
7#[derive(Debug, Clone, Copy, PartialEq)]
8pub enum LexerMode {
9    /// Standard mode - skip comments (current default behavior)
10    SkipComments,
11    /// Preserve mode - tokenize comments as tokens
12    PreserveComments,
13}
14
15impl Default for LexerMode {
16    fn default() -> Self {
17        LexerMode::SkipComments
18    }
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub enum Token {
23    // Keywords
24    Select,
25    From,
26    Where,
27    With, // WITH clause for CTEs
28    And,
29    Or,
30    In,
31    Not,
32    Between,
33    Like,
34    ILike, // Case-insensitive LIKE (PostgreSQL)
35    Is,
36    Null,
37    OrderBy,
38    GroupBy,
39    Having,
40    Qualify,
41    As,
42    Asc,
43    Desc,
44    Limit,
45    Offset,
46    Into,      // INTO keyword for temporary tables
47    DateTime,  // DateTime constructor
48    Case,      // CASE expression
49    When,      // WHEN clause
50    Then,      // THEN clause
51    Else,      // ELSE clause
52    End,       // END keyword
53    Distinct,  // DISTINCT keyword for aggregate functions
54    Over,      // OVER keyword for window functions
55    Partition, // PARTITION keyword for window functions
56    By,        // BY keyword (used with PARTITION BY, ORDER BY)
57    Exclude,   // EXCLUDE keyword (for SELECT * EXCLUDE)
58    // Note: REPLACE is NOT a keyword - it's handled as a function name
59    // to avoid conflicting with the REPLACE() string function
60
61    // Window frame keywords
62    Rows,      // ROWS frame type
63    Range,     // RANGE frame type
64    Unbounded, // UNBOUNDED for frame bounds
65    Preceding, // PRECEDING for frame bounds
66    Following, // FOLLOWING for frame bounds
67    Current,   // CURRENT for CURRENT ROW
68    Row,       // ROW for CURRENT ROW
69
70    // Set operation keywords
71    Union,     // UNION
72    Intersect, // INTERSECT
73    Except,    // EXCEPT
74
75    // Special CTE keyword
76    Web, // WEB (for WEB CTEs)
77
78    // Row expansion functions
79    Unnest, // UNNEST (for expanding delimited strings into rows)
80
81    // JOIN keywords
82    Join,  // JOIN keyword
83    Inner, // INNER JOIN
84    Left,  // LEFT JOIN
85    Right, // RIGHT JOIN
86    Full,  // FULL JOIN
87    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
88    On,    // ON keyword for join conditions
89    Cross, // CROSS JOIN
90
91    // Literals
92    Identifier(String),
93    QuotedIdentifier(String), // For "Customer Id" style identifiers
94    StringLiteral(String),
95    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
96    NumberLiteral(String),
97    Star,
98
99    // Operators
100    Dot,
101    Comma,
102    Colon,
103    LeftParen,
104    RightParen,
105    Equal,
106    NotEqual,
107    LessThan,
108    GreaterThan,
109    LessThanOrEqual,
110    GreaterThanOrEqual,
111
112    // Arithmetic operators
113    Plus,
114    Minus,
115    Divide,
116    Modulo,
117
118    // String operators
119    Concat, // || for string concatenation
120
121    // Comments (preserved for formatting)
122    LineComment(String),  // -- comment text (without the -- prefix)
123    BlockComment(String), // /* comment text */ (without delimiters)
124
125    // Special
126    Eof,
127}
128
129impl Token {
130    /// Check if a string is a SQL keyword and return corresponding token
131    pub fn from_keyword(s: &str) -> Option<Token> {
132        match s.to_uppercase().as_str() {
133            "SELECT" => Some(Token::Select),
134            "FROM" => Some(Token::From),
135            "WHERE" => Some(Token::Where),
136            "WITH" => Some(Token::With),
137            "AND" => Some(Token::And),
138            "OR" => Some(Token::Or),
139            "IN" => Some(Token::In),
140            "NOT" => Some(Token::Not),
141            "BETWEEN" => Some(Token::Between),
142            "LIKE" => Some(Token::Like),
143            "ILIKE" => Some(Token::ILike),
144            "IS" => Some(Token::Is),
145            "NULL" => Some(Token::Null),
146            "ORDER" => Some(Token::OrderBy),
147            "GROUP" => Some(Token::GroupBy),
148            "HAVING" => Some(Token::Having),
149            "QUALIFY" => Some(Token::Qualify),
150            "AS" => Some(Token::As),
151            "ASC" => Some(Token::Asc),
152            "DESC" => Some(Token::Desc),
153            "LIMIT" => Some(Token::Limit),
154            "OFFSET" => Some(Token::Offset),
155            "INTO" => Some(Token::Into),
156            "DISTINCT" => Some(Token::Distinct),
157            "EXCLUDE" => Some(Token::Exclude),
158            "CASE" => Some(Token::Case),
159            "WHEN" => Some(Token::When),
160            "THEN" => Some(Token::Then),
161            "ELSE" => Some(Token::Else),
162            "END" => Some(Token::End),
163            "OVER" => Some(Token::Over),
164            "PARTITION" => Some(Token::Partition),
165            "BY" => Some(Token::By),
166            "ROWS" => Some(Token::Rows),
167            "RANGE" => Some(Token::Range),
168            "UNBOUNDED" => Some(Token::Unbounded),
169            "PRECEDING" => Some(Token::Preceding),
170            "FOLLOWING" => Some(Token::Following),
171            "CURRENT" => Some(Token::Current),
172            "ROW" => Some(Token::Row),
173            "UNION" => Some(Token::Union),
174            "INTERSECT" => Some(Token::Intersect),
175            "EXCEPT" => Some(Token::Except),
176            "WEB" => Some(Token::Web),
177            "UNNEST" => Some(Token::Unnest),
178            "JOIN" => Some(Token::Join),
179            "INNER" => Some(Token::Inner),
180            "LEFT" => Some(Token::Left),
181            "RIGHT" => Some(Token::Right),
182            "FULL" => Some(Token::Full),
183            "OUTER" => Some(Token::Outer),
184            "ON" => Some(Token::On),
185            "CROSS" => Some(Token::Cross),
186            _ => None,
187        }
188    }
189
190    /// Check if token is a logical operator
191    pub fn is_logical_operator(&self) -> bool {
192        matches!(self, Token::And | Token::Or)
193    }
194
195    /// Check if token is a join type
196    pub fn is_join_type(&self) -> bool {
197        matches!(
198            self,
199            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
200        )
201    }
202
203    /// Check if token ends a clause
204    pub fn is_clause_terminator(&self) -> bool {
205        matches!(
206            self,
207            Token::OrderBy
208                | Token::GroupBy
209                | Token::Having
210                | Token::Limit
211                | Token::Offset
212                | Token::Union
213                | Token::Intersect
214                | Token::Except
215        )
216    }
217
218    /// Get the string representation of a keyword token
219    /// Returns the keyword as it would appear in SQL (uppercase)
220    pub fn as_keyword_str(&self) -> Option<&'static str> {
221        match self {
222            Token::Select => Some("SELECT"),
223            Token::From => Some("FROM"),
224            Token::Where => Some("WHERE"),
225            Token::With => Some("WITH"),
226            Token::And => Some("AND"),
227            Token::Or => Some("OR"),
228            Token::In => Some("IN"),
229            Token::Not => Some("NOT"),
230            Token::Between => Some("BETWEEN"),
231            Token::Like => Some("LIKE"),
232            Token::ILike => Some("ILIKE"),
233            Token::Is => Some("IS"),
234            Token::Null => Some("NULL"),
235            Token::OrderBy => Some("ORDER BY"),
236            Token::GroupBy => Some("GROUP BY"),
237            Token::Having => Some("HAVING"),
238            Token::Qualify => Some("QUALIFY"),
239            Token::As => Some("AS"),
240            Token::Asc => Some("ASC"),
241            Token::Desc => Some("DESC"),
242            Token::Limit => Some("LIMIT"),
243            Token::Offset => Some("OFFSET"),
244            Token::Into => Some("INTO"),
245            Token::Distinct => Some("DISTINCT"),
246            Token::Exclude => Some("EXCLUDE"),
247            Token::Case => Some("CASE"),
248            Token::When => Some("WHEN"),
249            Token::Then => Some("THEN"),
250            Token::Else => Some("ELSE"),
251            Token::End => Some("END"),
252            Token::Join => Some("JOIN"),
253            Token::Inner => Some("INNER"),
254            Token::Left => Some("LEFT"),
255            Token::Right => Some("RIGHT"),
256            Token::Full => Some("FULL"),
257            Token::Cross => Some("CROSS"),
258            Token::On => Some("ON"),
259            Token::Union => Some("UNION"),
260            Token::Intersect => Some("INTERSECT"),
261            Token::Except => Some("EXCEPT"),
262            Token::Over => Some("OVER"),
263            Token::Partition => Some("PARTITION"),
264            Token::By => Some("BY"),
265            Token::Rows => Some("ROWS"),
266            Token::Range => Some("RANGE"),
267            Token::Preceding => Some("PRECEDING"),
268            Token::Following => Some("FOLLOWING"),
269            Token::Current => Some("CURRENT"),
270            Token::Row => Some("ROW"),
271            Token::Unbounded => Some("UNBOUNDED"),
272            Token::DateTime => Some("DATETIME"),
273            _ => None,
274        }
275    }
276}
277
278#[derive(Debug, Clone)]
279pub struct Lexer {
280    input: Vec<char>,
281    position: usize,
282    current_char: Option<char>,
283    mode: LexerMode,
284}
285
286impl Lexer {
287    #[must_use]
288    pub fn new(input: &str) -> Self {
289        Self::with_mode(input, LexerMode::default())
290    }
291
292    /// Create a new lexer with specified mode
293    #[must_use]
294    pub fn with_mode(input: &str, mode: LexerMode) -> Self {
295        let chars: Vec<char> = input.chars().collect();
296        let current = chars.first().copied();
297        Self {
298            input: chars,
299            position: 0,
300            current_char: current,
301            mode,
302        }
303    }
304
305    fn advance(&mut self) {
306        self.position += 1;
307        self.current_char = self.input.get(self.position).copied();
308    }
309
310    fn peek(&self, offset: usize) -> Option<char> {
311        self.input.get(self.position + offset).copied()
312    }
313
314    /// Peek ahead n characters and return as a string
315    fn peek_string(&self, n: usize) -> String {
316        let mut result = String::new();
317        for i in 0..n {
318            if let Some(ch) = self.input.get(self.position + i) {
319                result.push(*ch);
320            } else {
321                break;
322            }
323        }
324        result
325    }
326
327    /// Read a JSON block delimited by $JSON$...$JSON$
328    /// Consumes the opening delimiter and reads until closing $JSON$
329    fn read_json_block(&mut self) -> String {
330        let mut result = String::new();
331
332        // Skip opening $JSON$
333        for _ in 0..6 {
334            self.advance();
335        }
336
337        // Read until we find closing $JSON$
338        while let Some(ch) = self.current_char {
339            // Check if we're at the closing delimiter
340            if ch == '$' && self.peek_string(6) == "$JSON$" {
341                // Skip closing $JSON$
342                for _ in 0..6 {
343                    self.advance();
344                }
345                break;
346            }
347            result.push(ch);
348            self.advance();
349        }
350
351        result
352    }
353
354    fn skip_whitespace(&mut self) {
355        while let Some(ch) = self.current_char {
356            if ch.is_whitespace() {
357                self.advance();
358            } else {
359                break;
360            }
361        }
362    }
363
364    /// Read a line comment and return its content (without the -- prefix)
365    fn read_line_comment(&mut self) -> String {
366        let mut result = String::new();
367
368        // Skip '--'
369        self.advance();
370        self.advance();
371
372        // Read until end of line or EOF
373        while let Some(ch) = self.current_char {
374            if ch == '\n' {
375                self.advance(); // consume the newline
376                break;
377            }
378            result.push(ch);
379            self.advance();
380        }
381
382        result
383    }
384
385    /// Read a block comment and return its content (without /* */ delimiters)
386    fn read_block_comment(&mut self) -> String {
387        let mut result = String::new();
388
389        // Skip '/*'
390        self.advance();
391        self.advance();
392
393        // Read until we find '*/'
394        while let Some(ch) = self.current_char {
395            if ch == '*' && self.peek(1) == Some('/') {
396                self.advance(); // skip '*'
397                self.advance(); // skip '/'
398                break;
399            }
400            result.push(ch);
401            self.advance();
402        }
403
404        result
405    }
406
407    /// Skip whitespace and comments (for backwards compatibility with parser)
408    /// This is the old behavior that discards comments
409    fn skip_whitespace_and_comments(&mut self) {
410        loop {
411            // Skip whitespace
412            while let Some(ch) = self.current_char {
413                if ch.is_whitespace() {
414                    self.advance();
415                } else {
416                    break;
417                }
418            }
419
420            // Check for comments
421            match self.current_char {
422                Some('-') if self.peek(1) == Some('-') => {
423                    // Single-line comment: skip until end of line
424                    self.advance(); // skip first '-'
425                    self.advance(); // skip second '-'
426                    while let Some(ch) = self.current_char {
427                        self.advance();
428                        if ch == '\n' {
429                            break;
430                        }
431                    }
432                }
433                Some('/') if self.peek(1) == Some('*') => {
434                    // Multi-line comment: skip until */
435                    self.advance(); // skip '/'
436                    self.advance(); // skip '*'
437                    while let Some(ch) = self.current_char {
438                        if ch == '*' && self.peek(1) == Some('/') {
439                            self.advance(); // skip '*'
440                            self.advance(); // skip '/'
441                            break;
442                        }
443                        self.advance();
444                    }
445                }
446                _ => {
447                    // No more comments or whitespace
448                    break;
449                }
450            }
451        }
452    }
453
454    fn read_identifier(&mut self) -> String {
455        let mut result = String::new();
456        while let Some(ch) = self.current_char {
457            if ch.is_alphanumeric() || ch == '_' {
458                result.push(ch);
459                self.advance();
460            } else {
461                break;
462            }
463        }
464        result
465    }
466
467    fn read_string(&mut self) -> String {
468        let mut result = String::new();
469        let quote_char = self.current_char.unwrap(); // ' or "
470        self.advance(); // skip opening quote
471
472        while let Some(ch) = self.current_char {
473            if ch == quote_char {
474                self.advance(); // skip closing quote
475                break;
476            }
477            result.push(ch);
478            self.advance();
479        }
480        result
481    }
482
483    fn read_number(&mut self) -> String {
484        let mut result = String::new();
485        let has_e = false;
486
487        // Read the main number part (including decimal point)
488        while let Some(ch) = self.current_char {
489            if !has_e && (ch.is_numeric() || ch == '.') {
490                result.push(ch);
491                self.advance();
492            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
493                // Handle scientific notation
494                result.push(ch);
495                self.advance();
496                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
497
498                // Check for optional sign after 'e'
499                if let Some(sign) = self.current_char {
500                    if sign == '+' || sign == '-' {
501                        result.push(sign);
502                        self.advance();
503                    }
504                }
505
506                // Read exponent digits
507                while let Some(digit) = self.current_char {
508                    if digit.is_numeric() {
509                        result.push(digit);
510                        self.advance();
511                    } else {
512                        break;
513                    }
514                }
515                break; // Done reading the number
516            } else {
517                break;
518            }
519        }
520        result
521    }
522
523    /// Get next token while preserving comments as tokens
524    /// This is the new behavior for comment-aware formatting
525    pub fn next_token_with_comments(&mut self) -> Token {
526        // Only skip whitespace, NOT comments
527        self.skip_whitespace();
528
529        match self.current_char {
530            None => Token::Eof,
531            // Handle comments as tokens
532            Some('-') if self.peek(1) == Some('-') => {
533                let comment_text = self.read_line_comment();
534                Token::LineComment(comment_text)
535            }
536            Some('/') if self.peek(1) == Some('*') => {
537                let comment_text = self.read_block_comment();
538                Token::BlockComment(comment_text)
539            }
540            Some('*') => {
541                self.advance();
542                Token::Star
543            }
544            Some('+') => {
545                self.advance();
546                Token::Plus
547            }
548            Some('/') => {
549                // Regular division (comment case handled above)
550                self.advance();
551                Token::Divide
552            }
553            Some('%') => {
554                self.advance();
555                Token::Modulo
556            }
557            Some('.') => {
558                self.advance();
559                Token::Dot
560            }
561            Some(',') => {
562                self.advance();
563                Token::Comma
564            }
565            Some(':') => {
566                self.advance();
567                Token::Colon
568            }
569            Some('(') => {
570                self.advance();
571                Token::LeftParen
572            }
573            Some(')') => {
574                self.advance();
575                Token::RightParen
576            }
577            Some('=') => {
578                self.advance();
579                Token::Equal
580            }
581            Some('<') => {
582                self.advance();
583                if self.current_char == Some('=') {
584                    self.advance();
585                    Token::LessThanOrEqual
586                } else if self.current_char == Some('>') {
587                    self.advance();
588                    Token::NotEqual
589                } else {
590                    Token::LessThan
591                }
592            }
593            Some('>') => {
594                self.advance();
595                if self.current_char == Some('=') {
596                    self.advance();
597                    Token::GreaterThanOrEqual
598                } else {
599                    Token::GreaterThan
600                }
601            }
602            Some('!') if self.peek(1) == Some('=') => {
603                self.advance();
604                self.advance();
605                Token::NotEqual
606            }
607            Some('|') if self.peek(1) == Some('|') => {
608                self.advance();
609                self.advance();
610                Token::Concat
611            }
612            Some('"') => {
613                let ident_val = self.read_string();
614                Token::QuotedIdentifier(ident_val)
615            }
616            Some('$') => {
617                if self.peek_string(6) == "$JSON$" {
618                    let json_content = self.read_json_block();
619                    Token::JsonBlock(json_content)
620                } else {
621                    let ident = self.read_identifier();
622                    Token::Identifier(ident)
623                }
624            }
625            Some('\'') => {
626                let string_val = self.read_string();
627                Token::StringLiteral(string_val)
628            }
629            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
630                self.advance();
631                let num = self.read_number();
632                Token::NumberLiteral(format!("-{num}"))
633            }
634            Some('-') => {
635                self.advance();
636                Token::Minus
637            }
638            Some(ch) if ch.is_numeric() => {
639                let num = self.read_number();
640                Token::NumberLiteral(num)
641            }
642            Some('#') => {
643                self.advance();
644                let table_name = self.read_identifier();
645                if table_name.is_empty() {
646                    Token::Identifier("#".to_string())
647                } else {
648                    Token::Identifier(format!("#{}", table_name))
649                }
650            }
651            Some(ch) if ch.is_alphabetic() || ch == '_' => {
652                let ident = self.read_identifier();
653                // Handle multi-word keywords like GROUP BY and ORDER BY
654                match ident.to_uppercase().as_str() {
655                    "ORDER" if self.peek_keyword("BY") => {
656                        self.skip_whitespace();
657                        self.read_identifier(); // consume "BY"
658                        Token::OrderBy
659                    }
660                    "GROUP" if self.peek_keyword("BY") => {
661                        self.skip_whitespace();
662                        self.read_identifier(); // consume "BY"
663                        Token::GroupBy
664                    }
665                    _ => Token::from_keyword(&ident).unwrap_or_else(|| Token::Identifier(ident)),
666                }
667            }
668            Some(ch) => {
669                self.advance();
670                Token::Identifier(ch.to_string())
671            }
672        }
673    }
674
675    /// Get next token - dispatches based on lexer mode
676    pub fn next_token(&mut self) -> Token {
677        match self.mode {
678            LexerMode::SkipComments => self.next_token_skip_comments(),
679            LexerMode::PreserveComments => self.next_token_with_comments(),
680        }
681    }
682
683    /// Get next token skipping comments (original behavior)
684    fn next_token_skip_comments(&mut self) -> Token {
685        self.skip_whitespace_and_comments();
686
687        match self.current_char {
688            None => Token::Eof,
689            Some('*') => {
690                self.advance();
691                // Context-sensitive: could be SELECT * or multiplication
692                // The parser will distinguish based on context
693                Token::Star // We'll handle multiplication in parser
694            }
695            Some('+') => {
696                self.advance();
697                Token::Plus
698            }
699            Some('/') => {
700                // Check if this is a comment start
701                if self.peek(1) == Some('*') {
702                    // This shouldn't happen as comments are skipped above,
703                    // but handle it just in case
704                    self.skip_whitespace_and_comments();
705                    return self.next_token();
706                }
707                self.advance();
708                Token::Divide
709            }
710            Some('%') => {
711                self.advance();
712                Token::Modulo
713            }
714            Some('.') => {
715                self.advance();
716                Token::Dot
717            }
718            Some(',') => {
719                self.advance();
720                Token::Comma
721            }
722            Some(':') => {
723                self.advance();
724                Token::Colon
725            }
726            Some('(') => {
727                self.advance();
728                Token::LeftParen
729            }
730            Some(')') => {
731                self.advance();
732                Token::RightParen
733            }
734            Some('=') => {
735                self.advance();
736                Token::Equal
737            }
738            Some('<') => {
739                self.advance();
740                if self.current_char == Some('=') {
741                    self.advance();
742                    Token::LessThanOrEqual
743                } else if self.current_char == Some('>') {
744                    self.advance();
745                    Token::NotEqual
746                } else {
747                    Token::LessThan
748                }
749            }
750            Some('>') => {
751                self.advance();
752                if self.current_char == Some('=') {
753                    self.advance();
754                    Token::GreaterThanOrEqual
755                } else {
756                    Token::GreaterThan
757                }
758            }
759            Some('!') if self.peek(1) == Some('=') => {
760                self.advance();
761                self.advance();
762                Token::NotEqual
763            }
764            Some('|') if self.peek(1) == Some('|') => {
765                self.advance();
766                self.advance();
767                Token::Concat
768            }
769            Some('"') => {
770                // Double quotes = identifier
771                let ident_val = self.read_string();
772                Token::QuotedIdentifier(ident_val)
773            }
774            Some('$') => {
775                // Check if this is $JSON$ delimiter
776                if self.peek_string(6) == "$JSON$" {
777                    let json_content = self.read_json_block();
778                    Token::JsonBlock(json_content)
779                } else {
780                    // Not a JSON block, could be part of identifier or parameter
781                    // For now, treat as identifier start
782                    let ident = self.read_identifier();
783                    Token::Identifier(ident)
784                }
785            }
786            Some('\'') => {
787                // Single quotes = string literal
788                let string_val = self.read_string();
789                Token::StringLiteral(string_val)
790            }
791            Some('-') if self.peek(1) == Some('-') => {
792                // This is a comment, skip it and get next token
793                self.skip_whitespace_and_comments();
794                self.next_token()
795            }
796            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
797                // Handle negative numbers
798                self.advance(); // skip '-'
799                let num = self.read_number();
800                Token::NumberLiteral(format!("-{num}"))
801            }
802            Some('-') => {
803                // Handle subtraction operator
804                self.advance();
805                Token::Minus
806            }
807            Some(ch) if ch.is_numeric() => {
808                let num = self.read_number();
809                Token::NumberLiteral(num)
810            }
811            Some('#') => {
812                // Temporary table identifier: #tablename
813                self.advance(); // consume #
814                let table_name = self.read_identifier();
815                if table_name.is_empty() {
816                    // Just # by itself
817                    Token::Identifier("#".to_string())
818                } else {
819                    // #tablename
820                    Token::Identifier(format!("#{}", table_name))
821                }
822            }
823            Some(ch) if ch.is_alphabetic() || ch == '_' => {
824                let ident = self.read_identifier();
825                match ident.to_uppercase().as_str() {
826                    "SELECT" => Token::Select,
827                    "FROM" => Token::From,
828                    "WHERE" => Token::Where,
829                    "WITH" => Token::With,
830                    "AND" => Token::And,
831                    "OR" => Token::Or,
832                    "IN" => Token::In,
833                    "NOT" => Token::Not,
834                    "BETWEEN" => Token::Between,
835                    "LIKE" => Token::Like,
836                    "ILIKE" => Token::ILike,
837                    "IS" => Token::Is,
838                    "NULL" => Token::Null,
839                    "ORDER" if self.peek_keyword("BY") => {
840                        self.skip_whitespace();
841                        self.read_identifier(); // consume "BY"
842                        Token::OrderBy
843                    }
844                    "GROUP" if self.peek_keyword("BY") => {
845                        self.skip_whitespace();
846                        self.read_identifier(); // consume "BY"
847                        Token::GroupBy
848                    }
849                    "HAVING" => Token::Having,
850                    "QUALIFY" => Token::Qualify,
851                    "AS" => Token::As,
852                    "ASC" => Token::Asc,
853                    "DESC" => Token::Desc,
854                    "LIMIT" => Token::Limit,
855                    "OFFSET" => Token::Offset,
856                    "INTO" => Token::Into,
857                    "DATETIME" => Token::DateTime,
858                    "CASE" => Token::Case,
859                    "WHEN" => Token::When,
860                    "THEN" => Token::Then,
861                    "ELSE" => Token::Else,
862                    "END" => Token::End,
863                    "DISTINCT" => Token::Distinct,
864                    "EXCLUDE" => Token::Exclude,
865                    "OVER" => Token::Over,
866                    "PARTITION" => Token::Partition,
867                    "BY" => Token::By,
868                    // Window frame keywords
869                    "ROWS" => Token::Rows,
870                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
871                    // We'll handle this in the parser based on context
872                    "UNBOUNDED" => Token::Unbounded,
873                    "PRECEDING" => Token::Preceding,
874                    "FOLLOWING" => Token::Following,
875                    "CURRENT" => Token::Current,
876                    "ROW" => Token::Row,
877                    // Set operation keywords
878                    "UNION" => Token::Union,
879                    "INTERSECT" => Token::Intersect,
880                    "EXCEPT" => Token::Except,
881                    // Special CTE keyword
882                    "WEB" => Token::Web,
883                    // Row expansion functions
884                    "UNNEST" => Token::Unnest,
885                    // JOIN keywords
886                    "JOIN" => Token::Join,
887                    "INNER" => Token::Inner,
888                    "LEFT" => Token::Left,
889                    "RIGHT" => Token::Right,
890                    "FULL" => Token::Full,
891                    "OUTER" => Token::Outer,
892                    "ON" => Token::On,
893                    "CROSS" => Token::Cross,
894                    _ => Token::Identifier(ident),
895                }
896            }
897            Some(ch) => {
898                self.advance();
899                Token::Identifier(ch.to_string())
900            }
901        }
902    }
903
904    fn peek_keyword(&mut self, keyword: &str) -> bool {
905        let saved_pos = self.position;
906        let saved_char = self.current_char;
907
908        self.skip_whitespace_and_comments();
909        let next_word = self.read_identifier();
910        let matches = next_word.to_uppercase() == keyword;
911
912        // Restore position
913        self.position = saved_pos;
914        self.current_char = saved_char;
915
916        matches
917    }
918
919    #[must_use]
920    pub fn get_position(&self) -> usize {
921        self.position
922    }
923
924    pub fn tokenize_all(&mut self) -> Vec<Token> {
925        let mut tokens = Vec::new();
926        loop {
927            let token = self.next_token();
928            if matches!(token, Token::Eof) {
929                tokens.push(token);
930                break;
931            }
932            tokens.push(token);
933        }
934        tokens
935    }
936
937    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
938        let mut tokens = Vec::new();
939        loop {
940            self.skip_whitespace_and_comments();
941            let start_pos = self.position;
942            let token = self.next_token();
943            let end_pos = self.position;
944
945            if matches!(token, Token::Eof) {
946                break;
947            }
948            tokens.push((start_pos, end_pos, token));
949        }
950        tokens
951    }
952
953    /// Tokenize all tokens including comments
954    /// This is useful for formatting tools that need to preserve comments
955    pub fn tokenize_all_with_comments(&mut self) -> Vec<Token> {
956        let mut tokens = Vec::new();
957        loop {
958            let token = self.next_token_with_comments();
959            if matches!(token, Token::Eof) {
960                tokens.push(token);
961                break;
962            }
963            tokens.push(token);
964        }
965        tokens
966    }
967}
968
969#[cfg(test)]
970mod tests {
971    use super::*;
972
973    #[test]
974    fn test_line_comment_tokenization() {
975        let sql = "SELECT col1, -- this is a comment\ncol2 FROM table";
976        let mut lexer = Lexer::new(sql);
977        let tokens = lexer.tokenize_all_with_comments();
978
979        // Find the comment token
980        let comment_token = tokens.iter().find(|t| matches!(t, Token::LineComment(_)));
981        assert!(comment_token.is_some(), "Should find line comment token");
982
983        if let Some(Token::LineComment(text)) = comment_token {
984            assert_eq!(text.trim(), "this is a comment");
985        }
986    }
987
988    #[test]
989    fn test_block_comment_tokenization() {
990        let sql = "SELECT /* block comment */ col1 FROM table";
991        let mut lexer = Lexer::new(sql);
992        let tokens = lexer.tokenize_all_with_comments();
993
994        // Find the comment token
995        let comment_token = tokens.iter().find(|t| matches!(t, Token::BlockComment(_)));
996        assert!(comment_token.is_some(), "Should find block comment token");
997
998        if let Some(Token::BlockComment(text)) = comment_token {
999            assert_eq!(text.trim(), "block comment");
1000        }
1001    }
1002
1003    #[test]
1004    fn test_multiple_comments() {
1005        let sql = "-- First comment\nSELECT col1, /* inline */ col2\n-- Second comment\nFROM table";
1006        let mut lexer = Lexer::new(sql);
1007        let tokens = lexer.tokenize_all_with_comments();
1008
1009        let line_comments: Vec<_> = tokens
1010            .iter()
1011            .filter(|t| matches!(t, Token::LineComment(_)))
1012            .collect();
1013        let block_comments: Vec<_> = tokens
1014            .iter()
1015            .filter(|t| matches!(t, Token::BlockComment(_)))
1016            .collect();
1017
1018        assert_eq!(line_comments.len(), 2, "Should find 2 line comments");
1019        assert_eq!(block_comments.len(), 1, "Should find 1 block comment");
1020    }
1021
1022    #[test]
1023    fn test_backwards_compatibility() {
1024        // Test that next_token() still skips comments
1025        let sql = "SELECT -- comment\ncol1 FROM table";
1026        let mut lexer = Lexer::new(sql);
1027        let tokens = lexer.tokenize_all();
1028
1029        // Should NOT contain any comment tokens
1030        let has_comments = tokens
1031            .iter()
1032            .any(|t| matches!(t, Token::LineComment(_) | Token::BlockComment(_)));
1033        assert!(
1034            !has_comments,
1035            "next_token() should skip comments for backwards compatibility"
1036        );
1037
1038        // Should still parse correctly
1039        assert!(tokens.iter().any(|t| matches!(t, Token::Select)));
1040        assert!(tokens.iter().any(|t| matches!(t, Token::From)));
1041    }
1042
1043    // ===== Dual-Mode Lexer Tests (Phase 1) =====
1044
1045    #[test]
1046    fn test_lexer_mode_skip_comments() {
1047        let sql = "SELECT id -- comment\nFROM table";
1048
1049        // SkipComments mode (default)
1050        let mut lexer = Lexer::with_mode(sql, LexerMode::SkipComments);
1051
1052        assert_eq!(lexer.next_token(), Token::Select);
1053        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1054        // Comment should be skipped
1055        assert_eq!(lexer.next_token(), Token::From);
1056        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1057        assert_eq!(lexer.next_token(), Token::Eof);
1058    }
1059
1060    #[test]
1061    fn test_lexer_mode_preserve_comments() {
1062        let sql = "SELECT id -- comment\nFROM table";
1063
1064        // PreserveComments mode
1065        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1066
1067        assert_eq!(lexer.next_token(), Token::Select);
1068        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1069
1070        // Comment should be preserved as a token
1071        let comment_tok = lexer.next_token();
1072        assert!(matches!(comment_tok, Token::LineComment(_)));
1073        if let Token::LineComment(text) = comment_tok {
1074            assert_eq!(text.trim(), "comment");
1075        }
1076
1077        assert_eq!(lexer.next_token(), Token::From);
1078        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1079        assert_eq!(lexer.next_token(), Token::Eof);
1080    }
1081
1082    #[test]
1083    fn test_lexer_mode_default_is_skip() {
1084        let sql = "SELECT id -- comment\nFROM table";
1085
1086        // Default (using new()) should skip comments
1087        let mut lexer = Lexer::new(sql);
1088
1089        let mut tok_count = 0;
1090        loop {
1091            let tok = lexer.next_token();
1092            if matches!(tok, Token::Eof) {
1093                break;
1094            }
1095            // Should never see a comment token
1096            assert!(!matches!(
1097                tok,
1098                Token::LineComment(_) | Token::BlockComment(_)
1099            ));
1100            tok_count += 1;
1101        }
1102
1103        // SELECT, id, FROM, table = 4 tokens (no comment)
1104        assert_eq!(tok_count, 4);
1105    }
1106
1107    #[test]
1108    fn test_lexer_mode_block_comments() {
1109        let sql = "SELECT /* block */ id FROM table";
1110
1111        // Skip mode
1112        let mut lexer_skip = Lexer::with_mode(sql, LexerMode::SkipComments);
1113        assert_eq!(lexer_skip.next_token(), Token::Select);
1114        assert_eq!(lexer_skip.next_token(), Token::Identifier("id".into()));
1115        assert_eq!(lexer_skip.next_token(), Token::From);
1116
1117        // Preserve mode
1118        let mut lexer_preserve = Lexer::with_mode(sql, LexerMode::PreserveComments);
1119        assert_eq!(lexer_preserve.next_token(), Token::Select);
1120
1121        let comment_tok = lexer_preserve.next_token();
1122        assert!(matches!(comment_tok, Token::BlockComment(_)));
1123        if let Token::BlockComment(text) = comment_tok {
1124            assert_eq!(text.trim(), "block");
1125        }
1126
1127        assert_eq!(lexer_preserve.next_token(), Token::Identifier("id".into()));
1128    }
1129
1130    #[test]
1131    fn test_lexer_mode_mixed_comments() {
1132        let sql = "-- leading\nSELECT /* inline */ id -- trailing\nFROM table";
1133
1134        let mut lexer = Lexer::with_mode(sql, LexerMode::PreserveComments);
1135
1136        // leading comment
1137        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1138
1139        // SELECT
1140        assert_eq!(lexer.next_token(), Token::Select);
1141
1142        // inline block comment
1143        assert!(matches!(lexer.next_token(), Token::BlockComment(_)));
1144
1145        // id
1146        assert_eq!(lexer.next_token(), Token::Identifier("id".into()));
1147
1148        // trailing comment
1149        assert!(matches!(lexer.next_token(), Token::LineComment(_)));
1150
1151        // FROM table
1152        assert_eq!(lexer.next_token(), Token::From);
1153        assert_eq!(lexer.next_token(), Token::Identifier("table".into()));
1154        assert_eq!(lexer.next_token(), Token::Eof);
1155    }
1156}
sql_cli/sql/parser/lexer.rs

sql_cli/sql/parser/
lexer.rs