sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    Into,      // INTO keyword for temporary tables
30    DateTime,  // DateTime constructor
31    Case,      // CASE expression
32    When,      // WHEN clause
33    Then,      // THEN clause
34    Else,      // ELSE clause
35    End,       // END keyword
36    Distinct,  // DISTINCT keyword for aggregate functions
37    Over,      // OVER keyword for window functions
38    Partition, // PARTITION keyword for window functions
39    By,        // BY keyword (used with PARTITION BY, ORDER BY)
40
41    // Window frame keywords
42    Rows,      // ROWS frame type
43    Range,     // RANGE frame type
44    Unbounded, // UNBOUNDED for frame bounds
45    Preceding, // PRECEDING for frame bounds
46    Following, // FOLLOWING for frame bounds
47    Current,   // CURRENT for CURRENT ROW
48    Row,       // ROW for CURRENT ROW
49
50    // Set operation keywords
51    Union,     // UNION
52    Intersect, // INTERSECT
53    Except,    // EXCEPT
54
55    // Special CTE keyword
56    Web, // WEB (for WEB CTEs)
57
58    // Row expansion functions
59    Unnest, // UNNEST (for expanding delimited strings into rows)
60
61    // JOIN keywords
62    Join,  // JOIN keyword
63    Inner, // INNER JOIN
64    Left,  // LEFT JOIN
65    Right, // RIGHT JOIN
66    Full,  // FULL JOIN
67    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
68    On,    // ON keyword for join conditions
69    Cross, // CROSS JOIN
70
71    // Literals
72    Identifier(String),
73    QuotedIdentifier(String), // For "Customer Id" style identifiers
74    StringLiteral(String),
75    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
76    NumberLiteral(String),
77    Star,
78
79    // Operators
80    Dot,
81    Comma,
82    Colon,
83    LeftParen,
84    RightParen,
85    Equal,
86    NotEqual,
87    LessThan,
88    GreaterThan,
89    LessThanOrEqual,
90    GreaterThanOrEqual,
91
92    // Arithmetic operators
93    Plus,
94    Minus,
95    Divide,
96    Modulo,
97
98    // String operators
99    Concat, // || for string concatenation
100
101    // Special
102    Eof,
103}
104
105impl Token {
106    /// Check if a string is a SQL keyword and return corresponding token
107    pub fn from_keyword(s: &str) -> Option<Token> {
108        match s.to_uppercase().as_str() {
109            "SELECT" => Some(Token::Select),
110            "FROM" => Some(Token::From),
111            "WHERE" => Some(Token::Where),
112            "WITH" => Some(Token::With),
113            "AND" => Some(Token::And),
114            "OR" => Some(Token::Or),
115            "IN" => Some(Token::In),
116            "NOT" => Some(Token::Not),
117            "BETWEEN" => Some(Token::Between),
118            "LIKE" => Some(Token::Like),
119            "IS" => Some(Token::Is),
120            "NULL" => Some(Token::Null),
121            "ORDER" => Some(Token::OrderBy),
122            "GROUP" => Some(Token::GroupBy),
123            "HAVING" => Some(Token::Having),
124            "AS" => Some(Token::As),
125            "ASC" => Some(Token::Asc),
126            "DESC" => Some(Token::Desc),
127            "LIMIT" => Some(Token::Limit),
128            "OFFSET" => Some(Token::Offset),
129            "INTO" => Some(Token::Into),
130            "DISTINCT" => Some(Token::Distinct),
131            "CASE" => Some(Token::Case),
132            "WHEN" => Some(Token::When),
133            "THEN" => Some(Token::Then),
134            "ELSE" => Some(Token::Else),
135            "END" => Some(Token::End),
136            "OVER" => Some(Token::Over),
137            "PARTITION" => Some(Token::Partition),
138            "BY" => Some(Token::By),
139            "ROWS" => Some(Token::Rows),
140            "RANGE" => Some(Token::Range),
141            "UNBOUNDED" => Some(Token::Unbounded),
142            "PRECEDING" => Some(Token::Preceding),
143            "FOLLOWING" => Some(Token::Following),
144            "CURRENT" => Some(Token::Current),
145            "ROW" => Some(Token::Row),
146            "UNION" => Some(Token::Union),
147            "INTERSECT" => Some(Token::Intersect),
148            "EXCEPT" => Some(Token::Except),
149            "WEB" => Some(Token::Web),
150            "UNNEST" => Some(Token::Unnest),
151            "JOIN" => Some(Token::Join),
152            "INNER" => Some(Token::Inner),
153            "LEFT" => Some(Token::Left),
154            "RIGHT" => Some(Token::Right),
155            "FULL" => Some(Token::Full),
156            "OUTER" => Some(Token::Outer),
157            "ON" => Some(Token::On),
158            "CROSS" => Some(Token::Cross),
159            _ => None,
160        }
161    }
162
163    /// Check if token is a logical operator
164    pub fn is_logical_operator(&self) -> bool {
165        matches!(self, Token::And | Token::Or)
166    }
167
168    /// Check if token is a join type
169    pub fn is_join_type(&self) -> bool {
170        matches!(
171            self,
172            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
173        )
174    }
175
176    /// Check if token ends a clause
177    pub fn is_clause_terminator(&self) -> bool {
178        matches!(
179            self,
180            Token::OrderBy
181                | Token::GroupBy
182                | Token::Having
183                | Token::Limit
184                | Token::Offset
185                | Token::Union
186                | Token::Intersect
187                | Token::Except
188        )
189    }
190
191    /// Get the string representation of a keyword token
192    pub fn as_keyword_str(&self) -> Option<&'static str> {
193        match self {
194            Token::Select => Some("SELECT"),
195            Token::From => Some("FROM"),
196            Token::Where => Some("WHERE"),
197            Token::With => Some("WITH"),
198            Token::And => Some("AND"),
199            Token::Or => Some("OR"),
200            Token::OrderBy => Some("ORDER BY"),
201            Token::GroupBy => Some("GROUP BY"),
202            Token::Having => Some("HAVING"),
203            // Add more as needed
204            _ => None,
205        }
206    }
207}
208
209#[derive(Debug, Clone)]
210pub struct Lexer {
211    input: Vec<char>,
212    position: usize,
213    current_char: Option<char>,
214}
215
216impl Lexer {
217    #[must_use]
218    pub fn new(input: &str) -> Self {
219        let chars: Vec<char> = input.chars().collect();
220        let current = chars.first().copied();
221        Self {
222            input: chars,
223            position: 0,
224            current_char: current,
225        }
226    }
227
228    fn advance(&mut self) {
229        self.position += 1;
230        self.current_char = self.input.get(self.position).copied();
231    }
232
233    fn peek(&self, offset: usize) -> Option<char> {
234        self.input.get(self.position + offset).copied()
235    }
236
237    /// Peek ahead n characters and return as a string
238    fn peek_string(&self, n: usize) -> String {
239        let mut result = String::new();
240        for i in 0..n {
241            if let Some(ch) = self.input.get(self.position + i) {
242                result.push(*ch);
243            } else {
244                break;
245            }
246        }
247        result
248    }
249
250    /// Read a JSON block delimited by $JSON$...$JSON$
251    /// Consumes the opening delimiter and reads until closing $JSON$
252    fn read_json_block(&mut self) -> String {
253        let mut result = String::new();
254
255        // Skip opening $JSON$
256        for _ in 0..6 {
257            self.advance();
258        }
259
260        // Read until we find closing $JSON$
261        while let Some(ch) = self.current_char {
262            // Check if we're at the closing delimiter
263            if ch == '$' && self.peek_string(6) == "$JSON$" {
264                // Skip closing $JSON$
265                for _ in 0..6 {
266                    self.advance();
267                }
268                break;
269            }
270            result.push(ch);
271            self.advance();
272        }
273
274        result
275    }
276
277    fn skip_whitespace(&mut self) {
278        while let Some(ch) = self.current_char {
279            if ch.is_whitespace() {
280                self.advance();
281            } else {
282                break;
283            }
284        }
285    }
286
287    fn skip_whitespace_and_comments(&mut self) {
288        loop {
289            // Skip whitespace
290            while let Some(ch) = self.current_char {
291                if ch.is_whitespace() {
292                    self.advance();
293                } else {
294                    break;
295                }
296            }
297
298            // Check for comments
299            match self.current_char {
300                Some('-') if self.peek(1) == Some('-') => {
301                    // Single-line comment: skip until end of line
302                    self.advance(); // skip first '-'
303                    self.advance(); // skip second '-'
304                    while let Some(ch) = self.current_char {
305                        self.advance();
306                        if ch == '\n' {
307                            break;
308                        }
309                    }
310                }
311                Some('/') if self.peek(1) == Some('*') => {
312                    // Multi-line comment: skip until */
313                    self.advance(); // skip '/'
314                    self.advance(); // skip '*'
315                    while let Some(ch) = self.current_char {
316                        if ch == '*' && self.peek(1) == Some('/') {
317                            self.advance(); // skip '*'
318                            self.advance(); // skip '/'
319                            break;
320                        }
321                        self.advance();
322                    }
323                }
324                _ => {
325                    // No more comments or whitespace
326                    break;
327                }
328            }
329        }
330    }
331
332    fn read_identifier(&mut self) -> String {
333        let mut result = String::new();
334        while let Some(ch) = self.current_char {
335            if ch.is_alphanumeric() || ch == '_' {
336                result.push(ch);
337                self.advance();
338            } else {
339                break;
340            }
341        }
342        result
343    }
344
345    fn read_string(&mut self) -> String {
346        let mut result = String::new();
347        let quote_char = self.current_char.unwrap(); // ' or "
348        self.advance(); // skip opening quote
349
350        while let Some(ch) = self.current_char {
351            if ch == quote_char {
352                self.advance(); // skip closing quote
353                break;
354            }
355            result.push(ch);
356            self.advance();
357        }
358        result
359    }
360
361    fn read_number(&mut self) -> String {
362        let mut result = String::new();
363        let has_e = false;
364
365        // Read the main number part (including decimal point)
366        while let Some(ch) = self.current_char {
367            if !has_e && (ch.is_numeric() || ch == '.') {
368                result.push(ch);
369                self.advance();
370            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
371                // Handle scientific notation
372                result.push(ch);
373                self.advance();
374                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
375
376                // Check for optional sign after 'e'
377                if let Some(sign) = self.current_char {
378                    if sign == '+' || sign == '-' {
379                        result.push(sign);
380                        self.advance();
381                    }
382                }
383
384                // Read exponent digits
385                while let Some(digit) = self.current_char {
386                    if digit.is_numeric() {
387                        result.push(digit);
388                        self.advance();
389                    } else {
390                        break;
391                    }
392                }
393                break; // Done reading the number
394            } else {
395                break;
396            }
397        }
398        result
399    }
400
401    pub fn next_token(&mut self) -> Token {
402        self.skip_whitespace_and_comments();
403
404        match self.current_char {
405            None => Token::Eof,
406            Some('*') => {
407                self.advance();
408                // Context-sensitive: could be SELECT * or multiplication
409                // The parser will distinguish based on context
410                Token::Star // We'll handle multiplication in parser
411            }
412            Some('+') => {
413                self.advance();
414                Token::Plus
415            }
416            Some('/') => {
417                // Check if this is a comment start
418                if self.peek(1) == Some('*') {
419                    // This shouldn't happen as comments are skipped above,
420                    // but handle it just in case
421                    self.skip_whitespace_and_comments();
422                    return self.next_token();
423                }
424                self.advance();
425                Token::Divide
426            }
427            Some('%') => {
428                self.advance();
429                Token::Modulo
430            }
431            Some('.') => {
432                self.advance();
433                Token::Dot
434            }
435            Some(',') => {
436                self.advance();
437                Token::Comma
438            }
439            Some(':') => {
440                self.advance();
441                Token::Colon
442            }
443            Some('(') => {
444                self.advance();
445                Token::LeftParen
446            }
447            Some(')') => {
448                self.advance();
449                Token::RightParen
450            }
451            Some('=') => {
452                self.advance();
453                Token::Equal
454            }
455            Some('<') => {
456                self.advance();
457                if self.current_char == Some('=') {
458                    self.advance();
459                    Token::LessThanOrEqual
460                } else if self.current_char == Some('>') {
461                    self.advance();
462                    Token::NotEqual
463                } else {
464                    Token::LessThan
465                }
466            }
467            Some('>') => {
468                self.advance();
469                if self.current_char == Some('=') {
470                    self.advance();
471                    Token::GreaterThanOrEqual
472                } else {
473                    Token::GreaterThan
474                }
475            }
476            Some('!') if self.peek(1) == Some('=') => {
477                self.advance();
478                self.advance();
479                Token::NotEqual
480            }
481            Some('|') if self.peek(1) == Some('|') => {
482                self.advance();
483                self.advance();
484                Token::Concat
485            }
486            Some('"') => {
487                // Double quotes = identifier
488                let ident_val = self.read_string();
489                Token::QuotedIdentifier(ident_val)
490            }
491            Some('$') => {
492                // Check if this is $JSON$ delimiter
493                if self.peek_string(6) == "$JSON$" {
494                    let json_content = self.read_json_block();
495                    Token::JsonBlock(json_content)
496                } else {
497                    // Not a JSON block, could be part of identifier or parameter
498                    // For now, treat as identifier start
499                    let ident = self.read_identifier();
500                    Token::Identifier(ident)
501                }
502            }
503            Some('\'') => {
504                // Single quotes = string literal
505                let string_val = self.read_string();
506                Token::StringLiteral(string_val)
507            }
508            Some('-') if self.peek(1) == Some('-') => {
509                // This is a comment, skip it and get next token
510                self.skip_whitespace_and_comments();
511                self.next_token()
512            }
513            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
514                // Handle negative numbers
515                self.advance(); // skip '-'
516                let num = self.read_number();
517                Token::NumberLiteral(format!("-{num}"))
518            }
519            Some('-') => {
520                // Handle subtraction operator
521                self.advance();
522                Token::Minus
523            }
524            Some(ch) if ch.is_numeric() => {
525                let num = self.read_number();
526                Token::NumberLiteral(num)
527            }
528            Some('#') => {
529                // Temporary table identifier: #tablename
530                self.advance(); // consume #
531                let table_name = self.read_identifier();
532                if table_name.is_empty() {
533                    // Just # by itself
534                    Token::Identifier("#".to_string())
535                } else {
536                    // #tablename
537                    Token::Identifier(format!("#{}", table_name))
538                }
539            }
540            Some(ch) if ch.is_alphabetic() || ch == '_' => {
541                let ident = self.read_identifier();
542                match ident.to_uppercase().as_str() {
543                    "SELECT" => Token::Select,
544                    "FROM" => Token::From,
545                    "WHERE" => Token::Where,
546                    "WITH" => Token::With,
547                    "AND" => Token::And,
548                    "OR" => Token::Or,
549                    "IN" => Token::In,
550                    "NOT" => Token::Not,
551                    "BETWEEN" => Token::Between,
552                    "LIKE" => Token::Like,
553                    "IS" => Token::Is,
554                    "NULL" => Token::Null,
555                    "ORDER" if self.peek_keyword("BY") => {
556                        self.skip_whitespace();
557                        self.read_identifier(); // consume "BY"
558                        Token::OrderBy
559                    }
560                    "GROUP" if self.peek_keyword("BY") => {
561                        self.skip_whitespace();
562                        self.read_identifier(); // consume "BY"
563                        Token::GroupBy
564                    }
565                    "HAVING" => Token::Having,
566                    "AS" => Token::As,
567                    "ASC" => Token::Asc,
568                    "DESC" => Token::Desc,
569                    "LIMIT" => Token::Limit,
570                    "OFFSET" => Token::Offset,
571                    "INTO" => Token::Into,
572                    "DATETIME" => Token::DateTime,
573                    "CASE" => Token::Case,
574                    "WHEN" => Token::When,
575                    "THEN" => Token::Then,
576                    "ELSE" => Token::Else,
577                    "END" => Token::End,
578                    "DISTINCT" => Token::Distinct,
579                    "OVER" => Token::Over,
580                    "PARTITION" => Token::Partition,
581                    "BY" => Token::By,
582                    // Window frame keywords
583                    "ROWS" => Token::Rows,
584                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
585                    // We'll handle this in the parser based on context
586                    "UNBOUNDED" => Token::Unbounded,
587                    "PRECEDING" => Token::Preceding,
588                    "FOLLOWING" => Token::Following,
589                    "CURRENT" => Token::Current,
590                    "ROW" => Token::Row,
591                    // Set operation keywords
592                    "UNION" => Token::Union,
593                    "INTERSECT" => Token::Intersect,
594                    "EXCEPT" => Token::Except,
595                    // Special CTE keyword
596                    "WEB" => Token::Web,
597                    // Row expansion functions
598                    "UNNEST" => Token::Unnest,
599                    // JOIN keywords
600                    "JOIN" => Token::Join,
601                    "INNER" => Token::Inner,
602                    "LEFT" => Token::Left,
603                    "RIGHT" => Token::Right,
604                    "FULL" => Token::Full,
605                    "OUTER" => Token::Outer,
606                    "ON" => Token::On,
607                    "CROSS" => Token::Cross,
608                    _ => Token::Identifier(ident),
609                }
610            }
611            Some(ch) => {
612                self.advance();
613                Token::Identifier(ch.to_string())
614            }
615        }
616    }
617
618    fn peek_keyword(&mut self, keyword: &str) -> bool {
619        let saved_pos = self.position;
620        let saved_char = self.current_char;
621
622        self.skip_whitespace_and_comments();
623        let next_word = self.read_identifier();
624        let matches = next_word.to_uppercase() == keyword;
625
626        // Restore position
627        self.position = saved_pos;
628        self.current_char = saved_char;
629
630        matches
631    }
632
633    #[must_use]
634    pub fn get_position(&self) -> usize {
635        self.position
636    }
637
638    pub fn tokenize_all(&mut self) -> Vec<Token> {
639        let mut tokens = Vec::new();
640        loop {
641            let token = self.next_token();
642            if matches!(token, Token::Eof) {
643                tokens.push(token);
644                break;
645            }
646            tokens.push(token);
647        }
648        tokens
649    }
650
651    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
652        let mut tokens = Vec::new();
653        loop {
654            self.skip_whitespace_and_comments();
655            let start_pos = self.position;
656            let token = self.next_token();
657            let end_pos = self.position;
658
659            if matches!(token, Token::Eof) {
660                break;
661            }
662            tokens.push((start_pos, end_pos, token));
663        }
664        tokens
665    }
666}