sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    DateTime,  // DateTime constructor
30    Case,      // CASE expression
31    When,      // WHEN clause
32    Then,      // THEN clause
33    Else,      // ELSE clause
34    End,       // END keyword
35    Distinct,  // DISTINCT keyword for aggregate functions
36    Over,      // OVER keyword for window functions
37    Partition, // PARTITION keyword for window functions
38    By,        // BY keyword (used with PARTITION BY, ORDER BY)
39
40    // Window frame keywords
41    Rows,      // ROWS frame type
42    Range,     // RANGE frame type
43    Unbounded, // UNBOUNDED for frame bounds
44    Preceding, // PRECEDING for frame bounds
45    Following, // FOLLOWING for frame bounds
46    Current,   // CURRENT for CURRENT ROW
47    Row,       // ROW for CURRENT ROW
48
49    // Set operation keywords
50    Union,     // UNION
51    Intersect, // INTERSECT
52    Except,    // EXCEPT
53
54    // Special CTE keyword
55    Web, // WEB (for WEB CTEs)
56
57    // JOIN keywords
58    Join,  // JOIN keyword
59    Inner, // INNER JOIN
60    Left,  // LEFT JOIN
61    Right, // RIGHT JOIN
62    Full,  // FULL JOIN
63    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
64    On,    // ON keyword for join conditions
65    Cross, // CROSS JOIN
66
67    // Literals
68    Identifier(String),
69    QuotedIdentifier(String), // For "Customer Id" style identifiers
70    StringLiteral(String),
71    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
72    NumberLiteral(String),
73    Star,
74
75    // Operators
76    Dot,
77    Comma,
78    Colon,
79    LeftParen,
80    RightParen,
81    Equal,
82    NotEqual,
83    LessThan,
84    GreaterThan,
85    LessThanOrEqual,
86    GreaterThanOrEqual,
87
88    // Arithmetic operators
89    Plus,
90    Minus,
91    Divide,
92    Modulo,
93
94    // String operators
95    Concat, // || for string concatenation
96
97    // Special
98    Eof,
99}
100
101impl Token {
102    /// Check if a string is a SQL keyword and return corresponding token
103    pub fn from_keyword(s: &str) -> Option<Token> {
104        match s.to_uppercase().as_str() {
105            "SELECT" => Some(Token::Select),
106            "FROM" => Some(Token::From),
107            "WHERE" => Some(Token::Where),
108            "WITH" => Some(Token::With),
109            "AND" => Some(Token::And),
110            "OR" => Some(Token::Or),
111            "IN" => Some(Token::In),
112            "NOT" => Some(Token::Not),
113            "BETWEEN" => Some(Token::Between),
114            "LIKE" => Some(Token::Like),
115            "IS" => Some(Token::Is),
116            "NULL" => Some(Token::Null),
117            "ORDER" => Some(Token::OrderBy),
118            "GROUP" => Some(Token::GroupBy),
119            "HAVING" => Some(Token::Having),
120            "AS" => Some(Token::As),
121            "ASC" => Some(Token::Asc),
122            "DESC" => Some(Token::Desc),
123            "LIMIT" => Some(Token::Limit),
124            "OFFSET" => Some(Token::Offset),
125            "DISTINCT" => Some(Token::Distinct),
126            "CASE" => Some(Token::Case),
127            "WHEN" => Some(Token::When),
128            "THEN" => Some(Token::Then),
129            "ELSE" => Some(Token::Else),
130            "END" => Some(Token::End),
131            "OVER" => Some(Token::Over),
132            "PARTITION" => Some(Token::Partition),
133            "BY" => Some(Token::By),
134            "ROWS" => Some(Token::Rows),
135            "RANGE" => Some(Token::Range),
136            "UNBOUNDED" => Some(Token::Unbounded),
137            "PRECEDING" => Some(Token::Preceding),
138            "FOLLOWING" => Some(Token::Following),
139            "CURRENT" => Some(Token::Current),
140            "ROW" => Some(Token::Row),
141            "UNION" => Some(Token::Union),
142            "INTERSECT" => Some(Token::Intersect),
143            "EXCEPT" => Some(Token::Except),
144            "WEB" => Some(Token::Web),
145            "JOIN" => Some(Token::Join),
146            "INNER" => Some(Token::Inner),
147            "LEFT" => Some(Token::Left),
148            "RIGHT" => Some(Token::Right),
149            "FULL" => Some(Token::Full),
150            "OUTER" => Some(Token::Outer),
151            "ON" => Some(Token::On),
152            "CROSS" => Some(Token::Cross),
153            _ => None,
154        }
155    }
156
157    /// Check if token is a logical operator
158    pub fn is_logical_operator(&self) -> bool {
159        matches!(self, Token::And | Token::Or)
160    }
161
162    /// Check if token is a join type
163    pub fn is_join_type(&self) -> bool {
164        matches!(
165            self,
166            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
167        )
168    }
169
170    /// Check if token ends a clause
171    pub fn is_clause_terminator(&self) -> bool {
172        matches!(
173            self,
174            Token::OrderBy
175                | Token::GroupBy
176                | Token::Having
177                | Token::Limit
178                | Token::Offset
179                | Token::Union
180                | Token::Intersect
181                | Token::Except
182        )
183    }
184
185    /// Get the string representation of a keyword token
186    pub fn as_keyword_str(&self) -> Option<&'static str> {
187        match self {
188            Token::Select => Some("SELECT"),
189            Token::From => Some("FROM"),
190            Token::Where => Some("WHERE"),
191            Token::With => Some("WITH"),
192            Token::And => Some("AND"),
193            Token::Or => Some("OR"),
194            Token::OrderBy => Some("ORDER BY"),
195            Token::GroupBy => Some("GROUP BY"),
196            Token::Having => Some("HAVING"),
197            // Add more as needed
198            _ => None,
199        }
200    }
201}
202
203#[derive(Debug, Clone)]
204pub struct Lexer {
205    input: Vec<char>,
206    position: usize,
207    current_char: Option<char>,
208}
209
210impl Lexer {
211    #[must_use]
212    pub fn new(input: &str) -> Self {
213        let chars: Vec<char> = input.chars().collect();
214        let current = chars.first().copied();
215        Self {
216            input: chars,
217            position: 0,
218            current_char: current,
219        }
220    }
221
222    fn advance(&mut self) {
223        self.position += 1;
224        self.current_char = self.input.get(self.position).copied();
225    }
226
227    fn peek(&self, offset: usize) -> Option<char> {
228        self.input.get(self.position + offset).copied()
229    }
230
231    /// Peek ahead n characters and return as a string
232    fn peek_string(&self, n: usize) -> String {
233        let mut result = String::new();
234        for i in 0..n {
235            if let Some(ch) = self.input.get(self.position + i) {
236                result.push(*ch);
237            } else {
238                break;
239            }
240        }
241        result
242    }
243
244    /// Read a JSON block delimited by $JSON$...$JSON$
245    /// Consumes the opening delimiter and reads until closing $JSON$
246    fn read_json_block(&mut self) -> String {
247        let mut result = String::new();
248
249        // Skip opening $JSON$
250        for _ in 0..6 {
251            self.advance();
252        }
253
254        // Read until we find closing $JSON$
255        while let Some(ch) = self.current_char {
256            // Check if we're at the closing delimiter
257            if ch == '$' && self.peek_string(6) == "$JSON$" {
258                // Skip closing $JSON$
259                for _ in 0..6 {
260                    self.advance();
261                }
262                break;
263            }
264            result.push(ch);
265            self.advance();
266        }
267
268        result
269    }
270
271    fn skip_whitespace(&mut self) {
272        while let Some(ch) = self.current_char {
273            if ch.is_whitespace() {
274                self.advance();
275            } else {
276                break;
277            }
278        }
279    }
280
281    fn skip_whitespace_and_comments(&mut self) {
282        loop {
283            // Skip whitespace
284            while let Some(ch) = self.current_char {
285                if ch.is_whitespace() {
286                    self.advance();
287                } else {
288                    break;
289                }
290            }
291
292            // Check for comments
293            match self.current_char {
294                Some('-') if self.peek(1) == Some('-') => {
295                    // Single-line comment: skip until end of line
296                    self.advance(); // skip first '-'
297                    self.advance(); // skip second '-'
298                    while let Some(ch) = self.current_char {
299                        self.advance();
300                        if ch == '\n' {
301                            break;
302                        }
303                    }
304                }
305                Some('/') if self.peek(1) == Some('*') => {
306                    // Multi-line comment: skip until */
307                    self.advance(); // skip '/'
308                    self.advance(); // skip '*'
309                    while let Some(ch) = self.current_char {
310                        if ch == '*' && self.peek(1) == Some('/') {
311                            self.advance(); // skip '*'
312                            self.advance(); // skip '/'
313                            break;
314                        }
315                        self.advance();
316                    }
317                }
318                _ => {
319                    // No more comments or whitespace
320                    break;
321                }
322            }
323        }
324    }
325
326    fn read_identifier(&mut self) -> String {
327        let mut result = String::new();
328        while let Some(ch) = self.current_char {
329            if ch.is_alphanumeric() || ch == '_' {
330                result.push(ch);
331                self.advance();
332            } else {
333                break;
334            }
335        }
336        result
337    }
338
339    fn read_string(&mut self) -> String {
340        let mut result = String::new();
341        let quote_char = self.current_char.unwrap(); // ' or "
342        self.advance(); // skip opening quote
343
344        while let Some(ch) = self.current_char {
345            if ch == quote_char {
346                self.advance(); // skip closing quote
347                break;
348            }
349            result.push(ch);
350            self.advance();
351        }
352        result
353    }
354
355    fn read_number(&mut self) -> String {
356        let mut result = String::new();
357        let has_e = false;
358
359        // Read the main number part (including decimal point)
360        while let Some(ch) = self.current_char {
361            if !has_e && (ch.is_numeric() || ch == '.') {
362                result.push(ch);
363                self.advance();
364            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
365                // Handle scientific notation
366                result.push(ch);
367                self.advance();
368                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
369
370                // Check for optional sign after 'e'
371                if let Some(sign) = self.current_char {
372                    if sign == '+' || sign == '-' {
373                        result.push(sign);
374                        self.advance();
375                    }
376                }
377
378                // Read exponent digits
379                while let Some(digit) = self.current_char {
380                    if digit.is_numeric() {
381                        result.push(digit);
382                        self.advance();
383                    } else {
384                        break;
385                    }
386                }
387                break; // Done reading the number
388            } else {
389                break;
390            }
391        }
392        result
393    }
394
395    pub fn next_token(&mut self) -> Token {
396        self.skip_whitespace_and_comments();
397
398        match self.current_char {
399            None => Token::Eof,
400            Some('*') => {
401                self.advance();
402                // Context-sensitive: could be SELECT * or multiplication
403                // The parser will distinguish based on context
404                Token::Star // We'll handle multiplication in parser
405            }
406            Some('+') => {
407                self.advance();
408                Token::Plus
409            }
410            Some('/') => {
411                // Check if this is a comment start
412                if self.peek(1) == Some('*') {
413                    // This shouldn't happen as comments are skipped above,
414                    // but handle it just in case
415                    self.skip_whitespace_and_comments();
416                    return self.next_token();
417                }
418                self.advance();
419                Token::Divide
420            }
421            Some('%') => {
422                self.advance();
423                Token::Modulo
424            }
425            Some('.') => {
426                self.advance();
427                Token::Dot
428            }
429            Some(',') => {
430                self.advance();
431                Token::Comma
432            }
433            Some(':') => {
434                self.advance();
435                Token::Colon
436            }
437            Some('(') => {
438                self.advance();
439                Token::LeftParen
440            }
441            Some(')') => {
442                self.advance();
443                Token::RightParen
444            }
445            Some('=') => {
446                self.advance();
447                Token::Equal
448            }
449            Some('<') => {
450                self.advance();
451                if self.current_char == Some('=') {
452                    self.advance();
453                    Token::LessThanOrEqual
454                } else if self.current_char == Some('>') {
455                    self.advance();
456                    Token::NotEqual
457                } else {
458                    Token::LessThan
459                }
460            }
461            Some('>') => {
462                self.advance();
463                if self.current_char == Some('=') {
464                    self.advance();
465                    Token::GreaterThanOrEqual
466                } else {
467                    Token::GreaterThan
468                }
469            }
470            Some('!') if self.peek(1) == Some('=') => {
471                self.advance();
472                self.advance();
473                Token::NotEqual
474            }
475            Some('|') if self.peek(1) == Some('|') => {
476                self.advance();
477                self.advance();
478                Token::Concat
479            }
480            Some('"') => {
481                // Double quotes = identifier
482                let ident_val = self.read_string();
483                Token::QuotedIdentifier(ident_val)
484            }
485            Some('$') => {
486                // Check if this is $JSON$ delimiter
487                if self.peek_string(6) == "$JSON$" {
488                    let json_content = self.read_json_block();
489                    Token::JsonBlock(json_content)
490                } else {
491                    // Not a JSON block, could be part of identifier or parameter
492                    // For now, treat as identifier start
493                    let ident = self.read_identifier();
494                    Token::Identifier(ident)
495                }
496            }
497            Some('\'') => {
498                // Single quotes = string literal
499                let string_val = self.read_string();
500                Token::StringLiteral(string_val)
501            }
502            Some('-') if self.peek(1) == Some('-') => {
503                // This is a comment, skip it and get next token
504                self.skip_whitespace_and_comments();
505                self.next_token()
506            }
507            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
508                // Handle negative numbers
509                self.advance(); // skip '-'
510                let num = self.read_number();
511                Token::NumberLiteral(format!("-{num}"))
512            }
513            Some('-') => {
514                // Handle subtraction operator
515                self.advance();
516                Token::Minus
517            }
518            Some(ch) if ch.is_numeric() => {
519                let num = self.read_number();
520                Token::NumberLiteral(num)
521            }
522            Some(ch) if ch.is_alphabetic() || ch == '_' => {
523                let ident = self.read_identifier();
524                match ident.to_uppercase().as_str() {
525                    "SELECT" => Token::Select,
526                    "FROM" => Token::From,
527                    "WHERE" => Token::Where,
528                    "WITH" => Token::With,
529                    "AND" => Token::And,
530                    "OR" => Token::Or,
531                    "IN" => Token::In,
532                    "NOT" => Token::Not,
533                    "BETWEEN" => Token::Between,
534                    "LIKE" => Token::Like,
535                    "IS" => Token::Is,
536                    "NULL" => Token::Null,
537                    "ORDER" if self.peek_keyword("BY") => {
538                        self.skip_whitespace();
539                        self.read_identifier(); // consume "BY"
540                        Token::OrderBy
541                    }
542                    "GROUP" if self.peek_keyword("BY") => {
543                        self.skip_whitespace();
544                        self.read_identifier(); // consume "BY"
545                        Token::GroupBy
546                    }
547                    "HAVING" => Token::Having,
548                    "AS" => Token::As,
549                    "ASC" => Token::Asc,
550                    "DESC" => Token::Desc,
551                    "LIMIT" => Token::Limit,
552                    "OFFSET" => Token::Offset,
553                    "DATETIME" => Token::DateTime,
554                    "CASE" => Token::Case,
555                    "WHEN" => Token::When,
556                    "THEN" => Token::Then,
557                    "ELSE" => Token::Else,
558                    "END" => Token::End,
559                    "DISTINCT" => Token::Distinct,
560                    "OVER" => Token::Over,
561                    "PARTITION" => Token::Partition,
562                    "BY" => Token::By,
563                    // Window frame keywords
564                    "ROWS" => Token::Rows,
565                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
566                    // We'll handle this in the parser based on context
567                    "UNBOUNDED" => Token::Unbounded,
568                    "PRECEDING" => Token::Preceding,
569                    "FOLLOWING" => Token::Following,
570                    "CURRENT" => Token::Current,
571                    "ROW" => Token::Row,
572                    // Set operation keywords
573                    "UNION" => Token::Union,
574                    "INTERSECT" => Token::Intersect,
575                    "EXCEPT" => Token::Except,
576                    // Special CTE keyword
577                    "WEB" => Token::Web,
578                    // JOIN keywords
579                    "JOIN" => Token::Join,
580                    "INNER" => Token::Inner,
581                    "LEFT" => Token::Left,
582                    "RIGHT" => Token::Right,
583                    "FULL" => Token::Full,
584                    "OUTER" => Token::Outer,
585                    "ON" => Token::On,
586                    "CROSS" => Token::Cross,
587                    _ => Token::Identifier(ident),
588                }
589            }
590            Some(ch) => {
591                self.advance();
592                Token::Identifier(ch.to_string())
593            }
594        }
595    }
596
597    fn peek_keyword(&mut self, keyword: &str) -> bool {
598        let saved_pos = self.position;
599        let saved_char = self.current_char;
600
601        self.skip_whitespace_and_comments();
602        let next_word = self.read_identifier();
603        let matches = next_word.to_uppercase() == keyword;
604
605        // Restore position
606        self.position = saved_pos;
607        self.current_char = saved_char;
608
609        matches
610    }
611
612    #[must_use]
613    pub fn get_position(&self) -> usize {
614        self.position
615    }
616
617    pub fn tokenize_all(&mut self) -> Vec<Token> {
618        let mut tokens = Vec::new();
619        loop {
620            let token = self.next_token();
621            if matches!(token, Token::Eof) {
622                tokens.push(token);
623                break;
624            }
625            tokens.push(token);
626        }
627        tokens
628    }
629
630    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
631        let mut tokens = Vec::new();
632        loop {
633            self.skip_whitespace_and_comments();
634            let start_pos = self.position;
635            let token = self.next_token();
636            let end_pos = self.position;
637
638            if matches!(token, Token::Eof) {
639                break;
640            }
641            tokens.push((start_pos, end_pos, token));
642        }
643        tokens
644    }
645}