sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    DateTime,  // DateTime constructor
30    Case,      // CASE expression
31    When,      // WHEN clause
32    Then,      // THEN clause
33    Else,      // ELSE clause
34    End,       // END keyword
35    Distinct,  // DISTINCT keyword for aggregate functions
36    Over,      // OVER keyword for window functions
37    Partition, // PARTITION keyword for window functions
38    By,        // BY keyword (used with PARTITION BY, ORDER BY)
39
40    // Window frame keywords
41    Rows,      // ROWS frame type
42    Range,     // RANGE frame type
43    Unbounded, // UNBOUNDED for frame bounds
44    Preceding, // PRECEDING for frame bounds
45    Following, // FOLLOWING for frame bounds
46    Current,   // CURRENT for CURRENT ROW
47    Row,       // ROW for CURRENT ROW
48
49    // Set operation keywords
50    Union,     // UNION
51    Intersect, // INTERSECT
52    Except,    // EXCEPT
53
54    // Special CTE keyword
55    Web, // WEB (for WEB CTEs)
56
57    // JOIN keywords
58    Join,  // JOIN keyword
59    Inner, // INNER JOIN
60    Left,  // LEFT JOIN
61    Right, // RIGHT JOIN
62    Full,  // FULL JOIN
63    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
64    On,    // ON keyword for join conditions
65    Cross, // CROSS JOIN
66
67    // Literals
68    Identifier(String),
69    QuotedIdentifier(String), // For "Customer Id" style identifiers
70    StringLiteral(String),
71    NumberLiteral(String),
72    Star,
73
74    // Operators
75    Dot,
76    Comma,
77    Colon,
78    LeftParen,
79    RightParen,
80    Equal,
81    NotEqual,
82    LessThan,
83    GreaterThan,
84    LessThanOrEqual,
85    GreaterThanOrEqual,
86
87    // Arithmetic operators
88    Plus,
89    Minus,
90    Divide,
91    Modulo,
92
93    // String operators
94    Concat, // || for string concatenation
95
96    // Special
97    Eof,
98}
99
100impl Token {
101    /// Check if a string is a SQL keyword and return corresponding token
102    pub fn from_keyword(s: &str) -> Option<Token> {
103        match s.to_uppercase().as_str() {
104            "SELECT" => Some(Token::Select),
105            "FROM" => Some(Token::From),
106            "WHERE" => Some(Token::Where),
107            "WITH" => Some(Token::With),
108            "AND" => Some(Token::And),
109            "OR" => Some(Token::Or),
110            "IN" => Some(Token::In),
111            "NOT" => Some(Token::Not),
112            "BETWEEN" => Some(Token::Between),
113            "LIKE" => Some(Token::Like),
114            "IS" => Some(Token::Is),
115            "NULL" => Some(Token::Null),
116            "ORDER" => Some(Token::OrderBy),
117            "GROUP" => Some(Token::GroupBy),
118            "HAVING" => Some(Token::Having),
119            "AS" => Some(Token::As),
120            "ASC" => Some(Token::Asc),
121            "DESC" => Some(Token::Desc),
122            "LIMIT" => Some(Token::Limit),
123            "OFFSET" => Some(Token::Offset),
124            "DISTINCT" => Some(Token::Distinct),
125            "CASE" => Some(Token::Case),
126            "WHEN" => Some(Token::When),
127            "THEN" => Some(Token::Then),
128            "ELSE" => Some(Token::Else),
129            "END" => Some(Token::End),
130            "OVER" => Some(Token::Over),
131            "PARTITION" => Some(Token::Partition),
132            "BY" => Some(Token::By),
133            "ROWS" => Some(Token::Rows),
134            "RANGE" => Some(Token::Range),
135            "UNBOUNDED" => Some(Token::Unbounded),
136            "PRECEDING" => Some(Token::Preceding),
137            "FOLLOWING" => Some(Token::Following),
138            "CURRENT" => Some(Token::Current),
139            "ROW" => Some(Token::Row),
140            "UNION" => Some(Token::Union),
141            "INTERSECT" => Some(Token::Intersect),
142            "EXCEPT" => Some(Token::Except),
143            "WEB" => Some(Token::Web),
144            "JOIN" => Some(Token::Join),
145            "INNER" => Some(Token::Inner),
146            "LEFT" => Some(Token::Left),
147            "RIGHT" => Some(Token::Right),
148            "FULL" => Some(Token::Full),
149            "OUTER" => Some(Token::Outer),
150            "ON" => Some(Token::On),
151            "CROSS" => Some(Token::Cross),
152            _ => None,
153        }
154    }
155
156    /// Check if token is a logical operator
157    pub fn is_logical_operator(&self) -> bool {
158        matches!(self, Token::And | Token::Or)
159    }
160
161    /// Check if token is a join type
162    pub fn is_join_type(&self) -> bool {
163        matches!(
164            self,
165            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
166        )
167    }
168
169    /// Check if token ends a clause
170    pub fn is_clause_terminator(&self) -> bool {
171        matches!(
172            self,
173            Token::OrderBy
174                | Token::GroupBy
175                | Token::Having
176                | Token::Limit
177                | Token::Offset
178                | Token::Union
179                | Token::Intersect
180                | Token::Except
181        )
182    }
183
184    /// Get the string representation of a keyword token
185    pub fn as_keyword_str(&self) -> Option<&'static str> {
186        match self {
187            Token::Select => Some("SELECT"),
188            Token::From => Some("FROM"),
189            Token::Where => Some("WHERE"),
190            Token::With => Some("WITH"),
191            Token::And => Some("AND"),
192            Token::Or => Some("OR"),
193            Token::OrderBy => Some("ORDER BY"),
194            Token::GroupBy => Some("GROUP BY"),
195            Token::Having => Some("HAVING"),
196            // Add more as needed
197            _ => None,
198        }
199    }
200}
201
202#[derive(Debug, Clone)]
203pub struct Lexer {
204    input: Vec<char>,
205    position: usize,
206    current_char: Option<char>,
207}
208
209impl Lexer {
210    #[must_use]
211    pub fn new(input: &str) -> Self {
212        let chars: Vec<char> = input.chars().collect();
213        let current = chars.first().copied();
214        Self {
215            input: chars,
216            position: 0,
217            current_char: current,
218        }
219    }
220
221    fn advance(&mut self) {
222        self.position += 1;
223        self.current_char = self.input.get(self.position).copied();
224    }
225
226    fn peek(&self, offset: usize) -> Option<char> {
227        self.input.get(self.position + offset).copied()
228    }
229
230    fn skip_whitespace(&mut self) {
231        while let Some(ch) = self.current_char {
232            if ch.is_whitespace() {
233                self.advance();
234            } else {
235                break;
236            }
237        }
238    }
239
240    fn skip_whitespace_and_comments(&mut self) {
241        loop {
242            // Skip whitespace
243            while let Some(ch) = self.current_char {
244                if ch.is_whitespace() {
245                    self.advance();
246                } else {
247                    break;
248                }
249            }
250
251            // Check for comments
252            match self.current_char {
253                Some('-') if self.peek(1) == Some('-') => {
254                    // Single-line comment: skip until end of line
255                    self.advance(); // skip first '-'
256                    self.advance(); // skip second '-'
257                    while let Some(ch) = self.current_char {
258                        self.advance();
259                        if ch == '\n' {
260                            break;
261                        }
262                    }
263                }
264                Some('/') if self.peek(1) == Some('*') => {
265                    // Multi-line comment: skip until */
266                    self.advance(); // skip '/'
267                    self.advance(); // skip '*'
268                    while let Some(ch) = self.current_char {
269                        if ch == '*' && self.peek(1) == Some('/') {
270                            self.advance(); // skip '*'
271                            self.advance(); // skip '/'
272                            break;
273                        }
274                        self.advance();
275                    }
276                }
277                _ => {
278                    // No more comments or whitespace
279                    break;
280                }
281            }
282        }
283    }
284
285    fn read_identifier(&mut self) -> String {
286        let mut result = String::new();
287        while let Some(ch) = self.current_char {
288            if ch.is_alphanumeric() || ch == '_' {
289                result.push(ch);
290                self.advance();
291            } else {
292                break;
293            }
294        }
295        result
296    }
297
298    fn read_string(&mut self) -> String {
299        let mut result = String::new();
300        let quote_char = self.current_char.unwrap(); // ' or "
301        self.advance(); // skip opening quote
302
303        while let Some(ch) = self.current_char {
304            if ch == quote_char {
305                self.advance(); // skip closing quote
306                break;
307            }
308            result.push(ch);
309            self.advance();
310        }
311        result
312    }
313
314    fn read_number(&mut self) -> String {
315        let mut result = String::new();
316        let mut has_e = false;
317
318        // Read the main number part (including decimal point)
319        while let Some(ch) = self.current_char {
320            if !has_e && (ch.is_numeric() || ch == '.') {
321                result.push(ch);
322                self.advance();
323            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
324                // Handle scientific notation
325                result.push(ch);
326                self.advance();
327                has_e = true;
328
329                // Check for optional sign after 'e'
330                if let Some(sign) = self.current_char {
331                    if sign == '+' || sign == '-' {
332                        result.push(sign);
333                        self.advance();
334                    }
335                }
336
337                // Read exponent digits
338                while let Some(digit) = self.current_char {
339                    if digit.is_numeric() {
340                        result.push(digit);
341                        self.advance();
342                    } else {
343                        break;
344                    }
345                }
346                break; // Done reading the number
347            } else {
348                break;
349            }
350        }
351        result
352    }
353
354    pub fn next_token(&mut self) -> Token {
355        self.skip_whitespace_and_comments();
356
357        match self.current_char {
358            None => Token::Eof,
359            Some('*') => {
360                self.advance();
361                // Context-sensitive: could be SELECT * or multiplication
362                // The parser will distinguish based on context
363                Token::Star // We'll handle multiplication in parser
364            }
365            Some('+') => {
366                self.advance();
367                Token::Plus
368            }
369            Some('/') => {
370                // Check if this is a comment start
371                if self.peek(1) == Some('*') {
372                    // This shouldn't happen as comments are skipped above,
373                    // but handle it just in case
374                    self.skip_whitespace_and_comments();
375                    return self.next_token();
376                }
377                self.advance();
378                Token::Divide
379            }
380            Some('%') => {
381                self.advance();
382                Token::Modulo
383            }
384            Some('.') => {
385                self.advance();
386                Token::Dot
387            }
388            Some(',') => {
389                self.advance();
390                Token::Comma
391            }
392            Some(':') => {
393                self.advance();
394                Token::Colon
395            }
396            Some('(') => {
397                self.advance();
398                Token::LeftParen
399            }
400            Some(')') => {
401                self.advance();
402                Token::RightParen
403            }
404            Some('=') => {
405                self.advance();
406                Token::Equal
407            }
408            Some('<') => {
409                self.advance();
410                if self.current_char == Some('=') {
411                    self.advance();
412                    Token::LessThanOrEqual
413                } else if self.current_char == Some('>') {
414                    self.advance();
415                    Token::NotEqual
416                } else {
417                    Token::LessThan
418                }
419            }
420            Some('>') => {
421                self.advance();
422                if self.current_char == Some('=') {
423                    self.advance();
424                    Token::GreaterThanOrEqual
425                } else {
426                    Token::GreaterThan
427                }
428            }
429            Some('!') if self.peek(1) == Some('=') => {
430                self.advance();
431                self.advance();
432                Token::NotEqual
433            }
434            Some('|') if self.peek(1) == Some('|') => {
435                self.advance();
436                self.advance();
437                Token::Concat
438            }
439            Some('"') => {
440                // Double quotes = identifier
441                let ident_val = self.read_string();
442                Token::QuotedIdentifier(ident_val)
443            }
444            Some('\'') => {
445                // Single quotes = string literal
446                let string_val = self.read_string();
447                Token::StringLiteral(string_val)
448            }
449            Some('-') if self.peek(1) == Some('-') => {
450                // This is a comment, skip it and get next token
451                self.skip_whitespace_and_comments();
452                self.next_token()
453            }
454            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
455                // Handle negative numbers
456                self.advance(); // skip '-'
457                let num = self.read_number();
458                Token::NumberLiteral(format!("-{num}"))
459            }
460            Some('-') => {
461                // Handle subtraction operator
462                self.advance();
463                Token::Minus
464            }
465            Some(ch) if ch.is_numeric() => {
466                let num = self.read_number();
467                Token::NumberLiteral(num)
468            }
469            Some(ch) if ch.is_alphabetic() || ch == '_' => {
470                let ident = self.read_identifier();
471                match ident.to_uppercase().as_str() {
472                    "SELECT" => Token::Select,
473                    "FROM" => Token::From,
474                    "WHERE" => Token::Where,
475                    "WITH" => Token::With,
476                    "AND" => Token::And,
477                    "OR" => Token::Or,
478                    "IN" => Token::In,
479                    "NOT" => Token::Not,
480                    "BETWEEN" => Token::Between,
481                    "LIKE" => Token::Like,
482                    "IS" => Token::Is,
483                    "NULL" => Token::Null,
484                    "ORDER" if self.peek_keyword("BY") => {
485                        self.skip_whitespace();
486                        self.read_identifier(); // consume "BY"
487                        Token::OrderBy
488                    }
489                    "GROUP" if self.peek_keyword("BY") => {
490                        self.skip_whitespace();
491                        self.read_identifier(); // consume "BY"
492                        Token::GroupBy
493                    }
494                    "HAVING" => Token::Having,
495                    "AS" => Token::As,
496                    "ASC" => Token::Asc,
497                    "DESC" => Token::Desc,
498                    "LIMIT" => Token::Limit,
499                    "OFFSET" => Token::Offset,
500                    "DATETIME" => Token::DateTime,
501                    "CASE" => Token::Case,
502                    "WHEN" => Token::When,
503                    "THEN" => Token::Then,
504                    "ELSE" => Token::Else,
505                    "END" => Token::End,
506                    "DISTINCT" => Token::Distinct,
507                    "OVER" => Token::Over,
508                    "PARTITION" => Token::Partition,
509                    "BY" => Token::By,
510                    // Window frame keywords
511                    "ROWS" => Token::Rows,
512                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
513                    // We'll handle this in the parser based on context
514                    "UNBOUNDED" => Token::Unbounded,
515                    "PRECEDING" => Token::Preceding,
516                    "FOLLOWING" => Token::Following,
517                    "CURRENT" => Token::Current,
518                    "ROW" => Token::Row,
519                    // Set operation keywords
520                    "UNION" => Token::Union,
521                    "INTERSECT" => Token::Intersect,
522                    "EXCEPT" => Token::Except,
523                    // Special CTE keyword
524                    "WEB" => Token::Web,
525                    // JOIN keywords
526                    "JOIN" => Token::Join,
527                    "INNER" => Token::Inner,
528                    "LEFT" => Token::Left,
529                    "RIGHT" => Token::Right,
530                    "FULL" => Token::Full,
531                    "OUTER" => Token::Outer,
532                    "ON" => Token::On,
533                    "CROSS" => Token::Cross,
534                    _ => Token::Identifier(ident),
535                }
536            }
537            Some(ch) => {
538                self.advance();
539                Token::Identifier(ch.to_string())
540            }
541        }
542    }
543
544    fn peek_keyword(&mut self, keyword: &str) -> bool {
545        let saved_pos = self.position;
546        let saved_char = self.current_char;
547
548        self.skip_whitespace_and_comments();
549        let next_word = self.read_identifier();
550        let matches = next_word.to_uppercase() == keyword;
551
552        // Restore position
553        self.position = saved_pos;
554        self.current_char = saved_char;
555
556        matches
557    }
558
559    #[must_use]
560    pub fn get_position(&self) -> usize {
561        self.position
562    }
563
564    pub fn tokenize_all(&mut self) -> Vec<Token> {
565        let mut tokens = Vec::new();
566        loop {
567            let token = self.next_token();
568            if matches!(token, Token::Eof) {
569                tokens.push(token);
570                break;
571            }
572            tokens.push(token);
573        }
574        tokens
575    }
576
577    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
578        let mut tokens = Vec::new();
579        loop {
580            self.skip_whitespace_and_comments();
581            let start_pos = self.position;
582            let token = self.next_token();
583            let end_pos = self.position;
584
585            if matches!(token, Token::Eof) {
586                break;
587            }
588            tokens.push((start_pos, end_pos, token));
589        }
590        tokens
591    }
592}