sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    DateTime,  // DateTime constructor
30    Case,      // CASE expression
31    When,      // WHEN clause
32    Then,      // THEN clause
33    Else,      // ELSE clause
34    End,       // END keyword
35    Distinct,  // DISTINCT keyword for aggregate functions
36    Over,      // OVER keyword for window functions
37    Partition, // PARTITION keyword for window functions
38    By,        // BY keyword (used with PARTITION BY, ORDER BY)
39
40    // Window frame keywords
41    Rows,      // ROWS frame type
42    Range,     // RANGE frame type
43    Unbounded, // UNBOUNDED for frame bounds
44    Preceding, // PRECEDING for frame bounds
45    Following, // FOLLOWING for frame bounds
46    Current,   // CURRENT for CURRENT ROW
47    Row,       // ROW for CURRENT ROW
48
49    // Set operation keywords
50    Union,     // UNION
51    Intersect, // INTERSECT
52    Except,    // EXCEPT
53
54    // Special CTE keyword
55    Web, // WEB (for WEB CTEs)
56
57    // Row expansion functions
58    Unnest, // UNNEST (for expanding delimited strings into rows)
59
60    // JOIN keywords
61    Join,  // JOIN keyword
62    Inner, // INNER JOIN
63    Left,  // LEFT JOIN
64    Right, // RIGHT JOIN
65    Full,  // FULL JOIN
66    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
67    On,    // ON keyword for join conditions
68    Cross, // CROSS JOIN
69
70    // Literals
71    Identifier(String),
72    QuotedIdentifier(String), // For "Customer Id" style identifiers
73    StringLiteral(String),
74    JsonBlock(String), // For $JSON$...$ JSON$ delimited blocks
75    NumberLiteral(String),
76    Star,
77
78    // Operators
79    Dot,
80    Comma,
81    Colon,
82    LeftParen,
83    RightParen,
84    Equal,
85    NotEqual,
86    LessThan,
87    GreaterThan,
88    LessThanOrEqual,
89    GreaterThanOrEqual,
90
91    // Arithmetic operators
92    Plus,
93    Minus,
94    Divide,
95    Modulo,
96
97    // String operators
98    Concat, // || for string concatenation
99
100    // Special
101    Eof,
102}
103
104impl Token {
105    /// Check if a string is a SQL keyword and return corresponding token
106    pub fn from_keyword(s: &str) -> Option<Token> {
107        match s.to_uppercase().as_str() {
108            "SELECT" => Some(Token::Select),
109            "FROM" => Some(Token::From),
110            "WHERE" => Some(Token::Where),
111            "WITH" => Some(Token::With),
112            "AND" => Some(Token::And),
113            "OR" => Some(Token::Or),
114            "IN" => Some(Token::In),
115            "NOT" => Some(Token::Not),
116            "BETWEEN" => Some(Token::Between),
117            "LIKE" => Some(Token::Like),
118            "IS" => Some(Token::Is),
119            "NULL" => Some(Token::Null),
120            "ORDER" => Some(Token::OrderBy),
121            "GROUP" => Some(Token::GroupBy),
122            "HAVING" => Some(Token::Having),
123            "AS" => Some(Token::As),
124            "ASC" => Some(Token::Asc),
125            "DESC" => Some(Token::Desc),
126            "LIMIT" => Some(Token::Limit),
127            "OFFSET" => Some(Token::Offset),
128            "DISTINCT" => Some(Token::Distinct),
129            "CASE" => Some(Token::Case),
130            "WHEN" => Some(Token::When),
131            "THEN" => Some(Token::Then),
132            "ELSE" => Some(Token::Else),
133            "END" => Some(Token::End),
134            "OVER" => Some(Token::Over),
135            "PARTITION" => Some(Token::Partition),
136            "BY" => Some(Token::By),
137            "ROWS" => Some(Token::Rows),
138            "RANGE" => Some(Token::Range),
139            "UNBOUNDED" => Some(Token::Unbounded),
140            "PRECEDING" => Some(Token::Preceding),
141            "FOLLOWING" => Some(Token::Following),
142            "CURRENT" => Some(Token::Current),
143            "ROW" => Some(Token::Row),
144            "UNION" => Some(Token::Union),
145            "INTERSECT" => Some(Token::Intersect),
146            "EXCEPT" => Some(Token::Except),
147            "WEB" => Some(Token::Web),
148            "UNNEST" => Some(Token::Unnest),
149            "JOIN" => Some(Token::Join),
150            "INNER" => Some(Token::Inner),
151            "LEFT" => Some(Token::Left),
152            "RIGHT" => Some(Token::Right),
153            "FULL" => Some(Token::Full),
154            "OUTER" => Some(Token::Outer),
155            "ON" => Some(Token::On),
156            "CROSS" => Some(Token::Cross),
157            _ => None,
158        }
159    }
160
161    /// Check if token is a logical operator
162    pub fn is_logical_operator(&self) -> bool {
163        matches!(self, Token::And | Token::Or)
164    }
165
166    /// Check if token is a join type
167    pub fn is_join_type(&self) -> bool {
168        matches!(
169            self,
170            Token::Inner | Token::Left | Token::Right | Token::Full | Token::Cross
171        )
172    }
173
174    /// Check if token ends a clause
175    pub fn is_clause_terminator(&self) -> bool {
176        matches!(
177            self,
178            Token::OrderBy
179                | Token::GroupBy
180                | Token::Having
181                | Token::Limit
182                | Token::Offset
183                | Token::Union
184                | Token::Intersect
185                | Token::Except
186        )
187    }
188
189    /// Get the string representation of a keyword token
190    pub fn as_keyword_str(&self) -> Option<&'static str> {
191        match self {
192            Token::Select => Some("SELECT"),
193            Token::From => Some("FROM"),
194            Token::Where => Some("WHERE"),
195            Token::With => Some("WITH"),
196            Token::And => Some("AND"),
197            Token::Or => Some("OR"),
198            Token::OrderBy => Some("ORDER BY"),
199            Token::GroupBy => Some("GROUP BY"),
200            Token::Having => Some("HAVING"),
201            // Add more as needed
202            _ => None,
203        }
204    }
205}
206
207#[derive(Debug, Clone)]
208pub struct Lexer {
209    input: Vec<char>,
210    position: usize,
211    current_char: Option<char>,
212}
213
214impl Lexer {
215    #[must_use]
216    pub fn new(input: &str) -> Self {
217        let chars: Vec<char> = input.chars().collect();
218        let current = chars.first().copied();
219        Self {
220            input: chars,
221            position: 0,
222            current_char: current,
223        }
224    }
225
226    fn advance(&mut self) {
227        self.position += 1;
228        self.current_char = self.input.get(self.position).copied();
229    }
230
231    fn peek(&self, offset: usize) -> Option<char> {
232        self.input.get(self.position + offset).copied()
233    }
234
235    /// Peek ahead n characters and return as a string
236    fn peek_string(&self, n: usize) -> String {
237        let mut result = String::new();
238        for i in 0..n {
239            if let Some(ch) = self.input.get(self.position + i) {
240                result.push(*ch);
241            } else {
242                break;
243            }
244        }
245        result
246    }
247
248    /// Read a JSON block delimited by $JSON$...$JSON$
249    /// Consumes the opening delimiter and reads until closing $JSON$
250    fn read_json_block(&mut self) -> String {
251        let mut result = String::new();
252
253        // Skip opening $JSON$
254        for _ in 0..6 {
255            self.advance();
256        }
257
258        // Read until we find closing $JSON$
259        while let Some(ch) = self.current_char {
260            // Check if we're at the closing delimiter
261            if ch == '$' && self.peek_string(6) == "$JSON$" {
262                // Skip closing $JSON$
263                for _ in 0..6 {
264                    self.advance();
265                }
266                break;
267            }
268            result.push(ch);
269            self.advance();
270        }
271
272        result
273    }
274
275    fn skip_whitespace(&mut self) {
276        while let Some(ch) = self.current_char {
277            if ch.is_whitespace() {
278                self.advance();
279            } else {
280                break;
281            }
282        }
283    }
284
285    fn skip_whitespace_and_comments(&mut self) {
286        loop {
287            // Skip whitespace
288            while let Some(ch) = self.current_char {
289                if ch.is_whitespace() {
290                    self.advance();
291                } else {
292                    break;
293                }
294            }
295
296            // Check for comments
297            match self.current_char {
298                Some('-') if self.peek(1) == Some('-') => {
299                    // Single-line comment: skip until end of line
300                    self.advance(); // skip first '-'
301                    self.advance(); // skip second '-'
302                    while let Some(ch) = self.current_char {
303                        self.advance();
304                        if ch == '\n' {
305                            break;
306                        }
307                    }
308                }
309                Some('/') if self.peek(1) == Some('*') => {
310                    // Multi-line comment: skip until */
311                    self.advance(); // skip '/'
312                    self.advance(); // skip '*'
313                    while let Some(ch) = self.current_char {
314                        if ch == '*' && self.peek(1) == Some('/') {
315                            self.advance(); // skip '*'
316                            self.advance(); // skip '/'
317                            break;
318                        }
319                        self.advance();
320                    }
321                }
322                _ => {
323                    // No more comments or whitespace
324                    break;
325                }
326            }
327        }
328    }
329
330    fn read_identifier(&mut self) -> String {
331        let mut result = String::new();
332        while let Some(ch) = self.current_char {
333            if ch.is_alphanumeric() || ch == '_' {
334                result.push(ch);
335                self.advance();
336            } else {
337                break;
338            }
339        }
340        result
341    }
342
343    fn read_string(&mut self) -> String {
344        let mut result = String::new();
345        let quote_char = self.current_char.unwrap(); // ' or "
346        self.advance(); // skip opening quote
347
348        while let Some(ch) = self.current_char {
349            if ch == quote_char {
350                self.advance(); // skip closing quote
351                break;
352            }
353            result.push(ch);
354            self.advance();
355        }
356        result
357    }
358
359    fn read_number(&mut self) -> String {
360        let mut result = String::new();
361        let has_e = false;
362
363        // Read the main number part (including decimal point)
364        while let Some(ch) = self.current_char {
365            if !has_e && (ch.is_numeric() || ch == '.') {
366                result.push(ch);
367                self.advance();
368            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
369                // Handle scientific notation
370                result.push(ch);
371                self.advance();
372                let _ = has_e; // We don't allow multiple 'e' characters, so break after this
373
374                // Check for optional sign after 'e'
375                if let Some(sign) = self.current_char {
376                    if sign == '+' || sign == '-' {
377                        result.push(sign);
378                        self.advance();
379                    }
380                }
381
382                // Read exponent digits
383                while let Some(digit) = self.current_char {
384                    if digit.is_numeric() {
385                        result.push(digit);
386                        self.advance();
387                    } else {
388                        break;
389                    }
390                }
391                break; // Done reading the number
392            } else {
393                break;
394            }
395        }
396        result
397    }
398
399    pub fn next_token(&mut self) -> Token {
400        self.skip_whitespace_and_comments();
401
402        match self.current_char {
403            None => Token::Eof,
404            Some('*') => {
405                self.advance();
406                // Context-sensitive: could be SELECT * or multiplication
407                // The parser will distinguish based on context
408                Token::Star // We'll handle multiplication in parser
409            }
410            Some('+') => {
411                self.advance();
412                Token::Plus
413            }
414            Some('/') => {
415                // Check if this is a comment start
416                if self.peek(1) == Some('*') {
417                    // This shouldn't happen as comments are skipped above,
418                    // but handle it just in case
419                    self.skip_whitespace_and_comments();
420                    return self.next_token();
421                }
422                self.advance();
423                Token::Divide
424            }
425            Some('%') => {
426                self.advance();
427                Token::Modulo
428            }
429            Some('.') => {
430                self.advance();
431                Token::Dot
432            }
433            Some(',') => {
434                self.advance();
435                Token::Comma
436            }
437            Some(':') => {
438                self.advance();
439                Token::Colon
440            }
441            Some('(') => {
442                self.advance();
443                Token::LeftParen
444            }
445            Some(')') => {
446                self.advance();
447                Token::RightParen
448            }
449            Some('=') => {
450                self.advance();
451                Token::Equal
452            }
453            Some('<') => {
454                self.advance();
455                if self.current_char == Some('=') {
456                    self.advance();
457                    Token::LessThanOrEqual
458                } else if self.current_char == Some('>') {
459                    self.advance();
460                    Token::NotEqual
461                } else {
462                    Token::LessThan
463                }
464            }
465            Some('>') => {
466                self.advance();
467                if self.current_char == Some('=') {
468                    self.advance();
469                    Token::GreaterThanOrEqual
470                } else {
471                    Token::GreaterThan
472                }
473            }
474            Some('!') if self.peek(1) == Some('=') => {
475                self.advance();
476                self.advance();
477                Token::NotEqual
478            }
479            Some('|') if self.peek(1) == Some('|') => {
480                self.advance();
481                self.advance();
482                Token::Concat
483            }
484            Some('"') => {
485                // Double quotes = identifier
486                let ident_val = self.read_string();
487                Token::QuotedIdentifier(ident_val)
488            }
489            Some('$') => {
490                // Check if this is $JSON$ delimiter
491                if self.peek_string(6) == "$JSON$" {
492                    let json_content = self.read_json_block();
493                    Token::JsonBlock(json_content)
494                } else {
495                    // Not a JSON block, could be part of identifier or parameter
496                    // For now, treat as identifier start
497                    let ident = self.read_identifier();
498                    Token::Identifier(ident)
499                }
500            }
501            Some('\'') => {
502                // Single quotes = string literal
503                let string_val = self.read_string();
504                Token::StringLiteral(string_val)
505            }
506            Some('-') if self.peek(1) == Some('-') => {
507                // This is a comment, skip it and get next token
508                self.skip_whitespace_and_comments();
509                self.next_token()
510            }
511            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
512                // Handle negative numbers
513                self.advance(); // skip '-'
514                let num = self.read_number();
515                Token::NumberLiteral(format!("-{num}"))
516            }
517            Some('-') => {
518                // Handle subtraction operator
519                self.advance();
520                Token::Minus
521            }
522            Some(ch) if ch.is_numeric() => {
523                let num = self.read_number();
524                Token::NumberLiteral(num)
525            }
526            Some(ch) if ch.is_alphabetic() || ch == '_' => {
527                let ident = self.read_identifier();
528                match ident.to_uppercase().as_str() {
529                    "SELECT" => Token::Select,
530                    "FROM" => Token::From,
531                    "WHERE" => Token::Where,
532                    "WITH" => Token::With,
533                    "AND" => Token::And,
534                    "OR" => Token::Or,
535                    "IN" => Token::In,
536                    "NOT" => Token::Not,
537                    "BETWEEN" => Token::Between,
538                    "LIKE" => Token::Like,
539                    "IS" => Token::Is,
540                    "NULL" => Token::Null,
541                    "ORDER" if self.peek_keyword("BY") => {
542                        self.skip_whitespace();
543                        self.read_identifier(); // consume "BY"
544                        Token::OrderBy
545                    }
546                    "GROUP" if self.peek_keyword("BY") => {
547                        self.skip_whitespace();
548                        self.read_identifier(); // consume "BY"
549                        Token::GroupBy
550                    }
551                    "HAVING" => Token::Having,
552                    "AS" => Token::As,
553                    "ASC" => Token::Asc,
554                    "DESC" => Token::Desc,
555                    "LIMIT" => Token::Limit,
556                    "OFFSET" => Token::Offset,
557                    "DATETIME" => Token::DateTime,
558                    "CASE" => Token::Case,
559                    "WHEN" => Token::When,
560                    "THEN" => Token::Then,
561                    "ELSE" => Token::Else,
562                    "END" => Token::End,
563                    "DISTINCT" => Token::Distinct,
564                    "OVER" => Token::Over,
565                    "PARTITION" => Token::Partition,
566                    "BY" => Token::By,
567                    // Window frame keywords
568                    "ROWS" => Token::Rows,
569                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
570                    // We'll handle this in the parser based on context
571                    "UNBOUNDED" => Token::Unbounded,
572                    "PRECEDING" => Token::Preceding,
573                    "FOLLOWING" => Token::Following,
574                    "CURRENT" => Token::Current,
575                    "ROW" => Token::Row,
576                    // Set operation keywords
577                    "UNION" => Token::Union,
578                    "INTERSECT" => Token::Intersect,
579                    "EXCEPT" => Token::Except,
580                    // Special CTE keyword
581                    "WEB" => Token::Web,
582                    // Row expansion functions
583                    "UNNEST" => Token::Unnest,
584                    // JOIN keywords
585                    "JOIN" => Token::Join,
586                    "INNER" => Token::Inner,
587                    "LEFT" => Token::Left,
588                    "RIGHT" => Token::Right,
589                    "FULL" => Token::Full,
590                    "OUTER" => Token::Outer,
591                    "ON" => Token::On,
592                    "CROSS" => Token::Cross,
593                    _ => Token::Identifier(ident),
594                }
595            }
596            Some(ch) => {
597                self.advance();
598                Token::Identifier(ch.to_string())
599            }
600        }
601    }
602
603    fn peek_keyword(&mut self, keyword: &str) -> bool {
604        let saved_pos = self.position;
605        let saved_char = self.current_char;
606
607        self.skip_whitespace_and_comments();
608        let next_word = self.read_identifier();
609        let matches = next_word.to_uppercase() == keyword;
610
611        // Restore position
612        self.position = saved_pos;
613        self.current_char = saved_char;
614
615        matches
616    }
617
618    #[must_use]
619    pub fn get_position(&self) -> usize {
620        self.position
621    }
622
623    pub fn tokenize_all(&mut self) -> Vec<Token> {
624        let mut tokens = Vec::new();
625        loop {
626            let token = self.next_token();
627            if matches!(token, Token::Eof) {
628                tokens.push(token);
629                break;
630            }
631            tokens.push(token);
632        }
633        tokens
634    }
635
636    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
637        let mut tokens = Vec::new();
638        loop {
639            self.skip_whitespace_and_comments();
640            let start_pos = self.position;
641            let token = self.next_token();
642            let end_pos = self.position;
643
644            if matches!(token, Token::Eof) {
645                break;
646            }
647            tokens.push((start_pos, end_pos, token));
648        }
649        tokens
650    }
651}