sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    DateTime,  // DateTime constructor
30    Case,      // CASE expression
31    When,      // WHEN clause
32    Then,      // THEN clause
33    Else,      // ELSE clause
34    End,       // END keyword
35    Distinct,  // DISTINCT keyword for aggregate functions
36    Over,      // OVER keyword for window functions
37    Partition, // PARTITION keyword for window functions
38    By,        // BY keyword (used with PARTITION BY, ORDER BY)
39
40    // Window frame keywords
41    Rows,      // ROWS frame type
42    Range,     // RANGE frame type
43    Unbounded, // UNBOUNDED for frame bounds
44    Preceding, // PRECEDING for frame bounds
45    Following, // FOLLOWING for frame bounds
46    Current,   // CURRENT for CURRENT ROW
47    Row,       // ROW for CURRENT ROW
48
49    // Set operation keywords
50    Union,     // UNION
51    Intersect, // INTERSECT
52    Except,    // EXCEPT
53
54    // Special CTE keyword
55    Web, // WEB (for WEB CTEs)
56
57    // JOIN keywords
58    Join,  // JOIN keyword
59    Inner, // INNER JOIN
60    Left,  // LEFT JOIN
61    Right, // RIGHT JOIN
62    Full,  // FULL JOIN
63    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
64    On,    // ON keyword for join conditions
65    Cross, // CROSS JOIN
66
67    // Literals
68    Identifier(String),
69    QuotedIdentifier(String), // For "Customer Id" style identifiers
70    StringLiteral(String),
71    NumberLiteral(String),
72    Star,
73
74    // Operators
75    Dot,
76    Comma,
77    Colon,
78    LeftParen,
79    RightParen,
80    Equal,
81    NotEqual,
82    LessThan,
83    GreaterThan,
84    LessThanOrEqual,
85    GreaterThanOrEqual,
86
87    // Arithmetic operators
88    Plus,
89    Minus,
90    Divide,
91    Modulo,
92
93    // String operators
94    Concat, // || for string concatenation
95
96    // Special
97    Eof,
98}
99
100#[derive(Debug, Clone)]
101pub struct Lexer {
102    input: Vec<char>,
103    position: usize,
104    current_char: Option<char>,
105}
106
107impl Lexer {
108    #[must_use]
109    pub fn new(input: &str) -> Self {
110        let chars: Vec<char> = input.chars().collect();
111        let current = chars.first().copied();
112        Self {
113            input: chars,
114            position: 0,
115            current_char: current,
116        }
117    }
118
119    fn advance(&mut self) {
120        self.position += 1;
121        self.current_char = self.input.get(self.position).copied();
122    }
123
124    fn peek(&self, offset: usize) -> Option<char> {
125        self.input.get(self.position + offset).copied()
126    }
127
128    fn skip_whitespace(&mut self) {
129        while let Some(ch) = self.current_char {
130            if ch.is_whitespace() {
131                self.advance();
132            } else {
133                break;
134            }
135        }
136    }
137
138    fn skip_whitespace_and_comments(&mut self) {
139        loop {
140            // Skip whitespace
141            while let Some(ch) = self.current_char {
142                if ch.is_whitespace() {
143                    self.advance();
144                } else {
145                    break;
146                }
147            }
148
149            // Check for comments
150            match self.current_char {
151                Some('-') if self.peek(1) == Some('-') => {
152                    // Single-line comment: skip until end of line
153                    self.advance(); // skip first '-'
154                    self.advance(); // skip second '-'
155                    while let Some(ch) = self.current_char {
156                        self.advance();
157                        if ch == '\n' {
158                            break;
159                        }
160                    }
161                }
162                Some('/') if self.peek(1) == Some('*') => {
163                    // Multi-line comment: skip until */
164                    self.advance(); // skip '/'
165                    self.advance(); // skip '*'
166                    while let Some(ch) = self.current_char {
167                        if ch == '*' && self.peek(1) == Some('/') {
168                            self.advance(); // skip '*'
169                            self.advance(); // skip '/'
170                            break;
171                        }
172                        self.advance();
173                    }
174                }
175                _ => {
176                    // No more comments or whitespace
177                    break;
178                }
179            }
180        }
181    }
182
183    fn read_identifier(&mut self) -> String {
184        let mut result = String::new();
185        while let Some(ch) = self.current_char {
186            if ch.is_alphanumeric() || ch == '_' {
187                result.push(ch);
188                self.advance();
189            } else {
190                break;
191            }
192        }
193        result
194    }
195
196    fn read_string(&mut self) -> String {
197        let mut result = String::new();
198        let quote_char = self.current_char.unwrap(); // ' or "
199        self.advance(); // skip opening quote
200
201        while let Some(ch) = self.current_char {
202            if ch == quote_char {
203                self.advance(); // skip closing quote
204                break;
205            }
206            result.push(ch);
207            self.advance();
208        }
209        result
210    }
211
212    fn read_number(&mut self) -> String {
213        let mut result = String::new();
214        let mut has_e = false;
215
216        // Read the main number part (including decimal point)
217        while let Some(ch) = self.current_char {
218            if !has_e && (ch.is_numeric() || ch == '.') {
219                result.push(ch);
220                self.advance();
221            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
222                // Handle scientific notation
223                result.push(ch);
224                self.advance();
225                has_e = true;
226
227                // Check for optional sign after 'e'
228                if let Some(sign) = self.current_char {
229                    if sign == '+' || sign == '-' {
230                        result.push(sign);
231                        self.advance();
232                    }
233                }
234
235                // Read exponent digits
236                while let Some(digit) = self.current_char {
237                    if digit.is_numeric() {
238                        result.push(digit);
239                        self.advance();
240                    } else {
241                        break;
242                    }
243                }
244                break; // Done reading the number
245            } else {
246                break;
247            }
248        }
249        result
250    }
251
252    pub fn next_token(&mut self) -> Token {
253        self.skip_whitespace_and_comments();
254
255        match self.current_char {
256            None => Token::Eof,
257            Some('*') => {
258                self.advance();
259                // Context-sensitive: could be SELECT * or multiplication
260                // The parser will distinguish based on context
261                Token::Star // We'll handle multiplication in parser
262            }
263            Some('+') => {
264                self.advance();
265                Token::Plus
266            }
267            Some('/') => {
268                // Check if this is a comment start
269                if self.peek(1) == Some('*') {
270                    // This shouldn't happen as comments are skipped above,
271                    // but handle it just in case
272                    self.skip_whitespace_and_comments();
273                    return self.next_token();
274                }
275                self.advance();
276                Token::Divide
277            }
278            Some('%') => {
279                self.advance();
280                Token::Modulo
281            }
282            Some('.') => {
283                self.advance();
284                Token::Dot
285            }
286            Some(',') => {
287                self.advance();
288                Token::Comma
289            }
290            Some(':') => {
291                self.advance();
292                Token::Colon
293            }
294            Some('(') => {
295                self.advance();
296                Token::LeftParen
297            }
298            Some(')') => {
299                self.advance();
300                Token::RightParen
301            }
302            Some('=') => {
303                self.advance();
304                Token::Equal
305            }
306            Some('<') => {
307                self.advance();
308                if self.current_char == Some('=') {
309                    self.advance();
310                    Token::LessThanOrEqual
311                } else if self.current_char == Some('>') {
312                    self.advance();
313                    Token::NotEqual
314                } else {
315                    Token::LessThan
316                }
317            }
318            Some('>') => {
319                self.advance();
320                if self.current_char == Some('=') {
321                    self.advance();
322                    Token::GreaterThanOrEqual
323                } else {
324                    Token::GreaterThan
325                }
326            }
327            Some('!') if self.peek(1) == Some('=') => {
328                self.advance();
329                self.advance();
330                Token::NotEqual
331            }
332            Some('|') if self.peek(1) == Some('|') => {
333                self.advance();
334                self.advance();
335                Token::Concat
336            }
337            Some('"') => {
338                // Double quotes = identifier
339                let ident_val = self.read_string();
340                Token::QuotedIdentifier(ident_val)
341            }
342            Some('\'') => {
343                // Single quotes = string literal
344                let string_val = self.read_string();
345                Token::StringLiteral(string_val)
346            }
347            Some('-') if self.peek(1) == Some('-') => {
348                // This is a comment, skip it and get next token
349                self.skip_whitespace_and_comments();
350                self.next_token()
351            }
352            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
353                // Handle negative numbers
354                self.advance(); // skip '-'
355                let num = self.read_number();
356                Token::NumberLiteral(format!("-{num}"))
357            }
358            Some('-') => {
359                // Handle subtraction operator
360                self.advance();
361                Token::Minus
362            }
363            Some(ch) if ch.is_numeric() => {
364                let num = self.read_number();
365                Token::NumberLiteral(num)
366            }
367            Some(ch) if ch.is_alphabetic() || ch == '_' => {
368                let ident = self.read_identifier();
369                match ident.to_uppercase().as_str() {
370                    "SELECT" => Token::Select,
371                    "FROM" => Token::From,
372                    "WHERE" => Token::Where,
373                    "WITH" => Token::With,
374                    "AND" => Token::And,
375                    "OR" => Token::Or,
376                    "IN" => Token::In,
377                    "NOT" => Token::Not,
378                    "BETWEEN" => Token::Between,
379                    "LIKE" => Token::Like,
380                    "IS" => Token::Is,
381                    "NULL" => Token::Null,
382                    "ORDER" if self.peek_keyword("BY") => {
383                        self.skip_whitespace();
384                        self.read_identifier(); // consume "BY"
385                        Token::OrderBy
386                    }
387                    "GROUP" if self.peek_keyword("BY") => {
388                        self.skip_whitespace();
389                        self.read_identifier(); // consume "BY"
390                        Token::GroupBy
391                    }
392                    "HAVING" => Token::Having,
393                    "AS" => Token::As,
394                    "ASC" => Token::Asc,
395                    "DESC" => Token::Desc,
396                    "LIMIT" => Token::Limit,
397                    "OFFSET" => Token::Offset,
398                    "DATETIME" => Token::DateTime,
399                    "CASE" => Token::Case,
400                    "WHEN" => Token::When,
401                    "THEN" => Token::Then,
402                    "ELSE" => Token::Else,
403                    "END" => Token::End,
404                    "DISTINCT" => Token::Distinct,
405                    "OVER" => Token::Over,
406                    "PARTITION" => Token::Partition,
407                    "BY" => Token::By,
408                    // Window frame keywords
409                    "ROWS" => Token::Rows,
410                    // Note: RANGE is context-sensitive - it's both a window frame keyword and a table function
411                    // We'll handle this in the parser based on context
412                    "UNBOUNDED" => Token::Unbounded,
413                    "PRECEDING" => Token::Preceding,
414                    "FOLLOWING" => Token::Following,
415                    "CURRENT" => Token::Current,
416                    "ROW" => Token::Row,
417                    // Set operation keywords
418                    "UNION" => Token::Union,
419                    "INTERSECT" => Token::Intersect,
420                    "EXCEPT" => Token::Except,
421                    // Special CTE keyword
422                    "WEB" => Token::Web,
423                    // JOIN keywords
424                    "JOIN" => Token::Join,
425                    "INNER" => Token::Inner,
426                    "LEFT" => Token::Left,
427                    "RIGHT" => Token::Right,
428                    "FULL" => Token::Full,
429                    "OUTER" => Token::Outer,
430                    "ON" => Token::On,
431                    "CROSS" => Token::Cross,
432                    _ => Token::Identifier(ident),
433                }
434            }
435            Some(ch) => {
436                self.advance();
437                Token::Identifier(ch.to_string())
438            }
439        }
440    }
441
442    fn peek_keyword(&mut self, keyword: &str) -> bool {
443        let saved_pos = self.position;
444        let saved_char = self.current_char;
445
446        self.skip_whitespace_and_comments();
447        let next_word = self.read_identifier();
448        let matches = next_word.to_uppercase() == keyword;
449
450        // Restore position
451        self.position = saved_pos;
452        self.current_char = saved_char;
453
454        matches
455    }
456
457    #[must_use]
458    pub fn get_position(&self) -> usize {
459        self.position
460    }
461
462    pub fn tokenize_all(&mut self) -> Vec<Token> {
463        let mut tokens = Vec::new();
464        loop {
465            let token = self.next_token();
466            if matches!(token, Token::Eof) {
467                tokens.push(token);
468                break;
469            }
470            tokens.push(token);
471        }
472        tokens
473    }
474
475    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
476        let mut tokens = Vec::new();
477        loop {
478            self.skip_whitespace_and_comments();
479            let start_pos = self.position;
480            let token = self.next_token();
481            let end_pos = self.position;
482
483            if matches!(token, Token::Eof) {
484                break;
485            }
486            tokens.push((start_pos, end_pos, token));
487        }
488        tokens
489    }
490}