sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    DateTime,  // DateTime constructor
30    Case,      // CASE expression
31    When,      // WHEN clause
32    Then,      // THEN clause
33    Else,      // ELSE clause
34    End,       // END keyword
35    Distinct,  // DISTINCT keyword for aggregate functions
36    Over,      // OVER keyword for window functions
37    Partition, // PARTITION keyword for window functions
38    By,        // BY keyword (used with PARTITION BY, ORDER BY)
39
40    // JOIN keywords
41    Join,  // JOIN keyword
42    Inner, // INNER JOIN
43    Left,  // LEFT JOIN
44    Right, // RIGHT JOIN
45    Full,  // FULL JOIN
46    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
47    On,    // ON keyword for join conditions
48    Cross, // CROSS JOIN
49
50    // Literals
51    Identifier(String),
52    QuotedIdentifier(String), // For "Customer Id" style identifiers
53    StringLiteral(String),
54    NumberLiteral(String),
55    Star,
56
57    // Operators
58    Dot,
59    Comma,
60    Colon,
61    LeftParen,
62    RightParen,
63    Equal,
64    NotEqual,
65    LessThan,
66    GreaterThan,
67    LessThanOrEqual,
68    GreaterThanOrEqual,
69
70    // Arithmetic operators
71    Plus,
72    Minus,
73    Divide,
74    Modulo,
75
76    // String operators
77    Concat, // || for string concatenation
78
79    // Special
80    Eof,
81}
82
83#[derive(Debug, Clone)]
84pub struct Lexer {
85    input: Vec<char>,
86    position: usize,
87    current_char: Option<char>,
88}
89
90impl Lexer {
91    #[must_use]
92    pub fn new(input: &str) -> Self {
93        let chars: Vec<char> = input.chars().collect();
94        let current = chars.first().copied();
95        Self {
96            input: chars,
97            position: 0,
98            current_char: current,
99        }
100    }
101
102    fn advance(&mut self) {
103        self.position += 1;
104        self.current_char = self.input.get(self.position).copied();
105    }
106
107    fn peek(&self, offset: usize) -> Option<char> {
108        self.input.get(self.position + offset).copied()
109    }
110
111    fn skip_whitespace(&mut self) {
112        while let Some(ch) = self.current_char {
113            if ch.is_whitespace() {
114                self.advance();
115            } else {
116                break;
117            }
118        }
119    }
120
121    fn skip_whitespace_and_comments(&mut self) {
122        loop {
123            // Skip whitespace
124            while let Some(ch) = self.current_char {
125                if ch.is_whitespace() {
126                    self.advance();
127                } else {
128                    break;
129                }
130            }
131
132            // Check for comments
133            match self.current_char {
134                Some('-') if self.peek(1) == Some('-') => {
135                    // Single-line comment: skip until end of line
136                    self.advance(); // skip first '-'
137                    self.advance(); // skip second '-'
138                    while let Some(ch) = self.current_char {
139                        self.advance();
140                        if ch == '\n' {
141                            break;
142                        }
143                    }
144                }
145                Some('/') if self.peek(1) == Some('*') => {
146                    // Multi-line comment: skip until */
147                    self.advance(); // skip '/'
148                    self.advance(); // skip '*'
149                    while let Some(ch) = self.current_char {
150                        if ch == '*' && self.peek(1) == Some('/') {
151                            self.advance(); // skip '*'
152                            self.advance(); // skip '/'
153                            break;
154                        }
155                        self.advance();
156                    }
157                }
158                _ => {
159                    // No more comments or whitespace
160                    break;
161                }
162            }
163        }
164    }
165
166    fn read_identifier(&mut self) -> String {
167        let mut result = String::new();
168        while let Some(ch) = self.current_char {
169            if ch.is_alphanumeric() || ch == '_' {
170                result.push(ch);
171                self.advance();
172            } else {
173                break;
174            }
175        }
176        result
177    }
178
179    fn read_string(&mut self) -> String {
180        let mut result = String::new();
181        let quote_char = self.current_char.unwrap(); // ' or "
182        self.advance(); // skip opening quote
183
184        while let Some(ch) = self.current_char {
185            if ch == quote_char {
186                self.advance(); // skip closing quote
187                break;
188            }
189            result.push(ch);
190            self.advance();
191        }
192        result
193    }
194
195    fn read_number(&mut self) -> String {
196        let mut result = String::new();
197        let mut has_e = false;
198
199        // Read the main number part (including decimal point)
200        while let Some(ch) = self.current_char {
201            if !has_e && (ch.is_numeric() || ch == '.') {
202                result.push(ch);
203                self.advance();
204            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
205                // Handle scientific notation
206                result.push(ch);
207                self.advance();
208                has_e = true;
209
210                // Check for optional sign after 'e'
211                if let Some(sign) = self.current_char {
212                    if sign == '+' || sign == '-' {
213                        result.push(sign);
214                        self.advance();
215                    }
216                }
217
218                // Read exponent digits
219                while let Some(digit) = self.current_char {
220                    if digit.is_numeric() {
221                        result.push(digit);
222                        self.advance();
223                    } else {
224                        break;
225                    }
226                }
227                break; // Done reading the number
228            } else {
229                break;
230            }
231        }
232        result
233    }
234
235    pub fn next_token(&mut self) -> Token {
236        self.skip_whitespace_and_comments();
237
238        match self.current_char {
239            None => Token::Eof,
240            Some('*') => {
241                self.advance();
242                // Context-sensitive: could be SELECT * or multiplication
243                // The parser will distinguish based on context
244                Token::Star // We'll handle multiplication in parser
245            }
246            Some('+') => {
247                self.advance();
248                Token::Plus
249            }
250            Some('/') => {
251                // Check if this is a comment start
252                if self.peek(1) == Some('*') {
253                    // This shouldn't happen as comments are skipped above,
254                    // but handle it just in case
255                    self.skip_whitespace_and_comments();
256                    return self.next_token();
257                }
258                self.advance();
259                Token::Divide
260            }
261            Some('%') => {
262                self.advance();
263                Token::Modulo
264            }
265            Some('.') => {
266                self.advance();
267                Token::Dot
268            }
269            Some(',') => {
270                self.advance();
271                Token::Comma
272            }
273            Some(':') => {
274                self.advance();
275                Token::Colon
276            }
277            Some('(') => {
278                self.advance();
279                Token::LeftParen
280            }
281            Some(')') => {
282                self.advance();
283                Token::RightParen
284            }
285            Some('=') => {
286                self.advance();
287                Token::Equal
288            }
289            Some('<') => {
290                self.advance();
291                if self.current_char == Some('=') {
292                    self.advance();
293                    Token::LessThanOrEqual
294                } else if self.current_char == Some('>') {
295                    self.advance();
296                    Token::NotEqual
297                } else {
298                    Token::LessThan
299                }
300            }
301            Some('>') => {
302                self.advance();
303                if self.current_char == Some('=') {
304                    self.advance();
305                    Token::GreaterThanOrEqual
306                } else {
307                    Token::GreaterThan
308                }
309            }
310            Some('!') if self.peek(1) == Some('=') => {
311                self.advance();
312                self.advance();
313                Token::NotEqual
314            }
315            Some('|') if self.peek(1) == Some('|') => {
316                self.advance();
317                self.advance();
318                Token::Concat
319            }
320            Some('"') => {
321                // Double quotes = identifier
322                let ident_val = self.read_string();
323                Token::QuotedIdentifier(ident_val)
324            }
325            Some('\'') => {
326                // Single quotes = string literal
327                let string_val = self.read_string();
328                Token::StringLiteral(string_val)
329            }
330            Some('-') if self.peek(1) == Some('-') => {
331                // This is a comment, skip it and get next token
332                self.skip_whitespace_and_comments();
333                self.next_token()
334            }
335            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
336                // Handle negative numbers
337                self.advance(); // skip '-'
338                let num = self.read_number();
339                Token::NumberLiteral(format!("-{num}"))
340            }
341            Some('-') => {
342                // Handle subtraction operator
343                self.advance();
344                Token::Minus
345            }
346            Some(ch) if ch.is_numeric() => {
347                let num = self.read_number();
348                Token::NumberLiteral(num)
349            }
350            Some(ch) if ch.is_alphabetic() || ch == '_' => {
351                let ident = self.read_identifier();
352                match ident.to_uppercase().as_str() {
353                    "SELECT" => Token::Select,
354                    "FROM" => Token::From,
355                    "WHERE" => Token::Where,
356                    "WITH" => Token::With,
357                    "AND" => Token::And,
358                    "OR" => Token::Or,
359                    "IN" => Token::In,
360                    "NOT" => Token::Not,
361                    "BETWEEN" => Token::Between,
362                    "LIKE" => Token::Like,
363                    "IS" => Token::Is,
364                    "NULL" => Token::Null,
365                    "ORDER" if self.peek_keyword("BY") => {
366                        self.skip_whitespace();
367                        self.read_identifier(); // consume "BY"
368                        Token::OrderBy
369                    }
370                    "GROUP" if self.peek_keyword("BY") => {
371                        self.skip_whitespace();
372                        self.read_identifier(); // consume "BY"
373                        Token::GroupBy
374                    }
375                    "HAVING" => Token::Having,
376                    "AS" => Token::As,
377                    "ASC" => Token::Asc,
378                    "DESC" => Token::Desc,
379                    "LIMIT" => Token::Limit,
380                    "OFFSET" => Token::Offset,
381                    "DATETIME" => Token::DateTime,
382                    "CASE" => Token::Case,
383                    "WHEN" => Token::When,
384                    "THEN" => Token::Then,
385                    "ELSE" => Token::Else,
386                    "END" => Token::End,
387                    "DISTINCT" => Token::Distinct,
388                    "OVER" => Token::Over,
389                    "PARTITION" => Token::Partition,
390                    "BY" => Token::By,
391                    // JOIN keywords
392                    "JOIN" => Token::Join,
393                    "INNER" => Token::Inner,
394                    "LEFT" => Token::Left,
395                    "RIGHT" => Token::Right,
396                    "FULL" => Token::Full,
397                    "OUTER" => Token::Outer,
398                    "ON" => Token::On,
399                    "CROSS" => Token::Cross,
400                    _ => Token::Identifier(ident),
401                }
402            }
403            Some(ch) => {
404                self.advance();
405                Token::Identifier(ch.to_string())
406            }
407        }
408    }
409
410    fn peek_keyword(&mut self, keyword: &str) -> bool {
411        let saved_pos = self.position;
412        let saved_char = self.current_char;
413
414        self.skip_whitespace_and_comments();
415        let next_word = self.read_identifier();
416        let matches = next_word.to_uppercase() == keyword;
417
418        // Restore position
419        self.position = saved_pos;
420        self.current_char = saved_char;
421
422        matches
423    }
424
425    #[must_use]
426    pub fn get_position(&self) -> usize {
427        self.position
428    }
429
430    pub fn tokenize_all(&mut self) -> Vec<Token> {
431        let mut tokens = Vec::new();
432        loop {
433            let token = self.next_token();
434            if matches!(token, Token::Eof) {
435                tokens.push(token);
436                break;
437            }
438            tokens.push(token);
439        }
440        tokens
441    }
442
443    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
444        let mut tokens = Vec::new();
445        loop {
446            self.skip_whitespace_and_comments();
447            let start_pos = self.position;
448            let token = self.next_token();
449            let end_pos = self.position;
450
451            if matches!(token, Token::Eof) {
452                break;
453            }
454            tokens.push((start_pos, end_pos, token));
455        }
456        tokens
457    }
458}