sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    DateTime,  // DateTime constructor
30    Case,      // CASE expression
31    When,      // WHEN clause
32    Then,      // THEN clause
33    Else,      // ELSE clause
34    End,       // END keyword
35    Distinct,  // DISTINCT keyword for aggregate functions
36    Over,      // OVER keyword for window functions
37    Partition, // PARTITION keyword for window functions
38    By,        // BY keyword (used with PARTITION BY, ORDER BY)
39
40    // JOIN keywords
41    Join,  // JOIN keyword
42    Inner, // INNER JOIN
43    Left,  // LEFT JOIN
44    Right, // RIGHT JOIN
45    Full,  // FULL JOIN
46    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
47    On,    // ON keyword for join conditions
48    Cross, // CROSS JOIN
49
50    // Literals
51    Identifier(String),
52    QuotedIdentifier(String), // For "Customer Id" style identifiers
53    StringLiteral(String),
54    NumberLiteral(String),
55    Star,
56
57    // Operators
58    Dot,
59    Comma,
60    Colon,
61    LeftParen,
62    RightParen,
63    Equal,
64    NotEqual,
65    LessThan,
66    GreaterThan,
67    LessThanOrEqual,
68    GreaterThanOrEqual,
69
70    // Arithmetic operators
71    Plus,
72    Minus,
73    Divide,
74    Modulo,
75
76    // Special
77    Eof,
78}
79
80#[derive(Debug, Clone)]
81pub struct Lexer {
82    input: Vec<char>,
83    position: usize,
84    current_char: Option<char>,
85}
86
87impl Lexer {
88    #[must_use]
89    pub fn new(input: &str) -> Self {
90        let chars: Vec<char> = input.chars().collect();
91        let current = chars.first().copied();
92        Self {
93            input: chars,
94            position: 0,
95            current_char: current,
96        }
97    }
98
99    fn advance(&mut self) {
100        self.position += 1;
101        self.current_char = self.input.get(self.position).copied();
102    }
103
104    fn peek(&self, offset: usize) -> Option<char> {
105        self.input.get(self.position + offset).copied()
106    }
107
108    fn skip_whitespace(&mut self) {
109        while let Some(ch) = self.current_char {
110            if ch.is_whitespace() {
111                self.advance();
112            } else {
113                break;
114            }
115        }
116    }
117
118    fn skip_whitespace_and_comments(&mut self) {
119        loop {
120            // Skip whitespace
121            while let Some(ch) = self.current_char {
122                if ch.is_whitespace() {
123                    self.advance();
124                } else {
125                    break;
126                }
127            }
128
129            // Check for comments
130            match self.current_char {
131                Some('-') if self.peek(1) == Some('-') => {
132                    // Single-line comment: skip until end of line
133                    self.advance(); // skip first '-'
134                    self.advance(); // skip second '-'
135                    while let Some(ch) = self.current_char {
136                        self.advance();
137                        if ch == '\n' {
138                            break;
139                        }
140                    }
141                }
142                Some('/') if self.peek(1) == Some('*') => {
143                    // Multi-line comment: skip until */
144                    self.advance(); // skip '/'
145                    self.advance(); // skip '*'
146                    while let Some(ch) = self.current_char {
147                        if ch == '*' && self.peek(1) == Some('/') {
148                            self.advance(); // skip '*'
149                            self.advance(); // skip '/'
150                            break;
151                        }
152                        self.advance();
153                    }
154                }
155                _ => {
156                    // No more comments or whitespace
157                    break;
158                }
159            }
160        }
161    }
162
163    fn read_identifier(&mut self) -> String {
164        let mut result = String::new();
165        while let Some(ch) = self.current_char {
166            if ch.is_alphanumeric() || ch == '_' {
167                result.push(ch);
168                self.advance();
169            } else {
170                break;
171            }
172        }
173        result
174    }
175
176    fn read_string(&mut self) -> String {
177        let mut result = String::new();
178        let quote_char = self.current_char.unwrap(); // ' or "
179        self.advance(); // skip opening quote
180
181        while let Some(ch) = self.current_char {
182            if ch == quote_char {
183                self.advance(); // skip closing quote
184                break;
185            }
186            result.push(ch);
187            self.advance();
188        }
189        result
190    }
191
192    fn read_number(&mut self) -> String {
193        let mut result = String::new();
194        let mut has_e = false;
195
196        // Read the main number part (including decimal point)
197        while let Some(ch) = self.current_char {
198            if !has_e && (ch.is_numeric() || ch == '.') {
199                result.push(ch);
200                self.advance();
201            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
202                // Handle scientific notation
203                result.push(ch);
204                self.advance();
205                has_e = true;
206
207                // Check for optional sign after 'e'
208                if let Some(sign) = self.current_char {
209                    if sign == '+' || sign == '-' {
210                        result.push(sign);
211                        self.advance();
212                    }
213                }
214
215                // Read exponent digits
216                while let Some(digit) = self.current_char {
217                    if digit.is_numeric() {
218                        result.push(digit);
219                        self.advance();
220                    } else {
221                        break;
222                    }
223                }
224                break; // Done reading the number
225            } else {
226                break;
227            }
228        }
229        result
230    }
231
232    pub fn next_token(&mut self) -> Token {
233        self.skip_whitespace_and_comments();
234
235        match self.current_char {
236            None => Token::Eof,
237            Some('*') => {
238                self.advance();
239                // Context-sensitive: could be SELECT * or multiplication
240                // The parser will distinguish based on context
241                Token::Star // We'll handle multiplication in parser
242            }
243            Some('+') => {
244                self.advance();
245                Token::Plus
246            }
247            Some('/') => {
248                // Check if this is a comment start
249                if self.peek(1) == Some('*') {
250                    // This shouldn't happen as comments are skipped above,
251                    // but handle it just in case
252                    self.skip_whitespace_and_comments();
253                    return self.next_token();
254                }
255                self.advance();
256                Token::Divide
257            }
258            Some('%') => {
259                self.advance();
260                Token::Modulo
261            }
262            Some('.') => {
263                self.advance();
264                Token::Dot
265            }
266            Some(',') => {
267                self.advance();
268                Token::Comma
269            }
270            Some(':') => {
271                self.advance();
272                Token::Colon
273            }
274            Some('(') => {
275                self.advance();
276                Token::LeftParen
277            }
278            Some(')') => {
279                self.advance();
280                Token::RightParen
281            }
282            Some('=') => {
283                self.advance();
284                Token::Equal
285            }
286            Some('<') => {
287                self.advance();
288                if self.current_char == Some('=') {
289                    self.advance();
290                    Token::LessThanOrEqual
291                } else if self.current_char == Some('>') {
292                    self.advance();
293                    Token::NotEqual
294                } else {
295                    Token::LessThan
296                }
297            }
298            Some('>') => {
299                self.advance();
300                if self.current_char == Some('=') {
301                    self.advance();
302                    Token::GreaterThanOrEqual
303                } else {
304                    Token::GreaterThan
305                }
306            }
307            Some('!') if self.peek(1) == Some('=') => {
308                self.advance();
309                self.advance();
310                Token::NotEqual
311            }
312            Some('"') => {
313                // Double quotes = identifier
314                let ident_val = self.read_string();
315                Token::QuotedIdentifier(ident_val)
316            }
317            Some('\'') => {
318                // Single quotes = string literal
319                let string_val = self.read_string();
320                Token::StringLiteral(string_val)
321            }
322            Some('-') if self.peek(1) == Some('-') => {
323                // This is a comment, skip it and get next token
324                self.skip_whitespace_and_comments();
325                self.next_token()
326            }
327            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
328                // Handle negative numbers
329                self.advance(); // skip '-'
330                let num = self.read_number();
331                Token::NumberLiteral(format!("-{num}"))
332            }
333            Some('-') => {
334                // Handle subtraction operator
335                self.advance();
336                Token::Minus
337            }
338            Some(ch) if ch.is_numeric() => {
339                let num = self.read_number();
340                Token::NumberLiteral(num)
341            }
342            Some(ch) if ch.is_alphabetic() || ch == '_' => {
343                let ident = self.read_identifier();
344                match ident.to_uppercase().as_str() {
345                    "SELECT" => Token::Select,
346                    "FROM" => Token::From,
347                    "WHERE" => Token::Where,
348                    "WITH" => Token::With,
349                    "AND" => Token::And,
350                    "OR" => Token::Or,
351                    "IN" => Token::In,
352                    "NOT" => Token::Not,
353                    "BETWEEN" => Token::Between,
354                    "LIKE" => Token::Like,
355                    "IS" => Token::Is,
356                    "NULL" => Token::Null,
357                    "ORDER" if self.peek_keyword("BY") => {
358                        self.skip_whitespace();
359                        self.read_identifier(); // consume "BY"
360                        Token::OrderBy
361                    }
362                    "GROUP" if self.peek_keyword("BY") => {
363                        self.skip_whitespace();
364                        self.read_identifier(); // consume "BY"
365                        Token::GroupBy
366                    }
367                    "HAVING" => Token::Having,
368                    "AS" => Token::As,
369                    "ASC" => Token::Asc,
370                    "DESC" => Token::Desc,
371                    "LIMIT" => Token::Limit,
372                    "OFFSET" => Token::Offset,
373                    "DATETIME" => Token::DateTime,
374                    "CASE" => Token::Case,
375                    "WHEN" => Token::When,
376                    "THEN" => Token::Then,
377                    "ELSE" => Token::Else,
378                    "END" => Token::End,
379                    "DISTINCT" => Token::Distinct,
380                    "OVER" => Token::Over,
381                    "PARTITION" => Token::Partition,
382                    "BY" => Token::By,
383                    // JOIN keywords
384                    "JOIN" => Token::Join,
385                    "INNER" => Token::Inner,
386                    "LEFT" => Token::Left,
387                    "RIGHT" => Token::Right,
388                    "FULL" => Token::Full,
389                    "OUTER" => Token::Outer,
390                    "ON" => Token::On,
391                    "CROSS" => Token::Cross,
392                    _ => Token::Identifier(ident),
393                }
394            }
395            Some(ch) => {
396                self.advance();
397                Token::Identifier(ch.to_string())
398            }
399        }
400    }
401
402    fn peek_keyword(&mut self, keyword: &str) -> bool {
403        let saved_pos = self.position;
404        let saved_char = self.current_char;
405
406        self.skip_whitespace_and_comments();
407        let next_word = self.read_identifier();
408        let matches = next_word.to_uppercase() == keyword;
409
410        // Restore position
411        self.position = saved_pos;
412        self.current_char = saved_char;
413
414        matches
415    }
416
417    #[must_use]
418    pub fn get_position(&self) -> usize {
419        self.position
420    }
421
422    pub fn tokenize_all(&mut self) -> Vec<Token> {
423        let mut tokens = Vec::new();
424        loop {
425            let token = self.next_token();
426            if matches!(token, Token::Eof) {
427                tokens.push(token);
428                break;
429            }
430            tokens.push(token);
431        }
432        tokens
433    }
434
435    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
436        let mut tokens = Vec::new();
437        loop {
438            self.skip_whitespace_and_comments();
439            let start_pos = self.position;
440            let token = self.next_token();
441            let end_pos = self.position;
442
443            if matches!(token, Token::Eof) {
444                break;
445            }
446            tokens.push((start_pos, end_pos, token));
447        }
448        tokens
449    }
450}