sql_cli/sql/parser/
lexer.rs

1//! SQL Lexer - Tokenization of SQL queries
2//!
3//! This module handles the conversion of raw SQL text into tokens
4//! that can be consumed by the parser.
5
6#[derive(Debug, Clone, PartialEq)]
7pub enum Token {
8    // Keywords
9    Select,
10    From,
11    Where,
12    With, // WITH clause for CTEs
13    And,
14    Or,
15    In,
16    Not,
17    Between,
18    Like,
19    Is,
20    Null,
21    OrderBy,
22    GroupBy,
23    Having,
24    As,
25    Asc,
26    Desc,
27    Limit,
28    Offset,
29    DateTime,  // DateTime constructor
30    Case,      // CASE expression
31    When,      // WHEN clause
32    Then,      // THEN clause
33    Else,      // ELSE clause
34    End,       // END keyword
35    Distinct,  // DISTINCT keyword for aggregate functions
36    Over,      // OVER keyword for window functions
37    Partition, // PARTITION keyword for window functions
38    By,        // BY keyword (used with PARTITION BY, ORDER BY)
39
40    // JOIN keywords
41    Join,  // JOIN keyword
42    Inner, // INNER JOIN
43    Left,  // LEFT JOIN
44    Right, // RIGHT JOIN
45    Full,  // FULL JOIN
46    Outer, // OUTER keyword (LEFT OUTER, RIGHT OUTER, FULL OUTER)
47    On,    // ON keyword for join conditions
48    Cross, // CROSS JOIN
49
50    // Literals
51    Identifier(String),
52    QuotedIdentifier(String), // For "Customer Id" style identifiers
53    StringLiteral(String),
54    NumberLiteral(String),
55    Star,
56
57    // Operators
58    Dot,
59    Comma,
60    LeftParen,
61    RightParen,
62    Equal,
63    NotEqual,
64    LessThan,
65    GreaterThan,
66    LessThanOrEqual,
67    GreaterThanOrEqual,
68
69    // Arithmetic operators
70    Plus,
71    Minus,
72    Divide,
73    Modulo,
74
75    // Special
76    Eof,
77}
78
79#[derive(Debug, Clone)]
80pub struct Lexer {
81    input: Vec<char>,
82    position: usize,
83    current_char: Option<char>,
84}
85
86impl Lexer {
87    #[must_use]
88    pub fn new(input: &str) -> Self {
89        let chars: Vec<char> = input.chars().collect();
90        let current = chars.first().copied();
91        Self {
92            input: chars,
93            position: 0,
94            current_char: current,
95        }
96    }
97
98    fn advance(&mut self) {
99        self.position += 1;
100        self.current_char = self.input.get(self.position).copied();
101    }
102
103    fn peek(&self, offset: usize) -> Option<char> {
104        self.input.get(self.position + offset).copied()
105    }
106
107    fn skip_whitespace(&mut self) {
108        while let Some(ch) = self.current_char {
109            if ch.is_whitespace() {
110                self.advance();
111            } else {
112                break;
113            }
114        }
115    }
116
117    fn skip_whitespace_and_comments(&mut self) {
118        loop {
119            // Skip whitespace
120            while let Some(ch) = self.current_char {
121                if ch.is_whitespace() {
122                    self.advance();
123                } else {
124                    break;
125                }
126            }
127
128            // Check for comments
129            match self.current_char {
130                Some('-') if self.peek(1) == Some('-') => {
131                    // Single-line comment: skip until end of line
132                    self.advance(); // skip first '-'
133                    self.advance(); // skip second '-'
134                    while let Some(ch) = self.current_char {
135                        self.advance();
136                        if ch == '\n' {
137                            break;
138                        }
139                    }
140                }
141                Some('/') if self.peek(1) == Some('*') => {
142                    // Multi-line comment: skip until */
143                    self.advance(); // skip '/'
144                    self.advance(); // skip '*'
145                    while let Some(ch) = self.current_char {
146                        if ch == '*' && self.peek(1) == Some('/') {
147                            self.advance(); // skip '*'
148                            self.advance(); // skip '/'
149                            break;
150                        }
151                        self.advance();
152                    }
153                }
154                _ => {
155                    // No more comments or whitespace
156                    break;
157                }
158            }
159        }
160    }
161
162    fn read_identifier(&mut self) -> String {
163        let mut result = String::new();
164        while let Some(ch) = self.current_char {
165            if ch.is_alphanumeric() || ch == '_' {
166                result.push(ch);
167                self.advance();
168            } else {
169                break;
170            }
171        }
172        result
173    }
174
175    fn read_string(&mut self) -> String {
176        let mut result = String::new();
177        let quote_char = self.current_char.unwrap(); // ' or "
178        self.advance(); // skip opening quote
179
180        while let Some(ch) = self.current_char {
181            if ch == quote_char {
182                self.advance(); // skip closing quote
183                break;
184            }
185            result.push(ch);
186            self.advance();
187        }
188        result
189    }
190
191    fn read_number(&mut self) -> String {
192        let mut result = String::new();
193        let mut has_e = false;
194
195        // Read the main number part (including decimal point)
196        while let Some(ch) = self.current_char {
197            if !has_e && (ch.is_numeric() || ch == '.') {
198                result.push(ch);
199                self.advance();
200            } else if (ch == 'e' || ch == 'E') && !has_e && !result.is_empty() {
201                // Handle scientific notation
202                result.push(ch);
203                self.advance();
204                has_e = true;
205
206                // Check for optional sign after 'e'
207                if let Some(sign) = self.current_char {
208                    if sign == '+' || sign == '-' {
209                        result.push(sign);
210                        self.advance();
211                    }
212                }
213
214                // Read exponent digits
215                while let Some(digit) = self.current_char {
216                    if digit.is_numeric() {
217                        result.push(digit);
218                        self.advance();
219                    } else {
220                        break;
221                    }
222                }
223                break; // Done reading the number
224            } else {
225                break;
226            }
227        }
228        result
229    }
230
231    pub fn next_token(&mut self) -> Token {
232        self.skip_whitespace_and_comments();
233
234        match self.current_char {
235            None => Token::Eof,
236            Some('*') => {
237                self.advance();
238                // Context-sensitive: could be SELECT * or multiplication
239                // The parser will distinguish based on context
240                Token::Star // We'll handle multiplication in parser
241            }
242            Some('+') => {
243                self.advance();
244                Token::Plus
245            }
246            Some('/') => {
247                // Check if this is a comment start
248                if self.peek(1) == Some('*') {
249                    // This shouldn't happen as comments are skipped above,
250                    // but handle it just in case
251                    self.skip_whitespace_and_comments();
252                    return self.next_token();
253                }
254                self.advance();
255                Token::Divide
256            }
257            Some('%') => {
258                self.advance();
259                Token::Modulo
260            }
261            Some('.') => {
262                self.advance();
263                Token::Dot
264            }
265            Some(',') => {
266                self.advance();
267                Token::Comma
268            }
269            Some('(') => {
270                self.advance();
271                Token::LeftParen
272            }
273            Some(')') => {
274                self.advance();
275                Token::RightParen
276            }
277            Some('=') => {
278                self.advance();
279                Token::Equal
280            }
281            Some('<') => {
282                self.advance();
283                if self.current_char == Some('=') {
284                    self.advance();
285                    Token::LessThanOrEqual
286                } else if self.current_char == Some('>') {
287                    self.advance();
288                    Token::NotEqual
289                } else {
290                    Token::LessThan
291                }
292            }
293            Some('>') => {
294                self.advance();
295                if self.current_char == Some('=') {
296                    self.advance();
297                    Token::GreaterThanOrEqual
298                } else {
299                    Token::GreaterThan
300                }
301            }
302            Some('!') if self.peek(1) == Some('=') => {
303                self.advance();
304                self.advance();
305                Token::NotEqual
306            }
307            Some('"') => {
308                // Double quotes = identifier
309                let ident_val = self.read_string();
310                Token::QuotedIdentifier(ident_val)
311            }
312            Some('\'') => {
313                // Single quotes = string literal
314                let string_val = self.read_string();
315                Token::StringLiteral(string_val)
316            }
317            Some('-') if self.peek(1) == Some('-') => {
318                // This is a comment, skip it and get next token
319                self.skip_whitespace_and_comments();
320                self.next_token()
321            }
322            Some('-') if self.peek(1).is_some_and(char::is_numeric) => {
323                // Handle negative numbers
324                self.advance(); // skip '-'
325                let num = self.read_number();
326                Token::NumberLiteral(format!("-{num}"))
327            }
328            Some('-') => {
329                // Handle subtraction operator
330                self.advance();
331                Token::Minus
332            }
333            Some(ch) if ch.is_numeric() => {
334                let num = self.read_number();
335                Token::NumberLiteral(num)
336            }
337            Some(ch) if ch.is_alphabetic() || ch == '_' => {
338                let ident = self.read_identifier();
339                match ident.to_uppercase().as_str() {
340                    "SELECT" => Token::Select,
341                    "FROM" => Token::From,
342                    "WHERE" => Token::Where,
343                    "WITH" => Token::With,
344                    "AND" => Token::And,
345                    "OR" => Token::Or,
346                    "IN" => Token::In,
347                    "NOT" => Token::Not,
348                    "BETWEEN" => Token::Between,
349                    "LIKE" => Token::Like,
350                    "IS" => Token::Is,
351                    "NULL" => Token::Null,
352                    "ORDER" if self.peek_keyword("BY") => {
353                        self.skip_whitespace();
354                        self.read_identifier(); // consume "BY"
355                        Token::OrderBy
356                    }
357                    "GROUP" if self.peek_keyword("BY") => {
358                        self.skip_whitespace();
359                        self.read_identifier(); // consume "BY"
360                        Token::GroupBy
361                    }
362                    "HAVING" => Token::Having,
363                    "AS" => Token::As,
364                    "ASC" => Token::Asc,
365                    "DESC" => Token::Desc,
366                    "LIMIT" => Token::Limit,
367                    "OFFSET" => Token::Offset,
368                    "DATETIME" => Token::DateTime,
369                    "CASE" => Token::Case,
370                    "WHEN" => Token::When,
371                    "THEN" => Token::Then,
372                    "ELSE" => Token::Else,
373                    "END" => Token::End,
374                    "DISTINCT" => Token::Distinct,
375                    "OVER" => Token::Over,
376                    "PARTITION" => Token::Partition,
377                    "BY" => Token::By,
378                    // JOIN keywords
379                    "JOIN" => Token::Join,
380                    "INNER" => Token::Inner,
381                    "LEFT" => Token::Left,
382                    "RIGHT" => Token::Right,
383                    "FULL" => Token::Full,
384                    "OUTER" => Token::Outer,
385                    "ON" => Token::On,
386                    "CROSS" => Token::Cross,
387                    _ => Token::Identifier(ident),
388                }
389            }
390            Some(ch) => {
391                self.advance();
392                Token::Identifier(ch.to_string())
393            }
394        }
395    }
396
397    fn peek_keyword(&mut self, keyword: &str) -> bool {
398        let saved_pos = self.position;
399        let saved_char = self.current_char;
400
401        self.skip_whitespace_and_comments();
402        let next_word = self.read_identifier();
403        let matches = next_word.to_uppercase() == keyword;
404
405        // Restore position
406        self.position = saved_pos;
407        self.current_char = saved_char;
408
409        matches
410    }
411
412    #[must_use]
413    pub fn get_position(&self) -> usize {
414        self.position
415    }
416
417    pub fn tokenize_all(&mut self) -> Vec<Token> {
418        let mut tokens = Vec::new();
419        loop {
420            let token = self.next_token();
421            if matches!(token, Token::Eof) {
422                tokens.push(token);
423                break;
424            }
425            tokens.push(token);
426        }
427        tokens
428    }
429
430    pub fn tokenize_all_with_positions(&mut self) -> Vec<(usize, usize, Token)> {
431        let mut tokens = Vec::new();
432        loop {
433            self.skip_whitespace_and_comments();
434            let start_pos = self.position;
435            let token = self.next_token();
436            let end_pos = self.position;
437
438            if matches!(token, Token::Eof) {
439                break;
440            }
441            tokens.push((start_pos, end_pos, token));
442        }
443        tokens
444    }
445}