sqlexpr_rust/
lexer.rs

1// Lexer/Tokenizer for SQL Expression Parser
2//
3// This module handles tokenization of SQL expressions, including:
4// - Case-insensitive keywords
5// - String literals with SQL-style escaping
6// - Numeric literals (decimal, hex, octal, floating-point)
7// - Comments (line and block)
8// - Whitespace handling
9
10use std::fmt;
11
12#[derive(Debug, Clone, PartialEq)]
13pub enum Token {
14    // Keywords (case-insensitive)
15    And,
16    Or,
17    Not,
18    Between,
19    Like,
20    Escape,
21    In,
22    Is,
23    True,
24    False,
25    Null,
26
27    // Operators
28    Equal,              // =
29    NotEqual,           // <> or !=
30    GreaterThan,        // >
31    GreaterOrEqual,     // >=
32    LessThan,           // <
33    LessOrEqual,        // <=
34    Plus,               // +
35    Minus,              // -
36    Star,               // *
37    Slash,              // /
38    Percent,            // %
39
40    // Delimiters
41    LeftParen,          // (
42    RightParen,         // )
43    Comma,              // ,
44
45    // Literals
46    Identifier(String),
47    StringLiteral(String),
48    IntegerLiteral(i64),
49    FloatLiteral(f64),
50
51    // End of input
52    Eof,
53}
54
55impl fmt::Display for Token {
56    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
57        match self {
58            Token::And => write!(f, "AND"),
59            Token::Or => write!(f, "OR"),
60            Token::Not => write!(f, "NOT"),
61            Token::Between => write!(f, "BETWEEN"),
62            Token::Like => write!(f, "LIKE"),
63            Token::Escape => write!(f, "ESCAPE"),
64            Token::In => write!(f, "IN"),
65            Token::Is => write!(f, "IS"),
66            Token::True => write!(f, "TRUE"),
67            Token::False => write!(f, "FALSE"),
68            Token::Null => write!(f, "NULL"),
69            Token::Equal => write!(f, "="),
70            Token::NotEqual => write!(f, "<>"),
71            Token::GreaterThan => write!(f, ">"),
72            Token::GreaterOrEqual => write!(f, ">="),
73            Token::LessThan => write!(f, "<"),
74            Token::LessOrEqual => write!(f, "<="),
75            Token::Plus => write!(f, "+"),
76            Token::Minus => write!(f, "-"),
77            Token::Star => write!(f, "*"),
78            Token::Slash => write!(f, "/"),
79            Token::Percent => write!(f, "%"),
80            Token::LeftParen => write!(f, "("),
81            Token::RightParen => write!(f, ")"),
82            Token::Comma => write!(f, ","),
83            Token::Identifier(s) => write!(f, "identifier '{}'", s),
84            Token::StringLiteral(s) => write!(f, "string '{}'", s),
85            Token::IntegerLiteral(n) => write!(f, "integer {}", n),
86            Token::FloatLiteral(n) => write!(f, "float {}", n),
87            Token::Eof => write!(f, "end of input"),
88        }
89    }
90}
91
92pub struct Lexer {
93    input: Vec<char>,
94    position: usize,
95    current_char: Option<char>,
96}
97
98impl Lexer {
99    pub fn new(input: &str) -> Self {
100        let chars: Vec<char> = input.chars().collect();
101        let current_char = chars.first().copied();
102        Lexer {
103            input: chars,
104            position: 0,
105            current_char,
106        }
107    }
108
109    /// Format error message with position and input context
110    fn format_error(&self, message: &str) -> String {
111        format!("{} near position {} in:\n  {}",
112            message,
113            self.position,
114            String::from_iter(&self.input))
115    }
116
117    /// Advance to the next character
118    fn advance(&mut self) {
119        self.position += 1;
120        self.current_char = self.input.get(self.position).copied();
121    }
122
123    /// Peek at the next character without advancing
124    fn peek(&self) -> Option<char> {
125        self.input.get(self.position + 1).copied()
126    }
127
128    /// Skip whitespace characters
129    fn skip_whitespace(&mut self) {
130        while let Some(ch) = self.current_char {
131            if ch.is_whitespace() {
132                self.advance();
133            } else {
134                break;
135            }
136        }
137    }
138
139    /// Skip line comment (-- to end of line)
140    fn skip_line_comment(&mut self) {
141        // Skip the '--'
142        self.advance();
143        self.advance();
144
145        // Skip until newline or EOF
146        while let Some(ch) = self.current_char {
147            if ch == '\n' {
148                self.advance();
149                break;
150            }
151            self.advance();
152        }
153    }
154
155    /// Skip block comment (/* ... */)
156    fn skip_block_comment(&mut self) -> Result<(), String> {
157        // Skip the '/*'
158        self.advance();
159        self.advance();
160
161        // Look for '*/'
162        while let Some(ch) = self.current_char {
163            if ch == '*' && self.peek() == Some('/') {
164                self.advance(); // skip '*'
165                self.advance(); // skip '/'
166                return Ok(());
167            }
168            self.advance();
169        }
170
171        Err(self.format_error("Unterminated block comment"))
172    }
173
174    /// Read an identifier or keyword
175    fn read_identifier(&mut self) -> String {
176        let mut result = String::new();
177
178        while let Some(ch) = self.current_char {
179            if ch.is_alphanumeric() || ch == '_' || ch == '$' {
180                result.push(ch);
181                self.advance();
182            } else {
183                break;
184            }
185        }
186
187        result
188    }
189
190    /// Check if identifier is a keyword (case-insensitive)
191    fn keyword_or_identifier(&self, s: &str) -> Token {
192        match s.to_uppercase().as_str() {
193            "AND" => Token::And,
194            "OR" => Token::Or,
195            "NOT" => Token::Not,
196            "BETWEEN" => Token::Between,
197            "LIKE" => Token::Like,
198            "ESCAPE" => Token::Escape,
199            "IN" => Token::In,
200            "IS" => Token::Is,
201            "TRUE" => Token::True,
202            "FALSE" => Token::False,
203            "NULL" => Token::Null,
204            _ => Token::Identifier(s.to_string()),
205        }
206    }
207
208    /// Read a string literal with SQL-style escaping
209    fn read_string_literal(&mut self) -> Result<String, String> {
210        let mut result = String::new();
211
212        // Skip opening quote
213        self.advance();
214
215        while let Some(ch) = self.current_char {
216            if ch == '\'' {
217                // Check for escaped quote ('')
218                if self.peek() == Some('\'') {
219                    result.push('\'');
220                    self.advance(); // skip first '
221                    self.advance(); // skip second '
222                } else {
223                    // End of string
224                    self.advance(); // skip closing '
225                    return Ok(result);
226                }
227            } else {
228                result.push(ch);
229                self.advance();
230            }
231        }
232
233        Err(self.format_error("Unterminated string literal"))
234    }
235
236    /// Read a numeric literal (integer, long, hex, octal, or float)
237    fn read_number(&mut self) -> Result<Token, String> {
238        // Check for hex (0x or 0X)
239        if self.current_char == Some('0') && matches!(self.peek(), Some('x') | Some('X')) {
240            return self.read_hex_literal();
241        }
242
243        // Check for octal (starts with 0)
244        if self.current_char == Some('0') && self.peek().is_some_and(|c| c.is_ascii_digit()) {
245            return self.read_octal_literal();
246        }
247
248        // Read decimal or floating point
249        let mut num_str = String::new();
250        let mut is_float = false;
251
252        // Read integer part
253        while let Some(ch) = self.current_char {
254            if ch.is_ascii_digit() {
255                num_str.push(ch);
256                self.advance();
257            } else {
258                break;
259            }
260        }
261
262        // Check for decimal point
263        if self.current_char == Some('.') && self.peek().is_some_and(|c| c.is_ascii_digit() || c == 'e' || c == 'E') {
264            is_float = true;
265            num_str.push('.');
266            self.advance();
267
268            // Read fractional part
269            while let Some(ch) = self.current_char {
270                if ch.is_ascii_digit() {
271                    num_str.push(ch);
272                    self.advance();
273                } else {
274                    break;
275                }
276            }
277        }
278
279        // Check for exponent
280        if matches!(self.current_char, Some('e') | Some('E')) {
281            is_float = true;
282            num_str.push('e');
283            self.advance();
284
285            // Optional sign
286            if matches!(self.current_char, Some('+') | Some('-')) {
287                num_str.push(self.current_char.unwrap());
288                self.advance();
289            }
290
291            // Exponent digits
292            while let Some(ch) = self.current_char {
293                if ch.is_ascii_digit() {
294                    num_str.push(ch);
295                    self.advance();
296                } else {
297                    break;
298                }
299            }
300        }
301
302        // Check for long suffix (l or L) - treat as regular integer
303        if matches!(self.current_char, Some('l') | Some('L')) && !is_float {
304            self.advance();
305            let value = num_str.parse::<i64>()
306                .map_err(|e| self.format_error(&format!("Invalid integer literal: {}", e)))?;
307            return Ok(Token::IntegerLiteral(value));
308        }
309
310        // Parse as float or integer
311        if is_float {
312            let value = num_str.parse::<f64>()
313                .map_err(|e| self.format_error(&format!("Invalid float literal: {}", e)))?;
314            Ok(Token::FloatLiteral(value))
315        } else {
316            let value = num_str.parse::<i64>()
317                .map_err(|e| self.format_error(&format!("Invalid integer literal: {}", e)))?;
318            Ok(Token::IntegerLiteral(value))
319        }
320    }
321
322    /// Read hexadecimal literal (0x...)
323    fn read_hex_literal(&mut self) -> Result<Token, String> {
324        // Skip '0x' or '0X'
325        self.advance();
326        self.advance();
327
328        let mut hex_str = String::new();
329        while let Some(ch) = self.current_char {
330            if ch.is_ascii_hexdigit() {
331                hex_str.push(ch);
332                self.advance();
333            } else {
334                break;
335            }
336        }
337
338        if hex_str.is_empty() {
339            return Err(self.format_error("Invalid hexadecimal literal: no digits after 0x"));
340        }
341
342        let value = i64::from_str_radix(&hex_str, 16)
343            .map_err(|e| self.format_error(&format!("Invalid hexadecimal literal: {}", e)))?;
344        Ok(Token::IntegerLiteral(value))
345    }
346
347    /// Read octal literal (0...)
348    fn read_octal_literal(&mut self) -> Result<Token, String> {
349        let mut octal_str = String::new();
350
351        while let Some(ch) = self.current_char {
352            if ('0'..='7').contains(&ch) {
353                octal_str.push(ch);
354                self.advance();
355            } else {
356                break;
357            }
358        }
359
360        let value = i64::from_str_radix(&octal_str, 8)
361            .map_err(|e| self.format_error(&format!("Invalid octal literal: {}", e)))?;
362        Ok(Token::IntegerLiteral(value))
363    }
364
365    /// Read floating point literal starting with '.'
366    fn read_float_starting_with_dot(&mut self) -> Result<Token, String> {
367        let mut num_str = String::from("0.");
368
369        // Skip the '.'
370        self.advance();
371
372        // Read fractional part
373        while let Some(ch) = self.current_char {
374            if ch.is_ascii_digit() {
375                num_str.push(ch);
376                self.advance();
377            } else {
378                break;
379            }
380        }
381
382        // Check for exponent
383        if matches!(self.current_char, Some('e') | Some('E')) {
384            num_str.push('e');
385            self.advance();
386
387            // Optional sign
388            if matches!(self.current_char, Some('+') | Some('-')) {
389                num_str.push(self.current_char.unwrap());
390                self.advance();
391            }
392
393            // Exponent digits
394            while let Some(ch) = self.current_char {
395                if ch.is_ascii_digit() {
396                    num_str.push(ch);
397                    self.advance();
398                } else {
399                    break;
400                }
401            }
402        }
403
404        let value = num_str.parse::<f64>()
405            .map_err(|e| self.format_error(&format!("Invalid float literal: {}", e)))?;
406        Ok(Token::FloatLiteral(value))
407    }
408
409    /// Get the next token
410    pub fn next_token(&mut self) -> Result<Token, String> {
411        loop {
412            // Skip whitespace
413            self.skip_whitespace();
414
415            let ch = match self.current_char {
416                Some(c) => c,
417                None => return Ok(Token::Eof),
418            };
419
420            // Check for comments
421            if ch == '-' && self.peek() == Some('-') {
422                self.skip_line_comment();
423                continue;
424            }
425
426            if ch == '/' && self.peek() == Some('*') {
427                self.skip_block_comment()?;
428                continue;
429            }
430
431            // Single-character tokens
432            match ch {
433                '(' => {
434                    self.advance();
435                    return Ok(Token::LeftParen);
436                }
437                ')' => {
438                    self.advance();
439                    return Ok(Token::RightParen);
440                }
441                ',' => {
442                    self.advance();
443                    return Ok(Token::Comma);
444                }
445                '+' => {
446                    self.advance();
447                    return Ok(Token::Plus);
448                }
449                '-' => {
450                    self.advance();
451                    return Ok(Token::Minus);
452                }
453                '*' => {
454                    self.advance();
455                    return Ok(Token::Star);
456                }
457                '/' => {
458                    self.advance();
459                    return Ok(Token::Slash);
460                }
461                '%' => {
462                    self.advance();
463                    return Ok(Token::Percent);
464                }
465                '=' => {
466                    self.advance();
467                    return Ok(Token::Equal);
468                }
469                '!' => {
470                    if self.peek() == Some('=') {
471                        self.advance();
472                        self.advance();
473                        return Ok(Token::NotEqual);
474                    }
475                    return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
476                }
477                '<' => {
478                    self.advance();
479                    if self.current_char == Some('>') {
480                        self.advance();
481                        return Ok(Token::NotEqual);
482                    } else if self.current_char == Some('=') {
483                        self.advance();
484                        return Ok(Token::LessOrEqual);
485                    }
486                    return Ok(Token::LessThan);
487                }
488                '>' => {
489                    self.advance();
490                    if self.current_char == Some('=') {
491                        self.advance();
492                        return Ok(Token::GreaterOrEqual);
493                    }
494                    return Ok(Token::GreaterThan);
495                }
496                '\'' => {
497                    let s = self.read_string_literal()?;
498                    return Ok(Token::StringLiteral(s));
499                }
500                '.' => {
501                    // Check if this is a float starting with '.'
502                    if self.peek().is_some_and(|c| c.is_ascii_digit()) {
503                        return self.read_float_starting_with_dot();
504                    }
505                    return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
506                }
507                _ => {
508                    // Identifiers and keywords
509                    if ch.is_alphabetic() || ch == '_' || ch == '$' {
510                        let ident = self.read_identifier();
511                        return Ok(self.keyword_or_identifier(&ident));
512                    }
513
514                    // Numbers
515                    if ch.is_ascii_digit() {
516                        return self.read_number();
517                    }
518
519                    return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
520                }
521            }
522        }
523    }
524
525    /// Tokenize the entire input
526    pub fn tokenize(&mut self) -> Result<Vec<Token>, String> {
527        let mut tokens = Vec::new();
528        loop {
529            let token = self.next_token()?;
530            if token == Token::Eof {
531                tokens.push(token);
532                break;
533            }
534            tokens.push(token);
535        }
536        Ok(tokens)
537    }
538}
539
540#[cfg(test)]
541mod tests {
542    use super::*;
543
544    #[test]
545    fn test_keywords() {
546        let mut lexer = Lexer::new("AND or Not BETWEEN");
547        assert_eq!(lexer.next_token().unwrap(), Token::And);
548        assert_eq!(lexer.next_token().unwrap(), Token::Or);
549        assert_eq!(lexer.next_token().unwrap(), Token::Not);
550        assert_eq!(lexer.next_token().unwrap(), Token::Between);
551    }
552
553    #[test]
554    fn test_string_literal() {
555        let mut lexer = Lexer::new("'hello' 'it''s me'");
556        assert_eq!(lexer.next_token().unwrap(), Token::StringLiteral("hello".to_string()));
557        assert_eq!(lexer.next_token().unwrap(), Token::StringLiteral("it's me".to_string()));
558    }
559
560    #[test]
561    fn test_numbers() {
562        let mut lexer = Lexer::new("42 0x1A 077 3.14 1e-5 100L");
563        assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(42));
564        assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(26)); // 0x1A
565        assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(63)); // 077 octal
566        assert_eq!(lexer.next_token().unwrap(), Token::FloatLiteral(3.14));
567        assert!(matches!(lexer.next_token().unwrap(), Token::FloatLiteral(_)));
568        assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(100)); // 100L treated as integer
569    }
570
571    #[test]
572    fn test_comments() {
573        let mut lexer = Lexer::new("x -- comment\ny /* block */ z");
574        assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
575        assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
576        assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
577    }
578}