Skip to main content

sqlexpr_rust/
lexer.rs

1//! Lexer/Tokenizer for SQL Expression Parser
2//!
3//! This internal module handles tokenization of SQL expressions, including:
4//!  - Case-insensitive keywords
5//!  - String literals with SQL-style escaping
6//!  - Numeric literals (decimal, hex, octal, floating-point)
7//! - Comments (line and block)
8//! - Whitespace handling
9
10use std::fmt;
11
12/// Token types for SQL Expression Parser.
13/// Includes keywords, operators, literals, and delimiters.
14
15#[derive(Debug, Clone, PartialEq)]
16pub enum Token {
17    // Keywords (case-insensitive)
18    And,
19    Or,
20    Not,
21    Between,
22    Like,
23    Escape,
24    In,
25    Is,
26    True,
27    False,
28    Null,
29
30    // Operators
31    Equal,              // =
32    NotEqual,           // <> or !=
33    GreaterThan,        // >
34    GreaterOrEqual,     // >=
35    LessThan,           // <
36    LessOrEqual,        // <=
37    Plus,               // +
38    Minus,              // -
39    Star,               // *
40    Slash,              // /
41    Percent,            // %
42
43    // Delimiters
44    LeftParen,          // (
45    RightParen,         // )
46    Comma,              // ,
47
48    // Literals
49    Identifier(String),
50    StringLiteral(String),
51    IntegerLiteral(i64),
52    FloatLiteral(f64),
53
54    // End of input
55    Eof,
56}
57
58impl fmt::Display for Token {
59    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
60        match self {
61            Token::And => write!(f, "AND"),
62            Token::Or => write!(f, "OR"),
63            Token::Not => write!(f, "NOT"),
64            Token::Between => write!(f, "BETWEEN"),
65            Token::Like => write!(f, "LIKE"),
66            Token::Escape => write!(f, "ESCAPE"),
67            Token::In => write!(f, "IN"),
68            Token::Is => write!(f, "IS"),
69            Token::True => write!(f, "TRUE"),
70            Token::False => write!(f, "FALSE"),
71            Token::Null => write!(f, "NULL"),
72            Token::Equal => write!(f, "="),
73            Token::NotEqual => write!(f, "<>"),
74            Token::GreaterThan => write!(f, ">"),
75            Token::GreaterOrEqual => write!(f, ">="),
76            Token::LessThan => write!(f, "<"),
77            Token::LessOrEqual => write!(f, "<="),
78            Token::Plus => write!(f, "+"),
79            Token::Minus => write!(f, "-"),
80            Token::Star => write!(f, "*"),
81            Token::Slash => write!(f, "/"),
82            Token::Percent => write!(f, "%"),
83            Token::LeftParen => write!(f, "("),
84            Token::RightParen => write!(f, ")"),
85            Token::Comma => write!(f, ","),
86            Token::Identifier(s) => write!(f, "identifier '{}'", s),
87            Token::StringLiteral(s) => write!(f, "string '{}'", s),
88            Token::IntegerLiteral(n) => write!(f, "integer {}", n),
89            Token::FloatLiteral(n) => write!(f, "float {}", n),
90            Token::Eof => write!(f, "end of input"),
91        }
92    }
93}
94
95/// Lexer struct used during tokenization of an input string.
96pub struct Lexer {
97    input: Vec<char>,
98    position: usize,
99    current_char: Option<char>,
100}
101
102impl Lexer {
103    pub fn new(input: &str) -> Self {
104        let chars: Vec<char> = input.chars().collect();
105        let current_char = chars.first().copied();
106        Lexer {
107            input: chars,
108            position: 0,
109            current_char,
110        }
111    }
112
113    /// Format error message with position and input context
114    fn format_error(&self, message: &str) -> String {
115        format!("{} near position {} in:\n  {}",
116            message,
117            self.position,
118            String::from_iter(&self.input))
119    }
120
121    /// Advance to the next character
122    fn advance(&mut self) {
123        self.position += 1;
124        self.current_char = self.input.get(self.position).copied();
125    }
126
127    /// Peek at the next character without advancing
128    fn peek(&self) -> Option<char> {
129        self.input.get(self.position + 1).copied()
130    }
131
132    /// Skip whitespace characters
133    fn skip_whitespace(&mut self) {
134        while let Some(ch) = self.current_char {
135            if ch.is_whitespace() {
136                self.advance();
137            } else {
138                break;
139            }
140        }
141    }
142
143    /// Skip line comment (-- to end of line)
144    fn skip_line_comment(&mut self) {
145        // Skip the '--'
146        self.advance();
147        self.advance();
148
149        // Skip until newline or EOF
150        while let Some(ch) = self.current_char {
151            if ch == '\n' {
152                self.advance();
153                break;
154            }
155            self.advance();
156        }
157    }
158
159    /// Skip block comment (/* ... */)
160    fn skip_block_comment(&mut self) -> Result<(), String> {
161        // Skip the '/*'
162        self.advance();
163        self.advance();
164
165        // Look for '*/'
166        while let Some(ch) = self.current_char {
167            if ch == '*' && self.peek() == Some('/') {
168                self.advance(); // skip '*'
169                self.advance(); // skip '/'
170                return Ok(());
171            }
172            self.advance();
173        }
174
175        Err(self.format_error("Unterminated block comment"))
176    }
177
178    /// Read an identifier or keyword
179    fn read_identifier(&mut self) -> String {
180        let mut result = String::new();
181
182        while let Some(ch) = self.current_char {
183            if ch.is_alphanumeric() || ch == '_' || ch == '$' {
184                result.push(ch);
185                self.advance();
186            } else {
187                break;
188            }
189        }
190
191        result
192    }
193
194    /// Check if identifier is a keyword (case-insensitive)
195    fn keyword_or_identifier(&self, s: &str) -> Token {
196        match s.to_uppercase().as_str() {
197            "AND" => Token::And,
198            "OR" => Token::Or,
199            "NOT" => Token::Not,
200            "BETWEEN" => Token::Between,
201            "LIKE" => Token::Like,
202            "ESCAPE" => Token::Escape,
203            "IN" => Token::In,
204            "IS" => Token::Is,
205            "TRUE" => Token::True,
206            "FALSE" => Token::False,
207            "NULL" => Token::Null,
208            _ => Token::Identifier(s.to_string()),
209        }
210    }
211
212    /// Read a string literal with SQL-style escaping
213    fn read_string_literal(&mut self) -> Result<String, String> {
214        let mut result = String::new();
215
216        // Skip opening quote
217        self.advance();
218
219        while let Some(ch) = self.current_char {
220            if ch == '\'' {
221                // Check for escaped quote ('')
222                if self.peek() == Some('\'') {
223                    result.push('\'');
224                    self.advance(); // skip first '
225                    self.advance(); // skip second '
226                } else {
227                    // End of string
228                    self.advance(); // skip closing '
229                    return Ok(result);
230                }
231            } else {
232                result.push(ch);
233                self.advance();
234            }
235        }
236
237        Err(self.format_error("Unterminated string literal"))
238    }
239
240    /// Read a numeric literal (integer, long, hex, octal, or float)
241    fn read_number(&mut self) -> Result<Token, String> {
242        // Check for hex (0x or 0X)
243        if self.current_char == Some('0') && matches!(self.peek(), Some('x') | Some('X')) {
244            return self.read_hex_literal();
245        }
246
247        // Check for octal (starts with 0)
248        if self.current_char == Some('0') && self.peek().is_some_and(|c| c.is_ascii_digit()) {
249            return self.read_octal_literal();
250        }
251
252        // Read decimal or floating point
253        let mut num_str = String::new();
254        let mut is_float = false;
255
256        // Read integer part
257        while let Some(ch) = self.current_char {
258            if ch.is_ascii_digit() {
259                num_str.push(ch);
260                self.advance();
261            } else {
262                break;
263            }
264        }
265
266        // Check for decimal point
267        if self.current_char == Some('.') && self.peek().is_some_and(|c| c.is_ascii_digit() || c == 'e' || c == 'E') {
268            is_float = true;
269            num_str.push('.');
270            self.advance();
271
272            // Read fractional part
273            while let Some(ch) = self.current_char {
274                if ch.is_ascii_digit() {
275                    num_str.push(ch);
276                    self.advance();
277                } else {
278                    break;
279                }
280            }
281        }
282
283        // Check for exponent
284        if matches!(self.current_char, Some('e') | Some('E')) {
285            is_float = true;
286            num_str.push('e');
287            self.advance();
288
289            // Optional sign
290            if matches!(self.current_char, Some('+') | Some('-')) {
291                num_str.push(self.current_char.unwrap());
292                self.advance();
293            }
294
295            // Exponent digits
296            while let Some(ch) = self.current_char {
297                if ch.is_ascii_digit() {
298                    num_str.push(ch);
299                    self.advance();
300                } else {
301                    break;
302                }
303            }
304        }
305
306        // Check for long suffix (l or L) - treat as regular integer
307        if matches!(self.current_char, Some('l') | Some('L')) && !is_float {
308            self.advance();
309            let value = num_str.parse::<i64>()
310                .map_err(|e| self.format_error(&format!("Invalid integer literal: {}", e)))?;
311            return Ok(Token::IntegerLiteral(value));
312        }
313
314        // Parse as float or integer
315        if is_float {
316            let value = num_str.parse::<f64>()
317                .map_err(|e| self.format_error(&format!("Invalid float literal: {}", e)))?;
318            Ok(Token::FloatLiteral(value))
319        } else {
320            let value = num_str.parse::<i64>()
321                .map_err(|e| self.format_error(&format!("Invalid integer literal: {}", e)))?;
322            Ok(Token::IntegerLiteral(value))
323        }
324    }
325
326    /// Read hexadecimal literal (0x...)
327    fn read_hex_literal(&mut self) -> Result<Token, String> {
328        // Skip '0x' or '0X'
329        self.advance();
330        self.advance();
331
332        let mut hex_str = String::new();
333        while let Some(ch) = self.current_char {
334            if ch.is_ascii_hexdigit() {
335                hex_str.push(ch);
336                self.advance();
337            } else {
338                break;
339            }
340        }
341
342        if hex_str.is_empty() {
343            return Err(self.format_error("Invalid hexadecimal literal: no digits after 0x"));
344        }
345
346        let value = i64::from_str_radix(&hex_str, 16)
347            .map_err(|e| self.format_error(&format!("Invalid hexadecimal literal: {}", e)))?;
348        Ok(Token::IntegerLiteral(value))
349    }
350
351    /// Read octal literal (0...)
352    fn read_octal_literal(&mut self) -> Result<Token, String> {
353        let mut octal_str = String::new();
354
355        while let Some(ch) = self.current_char {
356            if ('0'..='7').contains(&ch) {
357                octal_str.push(ch);
358                self.advance();
359            } else {
360                break;
361            }
362        }
363
364        let value = i64::from_str_radix(&octal_str, 8)
365            .map_err(|e| self.format_error(&format!("Invalid octal literal: {}", e)))?;
366        Ok(Token::IntegerLiteral(value))
367    }
368
369    /// Read floating point literal starting with '.'
370    fn read_float_starting_with_dot(&mut self) -> Result<Token, String> {
371        let mut num_str = String::from("0.");
372
373        // Skip the '.'
374        self.advance();
375
376        // Read fractional part
377        while let Some(ch) = self.current_char {
378            if ch.is_ascii_digit() {
379                num_str.push(ch);
380                self.advance();
381            } else {
382                break;
383            }
384        }
385
386        // Check for exponent
387        if matches!(self.current_char, Some('e') | Some('E')) {
388            num_str.push('e');
389            self.advance();
390
391            // Optional sign
392            if matches!(self.current_char, Some('+') | Some('-')) {
393                num_str.push(self.current_char.unwrap());
394                self.advance();
395            }
396
397            // Exponent digits
398            while let Some(ch) = self.current_char {
399                if ch.is_ascii_digit() {
400                    num_str.push(ch);
401                    self.advance();
402                } else {
403                    break;
404                }
405            }
406        }
407
408        let value = num_str.parse::<f64>()
409            .map_err(|e| self.format_error(&format!("Invalid float literal: {}", e)))?;
410        Ok(Token::FloatLiteral(value))
411    }
412
413    /// Get the next token
414    pub fn next_token(&mut self) -> Result<Token, String> {
415        loop {
416            // Skip whitespace
417            self.skip_whitespace();
418
419            let ch = match self.current_char {
420                Some(c) => c,
421                None => return Ok(Token::Eof),
422            };
423
424            // Check for comments
425            if ch == '-' && self.peek() == Some('-') {
426                self.skip_line_comment();
427                continue;
428            }
429
430            if ch == '/' && self.peek() == Some('*') {
431                self.skip_block_comment()?;
432                continue;
433            }
434
435            // Single-character tokens
436            match ch {
437                '(' => {
438                    self.advance();
439                    return Ok(Token::LeftParen);
440                }
441                ')' => {
442                    self.advance();
443                    return Ok(Token::RightParen);
444                }
445                ',' => {
446                    self.advance();
447                    return Ok(Token::Comma);
448                }
449                '+' => {
450                    self.advance();
451                    return Ok(Token::Plus);
452                }
453                '-' => {
454                    self.advance();
455                    return Ok(Token::Minus);
456                }
457                '*' => {
458                    self.advance();
459                    return Ok(Token::Star);
460                }
461                '/' => {
462                    self.advance();
463                    return Ok(Token::Slash);
464                }
465                '%' => {
466                    self.advance();
467                    return Ok(Token::Percent);
468                }
469                '=' => {
470                    self.advance();
471                    return Ok(Token::Equal);
472                }
473                '!' => {
474                    if self.peek() == Some('=') {
475                        self.advance();
476                        self.advance();
477                        return Ok(Token::NotEqual);
478                    }
479                    return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
480                }
481                '<' => {
482                    self.advance();
483                    if self.current_char == Some('>') {
484                        self.advance();
485                        return Ok(Token::NotEqual);
486                    } else if self.current_char == Some('=') {
487                        self.advance();
488                        return Ok(Token::LessOrEqual);
489                    }
490                    return Ok(Token::LessThan);
491                }
492                '>' => {
493                    self.advance();
494                    if self.current_char == Some('=') {
495                        self.advance();
496                        return Ok(Token::GreaterOrEqual);
497                    }
498                    return Ok(Token::GreaterThan);
499                }
500                '\'' => {
501                    let s = self.read_string_literal()?;
502                    return Ok(Token::StringLiteral(s));
503                }
504                '.' => {
505                    // Check if this is a float starting with '.'
506                    if self.peek().is_some_and(|c| c.is_ascii_digit()) {
507                        return self.read_float_starting_with_dot();
508                    }
509                    return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
510                }
511                _ => {
512                    // Identifiers and keywords
513                    if ch.is_alphabetic() || ch == '_' || ch == '$' {
514                        let ident = self.read_identifier();
515                        return Ok(self.keyword_or_identifier(&ident));
516                    }
517
518                    // Numbers
519                    if ch.is_ascii_digit() {
520                        return self.read_number();
521                    }
522
523                    return Err(self.format_error(&format!("Unexpected character: '{}'", ch)));
524                }
525            }
526        }
527    }
528
529    /// Tokenize the entire input
530    pub fn tokenize(&mut self) -> Result<Vec<Token>, String> {
531        let mut tokens = Vec::new();
532        loop {
533            let token = self.next_token()?;
534            if token == Token::Eof {
535                tokens.push(token);
536                break;
537            }
538            tokens.push(token);
539        }
540        Ok(tokens)
541    }
542}
543
544#[cfg(test)]
545mod tests {
546    use super::*;
547
548    #[test]
549    fn test_keywords() {
550        let mut lexer = Lexer::new("AND or Not BETWEEN");
551        assert_eq!(lexer.next_token().unwrap(), Token::And);
552        assert_eq!(lexer.next_token().unwrap(), Token::Or);
553        assert_eq!(lexer.next_token().unwrap(), Token::Not);
554        assert_eq!(lexer.next_token().unwrap(), Token::Between);
555    }
556
557    #[test]
558    fn test_string_literal() {
559        let mut lexer = Lexer::new("'hello' 'it''s me'");
560        assert_eq!(lexer.next_token().unwrap(), Token::StringLiteral("hello".to_string()));
561        assert_eq!(lexer.next_token().unwrap(), Token::StringLiteral("it's me".to_string()));
562    }
563
564    #[test]
565    fn test_numbers() {
566        let mut lexer = Lexer::new("42 0x1A 077 3.14 1e-5 100L");
567        assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(42));
568        assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(26)); // 0x1A
569        assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(63)); // 077 octal
570        assert_eq!(lexer.next_token().unwrap(), Token::FloatLiteral(3.14));
571        assert!(matches!(lexer.next_token().unwrap(), Token::FloatLiteral(_)));
572        assert_eq!(lexer.next_token().unwrap(), Token::IntegerLiteral(100)); // 100L treated as integer
573    }
574
575    #[test]
576    fn test_comments() {
577        let mut lexer = Lexer::new("x -- comment\ny /* block */ z");
578        assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
579        assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
580        assert!(matches!(lexer.next_token().unwrap(), Token::Identifier(_)));
581    }
582}