Skip to main content

parser/
lexer.rs

1//! Lexer implementation. Generated by CongoCC Parser Generator. Do not edit.
2
3use crate::error::{ParseError, ParseResult};
4use crate::tokens::{Token, TokenType, LexicalState, TokenSource};
5
6/// The lexer/tokenizer for SqlExprParser
7pub struct Lexer {
8    /// The input string being tokenized
9    input: String,
10    /// Current position in the input
11    position: usize,
12    /// Current lexical state
13    state: LexicalState,
14    /// All tokens generated so far
15    tokens: Vec<Token>,
16    /// Current line number (1-indexed)
17    current_line: usize,
18    /// Current column number (1-indexed)
19    current_column: usize,
20    /// Line start offsets for quick line/column lookup
21    line_starts: Vec<usize>,
22}
23
24impl Lexer {
25    /// Create a new lexer for the given input
26    pub fn new(input: String) -> Self {
27        let mut line_starts = vec![0];
28        for (i, ch) in input.char_indices() {
29            if ch == '\n' {
30                line_starts.push(i + 1);
31            }
32        }
33
34        Lexer {
35            input,
36            position: 0,
37            state: LexicalState::DEFAULT,
38            tokens: Vec::new(),
39            current_line: 1,
40            current_column: 1,
41            line_starts,
42        }
43    }
44
45    /// Get the next token from the input
46    pub fn next_token(&mut self) -> ParseResult<Token> {
47        // Skip whitespace and comments based on lexical state
48        self.skip_ignored()?;
49
50        if self.position >= self.input.len() {
51            // Return EOF token
52            return Ok(Token::new(
53                TokenType::EOF,
54                String::new(),
55                self.position,
56                self.position,
57            ));
58        }
59
60        let start_pos = self.position;
61        let start_line = self.current_line;
62        let start_column = self.current_column;
63
64        // Try to match each token type in order
65        if let Some(token) = self.try_match_token(start_pos)? {
66            return Ok(token);
67        }
68
69        // If no token matched, it's an error
70        Err(ParseError::at_location(
71            format!("Unexpected character: '{}'", self.current_char()),
72            start_line,
73            start_column,
74        ))
75    }
76
77    /// Try to match a token at the current position
78    fn try_match_token(&mut self, start_pos: usize) -> ParseResult<Option<Token>> {
79        let ch = self.current_char();
80
81        if self.matches_string("!=") {
82            return Ok(Some(self.consume_literal(TokenType::NE, "!=", start_pos)));
83        }
84        if self.matches_string("<>") {
85            return Ok(Some(self.consume_literal(TokenType::NE, "<>", start_pos)));
86        }
87        if self.matches_string(">=") {
88            return Ok(Some(self.consume_literal(TokenType::GE, ">=", start_pos)));
89        }
90        if self.matches_string("<=") {
91            return Ok(Some(self.consume_literal(TokenType::LE, "<=", start_pos)));
92        }
93        match ch {
94            ' ' => {
95                self.advance();
96                return Ok(Some(Token::new(
97                    TokenType::SPACE,
98                    " ".to_string(),
99                    start_pos,
100                    self.position,
101                )));
102            }
103            '\t' => {
104                self.advance();
105                return Ok(Some(Token::new(
106                    TokenType::TAB,
107                    "\t".to_string(),
108                    start_pos,
109                    self.position,
110                )));
111            }
112            '\n' => {
113                self.advance();
114                return Ok(Some(Token::new(
115                    TokenType::NEWLINE,
116                    "\n".to_string(),
117                    start_pos,
118                    self.position,
119                )));
120            }
121            '\r' => {
122                self.advance();
123                return Ok(Some(Token::new(
124                    TokenType::CR,
125                    "\r".to_string(),
126                    start_pos,
127                    self.position,
128                )));
129            }
130            '\x0c' => {
131                self.advance();
132                return Ok(Some(Token::new(
133                    TokenType::FORM_FEED,
134                    "\x0c".to_string(),
135                    start_pos,
136                    self.position,
137                )));
138            }
139            '=' => {
140                self.advance();
141                return Ok(Some(Token::new(
142                    TokenType::EQ,
143                    "=".to_string(),
144                    start_pos,
145                    self.position,
146                )));
147            }
148            '>' => {
149                self.advance();
150                return Ok(Some(Token::new(
151                    TokenType::GT,
152                    ">".to_string(),
153                    start_pos,
154                    self.position,
155                )));
156            }
157            '<' => {
158                self.advance();
159                return Ok(Some(Token::new(
160                    TokenType::LT,
161                    "<".to_string(),
162                    start_pos,
163                    self.position,
164                )));
165            }
166            '(' => {
167                self.advance();
168                return Ok(Some(Token::new(
169                    TokenType::LPAREN,
170                    "(".to_string(),
171                    start_pos,
172                    self.position,
173                )));
174            }
175            ',' => {
176                self.advance();
177                return Ok(Some(Token::new(
178                    TokenType::COMMA,
179                    ",".to_string(),
180                    start_pos,
181                    self.position,
182                )));
183            }
184            ')' => {
185                self.advance();
186                return Ok(Some(Token::new(
187                    TokenType::RPAREN,
188                    ")".to_string(),
189                    start_pos,
190                    self.position,
191                )));
192            }
193            '+' => {
194                self.advance();
195                return Ok(Some(Token::new(
196                    TokenType::PLUS,
197                    "+".to_string(),
198                    start_pos,
199                    self.position,
200                )));
201            }
202            '-' => {
203                self.advance();
204                return Ok(Some(Token::new(
205                    TokenType::MINUS,
206                    "-".to_string(),
207                    start_pos,
208                    self.position,
209                )));
210            }
211            '*' => {
212                self.advance();
213                return Ok(Some(Token::new(
214                    TokenType::STAR,
215                    "*".to_string(),
216                    start_pos,
217                    self.position,
218                )));
219            }
220            '/' => {
221                self.advance();
222                return Ok(Some(Token::new(
223                    TokenType::SLASH,
224                    "/".to_string(),
225                    start_pos,
226                    self.position,
227                )));
228            }
229            '%' => {
230                self.advance();
231                return Ok(Some(Token::new(
232                    TokenType::PERCENT,
233                    "%".to_string(),
234                    start_pos,
235                    self.position,
236                )));
237            }
238            _ => {}
239        }
240
241        // String literal (single-quoted)
242        if ch == '\'' {
243            return self.match_string_literal(start_pos);
244        }
245
246        // Numeric literals (including leading-dot floats like .5)
247        if ch.is_ascii_digit()
248            || (ch == '.' && self.peek(1).is_some_and(|c| c.is_ascii_digit()))
249        {
250            return self.match_number(start_pos);
251        }
252
253        // Identifiers and keywords
254        if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
255            return self.match_identifier_or_keyword(start_pos);
256        }
257
258        // No token matched
259        Ok(None)
260    }
261
262    /// Consume a literal string token
263    fn consume_literal(&mut self, token_type: TokenType, literal: &str, start_pos: usize) -> Token {
264        for _ in 0..literal.len() {
265            self.advance();
266        }
267        Token::new(token_type, literal.to_string(), start_pos, self.position)
268    }
269
270    /// Match a single-quoted string literal
271    fn match_string_literal(&mut self, start_pos: usize) -> ParseResult<Option<Token>> {
272        // Consume opening quote
273        self.advance();
274        while self.position < self.input.len() {
275            let ch = self.current_char();
276            if ch == '\'' {
277                // Check for escaped quote ('')
278                if self.peek(1) == Some('\'') {
279                    self.advance(); // consume first '
280                    self.advance(); // consume second '
281                    continue;
282                }
283                self.advance(); // consume closing quote
284                let image = self.input[start_pos..self.position].to_string();
285                return Ok(Some(Token::new(
286                    TokenType::STRING_LITERAL,
287                    image,
288                    start_pos,
289                    self.position,
290                )));
291            }
292            self.advance();
293        }
294        // Unterminated string literal
295        Err(ParseError::at_position(
296            "Unterminated string literal".to_string(),
297            start_pos,
298        ))
299    }
300
301    /// Match a numeric literal (integer, hex, octal, or decimal/float)
302    fn match_number(&mut self, start_pos: usize) -> ParseResult<Option<Token>> {
303        // Check for hex (0x/0X) or octal (leading 0 + digits) prefix
304        if self.current_char() == '0' {
305            if self.peek(1).is_some_and(|ch| ch == 'x' || ch == 'X') {
306                // Hex literal: 0x followed by hex digits
307                self.advance(); // consume '0'
308                self.advance(); // consume 'x'/'X'
309                if self.position >= self.input.len() || !self.current_char().is_ascii_hexdigit() {
310                    return Err(ParseError::at_position(
311                        "Expected hex digit after 0x".to_string(),
312                        start_pos,
313                    ));
314                }
315                while self.position < self.input.len() && self.current_char().is_ascii_hexdigit() {
316                    self.advance();
317                }
318                // Optional long suffix
319                if self.position < self.input.len() && matches!(self.current_char(), 'L' | 'l') {
320                    self.advance();
321                }
322                let image = self.input[start_pos..self.position].to_string();
323                return Ok(Some(Token::new(
324                    TokenType::HEX_LITERAL,
325                    image,
326                    start_pos,
327                    self.position,
328                )));
329            }
330            if self.peek(1).is_some_and(|ch| ('0'..='7').contains(&ch)) {
331                // Octal literal: 0 followed by octal digits
332                self.advance(); // consume leading '0'
333                while self.position < self.input.len() && ('0'..='7').contains(&self.current_char()) {
334                    self.advance();
335                }
336                // Optional long suffix
337                if self.position < self.input.len() && matches!(self.current_char(), 'L' | 'l') {
338                    self.advance();
339                }
340                let image = self.input[start_pos..self.position].to_string();
341                return Ok(Some(Token::new(
342                    TokenType::OCTAL_LITERAL,
343                    image,
344                    start_pos,
345                    self.position,
346                )));
347            }
348        }
349
350        // Consume leading digits
351        let mut is_float = false;
352        while self.position < self.input.len() && self.current_char().is_ascii_digit() {
353            self.advance();
354        }
355        // Check for decimal point followed by digits
356        if self.position < self.input.len() && self.current_char() == '.'
357            && self.peek(1).is_some_and(|ch| ch.is_ascii_digit())
358        {
359            is_float = true;
360            self.advance(); // consume '.'
361            while self.position < self.input.len() && self.current_char().is_ascii_digit() {
362                self.advance();
363            }
364        }
365        // Check for exponent (e/E followed by optional +/- and digits)
366        if self.position < self.input.len() && matches!(self.current_char(), 'e' | 'E') {
367            is_float = true;
368            self.advance(); // consume 'e'/'E'
369            if self.position < self.input.len() && matches!(self.current_char(), '+' | '-') {
370                self.advance(); // consume sign
371            }
372            if self.position >= self.input.len() || !self.current_char().is_ascii_digit() {
373                return Err(ParseError::at_position(
374                    "Expected digit in exponent".to_string(),
375                    start_pos,
376                ));
377            }
378            while self.position < self.input.len() && self.current_char().is_ascii_digit() {
379                self.advance();
380            }
381        }
382        if is_float {
383            let image = self.input[start_pos..self.position].to_string();
384            return Ok(Some(Token::new(
385                TokenType::FLOATING_POINT_LITERAL,
386                image,
387                start_pos,
388                self.position,
389            )));
390        }
391        // Optional long suffix for integer literals
392        if self.position < self.input.len() && matches!(self.current_char(), 'L' | 'l') {
393            self.advance();
394        }
395        let image = self.input[start_pos..self.position].to_string();
396        Ok(Some(Token::new(
397            TokenType::DECIMAL_LITERAL,
398            image,
399            start_pos,
400            self.position,
401        )))
402    }
403
404    /// Match an identifier or keyword
405    fn match_identifier_or_keyword(&mut self, start_pos: usize) -> ParseResult<Option<Token>> {
406        // Consume identifier characters
407        while self.position < self.input.len() {
408            let ch = self.current_char();
409            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
410                self.advance();
411            } else {
412                break;
413            }
414        }
415        let image = self.input[start_pos..self.position].to_string();
416        let upper = image.to_ascii_uppercase();
417
418        // Check against keywords (case-insensitive)
419        let token_type = match upper.as_str() {
420            "NOT" => TokenType::NOT,
421            "AND" => TokenType::AND,
422            "OR" => TokenType::OR,
423            "BETWEEN" => TokenType::BETWEEN,
424            "LIKE" => TokenType::LIKE,
425            "ESCAPE" => TokenType::ESCAPE,
426            "IN" => TokenType::IN,
427            "IS" => TokenType::IS,
428            "TRUE" => TokenType::TRUE,
429            "FALSE" => TokenType::FALSE,
430            "NULL" => TokenType::NULL,
431            _ => TokenType::ID,
432        };
433
434        Ok(Some(Token::new(token_type, image, start_pos, self.position)))
435    }
436
437    /// Skip whitespace and ignored tokens
438    fn skip_ignored(&mut self) -> ParseResult<()> {
439        while self.position < self.input.len() {
440            let ch = self.current_char();
441
442            // Skip whitespace
443            if ch.is_whitespace() {
444                self.advance();
445                continue;
446            }
447
448            // Skip line comments: -- to end of line
449            if ch == '-' && self.peek(1) == Some('-') {
450                self.advance(); // consume first -
451                self.advance(); // consume second -
452                while self.position < self.input.len() && self.current_char() != '\n' {
453                    self.advance();
454                }
455                continue;
456            }
457
458            // Skip block comments: /* ... */
459            if ch == '/' && self.peek(1) == Some('*') {
460                let start_pos = self.position;
461                self.advance(); // consume /
462                self.advance(); // consume *
463                loop {
464                    if self.position >= self.input.len() {
465                        return Err(ParseError::at_position(
466                            "Unterminated block comment".to_string(),
467                            start_pos,
468                        ));
469                    }
470                    if self.current_char() == '*' && self.peek(1) == Some('/') {
471                        self.advance(); // consume *
472                        self.advance(); // consume /
473                        break;
474                    }
475                    self.advance();
476                }
477                continue;
478            }
479
480            break;
481        }
482        Ok(())
483    }
484
485    /// Get the current character without consuming it
486    fn current_char(&self) -> char {
487        self.input[self.position..].chars().next().unwrap_or('\0')
488    }
489
490    /// Advance to the next character
491    fn advance(&mut self) {
492        if self.position < self.input.len() {
493            let ch = self.current_char();
494            self.position += ch.len_utf8();
495
496            if ch == '\n' {
497                self.current_line += 1;
498                self.current_column = 1;
499            } else {
500                self.current_column += 1;
501            }
502        }
503    }
504
505    /// Peek ahead n characters without consuming
506    fn peek(&self, n: usize) -> Option<char> {
507        self.input[self.position..].chars().nth(n)
508    }
509
510    /// Check if current position matches a string
511    fn matches_string(&self, s: &str) -> bool {
512        self.input[self.position..].starts_with(s)
513    }
514}
515
516impl TokenSource for Lexer {
517    fn get_line_from_offset(&self, offset: usize) -> usize {
518        // Binary search for the line containing this offset
519        match self.line_starts.binary_search(&offset) {
520            Ok(line) => line + 1,
521            Err(line) => line,
522        }
523    }
524
525    fn get_column_from_offset(&self, offset: usize) -> usize {
526        let line_num = self.get_line_from_offset(offset);
527        if line_num == 0 || line_num > self.line_starts.len() {
528            return 1;
529        }
530
531        let line_start = self.line_starts[line_num - 1];
532        offset.saturating_sub(line_start) + 1
533    }
534}