jexl_parser/
lexer.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5use std::fmt;
6
7pub type Spanned<Token, Location, Error> = Result<(Location, Token, Location), Error>;
8
9#[derive(Debug, Clone, PartialEq)]
10pub enum Token<'input> {
11    // Literals
12    Number(f64),
13    DoubleQuotedString(&'input str),
14    SingleQuotedString(&'input str),
15    Boolean(bool),
16    Null,
17    Identifier(&'input str),
18
19    // Operators
20    Plus,
21    Minus,
22    Multiply,
23    Divide,
24    FloorDivide,
25    Modulus,
26    Exponent,
27
28    // Comparison
29    Equal,
30    NotEqual,
31    Greater,
32    GreaterEqual,
33    Less,
34    LessEqual,
35    In,
36
37    // Logical
38    And,
39    Or,
40
41    // Punctuation
42    LeftParen,
43    RightParen,
44    LeftBracket,
45    RightBracket,
46    LeftBrace,
47    RightBrace,
48    Comma,
49    Dot,
50    Colon,
51    Question,
52    Pipe,
53
54    // Whitespace (usually ignored)
55    Whitespace,
56    // End of input is handled automatically by lalrpop
57}
58
59impl<'input> fmt::Display for Token<'input> {
60    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
61        match self {
62            Token::Number(n) => write!(f, "{}", n),
63            Token::DoubleQuotedString(s) => write!(f, "\"{}\"", s),
64            Token::SingleQuotedString(s) => write!(f, "'{}'", s),
65            Token::Boolean(b) => write!(f, "{}", b),
66            Token::Null => write!(f, "null"),
67            Token::Identifier(s) => write!(f, "{}", s),
68            Token::Plus => write!(f, "+"),
69            Token::Minus => write!(f, "-"),
70            Token::Multiply => write!(f, "*"),
71            Token::Divide => write!(f, "/"),
72            Token::FloorDivide => write!(f, "//"),
73            Token::Modulus => write!(f, "%"),
74            Token::Exponent => write!(f, "^"),
75            Token::Equal => write!(f, "=="),
76            Token::NotEqual => write!(f, "!="),
77            Token::Greater => write!(f, ">"),
78            Token::GreaterEqual => write!(f, ">="),
79            Token::Less => write!(f, "<"),
80            Token::LessEqual => write!(f, "<="),
81            Token::In => write!(f, "in"),
82            Token::And => write!(f, "&&"),
83            Token::Or => write!(f, "||"),
84            Token::LeftParen => write!(f, "("),
85            Token::RightParen => write!(f, ")"),
86            Token::LeftBracket => write!(f, "["),
87            Token::RightBracket => write!(f, "]"),
88            Token::LeftBrace => write!(f, "{{"),
89            Token::RightBrace => write!(f, "}}"),
90            Token::Comma => write!(f, ","),
91            Token::Dot => write!(f, "."),
92            Token::Colon => write!(f, ":"),
93            Token::Question => write!(f, "?"),
94            Token::Pipe => write!(f, "|"),
95            Token::Whitespace => write!(f, " "),
96        }
97    }
98}
99
100#[derive(Debug, Clone)]
101pub struct Lexer<'input> {
102    input: &'input str,
103    position: usize,
104    line: usize,
105    column: usize,
106}
107
108#[derive(Debug, Clone, PartialEq)]
109pub struct LexError {
110    pub message: String,
111    pub line: usize,
112    pub column: usize,
113}
114
115impl fmt::Display for LexError {
116    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
117        write!(
118            f,
119            "Lexical error at line {}, column {}: {}",
120            self.line, self.column, self.message
121        )
122    }
123}
124
125impl std::error::Error for LexError {}
126
127impl<'input> Lexer<'input> {
128    pub fn new(input: &'input str) -> Self {
129        Lexer {
130            input,
131            position: 0,
132            line: 1,
133            column: 1,
134        }
135    }
136}
137
138impl<'input> Iterator for Lexer<'input> {
139    type Item = Spanned<Token<'input>, usize, LexError>;
140
141    fn next(&mut self) -> Option<Self::Item> {
142        // Skip whitespace first
143        self.skip_whitespace();
144
145        // Check if we've reached the end after skipping whitespace
146        if self.is_at_end() {
147            return None;
148        }
149
150        let start_pos = self.position;
151        match self.next_token_after_whitespace() {
152            Ok(token) => Some(Ok((start_pos, token, self.position))),
153            Err(error) => Some(Err(error)),
154        }
155    }
156}
157
158impl<'input> Lexer<'input> {
159    fn next_token_after_whitespace(&mut self) -> Result<Token<'input>, LexError> {
160        // Whitespace has already been skipped by the caller
161        let ch = self.current_char();
162
163        match ch {
164            // Single-character tokens
165            '+' => {
166                self.advance();
167                Ok(Token::Plus)
168            }
169            '-' => {
170                self.advance();
171                Ok(Token::Minus)
172            }
173            '*' => {
174                self.advance();
175                Ok(Token::Multiply)
176            }
177            '%' => {
178                self.advance();
179                Ok(Token::Modulus)
180            }
181            '^' => {
182                self.advance();
183                Ok(Token::Exponent)
184            }
185            '(' => {
186                self.advance();
187                Ok(Token::LeftParen)
188            }
189            ')' => {
190                self.advance();
191                Ok(Token::RightParen)
192            }
193            '[' => {
194                self.advance();
195                Ok(Token::LeftBracket)
196            }
197            ']' => {
198                self.advance();
199                Ok(Token::RightBracket)
200            }
201            '{' => {
202                self.advance();
203                Ok(Token::LeftBrace)
204            }
205            '}' => {
206                self.advance();
207                Ok(Token::RightBrace)
208            }
209            ',' => {
210                self.advance();
211                Ok(Token::Comma)
212            }
213            ':' => {
214                self.advance();
215                Ok(Token::Colon)
216            }
217            '?' => {
218                self.advance();
219                Ok(Token::Question)
220            }
221            '|' => {
222                self.advance();
223                if self.current_char() == '|' {
224                    self.advance();
225                    Ok(Token::Or)
226                } else {
227                    Ok(Token::Pipe)
228                }
229            }
230
231            // Multi-character tokens
232            '/' => {
233                self.advance();
234                if self.current_char() == '/' {
235                    self.advance();
236                    Ok(Token::FloorDivide)
237                } else {
238                    Ok(Token::Divide)
239                }
240            }
241
242            '=' => {
243                self.advance();
244                if self.current_char() == '=' {
245                    self.advance();
246                    Ok(Token::Equal)
247                } else {
248                    Err(LexError {
249                        message: "Unexpected character '='. Did you mean '=='?".to_string(),
250                        line: self.line,
251                        column: self.column,
252                    })
253                }
254            }
255
256            '!' => {
257                self.advance();
258                if self.current_char() == '=' {
259                    self.advance();
260                    Ok(Token::NotEqual)
261                } else {
262                    Err(LexError {
263                        message: "Unexpected character '!'. Did you mean '!='?".to_string(),
264                        line: self.line,
265                        column: self.column,
266                    })
267                }
268            }
269
270            '>' => {
271                self.advance();
272                if self.current_char() == '=' {
273                    self.advance();
274                    Ok(Token::GreaterEqual)
275                } else {
276                    Ok(Token::Greater)
277                }
278            }
279
280            '<' => {
281                self.advance();
282                if self.current_char() == '=' {
283                    self.advance();
284                    Ok(Token::LessEqual)
285                } else {
286                    Ok(Token::Less)
287                }
288            }
289
290            '&' => {
291                self.advance();
292                if self.current_char() == '&' {
293                    self.advance();
294                    Ok(Token::And)
295                } else {
296                    Err(LexError {
297                        message: "Unexpected character '&'. Did you mean '&&'?".to_string(),
298                        line: self.line,
299                        column: self.column,
300                    })
301                }
302            }
303
304            // String literals
305            '"' => self.scan_double_quoted_string(),
306            '\'' => self.scan_single_quoted_string(),
307
308            // Numbers
309            c if c.is_ascii_digit() => self.scan_number(),
310
311            // Handle numbers starting with a dot (like .89)
312            '.' => {
313                if self.position + 1 < self.input.len() {
314                    let next_char = self.input.chars().nth(self.position + 1).unwrap_or('\0');
315                    if next_char.is_ascii_digit() {
316                        self.scan_number()
317                    } else {
318                        self.advance();
319                        Ok(Token::Dot)
320                    }
321                } else {
322                    self.advance();
323                    Ok(Token::Dot)
324                }
325            }
326
327            // Identifiers and keywords
328            c if c.is_alphabetic() || c == '_' => self.scan_identifier(),
329
330            _ => Err(LexError {
331                message: format!("Unexpected character '{}'", ch),
332                line: self.line,
333                column: self.column,
334            }),
335        }
336    }
337
338    fn scan_double_quoted_string(&mut self) -> Result<Token<'input>, LexError> {
339        self.advance(); // consume opening quote
340        let start_pos = self.position;
341
342        // Match pattern: ([^"\\]*(\\")?)*
343        while !self.is_at_end() {
344            let ch = self.current_char();
345
346            if ch == '"' {
347                // End of string
348                let end_pos = self.position;
349                self.advance(); // consume closing quote
350                let string_slice = &self.input[start_pos..end_pos];
351                return Ok(Token::DoubleQuotedString(string_slice));
352            } else if ch == '\\' {
353                // Must be followed by "
354                self.advance(); // consume backslash
355                if !self.is_at_end() && self.current_char() == '"' {
356                    self.advance(); // consume escaped quote
357                } else {
358                    return Err(LexError {
359                        message: "Invalid escape sequence in double-quoted string".to_string(),
360                        line: self.line,
361                        column: self.column,
362                    });
363                }
364            } else {
365                self.advance();
366            }
367        }
368
369        Err(LexError {
370            message: "Unterminated string literal".to_string(),
371            line: self.line,
372            column: self.column,
373        })
374    }
375
376    fn scan_single_quoted_string(&mut self) -> Result<Token<'input>, LexError> {
377        self.advance(); // consume opening quote
378        let start_pos = self.position;
379
380        // Match pattern: ([^'\\]*(\\')?)*
381        while !self.is_at_end() {
382            let ch = self.current_char();
383
384            if ch == '\'' {
385                // End of string
386                let end_pos = self.position;
387                self.advance(); // consume closing quote
388                let string_slice = &self.input[start_pos..end_pos];
389                return Ok(Token::SingleQuotedString(string_slice));
390            } else if ch == '\\' {
391                // Must be followed by '
392                self.advance(); // consume backslash
393                if !self.is_at_end() && self.current_char() == '\'' {
394                    self.advance(); // consume escaped quote
395                } else {
396                    return Err(LexError {
397                        message: "Invalid escape sequence in single-quoted string".to_string(),
398                        line: self.line,
399                        column: self.column,
400                    });
401                }
402            } else {
403                self.advance();
404            }
405        }
406
407        Err(LexError {
408            message: "Unterminated string literal".to_string(),
409            line: self.line,
410            column: self.column,
411        })
412    }
413
414    fn scan_number(&mut self) -> Result<Token<'input>, LexError> {
415        let start_pos = self.position;
416
417        // Handle numbers starting with a dot
418        if self.current_char() == '.' {
419            self.advance();
420        }
421
422        // Scan digits (either integer part or fractional part)
423        while !self.is_at_end() && self.current_char().is_ascii_digit() {
424            self.advance();
425        }
426
427        // Check for decimal point (only if we didn't start with one)
428        if !&self.input[start_pos..self.position].starts_with('.')
429            && !self.is_at_end()
430            && self.current_char() == '.'
431        {
432            // Look ahead to see if there's a digit after the dot
433            if self.position + 1 < self.input.len() {
434                let next_char = self.input.chars().nth(self.position + 1).unwrap_or('\0');
435                if next_char.is_ascii_digit() {
436                    self.advance(); // consume dot
437
438                    // Scan fractional part
439                    while !self.is_at_end() && self.current_char().is_ascii_digit() {
440                        self.advance();
441                    }
442                }
443            }
444        }
445
446        let number_str = &self.input[start_pos..self.position];
447        match number_str.parse::<f64>() {
448            Ok(num) => Ok(Token::Number(num)),
449            Err(_) => Err(LexError {
450                message: format!("Invalid number format: {}", number_str),
451                line: self.line,
452                column: self.column,
453            }),
454        }
455    }
456
457    fn scan_identifier(&mut self) -> Result<Token<'input>, LexError> {
458        let start_pos = self.position;
459
460        while !self.is_at_end() {
461            let ch = self.current_char();
462            if ch.is_alphanumeric() || ch == '_' {
463                self.advance();
464            } else {
465                break;
466            }
467        }
468
469        let identifier = &self.input[start_pos..self.position];
470
471        // Check for keywords
472        let token = match identifier {
473            "true" => Token::Boolean(true),
474            "false" => Token::Boolean(false),
475            "null" => Token::Null,
476            "in" => Token::In,
477            _ => Token::Identifier(identifier),
478        };
479
480        Ok(token)
481    }
482
483    fn skip_whitespace(&mut self) {
484        while !self.is_at_end() && self.current_char().is_whitespace() {
485            if self.current_char() == '\n' {
486                self.line += 1;
487                self.column = 1;
488            } else {
489                self.column += 1;
490            }
491            self.advance();
492        }
493    }
494
495    fn current_char(&self) -> char {
496        self.input.chars().nth(self.position).unwrap_or('\0')
497    }
498
499    fn advance(&mut self) {
500        if !self.is_at_end() {
501            self.position += 1;
502            self.column += 1;
503        }
504    }
505
506    fn is_at_end(&self) -> bool {
507        self.position >= self.input.len()
508    }
509}
510
511#[cfg(test)]
512mod tests {
513    use super::*;
514
515    #[test]
516    fn test_basic_tokens() {
517        let lexer = Lexer::new("+ - * / % ^");
518        let tokens: Result<Vec<_>, _> = lexer.collect();
519        let tokens = tokens.unwrap();
520
521        let expected_tokens = vec![
522            Token::Plus,
523            Token::Minus,
524            Token::Multiply,
525            Token::Divide,
526            Token::Modulus,
527            Token::Exponent,
528        ];
529
530        let actual_tokens: Vec<Token> = tokens.into_iter().map(|(_, token, _)| token).collect();
531        assert_eq!(actual_tokens, expected_tokens);
532    }
533
534    #[test]
535    fn test_numbers() {
536        let lexer = Lexer::new("123 45.67 .89");
537        let tokens: Result<Vec<_>, _> = lexer.collect();
538        let tokens = tokens.unwrap();
539
540        let expected_tokens = vec![
541            Token::Number(123.0),
542            Token::Number(45.67),
543            Token::Number(0.89),
544        ];
545
546        let actual_tokens: Vec<Token> = tokens.into_iter().map(|(_, token, _)| token).collect();
547        assert_eq!(actual_tokens, expected_tokens);
548    }
549
550    #[test]
551    fn test_strings() {
552        let lexer = Lexer::new(r#""hello" 'world'"#);
553        let tokens: Result<Vec<_>, _> = lexer.collect();
554        let tokens = tokens.unwrap();
555
556        let expected_tokens = vec![
557            Token::DoubleQuotedString("hello"),
558            Token::SingleQuotedString("world"),
559        ];
560
561        let actual_tokens: Vec<Token> = tokens.into_iter().map(|(_, token, _)| token).collect();
562        assert_eq!(actual_tokens, expected_tokens);
563    }
564
565    #[test]
566    fn test_identifiers_and_keywords() {
567        let lexer = Lexer::new("foo true false null in");
568        let tokens: Result<Vec<_>, _> = lexer.collect();
569        let tokens = tokens.unwrap();
570
571        let expected_tokens = vec![
572            Token::Identifier("foo"),
573            Token::Boolean(true),
574            Token::Boolean(false),
575            Token::Null,
576            Token::In,
577        ];
578
579        let actual_tokens: Vec<Token> = tokens.into_iter().map(|(_, token, _)| token).collect();
580        assert_eq!(actual_tokens, expected_tokens);
581    }
582
583    #[test]
584    fn test_complex_expression() {
585        let lexer = Lexer::new("foo.bar[0] == 'test' && (x > 1)");
586        let tokens: Result<Vec<_>, _> = lexer.collect();
587        let tokens = tokens.unwrap();
588
589        let expected_tokens = vec![
590            Token::Identifier("foo"),
591            Token::Dot,
592            Token::Identifier("bar"),
593            Token::LeftBracket,
594            Token::Number(0.0),
595            Token::RightBracket,
596            Token::Equal,
597            Token::SingleQuotedString("test"),
598            Token::And,
599            Token::LeftParen,
600            Token::Identifier("x"),
601            Token::Greater,
602            Token::Number(1.0),
603            Token::RightParen,
604        ];
605
606        let actual_tokens: Vec<Token> = tokens.into_iter().map(|(_, token, _)| token).collect();
607        assert_eq!(actual_tokens, expected_tokens);
608    }
609}