Skip to main content

bytecode_filter/
lexer.rs

1//! Lexer/tokenizer for filter expressions.
2//!
3//! Converts a filter string into a stream of tokens.
4
5use thiserror::Error;
6
7/// Token types for filter expressions.
8#[derive(Debug, Clone, PartialEq)]
9#[allow(missing_docs)]
10pub enum Token {
11    // Identifiers and literals
12    Ident(String),  // Field names like MESSAGE_TYPE, payload
13    String(String), // Quoted strings: "value"
14    Number(u64),    // Numbers: 123
15    Regex(String),  // Regex patterns: r"pattern" or /pattern/
16
17    // Keywords
18    And,        // AND, &&
19    Or,         // OR, ||
20    Not,        // NOT, !
21    In,         // in
22    Contains,   // contains
23    IContains,  // icontains
24    StartsWith, // starts_with
25    EndsWith,   // ends_with
26    Matches,    // matches
27    IsEmpty,    // is_empty, EMPTY
28    NotEmpty,   // not_empty, NOT EMPTY
29    Header,     // header (for header extraction)
30    IEquals,    // iequals, ieq
31    Rand,       // rand
32
33    // Operators
34    Eq,     // ==
35    Ne,     // !=
36    Dot,    // .
37    LParen, // (
38    RParen, // )
39    LBrace, // {
40    RBrace, // }
41    Comma,  // ,
42
43    // End of input
44    Eof,
45}
46
47/// Lexer error types.
48#[derive(Debug, Clone, Error, PartialEq)]
49#[allow(missing_docs)]
50pub enum LexError {
51    #[error("Unexpected character '{0}' at position {1}")]
52    UnexpectedChar(char, usize),
53
54    #[error("Unterminated string starting at position {0}")]
55    UnterminatedString(usize),
56
57    #[error("Unterminated regex starting at position {0}")]
58    UnterminatedRegex(usize),
59
60    #[error("Invalid escape sequence at position {0}")]
61    InvalidEscape(usize),
62
63    #[error("Invalid number at position {0}")]
64    InvalidNumber(usize),
65}
66
67/// Lexer for filter expressions.
68pub struct Lexer<'a> {
69    input: &'a str,
70    chars: std::iter::Peekable<std::str::CharIndices<'a>>,
71    position: usize,
72}
73
74impl<'a> Lexer<'a> {
75    /// Create a new lexer for the given input.
76    pub fn new(input: &'a str) -> Self {
77        Self {
78            input,
79            chars: input.char_indices().peekable(),
80            position: 0,
81        }
82    }
83
84    /// Get the next token.
85    ///
86    /// # Errors
87    /// Returns `LexError` if an unexpected character is encountered.
88    pub fn next_token(&mut self) -> Result<Token, LexError> {
89        self.skip_whitespace();
90
91        let Some(&(pos, ch)) = self.chars.peek() else {
92            return Ok(Token::Eof);
93        };
94
95        self.position = pos;
96
97        match ch {
98            // Single-character tokens
99            '(' => {
100                self.chars.next();
101                Ok(Token::LParen)
102            }
103            ')' => {
104                self.chars.next();
105                Ok(Token::RParen)
106            }
107            '{' => {
108                self.chars.next();
109                Ok(Token::LBrace)
110            }
111            '}' => {
112                self.chars.next();
113                Ok(Token::RBrace)
114            }
115            ',' => {
116                self.chars.next();
117                Ok(Token::Comma)
118            }
119            '.' => {
120                self.chars.next();
121                Ok(Token::Dot)
122            }
123
124            // Operators
125            '=' => {
126                self.chars.next();
127                if self.chars.peek().map(|&(_, c)| c) == Some('=') {
128                    self.chars.next();
129                    Ok(Token::Eq)
130                } else {
131                    Ok(Token::Eq) // Single = also means ==
132                }
133            }
134            '!' => {
135                self.chars.next();
136                if self.chars.peek().map(|&(_, c)| c) == Some('=') {
137                    self.chars.next();
138                    Ok(Token::Ne)
139                } else {
140                    Ok(Token::Not)
141                }
142            }
143            '&' => {
144                self.chars.next();
145                if self.chars.peek().map(|&(_, c)| c) == Some('&') {
146                    self.chars.next();
147                }
148                Ok(Token::And)
149            }
150            '|' => {
151                self.chars.next();
152                if self.chars.peek().map(|&(_, c)| c) == Some('|') {
153                    self.chars.next();
154                }
155                Ok(Token::Or)
156            }
157
158            // Strings
159            '"' | '\'' => self.read_string(ch),
160
161            // Regex with r"..." or /.../ syntax
162            'r' if self.peek_char(1) == Some('"') => self.read_regex_r(),
163            '/' => self.read_regex_slash(),
164
165            // Numbers
166            '0'..='9' => self.read_number(),
167
168            // Identifiers and keywords
169            'a'..='z' | 'A'..='Z' | '_' => self.read_ident(),
170
171            _ => Err(LexError::UnexpectedChar(ch, pos)),
172        }
173    }
174
175    /// Tokenize the entire input.
176    ///
177    /// # Errors
178    /// Returns `LexError` if tokenization fails.
179    pub fn tokenize(&mut self) -> Result<Vec<Token>, LexError> {
180        let mut tokens = Vec::new();
181        loop {
182            let token = self.next_token()?;
183            if token == Token::Eof {
184                break;
185            }
186            tokens.push(token);
187        }
188        Ok(tokens)
189    }
190
191    fn skip_whitespace(&mut self) {
192        while let Some(&(_, ch)) = self.chars.peek() {
193            if ch.is_whitespace() {
194                self.chars.next();
195            } else if ch == '#' {
196                // Skip end-of-line comment: # ... until newline or EOF
197                while let Some(&(_, c)) = self.chars.peek() {
198                    if c == '\n' {
199                        self.chars.next();
200                        break;
201                    }
202                    self.chars.next();
203                }
204            } else {
205                break;
206            }
207        }
208    }
209
210    fn peek_char(&self, offset: usize) -> Option<char> {
211        self.input[self.position..].chars().nth(offset)
212    }
213
214    fn read_string(&mut self, quote: char) -> Result<Token, LexError> {
215        let start = self.position;
216        self.chars.next(); // consume opening quote
217
218        let mut value = String::new();
219
220        loop {
221            match self.chars.next() {
222                Some((_, ch)) if ch == quote => {
223                    return Ok(Token::String(value));
224                }
225                Some((pos, '\\')) => {
226                    // Escape sequence
227                    match self.chars.next() {
228                        Some((_, 'n')) => value.push('\n'),
229                        Some((_, 'r')) => value.push('\r'),
230                        Some((_, 't')) => value.push('\t'),
231                        Some((_, '\\')) => value.push('\\'),
232                        Some((_, c)) if c == quote => value.push(c),
233                        Some((_, '"')) => value.push('"'),
234                        Some((_, '\'')) => value.push('\''),
235                        _ => return Err(LexError::InvalidEscape(pos)),
236                    }
237                }
238                Some((_, ch)) => value.push(ch),
239                None => return Err(LexError::UnterminatedString(start)),
240            }
241        }
242    }
243
244    fn read_regex_r(&mut self) -> Result<Token, LexError> {
245        let start = self.position;
246        self.chars.next(); // consume 'r'
247        self.chars.next(); // consume '"'
248
249        let mut pattern = String::new();
250
251        loop {
252            match self.chars.next() {
253                Some((_, '"')) => {
254                    return Ok(Token::Regex(pattern));
255                }
256                Some((_, '\\')) => {
257                    // In raw regex, backslash is literal
258                    pattern.push('\\');
259                    if let Some((_, ch)) = self.chars.next() {
260                        pattern.push(ch);
261                    }
262                }
263                Some((_, ch)) => pattern.push(ch),
264                None => return Err(LexError::UnterminatedRegex(start)),
265            }
266        }
267    }
268
269    fn read_regex_slash(&mut self) -> Result<Token, LexError> {
270        let start = self.position;
271        self.chars.next(); // consume '/'
272
273        let mut pattern = String::new();
274
275        loop {
276            match self.chars.next() {
277                Some((_, '/')) => {
278                    // Check for flags (e.g., /pattern/i)
279                    while let Some(&(_, ch)) = self.chars.peek() {
280                        if ch.is_ascii_alphabetic() {
281                            self.chars.next();
282                            // For now, ignore flags - regex crate handles (?i) inline
283                        } else {
284                            break;
285                        }
286                    }
287                    return Ok(Token::Regex(pattern));
288                }
289                Some((_, '\\')) => {
290                    pattern.push('\\');
291                    if let Some((_, ch)) = self.chars.next() {
292                        pattern.push(ch);
293                    }
294                }
295                Some((_, ch)) => pattern.push(ch),
296                None => return Err(LexError::UnterminatedRegex(start)),
297            }
298        }
299    }
300
301    fn read_number(&mut self) -> Result<Token, LexError> {
302        let start = self.position;
303        let mut num_str = String::new();
304
305        while let Some(&(_, ch)) = self.chars.peek() {
306            if ch.is_ascii_digit() {
307                num_str.push(ch);
308                self.chars.next();
309            } else {
310                break;
311            }
312        }
313
314        num_str
315            .parse::<u64>()
316            .map(Token::Number)
317            .map_err(|_| LexError::InvalidNumber(start))
318    }
319
320    fn read_ident(&mut self) -> Result<Token, LexError> {
321        let mut ident = String::new();
322
323        while let Some(&(_, ch)) = self.chars.peek() {
324            if ch.is_ascii_alphanumeric() || ch == '_' {
325                ident.push(ch);
326                self.chars.next();
327            } else {
328                break;
329            }
330        }
331
332        // Check for keywords (case-insensitive)
333        let token = match ident.to_ascii_lowercase().as_str() {
334            "and" => Token::And,
335            "or" => Token::Or,
336            "not" => Token::Not,
337            "in" => Token::In,
338            "contains" => Token::Contains,
339            "icontains" => Token::IContains,
340            "starts_with" | "startswith" => Token::StartsWith,
341            "ends_with" | "endswith" => Token::EndsWith,
342            "matches" => Token::Matches,
343            "is_empty" | "isempty" | "empty" => Token::IsEmpty,
344            "not_empty" | "notempty" => Token::NotEmpty,
345            "header" => Token::Header,
346            "iequals" | "ieq" => Token::IEquals,
347            "rand" | "random" => Token::Rand,
348            "true" => return Ok(Token::Ident("true".into())),
349            "false" => return Ok(Token::Ident("false".into())),
350            _ => Token::Ident(ident),
351        };
352
353        Ok(token)
354    }
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360
361    fn tokenize(input: &str) -> Result<Vec<Token>, LexError> {
362        Lexer::new(input).tokenize()
363    }
364
365    #[test]
366    fn test_simple_tokens() {
367        assert_eq!(
368            tokenize("(){},.").unwrap(),
369            vec![
370                Token::LParen,
371                Token::RParen,
372                Token::LBrace,
373                Token::RBrace,
374                Token::Comma,
375                Token::Dot,
376            ]
377        );
378    }
379
380    #[test]
381    fn test_operators() {
382        assert_eq!(
383            tokenize("== != && ||").unwrap(),
384            vec![Token::Eq, Token::Ne, Token::And, Token::Or,]
385        );
386    }
387
388    #[test]
389    fn test_strings() {
390        assert_eq!(
391            tokenize(r#""hello" 'world'"#).unwrap(),
392            vec![Token::String("hello".into()), Token::String("world".into()),]
393        );
394    }
395
396    #[test]
397    fn test_string_escapes() {
398        assert_eq!(
399            tokenize(r#""hello\nworld""#).unwrap(),
400            vec![Token::String("hello\nworld".into()),]
401        );
402    }
403
404    #[test]
405    fn test_numbers() {
406        assert_eq!(
407            tokenize("123 456").unwrap(),
408            vec![Token::Number(123), Token::Number(456),]
409        );
410    }
411
412    #[test]
413    fn test_keywords() {
414        assert_eq!(
415            tokenize("AND OR NOT contains matches").unwrap(),
416            vec![
417                Token::And,
418                Token::Or,
419                Token::Not,
420                Token::Contains,
421                Token::Matches,
422            ]
423        );
424    }
425
426    #[test]
427    fn test_case_insensitive_keywords() {
428        assert_eq!(
429            tokenize("and AND And").unwrap(),
430            vec![Token::And, Token::And, Token::And,]
431        );
432    }
433
434    #[test]
435    fn test_identifiers() {
436        assert_eq!(
437            tokenize("MESSAGE_TYPE field1").unwrap(),
438            vec![
439                Token::Ident("MESSAGE_TYPE".into()),
440                Token::Ident("field1".into()),
441            ]
442        );
443    }
444
445    #[test]
446    fn test_regex_r_syntax() {
447        assert_eq!(
448            tokenize(r#"r"hello.*world""#).unwrap(),
449            vec![Token::Regex("hello.*world".into()),]
450        );
451    }
452
453    #[test]
454    fn test_regex_slash_syntax() {
455        assert_eq!(
456            tokenize(r#"/hello.*world/"#).unwrap(),
457            vec![Token::Regex("hello.*world".into()),]
458        );
459    }
460
461    #[test]
462    fn test_complex_expression() {
463        let input = r#"MESSAGE_TYPE == "2" AND payload contains "error""#;
464        assert_eq!(
465            tokenize(input).unwrap(),
466            vec![
467                Token::Ident("MESSAGE_TYPE".into()),
468                Token::Eq,
469                Token::String("2".into()),
470                Token::And,
471                Token::Ident("payload".into()),
472                Token::Contains,
473                Token::String("error".into()),
474            ]
475        );
476    }
477
478    #[test]
479    fn test_rand() {
480        assert_eq!(
481            tokenize("rand(100)").unwrap(),
482            vec![
483                Token::Rand,
484                Token::LParen,
485                Token::Number(100),
486                Token::RParen,
487            ]
488        );
489    }
490
491    #[test]
492    fn test_in_set() {
493        assert_eq!(
494            tokenize(r#"field in {"a", "b", "c"}"#).unwrap(),
495            vec![
496                Token::Ident("field".into()),
497                Token::In,
498                Token::LBrace,
499                Token::String("a".into()),
500                Token::Comma,
501                Token::String("b".into()),
502                Token::Comma,
503                Token::String("c".into()),
504                Token::RBrace,
505            ]
506        );
507    }
508
509    #[test]
510    fn test_header_syntax() {
511        assert_eq!(
512            tokenize(r#"field.header("X-Custom") iequals "value""#).unwrap(),
513            vec![
514                Token::Ident("field".into()),
515                Token::Dot,
516                Token::Header,
517                Token::LParen,
518                Token::String("X-Custom".into()),
519                Token::RParen,
520                Token::IEquals,
521                Token::String("value".into()),
522            ]
523        );
524    }
525
526    #[test]
527    fn test_end_of_line_comments() {
528        // Comment at end of line
529        let input = "MESSAGE_TYPE == \"2\" # check type\nAND MESSAGE_SUB_TYPE == \"11\" # CUSTOM PROBE";
530        assert_eq!(
531            tokenize(input).unwrap(),
532            vec![
533                Token::Ident("MESSAGE_TYPE".into()),
534                Token::Eq,
535                Token::String("2".into()),
536                Token::And,
537                Token::Ident("MESSAGE_SUB_TYPE".into()),
538                Token::Eq,
539                Token::String("11".into()),
540            ]
541        );
542
543        // Comment at end of input (no trailing newline)
544        assert_eq!(
545            tokenize("true # done").unwrap(),
546            vec![Token::Ident("true".into()),]
547        );
548
549        // Only a comment
550        assert_eq!(tokenize("# nothing here").unwrap(), vec![]);
551    }
552
553    #[test]
554    fn test_unterminated_string() {
555        assert!(matches!(
556            tokenize(r#""hello"#),
557            Err(LexError::UnterminatedString(_))
558        ));
559    }
560
561    #[test]
562    fn test_unterminated_regex() {
563        assert!(matches!(
564            tokenize(r#"/hello"#),
565            Err(LexError::UnterminatedRegex(_))
566        ));
567    }
568}