Skip to main content

bytecode_filter/
parser.rs

1//! Parser for filter expressions.
2//!
3//! Parses tokens into an AST that can be compiled to bytecode.
4
5use std::collections::HashMap;
6
7use thiserror::Error;
8
9use crate::lexer::{LexError, Lexer, Token};
10
11/// AST node for filter expressions.
12#[derive(Debug, Clone, PartialEq)]
13#[allow(missing_docs)]
14pub enum Expr {
15    /// Boolean literal: true or false
16    Bool(bool),
17
18    /// Random sampling: rand(N) returns true with probability 1/N
19    Rand(u16),
20
21    /// Payload-wide contains: payload contains "string"
22    Contains(String),
23
24    /// Payload-wide starts_with: payload starts_with "string"
25    StartsWith(String),
26
27    /// Payload-wide ends_with: payload ends_with "string"
28    EndsWith(String),
29
30    /// Payload-wide equals: payload == "string"
31    Equals(String),
32
33    /// Payload-wide regex match: payload matches "pattern"
34    Matches(String),
35
36    /// Part-specific contains: FIELD contains "string"
37    PartContains { part: String, value: String },
38
39    /// Part-specific case-insensitive contains
40    PartIContains { part: String, value: String },
41
42    /// Part-specific starts_with
43    PartStartsWith { part: String, value: String },
44
45    /// Part-specific ends_with
46    PartEndsWith { part: String, value: String },
47
48    /// Part-specific equals: FIELD == "string"
49    PartEquals { part: String, value: String },
50
51    /// Part-specific case-insensitive equals
52    PartIEquals { part: String, value: String },
53
54    /// Part-specific not equals: FIELD != "string"
55    PartNotEquals { part: String, value: String },
56
57    /// Part-specific regex match: FIELD matches "pattern"
58    PartMatches { part: String, pattern: String },
59
60    /// Part-specific set membership: FIELD in {"a", "b", "c"}
61    PartInSet { part: String, values: Vec<String> },
62
63    /// Part is empty: FIELD is_empty
64    PartIsEmpty { part: String },
65
66    /// Part is not empty: FIELD not_empty
67    PartNotEmpty { part: String },
68
69    /// Header extraction with equals: FIELD.header("name") == "value"
70    HeaderEquals {
71        part: String,
72        header: String,
73        value: String,
74    },
75
76    /// Header extraction with case-insensitive equals
77    HeaderIEquals {
78        part: String,
79        header: String,
80        value: String,
81    },
82
83    /// Header extraction with contains
84    HeaderContains {
85        part: String,
86        header: String,
87        value: String,
88    },
89
90    /// Header exists: FIELD.header("name") exists
91    HeaderExists { part: String, header: String },
92
93    /// Logical AND
94    And(Box<Expr>, Box<Expr>),
95
96    /// Logical OR
97    Or(Box<Expr>, Box<Expr>),
98
99    /// Logical NOT
100    Not(Box<Expr>),
101}
102
103/// Parser error types.
104#[derive(Debug, Clone, Error, PartialEq)]
105#[allow(missing_docs)]
106pub enum ParseError {
107    #[error("Lexer error: {0}")]
108    Lex(#[from] LexError),
109
110    #[error("Unexpected token: expected {expected}, got {got:?}")]
111    UnexpectedToken { expected: String, got: Token },
112
113    #[error("Unexpected end of input, expected {0}")]
114    UnexpectedEof(String),
115
116    #[error("Unknown field '{0}'. Known fields: {1}")]
117    UnknownField(String, String),
118
119    #[error("Invalid rand() argument: must be > 0")]
120    InvalidRandArg,
121
122    #[error("Expected string literal")]
123    ExpectedString,
124
125    #[error("Expected number")]
126    ExpectedNumber,
127
128    #[error("Invalid regex pattern: {0}")]
129    InvalidRegex(String),
130}
131
132/// Parser configuration with field mappings.
133///
134/// Define your record schema by mapping field names to positional indices
135/// and specifying the delimiter used to split records into fields.
136///
137/// # Example
138///
139/// ```
140/// use bytecode_filter::ParserConfig;
141///
142/// let mut config = ParserConfig::default();
143/// config.set_delimiter(",");
144/// config.add_field("STATUS", 0);
145/// config.add_field("CODE", 1);
146/// config.add_field("BODY", 2);
147/// ```
148#[derive(Debug, Clone)]
149pub struct ParserConfig {
150    /// Map of field names to part indices.
151    pub fields: HashMap<String, u8>,
152
153    /// The delimiter used to split records into fields.
154    pub delimiter: Vec<u8>,
155}
156
157impl Default for ParserConfig {
158    fn default() -> Self {
159        Self {
160            fields: HashMap::new(),
161            delimiter: b";;;".to_vec(),
162        }
163    }
164}
165
166impl ParserConfig {
167    /// Add a field mapping.
168    pub fn add_field(&mut self, name: impl Into<String>, index: u8) -> &mut Self {
169        self.fields.insert(name.into(), index);
170        self
171    }
172
173    /// Set the delimiter.
174    pub fn set_delimiter(&mut self, delimiter: impl Into<Vec<u8>>) -> &mut Self {
175        self.delimiter = delimiter.into();
176        self
177    }
178}
179
180/// Parser for filter expressions.
181pub struct Parser<'a> {
182    tokens: Vec<Token>,
183    pos: usize,
184    #[allow(dead_code)]
185    config: &'a ParserConfig,
186}
187
188impl<'a> Parser<'a> {
189    /// Create a new parser from input string.
190    ///
191    /// # Errors
192    /// Returns `ParseError` if tokenization fails.
193    pub fn new(input: &str, config: &'a ParserConfig) -> Result<Self, ParseError> {
194        let tokens = Lexer::new(input).tokenize()?;
195        Ok(Self {
196            tokens,
197            pos: 0,
198            config,
199        })
200    }
201
202    /// Parse the expression.
203    ///
204    /// # Errors
205    /// Returns `ParseError` if the expression syntax is invalid.
206    pub fn parse(&mut self) -> Result<Expr, ParseError> {
207        let expr = self.parse_or()?;
208
209        if self.peek() != &Token::Eof {
210            return Err(ParseError::UnexpectedToken {
211                expected: "end of input".into(),
212                got: self.peek().clone(),
213            });
214        }
215
216        Ok(expr)
217    }
218
219    fn peek(&self) -> &Token {
220        self.tokens.get(self.pos).unwrap_or(&Token::Eof)
221    }
222
223    fn advance(&mut self) -> &Token {
224        let token = self.tokens.get(self.pos).unwrap_or(&Token::Eof);
225        self.pos += 1;
226        token
227    }
228
229    fn expect(&mut self, expected: &Token) -> Result<(), ParseError> {
230        let got = self.advance().clone();
231        if &got == expected {
232            Ok(())
233        } else {
234            Err(ParseError::UnexpectedToken {
235                expected: format!("{:?}", expected),
236                got,
237            })
238        }
239    }
240
241    // Grammar:
242    // expr     -> or_expr
243    // or_expr  -> and_expr (OR and_expr)*
244    // and_expr -> not_expr (AND not_expr)*
245    // not_expr -> NOT not_expr | primary
246    // primary  -> '(' expr ')' | rand | field_expr | true | false
247
248    fn parse_or(&mut self) -> Result<Expr, ParseError> {
249        let mut left = self.parse_and()?;
250
251        while matches!(self.peek(), Token::Or) {
252            self.advance();
253            let right = self.parse_and()?;
254            left = Expr::Or(Box::new(left), Box::new(right));
255        }
256
257        Ok(left)
258    }
259
260    fn parse_and(&mut self) -> Result<Expr, ParseError> {
261        let mut left = self.parse_not()?;
262
263        while matches!(self.peek(), Token::And) {
264            self.advance();
265            let right = self.parse_not()?;
266            left = Expr::And(Box::new(left), Box::new(right));
267        }
268
269        Ok(left)
270    }
271
272    fn parse_not(&mut self) -> Result<Expr, ParseError> {
273        if matches!(self.peek(), Token::Not) {
274            self.advance();
275            let inner = self.parse_not()?;
276            Ok(Expr::Not(Box::new(inner)))
277        } else {
278            self.parse_primary()
279        }
280    }
281
282    fn parse_primary(&mut self) -> Result<Expr, ParseError> {
283        match self.peek().clone() {
284            Token::LParen => {
285                self.advance();
286                let expr = self.parse_or()?;
287                self.expect(&Token::RParen)?;
288                Ok(expr)
289            }
290
291            Token::Rand => self.parse_rand(),
292
293            Token::Ident(name) => {
294                self.advance();
295
296                // Check for boolean literals
297                if name == "true" {
298                    return Ok(Expr::Bool(true));
299                }
300                if name == "false" {
301                    return Ok(Expr::Bool(false));
302                }
303
304                // Check for "payload" keyword (full payload operations)
305                if name.to_lowercase() == "payload" {
306                    return self.parse_payload_op();
307                }
308
309                // Otherwise it's a field reference
310                self.parse_field_op(name)
311            }
312
313            token => Err(ParseError::UnexpectedToken {
314                expected: "expression".into(),
315                got: token,
316            }),
317        }
318    }
319
320    fn parse_rand(&mut self) -> Result<Expr, ParseError> {
321        self.advance(); // consume 'rand'
322        self.expect(&Token::LParen)?;
323
324        let n = match self.advance().clone() {
325            Token::Number(n) => n,
326            got => {
327                return Err(ParseError::UnexpectedToken {
328                    expected: "number".into(),
329                    got,
330                });
331            }
332        };
333
334        if n == 0 || n > u16::MAX as u64 {
335            return Err(ParseError::InvalidRandArg);
336        }
337
338        self.expect(&Token::RParen)?;
339        Ok(Expr::Rand(n as u16))
340    }
341
342    fn parse_payload_op(&mut self) -> Result<Expr, ParseError> {
343        match self.peek().clone() {
344            Token::Contains => {
345                self.advance();
346                let value = self.expect_string()?;
347                Ok(Expr::Contains(value))
348            }
349            Token::StartsWith => {
350                self.advance();
351                let value = self.expect_string()?;
352                Ok(Expr::StartsWith(value))
353            }
354            Token::EndsWith => {
355                self.advance();
356                let value = self.expect_string()?;
357                Ok(Expr::EndsWith(value))
358            }
359            Token::Matches => {
360                self.advance();
361                let pattern = self.expect_regex_or_string()?;
362                Ok(Expr::Matches(pattern))
363            }
364            Token::Eq => {
365                self.advance();
366                let value = self.expect_string()?;
367                Ok(Expr::Equals(value))
368            }
369            got => Err(ParseError::UnexpectedToken {
370                expected: "contains, starts_with, ends_with, matches, or ==".into(),
371                got,
372            }),
373        }
374    }
375
376    fn parse_field_op(&mut self, field_name: String) -> Result<Expr, ParseError> {
377        // Check for header extraction: FIELD.header("name")
378        if matches!(self.peek(), Token::Dot) {
379            self.advance();
380            if !matches!(self.peek(), Token::Header) {
381                return Err(ParseError::UnexpectedToken {
382                    expected: "header".into(),
383                    got: self.peek().clone(),
384                });
385            }
386            self.advance();
387            self.expect(&Token::LParen)?;
388            let header_name = self.expect_string()?;
389            self.expect(&Token::RParen)?;
390
391            return self.parse_header_op(field_name, header_name);
392        }
393
394        // Regular field operations
395        match self.peek().clone() {
396            Token::Contains => {
397                self.advance();
398                let value = self.expect_string()?;
399                Ok(Expr::PartContains {
400                    part: field_name,
401                    value,
402                })
403            }
404            Token::IContains => {
405                self.advance();
406                let value = self.expect_string()?;
407                Ok(Expr::PartIContains {
408                    part: field_name,
409                    value,
410                })
411            }
412            Token::StartsWith => {
413                self.advance();
414                let value = self.expect_string()?;
415                Ok(Expr::PartStartsWith {
416                    part: field_name,
417                    value,
418                })
419            }
420            Token::EndsWith => {
421                self.advance();
422                let value = self.expect_string()?;
423                Ok(Expr::PartEndsWith {
424                    part: field_name,
425                    value,
426                })
427            }
428            Token::Matches => {
429                self.advance();
430                let pattern = self.expect_regex_or_string()?;
431                Ok(Expr::PartMatches {
432                    part: field_name,
433                    pattern,
434                })
435            }
436            Token::Eq => {
437                self.advance();
438                let value = self.expect_string()?;
439                Ok(Expr::PartEquals {
440                    part: field_name,
441                    value,
442                })
443            }
444            Token::Ne => {
445                self.advance();
446                let value = self.expect_string()?;
447                Ok(Expr::PartNotEquals {
448                    part: field_name,
449                    value,
450                })
451            }
452            Token::IEquals => {
453                self.advance();
454                let value = self.expect_string()?;
455                Ok(Expr::PartIEquals {
456                    part: field_name,
457                    value,
458                })
459            }
460            Token::In => {
461                self.advance();
462                let values = self.parse_string_set()?;
463                Ok(Expr::PartInSet {
464                    part: field_name,
465                    values,
466                })
467            }
468            Token::IsEmpty => {
469                self.advance();
470                Ok(Expr::PartIsEmpty { part: field_name })
471            }
472            Token::NotEmpty => {
473                self.advance();
474                Ok(Expr::PartNotEmpty { part: field_name })
475            }
476            got => Err(ParseError::UnexpectedToken {
477                expected:
478                    "contains, starts_with, ends_with, matches, ==, !=, in, is_empty, or not_empty"
479                        .into(),
480                got,
481            }),
482        }
483    }
484
485    fn parse_header_op(&mut self, part: String, header: String) -> Result<Expr, ParseError> {
486        match self.peek().clone() {
487            Token::Eq => {
488                self.advance();
489                let value = self.expect_string()?;
490                Ok(Expr::HeaderEquals {
491                    part,
492                    header,
493                    value,
494                })
495            }
496            Token::IEquals => {
497                self.advance();
498                let value = self.expect_string()?;
499                Ok(Expr::HeaderIEquals {
500                    part,
501                    header,
502                    value,
503                })
504            }
505            Token::Contains => {
506                self.advance();
507                let value = self.expect_string()?;
508                Ok(Expr::HeaderContains {
509                    part,
510                    header,
511                    value,
512                })
513            }
514            // "exists" as an identifier
515            Token::Ident(ref s) if s.to_lowercase() == "exists" => {
516                self.advance();
517                Ok(Expr::HeaderExists { part, header })
518            }
519            got => Err(ParseError::UnexpectedToken {
520                expected: "==, iequals, contains, or exists".into(),
521                got,
522            }),
523        }
524    }
525
526    fn parse_string_set(&mut self) -> Result<Vec<String>, ParseError> {
527        self.expect(&Token::LBrace)?;
528
529        let mut values = Vec::new();
530
531        // Handle empty set
532        if matches!(self.peek(), Token::RBrace) {
533            self.advance();
534            return Ok(values);
535        }
536
537        // First value
538        values.push(self.expect_string()?);
539
540        // Additional values
541        while matches!(self.peek(), Token::Comma) {
542            self.advance();
543            // Allow trailing comma
544            if matches!(self.peek(), Token::RBrace) {
545                break;
546            }
547            values.push(self.expect_string()?);
548        }
549
550        self.expect(&Token::RBrace)?;
551        Ok(values)
552    }
553
554    fn expect_string(&mut self) -> Result<String, ParseError> {
555        match self.advance().clone() {
556            Token::String(s) => Ok(s),
557            got => Err(ParseError::UnexpectedToken {
558                expected: "string".into(),
559                got,
560            }),
561        }
562    }
563
564    fn expect_regex_or_string(&mut self) -> Result<String, ParseError> {
565        match self.advance().clone() {
566            Token::String(s) | Token::Regex(s) => Ok(s),
567            got => Err(ParseError::UnexpectedToken {
568                expected: "string or regex".into(),
569                got,
570            }),
571        }
572    }
573}
574
575/// Parse a filter expression string.
576///
577/// # Errors
578/// Returns `ParseError` if the expression is invalid.
579pub fn parse(input: &str, config: &ParserConfig) -> Result<Expr, ParseError> {
580    Parser::new(input, config)?.parse()
581}
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586
587    fn test_config() -> ParserConfig {
588        let mut config = ParserConfig::default();
589        config.add_field("LEVEL", 0);
590        config.add_field("CODE", 1);
591        config.add_field("METHOD", 2);
592        config.add_field("PATH", 3);
593        config.add_field("HEADERS", 4);
594        config.add_field("BODY", 5);
595        config
596    }
597
598    fn parse_expr(input: &str) -> Result<Expr, ParseError> {
599        let config = test_config();
600        parse(input, &config)
601    }
602
603    #[test]
604    fn test_bool_literals() {
605        assert_eq!(parse_expr("true").unwrap(), Expr::Bool(true));
606        assert_eq!(parse_expr("false").unwrap(), Expr::Bool(false));
607    }
608
609    #[test]
610    fn test_rand() {
611        assert_eq!(parse_expr("rand(100)").unwrap(), Expr::Rand(100));
612        assert_eq!(parse_expr("rand(1)").unwrap(), Expr::Rand(1));
613    }
614
615    #[test]
616    fn test_payload_contains() {
617        assert_eq!(
618            parse_expr(r#"payload contains "error""#).unwrap(),
619            Expr::Contains("error".into())
620        );
621    }
622
623    #[test]
624    fn test_payload_matches() {
625        assert_eq!(
626            parse_expr(r#"payload matches "error_[0-9]+""#).unwrap(),
627            Expr::Matches("error_[0-9]+".into())
628        );
629    }
630
631    #[test]
632    fn test_field_equals() {
633        assert_eq!(
634            parse_expr(r#"LEVEL == "error""#).unwrap(),
635            Expr::PartEquals {
636                part: "LEVEL".into(),
637                value: "error".into(),
638            }
639        );
640    }
641
642    #[test]
643    fn test_field_in_set() {
644        assert_eq!(
645            parse_expr(r#"LEVEL in {"error", "warn", "fatal"}"#).unwrap(),
646            Expr::PartInSet {
647                part: "LEVEL".into(),
648                values: vec!["error".into(), "warn".into(), "fatal".into()],
649            }
650        );
651    }
652
653    #[test]
654    fn test_header_iequals() {
655        assert_eq!(
656            parse_expr(r#"HEADERS.header("x-custom") iequals "value""#).unwrap(),
657            Expr::HeaderIEquals {
658                part: "HEADERS".into(),
659                header: "x-custom".into(),
660                value: "value".into(),
661            }
662        );
663    }
664
665    #[test]
666    fn test_and() {
667        let expr = parse_expr(r#"LEVEL == "error" AND CODE == "500""#).unwrap();
668        assert!(matches!(expr, Expr::And(_, _)));
669    }
670
671    #[test]
672    fn test_or() {
673        let expr = parse_expr(r#"LEVEL == "error" OR LEVEL == "warn""#).unwrap();
674        assert!(matches!(expr, Expr::Or(_, _)));
675    }
676
677    #[test]
678    fn test_not() {
679        let expr = parse_expr(r#"NOT LEVEL == "debug""#).unwrap();
680        assert!(matches!(expr, Expr::Not(_)));
681    }
682
683    #[test]
684    fn test_parentheses() {
685        let expr =
686            parse_expr(r#"(LEVEL == "error" OR LEVEL == "warn") AND BODY not_empty"#).unwrap();
687        match expr {
688            Expr::And(left, _) => {
689                assert!(matches!(*left, Expr::Or(_, _)));
690            }
691            _ => panic!("Expected And expression"),
692        }
693    }
694
695    #[test]
696    fn test_complex_filter() {
697        let input = r#"
698            CODE == "500"
699            AND METHOD == "POST"
700            AND HEADERS.header("Content-Type") iequals "application/json"
701        "#;
702        let expr = parse_expr(input).unwrap();
703
704        match expr {
705            Expr::And(left, right) => {
706                assert!(matches!(*left, Expr::And(_, _)));
707                assert!(matches!(*right, Expr::HeaderIEquals { .. }));
708            }
709            _ => panic!("Expected And expression"),
710        }
711    }
712
713    #[test]
714    fn test_field_is_empty() {
715        assert_eq!(
716            parse_expr("BODY is_empty").unwrap(),
717            Expr::PartIsEmpty {
718                part: "BODY".into()
719            }
720        );
721    }
722
723    #[test]
724    fn test_field_not_empty() {
725        assert_eq!(
726            parse_expr("BODY not_empty").unwrap(),
727            Expr::PartNotEmpty {
728                part: "BODY".into()
729            }
730        );
731    }
732
733    #[test]
734    fn test_combined_with_rand() {
735        let expr = parse_expr(r#"LEVEL == "error" AND rand(100)"#).unwrap();
736        match expr {
737            Expr::And(left, right) => {
738                assert!(matches!(*left, Expr::PartEquals { .. }));
739                assert!(matches!(*right, Expr::Rand(100)));
740            }
741            _ => panic!("Expected And expression"),
742        }
743    }
744}