Skip to main content

bytecode_filter/
lib.rs

1//! # bytecode-filter
2//!
3//! A fast bytecode-compiled filter engine for delimiter-separated records.
4//!
5//! Filters are expressed in a small DSL, compiled to bytecode at startup, and
6//! evaluated with **zero allocations** in the hot path.
7//!
8//! ## Features
9//!
10//! - **Zero-copy evaluation**: Records are split into fields without copying
11//! - **SIMD-accelerated string matching**: Uses `memchr` for fast substring search
12//! - **Precompiled regex**: Regex patterns are compiled once at startup
13//! - **Key-value extraction**: Extract and match key-value pairs from record fields
14//! - **Random sampling**: Built-in `rand(N)` for probabilistic filtering
15//!
16//! ## Example
17//!
18//! ```rust
19//! use bytecode_filter::{compile, ParserConfig};
20//! use bytes::Bytes;
21//!
22//! // Define your record schema
23//! let mut config = ParserConfig::default();
24//! config.set_delimiter(",");
25//! config.add_field("LEVEL", 0);
26//! config.add_field("CODE", 1);
27//! config.add_field("BODY", 2);
28//!
29//! // Compile a filter expression
30//! let filter = compile(r#"LEVEL == "error" AND CODE == "500""#, &config).unwrap();
31//!
32//! // Evaluate against records
33//! let record = Bytes::from("error,500,internal failure");
34//! assert!(filter.evaluate(record));
35//!
36//! let record = Bytes::from("info,200,ok");
37//! assert!(!filter.evaluate(record));
38//! ```
39//!
40//! ## Filter Syntax
41//!
42//! ### Basic Operations
43//!
44//! ```text
45//! # Boolean literals
46//! true
47//! false
48//!
49//! # Random sampling (returns true 1/N of the time)
50//! rand(100)    # 1% sample
51//! rand(2)      # 50% sample
52//!
53//! # Payload-wide operations (match against the entire record)
54//! payload contains "error"
55//! payload starts_with "ERROR:"
56//! payload ends_with ".json"
57//! payload == "exact match"
58//! payload matches "error_[0-9]+"
59//! ```
60//!
61//! ### Field Operations
62//!
63//! ```text
64//! # Equality
65//! STATUS == "active"
66//! STATUS != "deleted"
67//!
68//! # Set membership
69//! LEVEL in {"error", "warn", "fatal"}
70//!
71//! # String matching
72//! PATH contains "/api/"
73//! PATH starts_with "GET"
74//! PATH matches "/api/v[0-9]+/.*"
75//!
76//! # Case-insensitive
77//! METHOD icontains "post"
78//! LEVEL iequals "Error"
79//!
80//! # Empty checks
81//! NOTES is_empty
82//! NOTES not_empty
83//! ```
84//!
85//! ### Key-Value Extraction
86//!
87//! For fields that contain key-value data (e.g., HTTP headers, metadata),
88//! you can extract individual values:
89//!
90//! ```text
91//! HEADERS.header("Content-Type") == "application/json"
92//! HEADERS.header("Authorization") contains "Bearer"
93//! HEADERS.header("X-Request-Id") exists
94//! ```
95//!
96//! ### Boolean Logic
97//!
98//! ```text
99//! # AND, OR, NOT
100//! LEVEL == "error" AND CODE == "500"
101//! LEVEL == "warn" OR LEVEL == "error"
102//! NOT LEVEL == "debug"
103//!
104//! # Parentheses for grouping
105//! (LEVEL == "error" OR LEVEL == "warn") AND BODY not_empty
106//! ```
107//!
108//! ## Custom Schemas
109//!
110//! Fields and delimiters are fully configurable via [`ParserConfig`]:
111//!
112//! ```rust
113//! use bytecode_filter::ParserConfig;
114//!
115//! let mut config = ParserConfig::default();
116//! config.set_delimiter("\t");       // tab-separated
117//! config.add_field("HOST", 0);
118//! config.add_field("LEVEL", 1);
119//! config.add_field("MESSAGE", 2);
120//! ```
121//!
122//! Alternatively, use filter files with inline directives:
123//!
124//! ```text
125//! @delimiter = "\t"
126//! @field HOST = 0
127//! @field LEVEL = 1
128//! @field MESSAGE = 2
129//!
130//! LEVEL == "error" AND MESSAGE contains "timeout"
131//! ```
132
133#![warn(missing_docs)]
134#![warn(clippy::all)]
135
136mod compiler;
137mod lexer;
138mod loader;
139mod opcode;
140mod parser;
141mod split;
142mod vm;
143
144pub use compiler::{compile, compile_expr, CompileError};
145pub use lexer::{LexError, Lexer, Token};
146pub use loader::{load_filter_file, load_filter_string, LoadError};
147pub use opcode::Opcode;
148pub use parser::{parse, Expr, ParseError, Parser, ParserConfig};
149pub use split::{extract_header_value, PayloadParts, MAX_PARTS};
150pub use vm::{reset_rand_counter, CompiledFilter};
151
152#[cfg(test)]
153mod integration_tests {
154    use bytes::Bytes;
155
156    use super::*;
157
158    /// Helper: build a ParserConfig for a simple log-like schema.
159    fn log_config() -> ParserConfig {
160        let mut config = ParserConfig::default();
161        config.add_field("LEVEL", 0);
162        config.add_field("CODE", 1);
163        config.add_field("METHOD", 2);
164        config.add_field("PATH", 3);
165        config.add_field("HEADERS", 4);
166        config.add_field("BODY", 5);
167        config
168    }
169
170    /// Build a test record with the given field values.
171    fn make_record(fields: &[&str]) -> Bytes {
172        Bytes::from(fields.join(";;;"))
173    }
174
175    /// Build a 6-field record with specific overrides.
176    fn make_full_record(overrides: &[(usize, &str)]) -> Bytes {
177        let mut fields = vec![""; 6];
178        for (idx, value) in overrides {
179            fields[*idx] = value;
180        }
181        make_record(&fields)
182    }
183
184    #[test]
185    fn test_field_equality_and_headers() {
186        let config = log_config();
187        let filter = compile(
188            r#"
189            CODE == "500"
190            AND METHOD == "POST"
191            AND HEADERS.header("Content-Type") iequals "application/json"
192            "#,
193            &config,
194        )
195        .unwrap();
196
197        // Matching case
198        let record = make_full_record(&[
199            (1, "500"),
200            (2, "POST"),
201            (4, "Content-Type: application/json\r\nHost: example.com\r\n"),
202        ]);
203        assert!(filter.evaluate(record), "Should match all three clauses");
204
205        // Case-insensitive header value
206        let record = make_full_record(&[
207            (1, "500"),
208            (2, "POST"),
209            (4, "Content-Type: APPLICATION/JSON\r\n"),
210        ]);
211        assert!(filter.evaluate(record), "Should match case-insensitive");
212
213        // Wrong CODE
214        let record = make_full_record(&[
215            (1, "200"),
216            (2, "POST"),
217            (4, "Content-Type: application/json\r\n"),
218        ]);
219        assert!(!filter.evaluate(record), "Should not match wrong code");
220
221        // Wrong METHOD
222        let record = make_full_record(&[
223            (1, "500"),
224            (2, "GET"),
225            (4, "Content-Type: application/json\r\n"),
226        ]);
227        assert!(!filter.evaluate(record), "Should not match wrong method");
228
229        // Missing header
230        let record = make_full_record(&[(1, "500"), (2, "POST"), (4, "Host: example.com\r\n")]);
231        assert!(!filter.evaluate(record), "Should not match missing header");
232    }
233
234    #[test]
235    fn test_url_pattern_matching() {
236        let config = log_config();
237        let filter = compile(
238            r#"
239            LEVEL in {"error", "warn", "fatal"}
240            AND PATH matches "(?i).*/(?:admin|internal)/.*"
241            "#,
242            &config,
243        )
244        .unwrap();
245
246        for level in ["error", "warn", "fatal"] {
247            let record = make_full_record(&[(0, level), (3, "GET /api/admin/users HTTP/1.1")]);
248            assert!(
249                filter.evaluate(record),
250                "Should match level {} with admin URL",
251                level
252            );
253        }
254
255        let record = make_full_record(&[(0, "warn"), (3, "GET /internal/status HTTP/1.1")]);
256        assert!(filter.evaluate(record), "Should match internal URL");
257
258        // Non-matching: wrong level
259        let record = make_full_record(&[(0, "debug"), (3, "GET /admin/users HTTP/1.1")]);
260        assert!(!filter.evaluate(record), "Should not match debug level");
261
262        // Non-matching: no sensitive URL
263        let record = make_full_record(&[(0, "error"), (3, "GET /api/users HTTP/1.1")]);
264        assert!(!filter.evaluate(record), "Should not match public URL");
265    }
266
267    #[test]
268    fn test_combined_or() {
269        let config = log_config();
270        let filter = compile(
271            r#"
272            (
273                CODE == "500"
274                AND METHOD == "POST"
275                AND HEADERS.header("Content-Type") iequals "application/json"
276            )
277            OR
278            (
279                LEVEL in {"error", "warn", "fatal"}
280                AND PATH matches "(?i).*/admin/.*"
281            )
282            "#,
283            &config,
284        )
285        .unwrap();
286
287        // First branch match
288        let record = make_full_record(&[
289            (1, "500"),
290            (2, "POST"),
291            (4, "Content-Type: application/json\r\n"),
292        ]);
293        assert!(filter.evaluate(record), "Should match first branch");
294
295        // Second branch match
296        let record = make_full_record(&[(0, "error"), (3, "POST /api/admin/submit HTTP/1.1")]);
297        assert!(filter.evaluate(record), "Should match second branch");
298
299        // Neither branch
300        let record = make_full_record(&[(0, "info"), (3, "GET /api/users HTTP/1.1")]);
301        assert!(!filter.evaluate(record), "Should match neither branch");
302    }
303
304    #[test]
305    fn test_rand_sampling() {
306        vm::reset_rand_counter();
307
308        let config = log_config();
309        let filter = compile(r#"LEVEL == "error" AND rand(10)"#, &config).unwrap();
310
311        let record = make_record(&["error", "500", "GET"]);
312        let matches: usize = (0..100)
313            .filter(|_| filter.evaluate(record.clone()))
314            .count();
315
316        assert!(
317            matches == 10,
318            "Expected exactly 10 matches with deterministic counter, got {}",
319            matches
320        );
321    }
322
323    #[test]
324    fn test_empty_checks() {
325        let config = log_config();
326
327        let filter = compile("BODY is_empty", &config).unwrap();
328        assert!(filter.evaluate(make_record(&["error", "500", "GET", "/", "", ""])));
329        assert!(!filter.evaluate(make_record(&[
330            "error",
331            "500",
332            "GET",
333            "/",
334            "",
335            "some body"
336        ])));
337
338        let filter = compile("BODY not_empty", &config).unwrap();
339        assert!(!filter.evaluate(make_record(&["error", "500", "GET", "/", "", ""])));
340        assert!(filter.evaluate(make_record(&[
341            "error",
342            "500",
343            "GET",
344            "/",
345            "",
346            "some body"
347        ])));
348    }
349
350    #[test]
351    fn test_case_insensitive_contains() {
352        let config = log_config();
353        let filter = compile(r#"PATH icontains "ADMIN""#, &config).unwrap();
354
355        assert!(filter.evaluate(make_full_record(&[(3, "GET /admin/users HTTP/1.1")])));
356        assert!(filter.evaluate(make_full_record(&[(3, "GET /ADMIN/users HTTP/1.1")])));
357        assert!(filter.evaluate(make_full_record(&[(3, "GET /Admin/users HTTP/1.1")])));
358        assert!(!filter.evaluate(make_full_record(&[(3, "GET /api/users HTTP/1.1")])));
359    }
360
361    #[test]
362    fn test_filter_stats() {
363        let config = log_config();
364        let filter = compile(
365            r#"LEVEL in {"error", "warn"} AND payload matches "timeout""#,
366            &config,
367        )
368        .unwrap();
369
370        assert_eq!(filter.string_count(), 2); // "error" and "warn"
371        assert_eq!(filter.regex_count(), 1); // "timeout"
372        assert!(filter.bytecode_len() > 0);
373        assert_eq!(filter.delimiter(), b";;;");
374    }
375}