Skip to main content

tess/
filter.rs

1use regex::{Regex, RegexBuilder};
2
3use crate::format::LogFormat;
4
5/// Operator in a single filter spec.
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum FilterOp {
8    /// `field=value` — exact match.
9    Eq,
10    /// `field!=value` — exact non-match.
11    Ne,
12    /// `field~regex` — regex match.
13    Re,
14    /// `field!~regex` — regex non-match.
15    NotRe,
16    /// `field<value` — less than (numeric if both sides parse as f64, else lex).
17    Lt,
18    /// `field<=value` — less-than-or-equal.
19    Le,
20    /// `field>value` — greater than.
21    Gt,
22    /// `field>=value` — greater-than-or-equal.
23    Ge,
24}
25
26/// A parsed filter spec, before being bound to a format.
27#[derive(Debug, Clone)]
28pub struct FilterSpec {
29    pub field: String,
30    pub op: FilterOp,
31    pub value: String,
32}
33
34impl FilterSpec {
35    /// Parse a filter spec like `status=500`, `ip~^10\.`, `status!=200`,
36    /// `agent!~bot`, `status>=500`, `hour<12`. Operator detection scans for
37    /// the longest match first so multi-char operators (`!=`, `!~`, `<=`,
38    /// `>=`) aren't confused with their single-char prefixes.
39    pub fn parse(input: &str) -> Result<Self, String> {
40        for (op, sep) in &[
41            (FilterOp::NotRe, "!~"),
42            (FilterOp::Ne, "!="),
43            (FilterOp::Le, "<="),
44            (FilterOp::Ge, ">="),
45            (FilterOp::Re, "~"),
46            (FilterOp::Eq, "="),
47            (FilterOp::Lt, "<"),
48            (FilterOp::Gt, ">"),
49        ] {
50            if let Some((field, value)) = input.split_once(sep) {
51                if field.is_empty() {
52                    return Err(format!("filter `{input}`: empty field name"));
53                }
54                return Ok(FilterSpec {
55                    field: field.to_string(),
56                    op: op.clone(),
57                    value: value.to_string(),
58                });
59            }
60        }
61        Err(format!(
62            "filter `{input}`: missing operator (expected =, !=, ~, !~, <, <=, >, or >=)"
63        ))
64    }
65}
66
67/// A single compiled predicate: an operator and (for regex ops) the compiled
68/// regex.
69#[derive(Debug)]
70struct CompiledPredicate {
71    field: String,
72    op: FilterOp,
73    /// Used for `Eq` / `Ne` (byte-exact comparison).
74    literal: Option<String>,
75    /// Used for `Re` / `NotRe`.
76    regex: Option<Regex>,
77}
78
79/// A compiled filter bound to a specific format. Evaluating a line runs the
80/// format's regex once and applies all predicates against the resulting
81/// captures. AND semantics: a line matches iff every predicate matches.
82///
83/// `format_regex_record` is a sibling compiled from the same source pattern
84/// with dotall + multi-line flags enabled. Records-mode callers use it so
85/// greedy `.` / `.+` captures span across newlines within a single record
86/// (e.g. `(?P<message>.*)$` captures the entire record body instead of
87/// failing because the original `$` only matches end-of-input).
88#[derive(Debug)]
89pub struct CompiledFilter {
90    pub format_name: String,
91    format_regex: Regex,
92    format_regex_record: Regex,
93    predicates: Vec<CompiledPredicate>,
94}
95
96#[derive(Debug, PartialEq, Eq)]
97pub enum FilterMatch {
98    /// Line matches every predicate.
99    Matched,
100    /// Line parsed against the format but at least one predicate didn't match.
101    NotMatched,
102    /// Line didn't parse against the format at all.
103    NotParsed,
104}
105
106impl CompiledFilter {
107    /// Compile the given specs against `format`. Validates that every spec's
108    /// field is one of the format's named captures.
109    pub fn compile(format: &LogFormat, specs: Vec<FilterSpec>) -> Result<Self, String> {
110        let mut predicates = Vec::with_capacity(specs.len());
111        for spec in specs {
112            if !format.field_names.iter().any(|n| n == &spec.field) {
113                return Err(format!(
114                    "filter `{}{:?}{}`: field `{}` is not in format `{}` (available: {})",
115                    spec.field,
116                    spec.op,
117                    spec.value,
118                    spec.field,
119                    format.name,
120                    format.field_names.join(", "),
121                ));
122            }
123            let (literal, regex) = match spec.op {
124                FilterOp::Eq
125                | FilterOp::Ne
126                | FilterOp::Lt
127                | FilterOp::Le
128                | FilterOp::Gt
129                | FilterOp::Ge => (Some(spec.value.clone()), None),
130                FilterOp::Re | FilterOp::NotRe => {
131                    let r = Regex::new(&spec.value)
132                        .map_err(|e| format!("filter `{}`: invalid regex `{}`: {e}", spec.field, spec.value))?;
133                    (None, Some(r))
134                }
135            };
136            predicates.push(CompiledPredicate {
137                field: spec.field,
138                op: spec.op,
139                literal,
140                regex,
141            });
142        }
143        let format_regex_record = RegexBuilder::new(format.regex.as_str())
144            .dot_matches_new_line(true)
145            .multi_line(true)
146            .build()
147            .map_err(|e| {
148                format!("format `{}`: rebuilding regex for records mode: {e}", format.name)
149            })?;
150
151        Ok(Self {
152            format_name: format.name.clone(),
153            format_regex: format.regex.clone(),
154            format_regex_record,
155            predicates,
156        })
157    }
158
159    /// Evaluate the filter against a single logical line of bytes. Decodes the
160    /// line as UTF-8 with a lossy fallback so non-UTF-8 bytes can still flow
161    /// through (they just won't match string-equal predicates).
162    pub fn evaluate(&self, line: &[u8]) -> FilterMatch {
163        self.evaluate_with(&self.format_regex, line)
164    }
165
166    /// Records-mode evaluation: runs the format regex with dotall + multi-line
167    /// flags enabled against the full multi-line record bytes. Greedy
168    /// captures like `(?P<message>.*)$` consume the whole body of the record,
169    /// so predicates can match content on any continuation line.
170    pub fn evaluate_record(&self, record: &[u8]) -> FilterMatch {
171        self.evaluate_with(&self.format_regex_record, record)
172    }
173
174    fn evaluate_with(&self, regex: &Regex, bytes: &[u8]) -> FilterMatch {
175        let line_str = match std::str::from_utf8(bytes) {
176            Ok(s) => s,
177            Err(_) => return FilterMatch::NotParsed,
178        };
179        let Some(caps) = regex.captures(line_str) else {
180            return FilterMatch::NotParsed;
181        };
182        for p in &self.predicates {
183            let Some(m) = caps.name(&p.field) else {
184                return FilterMatch::NotMatched;
185            };
186            let captured = m.as_str();
187            let ok = match p.op {
188                FilterOp::Eq => p.literal.as_deref() == Some(captured),
189                FilterOp::Ne => p.literal.as_deref() != Some(captured),
190                FilterOp::Re => p.regex.as_ref().is_some_and(|r| r.is_match(captured)),
191                FilterOp::NotRe => p.regex.as_ref().is_some_and(|r| !r.is_match(captured)),
192                FilterOp::Lt | FilterOp::Le | FilterOp::Gt | FilterOp::Ge => {
193                    let rhs = p.literal.as_deref().unwrap_or("");
194                    compare(&p.op, captured, rhs)
195                }
196            };
197            if !ok {
198                return FilterMatch::NotMatched;
199            }
200        }
201        FilterMatch::Matched
202    }
203}
204
205/// Compare `lhs` against `rhs` under the given ordering operator.
206///
207/// Tries numeric comparison first (both sides parse as f64); falls back to
208/// lexicographic byte order. Numeric is intentionally lossy on integer
209/// overflow — log fields are typically small numbers (status codes, sizes,
210/// hours), and f64 covers the practical range.
211fn compare(op: &FilterOp, lhs: &str, rhs: &str) -> bool {
212    let order = match (lhs.parse::<f64>(), rhs.parse::<f64>()) {
213        (Ok(a), Ok(b)) => a.partial_cmp(&b),
214        _ => Some(lhs.cmp(rhs)),
215    };
216    let Some(order) = order else { return false; };
217    use std::cmp::Ordering::{Equal, Greater, Less};
218    matches!(
219        (op, order),
220        (FilterOp::Lt, Less)
221            | (FilterOp::Le, Less | Equal)
222            | (FilterOp::Gt, Greater)
223            | (FilterOp::Ge, Greater | Equal)
224    )
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    fn apache_combined() -> LogFormat {
232        LogFormat::compile(
233            "apache-combined",
234            r#"^(?P<ip>\S+) \S+ (?P<user>\S+) \[(?P<time>[^\]]+)\] "(?P<method>\S+) (?P<url>\S+) (?P<protocol>[^"]+)" (?P<status>\d+) (?P<size>\S+) "(?P<referer>[^"]*)" "(?P<agent>[^"]*)"$"#,
235        )
236        .unwrap()
237    }
238
239    const SAMPLE_200: &[u8] = br#"127.0.0.1 - alice [10/Oct/2023:13:55:36 +0000] "GET /index.html HTTP/1.1" 200 2326 "-" "Mozilla/5.0""#;
240    const SAMPLE_500: &[u8] = br#"127.0.0.1 - alice [10/Oct/2023:13:55:36 +0000] "GET /api/data HTTP/1.1" 500 512 "-" "curl/7.0""#;
241    const NON_PARSING: &[u8] = b"this line does not match the format at all";
242
243    #[test]
244    fn parse_eq() {
245        let s = FilterSpec::parse("status=500").unwrap();
246        assert_eq!(s.field, "status");
247        assert_eq!(s.op, FilterOp::Eq);
248        assert_eq!(s.value, "500");
249    }
250
251    #[test]
252    fn parse_ne_before_eq() {
253        let s = FilterSpec::parse("status!=200").unwrap();
254        assert_eq!(s.op, FilterOp::Ne);
255        assert_eq!(s.value, "200");
256    }
257
258    #[test]
259    fn parse_re() {
260        let s = FilterSpec::parse(r"ip~^10\.").unwrap();
261        assert_eq!(s.op, FilterOp::Re);
262        assert_eq!(s.value, r"^10\.");
263    }
264
265    #[test]
266    fn parse_not_re_before_re() {
267        let s = FilterSpec::parse("agent!~bot").unwrap();
268        assert_eq!(s.op, FilterOp::NotRe);
269        assert_eq!(s.value, "bot");
270    }
271
272    #[test]
273    fn parse_rejects_no_operator() {
274        let err = FilterSpec::parse("status").unwrap_err();
275        assert!(err.contains("missing operator"), "{err}");
276    }
277
278    #[test]
279    fn parse_rejects_empty_field() {
280        let err = FilterSpec::parse("=500").unwrap_err();
281        assert!(err.contains("empty field"), "{err}");
282    }
283
284    #[test]
285    fn compile_rejects_unknown_field() {
286        let fmt = apache_combined();
287        let specs = vec![FilterSpec::parse("notafield=x").unwrap()];
288        let err = CompiledFilter::compile(&fmt, specs).unwrap_err();
289        assert!(err.contains("not in format"), "{err}");
290    }
291
292    #[test]
293    fn evaluate_eq_matches() {
294        let fmt = apache_combined();
295        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status=500").unwrap()]).unwrap();
296        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
297        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
298    }
299
300    #[test]
301    fn evaluate_re_matches_5xx() {
302        let fmt = apache_combined();
303        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status~^5").unwrap()]).unwrap();
304        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
305        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
306    }
307
308    #[test]
309    fn evaluate_ne_excludes_200() {
310        let fmt = apache_combined();
311        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status!=200").unwrap()]).unwrap();
312        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
313        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
314    }
315
316    #[test]
317    fn evaluate_multiple_filters_and() {
318        let fmt = apache_combined();
319        let f = CompiledFilter::compile(
320            &fmt,
321            vec![
322                FilterSpec::parse("status~^5").unwrap(),
323                FilterSpec::parse(r"url~/api/").unwrap(),
324            ],
325        )
326        .unwrap();
327        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
328        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
329    }
330
331    #[test]
332    fn evaluate_unparseable_line_is_not_parsed() {
333        let fmt = apache_combined();
334        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status=200").unwrap()]).unwrap();
335        assert_eq!(f.evaluate(NON_PARSING), FilterMatch::NotParsed);
336    }
337
338    // ----- Comparison operators -----
339
340    #[test]
341    fn parse_le_before_lt() {
342        let s = FilterSpec::parse("status<=200").unwrap();
343        assert_eq!(s.op, FilterOp::Le);
344        assert_eq!(s.value, "200");
345    }
346
347    #[test]
348    fn parse_ge_before_gt() {
349        let s = FilterSpec::parse("status>=500").unwrap();
350        assert_eq!(s.op, FilterOp::Ge);
351        assert_eq!(s.value, "500");
352    }
353
354    #[test]
355    fn parse_lt() {
356        let s = FilterSpec::parse("size<1000").unwrap();
357        assert_eq!(s.op, FilterOp::Lt);
358        assert_eq!(s.value, "1000");
359    }
360
361    #[test]
362    fn parse_gt() {
363        let s = FilterSpec::parse("size>0").unwrap();
364        assert_eq!(s.op, FilterOp::Gt);
365        assert_eq!(s.value, "0");
366    }
367
368    #[test]
369    fn evaluate_ge_numeric() {
370        let fmt = apache_combined();
371        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status>=500").unwrap()]).unwrap();
372        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
373        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
374    }
375
376    #[test]
377    fn evaluate_lt_numeric() {
378        let fmt = apache_combined();
379        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status<400").unwrap()]).unwrap();
380        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::Matched);
381        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::NotMatched);
382    }
383
384    #[test]
385    fn evaluate_lex_fallback() {
386        // `size` of "-" means missing in CLF. Numeric parse fails, lex compare
387        // applies: "-" vs "100". Verify lex semantics produce the right answer.
388        // ASCII: '-' (0x2D) < '0' (0x30), so "-" < "100" lexicographically.
389        assert!(compare(&FilterOp::Lt, "-", "100"));
390        assert!(!compare(&FilterOp::Gt, "-", "100"));
391    }
392
393    #[test]
394    fn evaluate_lex_string_compare() {
395        // `level>warn` — both sides are strings, neither numeric.
396        assert!(compare(&FilterOp::Gt, "warning", "warn"));
397        assert!(!compare(&FilterOp::Gt, "info", "warn"));
398        assert!(compare(&FilterOp::Ge, "warn", "warn"));
399        assert!(compare(&FilterOp::Le, "warn", "warn"));
400    }
401
402    #[test]
403    fn parse_rejects_no_op_mentions_new_ops() {
404        let err = FilterSpec::parse("status").unwrap_err();
405        assert!(err.contains(">=") && err.contains("<="), "{err}");
406    }
407}