Skip to main content

tess/
filter.rs

1use regex::Regex;
2
3use crate::format::LogFormat;
4
5/// Operator in a single filter spec.
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum FilterOp {
8    /// `field=value` — exact match.
9    Eq,
10    /// `field!=value` — exact non-match.
11    Ne,
12    /// `field~regex` — regex match.
13    Re,
14    /// `field!~regex` — regex non-match.
15    NotRe,
16    /// `field<value` — less than (numeric if both sides parse as f64, else lex).
17    Lt,
18    /// `field<=value` — less-than-or-equal.
19    Le,
20    /// `field>value` — greater than.
21    Gt,
22    /// `field>=value` — greater-than-or-equal.
23    Ge,
24}
25
26/// A parsed filter spec, before being bound to a format.
27#[derive(Debug, Clone)]
28pub struct FilterSpec {
29    pub field: String,
30    pub op: FilterOp,
31    pub value: String,
32}
33
34impl FilterSpec {
35    /// Parse a filter spec like `status=500`, `ip~^10\.`, `status!=200`,
36    /// `agent!~bot`, `status>=500`, `hour<12`. Operator detection scans for
37    /// the longest match first so multi-char operators (`!=`, `!~`, `<=`,
38    /// `>=`) aren't confused with their single-char prefixes.
39    pub fn parse(input: &str) -> Result<Self, String> {
40        for (op, sep) in &[
41            (FilterOp::NotRe, "!~"),
42            (FilterOp::Ne, "!="),
43            (FilterOp::Le, "<="),
44            (FilterOp::Ge, ">="),
45            (FilterOp::Re, "~"),
46            (FilterOp::Eq, "="),
47            (FilterOp::Lt, "<"),
48            (FilterOp::Gt, ">"),
49        ] {
50            if let Some((field, value)) = input.split_once(sep) {
51                if field.is_empty() {
52                    return Err(format!("filter `{input}`: empty field name"));
53                }
54                return Ok(FilterSpec {
55                    field: field.to_string(),
56                    op: op.clone(),
57                    value: value.to_string(),
58                });
59            }
60        }
61        Err(format!(
62            "filter `{input}`: missing operator (expected =, !=, ~, !~, <, <=, >, or >=)"
63        ))
64    }
65}
66
67/// A single compiled predicate: an operator and (for regex ops) the compiled
68/// regex.
69#[derive(Debug)]
70struct CompiledPredicate {
71    field: String,
72    op: FilterOp,
73    /// Used for `Eq` / `Ne` (byte-exact comparison).
74    literal: Option<String>,
75    /// Used for `Re` / `NotRe`.
76    regex: Option<Regex>,
77}
78
79/// A compiled filter bound to a specific format. Evaluating a line runs the
80/// format's regex once and applies all predicates against the resulting
81/// captures. AND semantics: a line matches iff every predicate matches.
82#[derive(Debug)]
83pub struct CompiledFilter {
84    pub format_name: String,
85    format_regex: Regex,
86    predicates: Vec<CompiledPredicate>,
87}
88
89#[derive(Debug, PartialEq, Eq)]
90pub enum FilterMatch {
91    /// Line matches every predicate.
92    Matched,
93    /// Line parsed against the format but at least one predicate didn't match.
94    NotMatched,
95    /// Line didn't parse against the format at all.
96    NotParsed,
97}
98
99impl CompiledFilter {
100    /// Compile the given specs against `format`. Validates that every spec's
101    /// field is one of the format's named captures.
102    pub fn compile(format: &LogFormat, specs: Vec<FilterSpec>) -> Result<Self, String> {
103        let mut predicates = Vec::with_capacity(specs.len());
104        for spec in specs {
105            if !format.field_names.iter().any(|n| n == &spec.field) {
106                return Err(format!(
107                    "filter `{}{:?}{}`: field `{}` is not in format `{}` (available: {})",
108                    spec.field,
109                    spec.op,
110                    spec.value,
111                    spec.field,
112                    format.name,
113                    format.field_names.join(", "),
114                ));
115            }
116            let (literal, regex) = match spec.op {
117                FilterOp::Eq
118                | FilterOp::Ne
119                | FilterOp::Lt
120                | FilterOp::Le
121                | FilterOp::Gt
122                | FilterOp::Ge => (Some(spec.value.clone()), None),
123                FilterOp::Re | FilterOp::NotRe => {
124                    let r = Regex::new(&spec.value)
125                        .map_err(|e| format!("filter `{}`: invalid regex `{}`: {e}", spec.field, spec.value))?;
126                    (None, Some(r))
127                }
128            };
129            predicates.push(CompiledPredicate {
130                field: spec.field,
131                op: spec.op,
132                literal,
133                regex,
134            });
135        }
136        Ok(Self {
137            format_name: format.name.clone(),
138            format_regex: format.regex.clone(),
139            predicates,
140        })
141    }
142
143    /// Evaluate the filter against a single logical line of bytes. Decodes the
144    /// line as UTF-8 with a lossy fallback so non-UTF-8 bytes can still flow
145    /// through (they just won't match string-equal predicates).
146    pub fn evaluate(&self, line: &[u8]) -> FilterMatch {
147        let line_str = match std::str::from_utf8(line) {
148            Ok(s) => s,
149            Err(_) => return FilterMatch::NotParsed,
150        };
151        let Some(caps) = self.format_regex.captures(line_str) else {
152            return FilterMatch::NotParsed;
153        };
154        for p in &self.predicates {
155            let Some(m) = caps.name(&p.field) else {
156                return FilterMatch::NotMatched;
157            };
158            let captured = m.as_str();
159            let ok = match p.op {
160                FilterOp::Eq => p.literal.as_deref() == Some(captured),
161                FilterOp::Ne => p.literal.as_deref() != Some(captured),
162                FilterOp::Re => p.regex.as_ref().is_some_and(|r| r.is_match(captured)),
163                FilterOp::NotRe => p.regex.as_ref().is_some_and(|r| !r.is_match(captured)),
164                FilterOp::Lt | FilterOp::Le | FilterOp::Gt | FilterOp::Ge => {
165                    let rhs = p.literal.as_deref().unwrap_or("");
166                    compare(&p.op, captured, rhs)
167                }
168            };
169            if !ok {
170                return FilterMatch::NotMatched;
171            }
172        }
173        FilterMatch::Matched
174    }
175}
176
177/// Compare `lhs` against `rhs` under the given ordering operator.
178///
179/// Tries numeric comparison first (both sides parse as f64); falls back to
180/// lexicographic byte order. Numeric is intentionally lossy on integer
181/// overflow — log fields are typically small numbers (status codes, sizes,
182/// hours), and f64 covers the practical range.
183fn compare(op: &FilterOp, lhs: &str, rhs: &str) -> bool {
184    let order = match (lhs.parse::<f64>(), rhs.parse::<f64>()) {
185        (Ok(a), Ok(b)) => a.partial_cmp(&b),
186        _ => Some(lhs.cmp(rhs)),
187    };
188    let Some(order) = order else { return false; };
189    use std::cmp::Ordering::{Equal, Greater, Less};
190    matches!(
191        (op, order),
192        (FilterOp::Lt, Less)
193            | (FilterOp::Le, Less | Equal)
194            | (FilterOp::Gt, Greater)
195            | (FilterOp::Ge, Greater | Equal)
196    )
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    fn apache_combined() -> LogFormat {
204        LogFormat::compile(
205            "apache-combined",
206            r#"^(?P<ip>\S+) \S+ (?P<user>\S+) \[(?P<time>[^\]]+)\] "(?P<method>\S+) (?P<url>\S+) (?P<protocol>[^"]+)" (?P<status>\d+) (?P<size>\S+) "(?P<referer>[^"]*)" "(?P<agent>[^"]*)"$"#,
207        )
208        .unwrap()
209    }
210
211    const SAMPLE_200: &[u8] = br#"127.0.0.1 - alice [10/Oct/2023:13:55:36 +0000] "GET /index.html HTTP/1.1" 200 2326 "-" "Mozilla/5.0""#;
212    const SAMPLE_500: &[u8] = br#"127.0.0.1 - alice [10/Oct/2023:13:55:36 +0000] "GET /api/data HTTP/1.1" 500 512 "-" "curl/7.0""#;
213    const NON_PARSING: &[u8] = b"this line does not match the format at all";
214
215    #[test]
216    fn parse_eq() {
217        let s = FilterSpec::parse("status=500").unwrap();
218        assert_eq!(s.field, "status");
219        assert_eq!(s.op, FilterOp::Eq);
220        assert_eq!(s.value, "500");
221    }
222
223    #[test]
224    fn parse_ne_before_eq() {
225        let s = FilterSpec::parse("status!=200").unwrap();
226        assert_eq!(s.op, FilterOp::Ne);
227        assert_eq!(s.value, "200");
228    }
229
230    #[test]
231    fn parse_re() {
232        let s = FilterSpec::parse(r"ip~^10\.").unwrap();
233        assert_eq!(s.op, FilterOp::Re);
234        assert_eq!(s.value, r"^10\.");
235    }
236
237    #[test]
238    fn parse_not_re_before_re() {
239        let s = FilterSpec::parse("agent!~bot").unwrap();
240        assert_eq!(s.op, FilterOp::NotRe);
241        assert_eq!(s.value, "bot");
242    }
243
244    #[test]
245    fn parse_rejects_no_operator() {
246        let err = FilterSpec::parse("status").unwrap_err();
247        assert!(err.contains("missing operator"), "{err}");
248    }
249
250    #[test]
251    fn parse_rejects_empty_field() {
252        let err = FilterSpec::parse("=500").unwrap_err();
253        assert!(err.contains("empty field"), "{err}");
254    }
255
256    #[test]
257    fn compile_rejects_unknown_field() {
258        let fmt = apache_combined();
259        let specs = vec![FilterSpec::parse("notafield=x").unwrap()];
260        let err = CompiledFilter::compile(&fmt, specs).unwrap_err();
261        assert!(err.contains("not in format"), "{err}");
262    }
263
264    #[test]
265    fn evaluate_eq_matches() {
266        let fmt = apache_combined();
267        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status=500").unwrap()]).unwrap();
268        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
269        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
270    }
271
272    #[test]
273    fn evaluate_re_matches_5xx() {
274        let fmt = apache_combined();
275        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status~^5").unwrap()]).unwrap();
276        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
277        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
278    }
279
280    #[test]
281    fn evaluate_ne_excludes_200() {
282        let fmt = apache_combined();
283        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status!=200").unwrap()]).unwrap();
284        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
285        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
286    }
287
288    #[test]
289    fn evaluate_multiple_filters_and() {
290        let fmt = apache_combined();
291        let f = CompiledFilter::compile(
292            &fmt,
293            vec![
294                FilterSpec::parse("status~^5").unwrap(),
295                FilterSpec::parse(r"url~/api/").unwrap(),
296            ],
297        )
298        .unwrap();
299        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
300        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
301    }
302
303    #[test]
304    fn evaluate_unparseable_line_is_not_parsed() {
305        let fmt = apache_combined();
306        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status=200").unwrap()]).unwrap();
307        assert_eq!(f.evaluate(NON_PARSING), FilterMatch::NotParsed);
308    }
309
310    // ----- Comparison operators -----
311
312    #[test]
313    fn parse_le_before_lt() {
314        let s = FilterSpec::parse("status<=200").unwrap();
315        assert_eq!(s.op, FilterOp::Le);
316        assert_eq!(s.value, "200");
317    }
318
319    #[test]
320    fn parse_ge_before_gt() {
321        let s = FilterSpec::parse("status>=500").unwrap();
322        assert_eq!(s.op, FilterOp::Ge);
323        assert_eq!(s.value, "500");
324    }
325
326    #[test]
327    fn parse_lt() {
328        let s = FilterSpec::parse("size<1000").unwrap();
329        assert_eq!(s.op, FilterOp::Lt);
330        assert_eq!(s.value, "1000");
331    }
332
333    #[test]
334    fn parse_gt() {
335        let s = FilterSpec::parse("size>0").unwrap();
336        assert_eq!(s.op, FilterOp::Gt);
337        assert_eq!(s.value, "0");
338    }
339
340    #[test]
341    fn evaluate_ge_numeric() {
342        let fmt = apache_combined();
343        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status>=500").unwrap()]).unwrap();
344        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
345        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
346    }
347
348    #[test]
349    fn evaluate_lt_numeric() {
350        let fmt = apache_combined();
351        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status<400").unwrap()]).unwrap();
352        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::Matched);
353        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::NotMatched);
354    }
355
356    #[test]
357    fn evaluate_lex_fallback() {
358        // `size` of "-" means missing in CLF. Numeric parse fails, lex compare
359        // applies: "-" vs "100". Verify lex semantics produce the right answer.
360        // ASCII: '-' (0x2D) < '0' (0x30), so "-" < "100" lexicographically.
361        assert!(compare(&FilterOp::Lt, "-", "100"));
362        assert!(!compare(&FilterOp::Gt, "-", "100"));
363    }
364
365    #[test]
366    fn evaluate_lex_string_compare() {
367        // `level>warn` — both sides are strings, neither numeric.
368        assert!(compare(&FilterOp::Gt, "warning", "warn"));
369        assert!(!compare(&FilterOp::Gt, "info", "warn"));
370        assert!(compare(&FilterOp::Ge, "warn", "warn"));
371        assert!(compare(&FilterOp::Le, "warn", "warn"));
372    }
373
374    #[test]
375    fn parse_rejects_no_op_mentions_new_ops() {
376        let err = FilterSpec::parse("status").unwrap_err();
377        assert!(err.contains(">=") && err.contains("<="), "{err}");
378    }
379}