Skip to main content

tess/
filter.rs

1use regex::{Regex, RegexBuilder};
2
3use crate::format::LogFormat;
4
5/// Operator in a single filter spec.
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub enum FilterOp {
8    /// `field=value` — exact match.
9    Eq,
10    /// `field!=value` — exact non-match.
11    Ne,
12    /// `field~regex` — regex match.
13    Re,
14    /// `field!~regex` — regex non-match.
15    NotRe,
16    /// `field<value` — less than (numeric if both sides parse as f64, else lex).
17    Lt,
18    /// `field<=value` — less-than-or-equal.
19    Le,
20    /// `field>value` — greater than.
21    Gt,
22    /// `field>=value` — greater-than-or-equal.
23    Ge,
24}
25
26/// A parsed filter spec, before being bound to a format.
27#[derive(Debug, Clone)]
28pub struct FilterSpec {
29    pub field: String,
30    pub op: FilterOp,
31    pub value: String,
32}
33
34impl FilterSpec {
35    /// Parse a filter spec like `status=500`, `ip~^10\.`, `status!=200`,
36    /// `agent!~bot`, `status>=500`, `hour<12`. Operator detection scans for
37    /// the longest match first so multi-char operators (`!=`, `!~`, `<=`,
38    /// `>=`) aren't confused with their single-char prefixes.
39    pub fn parse(input: &str) -> Result<Self, String> {
40        for (op, sep) in &[
41            (FilterOp::NotRe, "!~"),
42            (FilterOp::Ne, "!="),
43            (FilterOp::Le, "<="),
44            (FilterOp::Ge, ">="),
45            (FilterOp::Re, "~"),
46            (FilterOp::Eq, "="),
47            (FilterOp::Lt, "<"),
48            (FilterOp::Gt, ">"),
49        ] {
50            if let Some((field, value)) = input.split_once(sep) {
51                if field.is_empty() {
52                    return Err(format!("filter `{input}`: empty field name"));
53                }
54                return Ok(FilterSpec {
55                    field: field.to_string(),
56                    op: op.clone(),
57                    value: value.to_string(),
58                });
59            }
60        }
61        Err(format!(
62            "filter `{input}`: missing operator (expected =, !=, ~, !~, <, <=, >, or >=)"
63        ))
64    }
65}
66
67/// A single compiled predicate: an operator and (for regex ops) the compiled
68/// regex.
69#[derive(Debug)]
70struct CompiledPredicate {
71    field: String,
72    op: FilterOp,
73    /// Used for `Eq` / `Ne` (byte-exact comparison).
74    literal: Option<String>,
75    /// Used for `Re` / `NotRe`.
76    regex: Option<Regex>,
77}
78
79/// A compiled filter bound to a specific format. Evaluating a line runs the
80/// format's regex once and applies all predicates against the resulting
81/// captures. AND semantics: a line matches iff every predicate matches.
82///
83/// `format_regex_record` is a sibling compiled from the same source pattern
84/// with dotall + multi-line flags enabled. Records-mode callers use it so
85/// greedy `.` / `.+` captures span across newlines within a single record
86/// (e.g. `(?P<message>.*)$` captures the entire record body instead of
87/// failing because the original `$` only matches end-of-input).
88#[derive(Debug)]
89pub struct CompiledFilter {
90    pub format_name: String,
91    format_regex: Regex,
92    format_regex_record: Regex,
93    predicates: Vec<CompiledPredicate>,
94}
95
96#[derive(Debug, PartialEq, Eq)]
97pub enum FilterMatch {
98    /// Line matches every predicate.
99    Matched,
100    /// Line parsed against the format but at least one predicate didn't match.
101    NotMatched,
102    /// Line didn't parse against the format at all.
103    NotParsed,
104}
105
106impl CompiledFilter {
107    /// Compile the given specs against `format`. Validates that every spec's
108    /// field is one of the format's named captures. The `case_mode` applies
109    /// to the regex operators (`~` / `!~`); literal-comparison operators
110    /// (`=`, `!=`, `<`, `<=`, `>`, `>=`) ignore it.
111    pub fn compile(
112        format: &LogFormat,
113        specs: Vec<FilterSpec>,
114        case_mode: crate::viewport::CaseMode,
115    ) -> Result<Self, String> {
116        let mut predicates = Vec::with_capacity(specs.len());
117        for spec in specs {
118            if !format.field_names.iter().any(|n| n == &spec.field) {
119                return Err(format!(
120                    "filter `{}{:?}{}`: field `{}` is not in format `{}` (available: {})",
121                    spec.field,
122                    spec.op,
123                    spec.value,
124                    spec.field,
125                    format.name,
126                    format.field_names.join(", "),
127                ));
128            }
129            let (literal, regex) = match spec.op {
130                FilterOp::Eq
131                | FilterOp::Ne
132                | FilterOp::Lt
133                | FilterOp::Le
134                | FilterOp::Gt
135                | FilterOp::Ge => (Some(spec.value.clone()), None),
136                FilterOp::Re | FilterOp::NotRe => {
137                    let compiled = case_mode.apply_to_pattern(&spec.value);
138                    let r = Regex::new(&compiled)
139                        .map_err(|e| format!("filter `{}`: invalid regex `{}`: {e}", spec.field, spec.value))?;
140                    (None, Some(r))
141                }
142            };
143            predicates.push(CompiledPredicate {
144                field: spec.field,
145                op: spec.op,
146                literal,
147                regex,
148            });
149        }
150        let format_regex_record = RegexBuilder::new(format.regex.as_str())
151            .dot_matches_new_line(true)
152            .multi_line(true)
153            .build()
154            .map_err(|e| {
155                format!("format `{}`: rebuilding regex for records mode: {e}", format.name)
156            })?;
157
158        Ok(Self {
159            format_name: format.name.clone(),
160            format_regex: format.regex.clone(),
161            format_regex_record,
162            predicates,
163        })
164    }
165
166    /// Evaluate the filter against a single logical line of bytes. Decodes the
167    /// line as UTF-8 with a lossy fallback so non-UTF-8 bytes can still flow
168    /// through (they just won't match string-equal predicates).
169    pub fn evaluate(&self, line: &[u8]) -> FilterMatch {
170        self.evaluate_with(&self.format_regex, line)
171    }
172
173    /// Records-mode evaluation: runs the format regex with dotall + multi-line
174    /// flags enabled against the full multi-line record bytes. Greedy
175    /// captures like `(?P<message>.*)$` consume the whole body of the record,
176    /// so predicates can match content on any continuation line.
177    pub fn evaluate_record(&self, record: &[u8]) -> FilterMatch {
178        self.evaluate_with(&self.format_regex_record, record)
179    }
180
181    fn evaluate_with(&self, regex: &Regex, bytes: &[u8]) -> FilterMatch {
182        let line_str = match std::str::from_utf8(bytes) {
183            Ok(s) => s,
184            Err(_) => return FilterMatch::NotParsed,
185        };
186        let Some(caps) = regex.captures(line_str) else {
187            return FilterMatch::NotParsed;
188        };
189        for p in &self.predicates {
190            let Some(m) = caps.name(&p.field) else {
191                return FilterMatch::NotMatched;
192            };
193            let captured = m.as_str();
194            let ok = match p.op {
195                FilterOp::Eq => p.literal.as_deref() == Some(captured),
196                FilterOp::Ne => p.literal.as_deref() != Some(captured),
197                FilterOp::Re => p.regex.as_ref().is_some_and(|r| r.is_match(captured)),
198                FilterOp::NotRe => p.regex.as_ref().is_some_and(|r| !r.is_match(captured)),
199                FilterOp::Lt | FilterOp::Le | FilterOp::Gt | FilterOp::Ge => {
200                    let rhs = p.literal.as_deref().unwrap_or("");
201                    compare(&p.op, captured, rhs)
202                }
203            };
204            if !ok {
205                return FilterMatch::NotMatched;
206            }
207        }
208        FilterMatch::Matched
209    }
210}
211
212/// Compare `lhs` against `rhs` under the given ordering operator.
213///
214/// Tries numeric comparison first (both sides parse as f64); falls back to
215/// lexicographic byte order. Numeric is intentionally lossy on integer
216/// overflow — log fields are typically small numbers (status codes, sizes,
217/// hours), and f64 covers the practical range.
218fn compare(op: &FilterOp, lhs: &str, rhs: &str) -> bool {
219    let order = match (lhs.parse::<f64>(), rhs.parse::<f64>()) {
220        (Ok(a), Ok(b)) => a.partial_cmp(&b),
221        _ => Some(lhs.cmp(rhs)),
222    };
223    let Some(order) = order else { return false; };
224    use std::cmp::Ordering::{Equal, Greater, Less};
225    matches!(
226        (op, order),
227        (FilterOp::Lt, Less)
228            | (FilterOp::Le, Less | Equal)
229            | (FilterOp::Gt, Greater)
230            | (FilterOp::Ge, Greater | Equal)
231    )
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    fn apache_combined() -> LogFormat {
239        LogFormat::compile(
240            "apache-combined",
241            r#"^(?P<ip>\S+) \S+ (?P<user>\S+) \[(?P<time>[^\]]+)\] "(?P<method>\S+) (?P<url>\S+) (?P<protocol>[^"]+)" (?P<status>\d+) (?P<size>\S+) "(?P<referer>[^"]*)" "(?P<agent>[^"]*)"$"#,
242        )
243        .unwrap()
244    }
245
246    const SAMPLE_200: &[u8] = br#"127.0.0.1 - alice [10/Oct/2023:13:55:36 +0000] "GET /index.html HTTP/1.1" 200 2326 "-" "Mozilla/5.0""#;
247    const SAMPLE_500: &[u8] = br#"127.0.0.1 - alice [10/Oct/2023:13:55:36 +0000] "GET /api/data HTTP/1.1" 500 512 "-" "curl/7.0""#;
248    const NON_PARSING: &[u8] = b"this line does not match the format at all";
249
250    #[test]
251    fn parse_eq() {
252        let s = FilterSpec::parse("status=500").unwrap();
253        assert_eq!(s.field, "status");
254        assert_eq!(s.op, FilterOp::Eq);
255        assert_eq!(s.value, "500");
256    }
257
258    #[test]
259    fn parse_ne_before_eq() {
260        let s = FilterSpec::parse("status!=200").unwrap();
261        assert_eq!(s.op, FilterOp::Ne);
262        assert_eq!(s.value, "200");
263    }
264
265    #[test]
266    fn parse_re() {
267        let s = FilterSpec::parse(r"ip~^10\.").unwrap();
268        assert_eq!(s.op, FilterOp::Re);
269        assert_eq!(s.value, r"^10\.");
270    }
271
272    #[test]
273    fn parse_not_re_before_re() {
274        let s = FilterSpec::parse("agent!~bot").unwrap();
275        assert_eq!(s.op, FilterOp::NotRe);
276        assert_eq!(s.value, "bot");
277    }
278
279    #[test]
280    fn parse_rejects_no_operator() {
281        let err = FilterSpec::parse("status").unwrap_err();
282        assert!(err.contains("missing operator"), "{err}");
283    }
284
285    #[test]
286    fn parse_rejects_empty_field() {
287        let err = FilterSpec::parse("=500").unwrap_err();
288        assert!(err.contains("empty field"), "{err}");
289    }
290
291    #[test]
292    fn compile_rejects_unknown_field() {
293        let fmt = apache_combined();
294        let specs = vec![FilterSpec::parse("notafield=x").unwrap()];
295        let err = CompiledFilter::compile(&fmt, specs, crate::viewport::CaseMode::Sensitive).unwrap_err();
296        assert!(err.contains("not in format"), "{err}");
297    }
298
299    #[test]
300    fn evaluate_eq_matches() {
301        let fmt = apache_combined();
302        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status=500").unwrap()], crate::viewport::CaseMode::Sensitive).unwrap();
303        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
304        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
305    }
306
307    #[test]
308    fn evaluate_re_matches_5xx() {
309        let fmt = apache_combined();
310        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status~^5").unwrap()], crate::viewport::CaseMode::Sensitive).unwrap();
311        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
312        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
313    }
314
315    #[test]
316    fn evaluate_ne_excludes_200() {
317        let fmt = apache_combined();
318        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status!=200").unwrap()], crate::viewport::CaseMode::Sensitive).unwrap();
319        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
320        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
321    }
322
323    #[test]
324    fn evaluate_multiple_filters_and() {
325        let fmt = apache_combined();
326        let f = CompiledFilter::compile(
327            &fmt,
328            vec![
329                FilterSpec::parse("status~^5").unwrap(),
330                FilterSpec::parse(r"url~/api/").unwrap(),
331            ],
332            crate::viewport::CaseMode::Sensitive,
333        )
334        .unwrap();
335        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
336        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
337    }
338
339    #[test]
340    fn evaluate_unparseable_line_is_not_parsed() {
341        let fmt = apache_combined();
342        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status=200").unwrap()], crate::viewport::CaseMode::Sensitive).unwrap();
343        assert_eq!(f.evaluate(NON_PARSING), FilterMatch::NotParsed);
344    }
345
346    // ----- Comparison operators -----
347
348    #[test]
349    fn parse_le_before_lt() {
350        let s = FilterSpec::parse("status<=200").unwrap();
351        assert_eq!(s.op, FilterOp::Le);
352        assert_eq!(s.value, "200");
353    }
354
355    #[test]
356    fn parse_ge_before_gt() {
357        let s = FilterSpec::parse("status>=500").unwrap();
358        assert_eq!(s.op, FilterOp::Ge);
359        assert_eq!(s.value, "500");
360    }
361
362    #[test]
363    fn parse_lt() {
364        let s = FilterSpec::parse("size<1000").unwrap();
365        assert_eq!(s.op, FilterOp::Lt);
366        assert_eq!(s.value, "1000");
367    }
368
369    #[test]
370    fn parse_gt() {
371        let s = FilterSpec::parse("size>0").unwrap();
372        assert_eq!(s.op, FilterOp::Gt);
373        assert_eq!(s.value, "0");
374    }
375
376    #[test]
377    fn evaluate_ge_numeric() {
378        let fmt = apache_combined();
379        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status>=500").unwrap()], crate::viewport::CaseMode::Sensitive).unwrap();
380        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::Matched);
381        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::NotMatched);
382    }
383
384    #[test]
385    fn evaluate_lt_numeric() {
386        let fmt = apache_combined();
387        let f = CompiledFilter::compile(&fmt, vec![FilterSpec::parse("status<400").unwrap()], crate::viewport::CaseMode::Sensitive).unwrap();
388        assert_eq!(f.evaluate(SAMPLE_200), FilterMatch::Matched);
389        assert_eq!(f.evaluate(SAMPLE_500), FilterMatch::NotMatched);
390    }
391
392    #[test]
393    fn evaluate_lex_fallback() {
394        // `size` of "-" means missing in CLF. Numeric parse fails, lex compare
395        // applies: "-" vs "100". Verify lex semantics produce the right answer.
396        // ASCII: '-' (0x2D) < '0' (0x30), so "-" < "100" lexicographically.
397        assert!(compare(&FilterOp::Lt, "-", "100"));
398        assert!(!compare(&FilterOp::Gt, "-", "100"));
399    }
400
401    #[test]
402    fn evaluate_lex_string_compare() {
403        // `level>warn` — both sides are strings, neither numeric.
404        assert!(compare(&FilterOp::Gt, "warning", "warn"));
405        assert!(!compare(&FilterOp::Gt, "info", "warn"));
406        assert!(compare(&FilterOp::Ge, "warn", "warn"));
407        assert!(compare(&FilterOp::Le, "warn", "warn"));
408    }
409
410    #[test]
411    fn parse_rejects_no_op_mentions_new_ops() {
412        let err = FilterSpec::parse("status").unwrap_err();
413        assert!(err.contains(">=") && err.contains("<="), "{err}");
414    }
415}