Skip to main content

anomalyx_normalize/parsers/
cef.rs

1//! CEF and LEEF parsers — ArcSight / QRadar SIEM event formats.
2//!
3//! Both are pipe-delimited headers followed by a `key=value` extension. One row
4//! per event; the header's category fields (`signatureId`/`name` for CEF,
5//! `eventId` for LEEF) and `severity` are exactly what `dist.chi2` reads as a
6//! signature/category mix shift, and a value never seen in the baseline surfaces
7//! as a new category automatically.
8//!
9//! - **CEF** (`CEF:Version|Vendor|Product|Version|SignatureID|Name|Severity|ext`):
10//!   7 header fields (with `\|` / `\\` escaping) then a space-separated extension
11//!   whose values may contain spaces — split at ` key=` boundaries, with
12//!   `\=` / `\\` / `\n` value escaping.
13//! - **LEEF** (`LEEF:Version|Vendor|Product|Version|EventID|[Delimiter|]ext`):
14//!   5 header fields; LEEF 2.0 adds an explicit delimiter field (a char or `xHH`
15//!   hex), LEEF 1.0 uses a tab. The extension is plain `key=value` pairs.
16
17use crate::infer;
18use crate::parser::{Confidence, FormatParser, STRONG};
19use crate::table::TableBuilder;
20use ax_core::{AxError, Column, Value};
21use std::collections::BTreeMap;
22
23fn is_ident(c: char) -> bool {
24    c.is_ascii_alphanumeric()
25}
26
27/// Decodes the SIEM backslash escapes: `\n`/`\r`/`\t` to whitespace, and any
28/// other `\x` (e.g. `\|`, `\=`, `\\`) to the literal `x`.
29fn unescape(s: &str) -> String {
30    let mut out = String::new();
31    let mut chars = s.chars();
32    while let Some(c) = chars.next() {
33        if c == '\\' {
34            match chars.next() {
35                Some('n') => out.push('\n'),
36                Some('r') => out.push('\r'),
37                Some('t') => out.push('\t'),
38                Some(other) => out.push(other),
39                None => out.push('\\'),
40            }
41        } else {
42            out.push(c);
43        }
44    }
45    out
46}
47
48/// Splits on `|` that is not backslash-escaped, into at most `max` fields (the
49/// last absorbs any remaining pipes). Escape pairs are preserved for [`unescape`].
50fn split_unescaped_pipe(s: &str, max: usize) -> Vec<String> {
51    let mut fields = Vec::new();
52    let mut cur = String::new();
53    let mut chars = s.chars().peekable();
54    while let Some(c) = chars.next() {
55        if c == '\\' {
56            cur.push('\\');
57            if let Some(next) = chars.next() {
58                cur.push(next);
59            }
60        } else if c == '|' && fields.len() < max - 1 {
61            fields.push(std::mem::take(&mut cur));
62        } else {
63            cur.push(c);
64        }
65    }
66    fields.push(cur);
67    fields
68}
69
70/// Parses a CEF extension (`key=value key2=value2 ...`) where a value may itself
71/// contain spaces — a new key begins only at a space-preceded `ident=`. Values
72/// are unescaped.
73fn parse_cef_extension(ext: &str) -> Vec<(String, String)> {
74    let chars: Vec<char> = ext.chars().collect();
75    let n = chars.len();
76    // Locate each key: an identifier run, ending in `=`, at start or after a space.
77    let mut keys: Vec<(usize, usize)> = Vec::new(); // (key_start, eq_index)
78    let mut i = 0;
79    while i < n {
80        if (i == 0 || chars[i - 1] == ' ') && is_ident(chars[i]) {
81            let mut j = i;
82            while j < n && is_ident(chars[j]) {
83                j += 1;
84            }
85            if j < n && chars[j] == '=' {
86                keys.push((i, j));
87                i = j + 1;
88                continue;
89            }
90        }
91        i += 1;
92    }
93    let mut pairs = Vec::new();
94    for (idx, &(key_start, eq)) in keys.iter().enumerate() {
95        let key: String = chars[key_start..eq].iter().collect();
96        let value_end = keys.get(idx + 1).map_or(n, |&(next_start, _)| next_start);
97        let raw: String = chars[eq + 1..value_end].iter().collect();
98        pairs.push((key, unescape(raw.trim_end())));
99    }
100    pairs
101}
102
103/// Resolves a LEEF 2.0 delimiter spec: a literal char, or `xHH` / `\xHH` hex.
104/// Defaults to tab (the LEEF 1.0 separator).
105fn leef_delimiter(spec: &str) -> char {
106    if let Some(hex) = spec.strip_prefix('x').or_else(|| spec.strip_prefix("\\x")) {
107        if let Ok(byte) = u8::from_str_radix(hex, 16) {
108            return byte as char;
109        }
110    }
111    spec.chars().next().unwrap_or('\t')
112}
113
114// ----------------------------------------------------------------- CEF --------
115
116#[derive(Debug, Default, Clone)]
117pub struct CefParser;
118
119const CEF_HEADER: [&str; 7] = [
120    "cefVersion",
121    "deviceVendor",
122    "deviceProduct",
123    "deviceVersion",
124    "signatureId",
125    "name",
126    "severity",
127];
128
129impl CefParser {
130    fn err(&self, msg: impl std::fmt::Display) -> AxError {
131        AxError::Parse {
132            format: self.id().to_string(),
133            message: msg.to_string(),
134        }
135    }
136}
137
138impl FormatParser for CefParser {
139    fn id(&self) -> &'static str {
140        "cef"
141    }
142    fn extensions(&self) -> &'static [&'static str] {
143        &["cef"]
144    }
145    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
146        let text = std::str::from_utf8(bytes).ok()?;
147        let line = text.lines().find(|l| !l.trim().is_empty())?;
148        line.starts_with("CEF:").then_some(STRONG)
149    }
150    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
151        let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
152        let mut builder = TableBuilder::new();
153        for line in text.lines() {
154            if line.trim().is_empty() {
155                continue;
156            }
157            let rest = line
158                .strip_prefix("CEF:")
159                .ok_or_else(|| self.err("not a CEF line: missing 'CEF:' prefix"))?;
160            let fields = split_unescaped_pipe(rest, 8);
161            if fields.len() < CEF_HEADER.len() {
162                return Err(self.err("CEF header requires 7 pipe-delimited fields"));
163            }
164            let mut row: BTreeMap<String, Value> = BTreeMap::new();
165            for (name, raw) in CEF_HEADER.iter().zip(&fields) {
166                let decoded = unescape(raw);
167                // Severity is the analyzable numeric (or a named level); the rest
168                // are categorical identifiers kept verbatim.
169                let cell = if *name == "severity" {
170                    infer::infer_scalar(&decoded)
171                } else {
172                    Value::Str(decoded)
173                };
174                row.insert((*name).to_string(), cell);
175            }
176            if let Some(ext) = fields.get(CEF_HEADER.len()) {
177                for (key, value) in parse_cef_extension(ext) {
178                    row.insert(key, infer::infer_scalar(&value));
179                }
180            }
181            builder.push_row(row);
182        }
183        Ok(builder.finish())
184    }
185}
186
187// ---------------------------------------------------------------- LEEF --------
188
189#[derive(Debug, Default, Clone)]
190pub struct LeefParser;
191
192const LEEF_HEADER: [&str; 5] = [
193    "leefVersion",
194    "vendor",
195    "product",
196    "productVersion",
197    "eventId",
198];
199
200impl LeefParser {
201    fn err(&self, msg: impl std::fmt::Display) -> AxError {
202        AxError::Parse {
203            format: self.id().to_string(),
204            message: msg.to_string(),
205        }
206    }
207}
208
209impl FormatParser for LeefParser {
210    fn id(&self) -> &'static str {
211        "leef"
212    }
213    fn extensions(&self) -> &'static [&'static str] {
214        &["leef"]
215    }
216    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
217        let text = std::str::from_utf8(bytes).ok()?;
218        let line = text.lines().find(|l| !l.trim().is_empty())?;
219        line.starts_with("LEEF:").then_some(STRONG)
220    }
221    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
222        let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
223        let mut builder = TableBuilder::new();
224        for line in text.lines() {
225            if line.trim().is_empty() {
226                continue;
227            }
228            let rest = line
229                .strip_prefix("LEEF:")
230                .ok_or_else(|| self.err("not a LEEF line: missing 'LEEF:' prefix"))?;
231            // LEEF 2.0 inserts a delimiter field between the header and extension.
232            let version = rest.split('|').next().unwrap_or("");
233            let is_v2 = version.starts_with('2');
234            let header_count = LEEF_HEADER.len() + usize::from(is_v2);
235            let parts: Vec<&str> = rest.splitn(header_count + 1, '|').collect();
236            if parts.len() < LEEF_HEADER.len() {
237                return Err(self.err("LEEF header requires at least 5 fields"));
238            }
239            let mut row: BTreeMap<String, Value> = BTreeMap::new();
240            for (name, value) in LEEF_HEADER.iter().zip(&parts) {
241                row.insert((*name).to_string(), Value::Str((*value).to_string()));
242            }
243            let delimiter = if is_v2 {
244                parts
245                    .get(LEEF_HEADER.len())
246                    .map_or('\t', |s| leef_delimiter(s))
247            } else {
248                '\t'
249            };
250            if let Some(ext) = parts.get(header_count) {
251                for token in ext.split(delimiter) {
252                    if let Some((key, value)) = token.split_once('=') {
253                        if !key.is_empty() {
254                            row.insert(key.to_string(), infer::infer_scalar(value));
255                        }
256                    }
257                }
258            }
259            builder.push_row(row);
260        }
261        Ok(builder.finish())
262    }
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268    use ax_core::ColType;
269
270    fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
271        cols.iter()
272            .find(|c| c.name == name)
273            .unwrap_or_else(|| panic!("missing column {name}"))
274    }
275
276    // -------------------------------------------------------- helpers --------
277
278    #[test]
279    fn unescape_decodes_siem_escapes() {
280        assert_eq!(unescape(r"a\|b"), "a|b");
281        assert_eq!(unescape(r"a\=b"), "a=b");
282        assert_eq!(unescape(r"a\\b"), r"a\b");
283        assert_eq!(unescape(r"a\nb"), "a\nb");
284        assert_eq!(unescape("plain"), "plain");
285    }
286
287    #[test]
288    fn split_unescaped_pipe_keeps_escaped_and_extra() {
289        // Escaped pipe stays in its field; once 8 fields are reached the last
290        // absorbs any further pipes (9 segments here → 8th keeps "i|j").
291        let f = split_unescaped_pipe(r"a\|b|c|d|e|f|g|h|i|j", 8);
292        assert_eq!(f.len(), 8);
293        assert_eq!(f[0], r"a\|b", "escaped pipe is not a separator");
294        assert_eq!(f[7], "i|j", "extension field absorbs extra pipes");
295    }
296
297    #[test]
298    fn parse_cef_extension_handles_spaces_and_escapes() {
299        let pairs = parse_cef_extension(r"src=10.0.0.1 msg=worm was stopped spt=1232 note=a\=b");
300        assert_eq!(
301            pairs,
302            vec![
303                ("src".into(), "10.0.0.1".into()),
304                ("msg".into(), "worm was stopped".into()), // value with spaces
305                ("spt".into(), "1232".into()),
306                ("note".into(), "a=b".into()), // escaped '='
307            ]
308        );
309    }
310
311    #[test]
312    fn parse_cef_extension_only_breaks_at_space_preceded_keys() {
313        // A `=` that is not space-preceded stays inside the value (a new key
314        // begins ONLY after a space). And single-char keys advance correctly.
315        assert_eq!(
316            parse_cef_extension("k=ab=cd"),
317            vec![("k".into(), "ab=cd".into())]
318        );
319        assert_eq!(
320            parse_cef_extension("a=1 b=2"),
321            vec![("a".into(), "1".into()), ("b".into(), "2".into())]
322        );
323    }
324
325    #[test]
326    fn leef_delimiter_resolves_char_hex_and_default() {
327        assert_eq!(leef_delimiter("^"), '^');
328        assert_eq!(leef_delimiter("x09"), '\t');
329        assert_eq!(leef_delimiter(r"\x09"), '\t');
330        assert_eq!(leef_delimiter(""), '\t'); // empty → tab default
331    }
332
333    // ----------------------------------------------------------- CEF --------
334
335    const CEF: &str = concat!(
336        r"CEF:0|Security|threatmanager|1.0|100|worm stopped|10|src=10.0.0.1 spt=1232 msg=took action",
337        "\n",
338        r"CEF:0|Security|threatmanager|1.0|200|port scan|3|src=10.0.0.9 dst=2.1.2.2",
339        "\n",
340    );
341
342    fn cef(s: &str) -> Vec<Column> {
343        CefParser.parse("-", s.as_bytes()).unwrap()
344    }
345
346    #[test]
347    fn cef_header_fields() {
348        let cols = cef(CEF);
349        assert_eq!(
350            col(&cols, "deviceProduct").cells[0],
351            Value::Str("threatmanager".into())
352        );
353        assert_eq!(col(&cols, "signatureId").cells[0], Value::Str("100".into()));
354        assert_eq!(col(&cols, "name").cells[1], Value::Str("port scan".into()));
355        let sev = col(&cols, "severity");
356        assert_eq!(sev.ty, ColType::Int, "severity is the analyzable numeric");
357        assert_eq!(sev.cells, vec![Value::Int(10), Value::Int(3)]);
358    }
359
360    #[test]
361    fn cef_extension_fields_typed_and_padded() {
362        let cols = cef(CEF);
363        assert_eq!(col(&cols, "src").cells[0], Value::Str("10.0.0.1".into()));
364        assert_eq!(col(&cols, "spt").cells[0], Value::Int(1232)); // port → int
365        assert_eq!(col(&cols, "msg").cells[0], Value::Str("took action".into()));
366        // dst only on the second event; spt/msg only on the first.
367        assert_eq!(col(&cols, "dst").cells[0], Value::Null);
368        assert_eq!(col(&cols, "spt").cells[1], Value::Null);
369    }
370
371    #[test]
372    fn cef_escaped_pipe_in_header() {
373        let cols = cef(r"CEF:0|Sec\|ops|prod|1|1|n|5|");
374        assert_eq!(
375            col(&cols, "deviceVendor").cells[0],
376            Value::Str("Sec|ops".into())
377        );
378    }
379
380    #[test]
381    fn cef_without_extension() {
382        let cols = cef("CEF:0|v|p|1.0|42|evt|7\n"); // 7 fields, no extension
383        assert_eq!(col(&cols, "signatureId").cells[0], Value::Str("42".into()));
384        assert_eq!(col(&cols, "severity").cells[0], Value::Int(7));
385    }
386
387    #[test]
388    fn cef_malformed_too_few_fields_errors() {
389        assert!(matches!(
390            CefParser.parse("-", b"CEF:0|only|three\n"),
391            Err(AxError::Parse { .. })
392        ));
393        assert!(matches!(
394            CefParser.parse("-", b"not a cef line\n"),
395            Err(AxError::Parse { .. })
396        ));
397    }
398
399    #[test]
400    fn cef_sniff_and_resolution() {
401        assert_eq!(CefParser.sniff(CEF.as_bytes()), Some(STRONG));
402        assert_eq!(CefParser.sniff(b"LEEF:1.0|v|p|1|x|"), None);
403        assert_eq!(CefParser.sniff(b"a,b,c\n1,2,3"), None);
404        assert_eq!(CefParser.extensions(), &["cef"]);
405        let reg = crate::parser::ParserRegistry::default();
406        assert_eq!(reg.resolve("e.cef", b"x").unwrap().id(), "cef");
407        assert_eq!(reg.resolve("-", CEF.as_bytes()).unwrap().id(), "cef");
408    }
409
410    // ---------------------------------------------------------- LEEF --------
411
412    #[test]
413    fn leef_v1_tab_extension() {
414        let line = "LEEF:1.0|Lancope|StealthWatch|1.0|41|src=192.0.2.0\tdst=172.50.123.1\tsev=5\n";
415        let cols = LeefParser.parse("-", line.as_bytes()).unwrap();
416        assert_eq!(col(&cols, "leefVersion").cells[0], Value::Str("1.0".into()));
417        assert_eq!(col(&cols, "vendor").cells[0], Value::Str("Lancope".into()));
418        assert_eq!(col(&cols, "eventId").cells[0], Value::Str("41".into()));
419        assert_eq!(col(&cols, "src").cells[0], Value::Str("192.0.2.0".into()));
420        assert_eq!(col(&cols, "sev").cells[0], Value::Int(5));
421    }
422
423    #[test]
424    fn leef_header_only_no_extension() {
425        // Exactly 5 fields (no extension) is valid LEEF 1.0 — pins the `< 5`
426        // header-count boundary (must not reject a 5-field header).
427        let cols = LeefParser.parse("-", b"LEEF:1.0|Acme|Tool|2|77").unwrap();
428        assert_eq!(col(&cols, "eventId").cells[0], Value::Str("77".into()));
429        assert_eq!(col(&cols, "vendor").cells[0], Value::Str("Acme".into()));
430    }
431
432    #[test]
433    fn leef_v2_explicit_delimiter() {
434        // LEEF 2.0 with a '^' delimiter field between header and extension.
435        let line = "LEEF:2.0|Vendor|Product|2.5|1001|^|src=10.0.0.1^dst=10.0.0.2^spt=22\n";
436        let cols = LeefParser.parse("-", line.as_bytes()).unwrap();
437        assert_eq!(col(&cols, "eventId").cells[0], Value::Str("1001".into()));
438        assert_eq!(col(&cols, "src").cells[0], Value::Str("10.0.0.1".into()));
439        assert_eq!(col(&cols, "spt").cells[0], Value::Int(22));
440        // The delimiter field itself is not a data column.
441        assert!(cols.iter().all(|c| c.name != "^"));
442    }
443
444    #[test]
445    fn leef_v2_hex_delimiter() {
446        // x09 = tab delimiter.
447        let line = "LEEF:2.0|V|P|1|99|x09|a=1\tb=2\n";
448        let cols = LeefParser.parse("-", line.as_bytes()).unwrap();
449        assert_eq!(col(&cols, "a").cells[0], Value::Int(1));
450        assert_eq!(col(&cols, "b").cells[0], Value::Int(2));
451    }
452
453    #[test]
454    fn leef_malformed_and_sniff() {
455        assert!(matches!(
456            LeefParser.parse("-", b"LEEF:1.0|onlytwo\n"),
457            Err(AxError::Parse { .. })
458        ));
459        assert!(matches!(
460            LeefParser.parse("-", b"not leef\n"),
461            Err(AxError::Parse { .. })
462        ));
463        assert_eq!(LeefParser.sniff(b"LEEF:1.0|v|p|1|x|a=1"), Some(STRONG));
464        assert_eq!(LeefParser.sniff(b"CEF:0|v|p|1|1|n|5|"), None);
465        assert_eq!(LeefParser.extensions(), &["leef"]);
466        let reg = crate::parser::ParserRegistry::default();
467        assert_eq!(reg.resolve("e.leef", b"x").unwrap().id(), "leef");
468        assert_eq!(
469            reg.resolve("-", b"LEEF:1.0|v|p|1|x|a=1\n").unwrap().id(),
470            "leef"
471        );
472    }
473}