Skip to main content

rsigma_runtime/parse/
cef.rs

1//! Zero-dependency CEF (Common Event Format) parser.
2//!
3//! Parses [ArcSight CEF](https://www.microfocus.com/documentation/arcsight/arcsight-smartconnectors-8.4/cef-implementation-standard/)
4//! log lines into a structured [`CefRecord`].
5//!
6//! # Format
7//!
8//! ```text
9//! CEF:Version|Device Vendor|Device Product|Device Version|Signature ID|Name|Severity|Extensions
10//! ```
11//!
12//! The header contains 7 pipe-delimited fields. Pipes in header values are
13//! escaped as `\|` and backslashes as `\\`.
14//!
15//! Extensions are space-separated `key=value` pairs where values may contain
16//! spaces. The boundary between one value and the next key is determined by
17//! looking back from each unescaped `=` to find the key name. In extension
18//! values, `\=` is a literal `=`, `\\` is a literal `\`, `\n` is a newline,
19//! and `\r` is a carriage return.
20//!
21//! # Syslog wrapping
22//!
23//! This parser handles **raw CEF only**. If CEF arrives inside a syslog
24//! envelope, the caller must strip the syslog prefix first (e.g. by finding
25//! `"CEF:"` in the line). The [`find_cef_start`] helper locates the offset.
26//!
27//! # Example
28//!
29//! ```
30//! use rsigma_runtime::parse::cef::parse;
31//!
32//! let record = parse(
33//!     "CEF:0|Security|IDS|1.0|100|Attack detected|9|src=10.0.0.1 dst=192.168.1.1 msg=Intrusion attempt"
34//! ).unwrap();
35//!
36//! assert_eq!(record.device_vendor, "Security");
37//! assert_eq!(record.severity, "9");
38//! assert_eq!(record.extensions.len(), 3);
39//! assert_eq!(record.extensions[2].0, "msg");
40//! assert_eq!(record.extensions[2].1, "Intrusion attempt");
41//! ```
42
43use std::fmt;
44
45/// A parsed CEF record.
46#[derive(Debug, Clone, PartialEq)]
47pub struct CefRecord {
48    /// CEF version (typically 0).
49    pub version: u32,
50    pub device_vendor: String,
51    pub device_product: String,
52    pub device_version: String,
53    pub signature_id: String,
54    pub name: String,
55    pub severity: String,
56    /// Extension key-value pairs, in the order they appeared.
57    pub extensions: Vec<(String, String)>,
58}
59
60/// Errors from CEF parsing.
61#[derive(Debug, Clone, PartialEq)]
62pub enum CefError {
63    /// Input does not start with `CEF:`.
64    NotCef,
65    /// Header has fewer than 7 pipe-delimited fields.
66    IncompleteHeader,
67    /// The version field is not a valid integer.
68    InvalidVersion,
69}
70
71impl fmt::Display for CefError {
72    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
73        match self {
74            CefError::NotCef => write!(f, "input does not contain a CEF header"),
75            CefError::IncompleteHeader => {
76                write!(f, "CEF header requires 7 pipe-delimited fields")
77            }
78            CefError::InvalidVersion => write!(f, "CEF version is not a valid integer"),
79        }
80    }
81}
82
83impl std::error::Error for CefError {}
84
85/// Find the byte offset of `"CEF:"` in the input, if present.
86///
87/// Useful for stripping a syslog prefix before calling [`parse`].
88pub fn find_cef_start(input: &str) -> Option<usize> {
89    input.find("CEF:")
90}
91
92/// Parse a CEF line into a [`CefRecord`].
93///
94/// Expects input starting at `CEF:` (use [`find_cef_start`] to locate it
95/// within a syslog-wrapped line).
96pub fn parse(input: &str) -> Result<CefRecord, CefError> {
97    let input = input.trim();
98
99    // Locate the CEF header start.
100    let cef_start = find_cef_start(input).ok_or(CefError::NotCef)?;
101    let after_marker = &input[cef_start + 4..]; // skip "CEF:"
102
103    // Split header fields on unescaped `|`. We need exactly 7 separators
104    // (version + 6 named fields), with the rest being the extension.
105    let header_fields = split_header(after_marker);
106    if header_fields.len() < 8 {
107        return Err(CefError::IncompleteHeader);
108    }
109
110    let version: u32 = header_fields[0]
111        .trim()
112        .parse()
113        .map_err(|_| CefError::InvalidVersion)?;
114
115    let extensions = if header_fields.len() > 7 {
116        parse_extensions(header_fields[7])
117    } else {
118        Vec::new()
119    };
120
121    Ok(CefRecord {
122        version,
123        device_vendor: unescape_header(header_fields[1]),
124        device_product: unescape_header(header_fields[2]),
125        device_version: unescape_header(header_fields[3]),
126        signature_id: unescape_header(header_fields[4]),
127        name: unescape_header(header_fields[5]),
128        severity: unescape_header(header_fields[6]),
129        extensions,
130    })
131}
132
133/// Split the CEF header on unescaped `|` characters.
134///
135/// Returns up to 8 segments: version, 6 header fields, and the extension
136/// tail (everything after the 7th `|`).
137fn split_header(input: &str) -> Vec<&str> {
138    let bytes = input.as_bytes();
139    let len = bytes.len();
140    let mut segments = Vec::with_capacity(8);
141    let mut start = 0;
142    let mut pipe_count = 0;
143
144    let mut i = 0;
145    while i < len {
146        if bytes[i] == b'\\' && i + 1 < len {
147            // Skip escaped character.
148            i += 2;
149            continue;
150        }
151        if bytes[i] == b'|' {
152            segments.push(&input[start..i]);
153            start = i + 1;
154            pipe_count += 1;
155            if pipe_count == 7 {
156                // Everything after the 7th pipe is the extension.
157                segments.push(&input[start..]);
158                return segments;
159            }
160        }
161        i += 1;
162    }
163
164    // Fewer than 7 pipes — push whatever remains.
165    if start <= len {
166        segments.push(&input[start..]);
167    }
168    segments
169}
170
171/// Unescape a CEF header field value (`\|` → `|`, `\\` → `\`).
172fn unescape_header(input: &str) -> String {
173    let bytes = input.as_bytes();
174    let len = bytes.len();
175    let mut out = String::with_capacity(len);
176    let mut i = 0;
177
178    while i < len {
179        if bytes[i] == b'\\' && i + 1 < len {
180            match bytes[i + 1] {
181                b'|' => {
182                    out.push('|');
183                    i += 2;
184                }
185                b'\\' => {
186                    out.push('\\');
187                    i += 2;
188                }
189                _ => {
190                    out.push('\\');
191                    i += 1;
192                }
193            }
194        } else {
195            out.push(bytes[i] as char);
196            i += 1;
197        }
198    }
199
200    out
201}
202
203/// Parse CEF extension key=value pairs.
204///
205/// Uses the "split by unescaped `=`, then look-back" algorithm:
206/// 1. Split the extension string on every unescaped `=`.
207/// 2. For each pair of consecutive segments, the **last word** of the left
208///    segment is the key, and everything in the right segment (up to its own
209///    last word, which is the *next* key) is the value.
210/// 3. The very last segment is the value for the final key (no look-ahead).
211fn parse_extensions(input: &str) -> Vec<(String, String)> {
212    let input = input.trim();
213    if input.is_empty() {
214        return Vec::new();
215    }
216
217    let segments = split_on_unescaped_eq(input);
218    if segments.len() < 2 {
219        return Vec::new();
220    }
221
222    let mut pairs = Vec::new();
223    let n = segments.len();
224    let mut current_key = extract_last_word(segments[0]);
225
226    for (i, segment) in segments.iter().enumerate().skip(1) {
227        let key = std::mem::take(&mut current_key);
228        if i < n - 1 {
229            // Intermediate segment: its last word is the next key; everything
230            // before it is the value for the current key.
231            match segment.rsplit_once(' ') {
232                Some((value_part, next_key)) => {
233                    pairs.push((key, unescape_extension(value_part.trim())));
234                    current_key = next_key.to_string();
235                }
236                None => {
237                    // No space — entire segment is the value (degenerate case).
238                    pairs.push((key, unescape_extension(segment.trim())));
239                }
240            }
241        } else {
242            // Final segment: the entire content is the value for the current key.
243            pairs.push((key, unescape_extension(segment.trim())));
244        }
245    }
246
247    pairs
248}
249
250/// Split a string on unescaped `=` characters.
251fn split_on_unescaped_eq(input: &str) -> Vec<&str> {
252    let bytes = input.as_bytes();
253    let len = bytes.len();
254    let mut segments = Vec::new();
255    let mut start = 0;
256    let mut i = 0;
257
258    while i < len {
259        if bytes[i] == b'\\' && i + 1 < len {
260            i += 2; // skip escaped char
261            continue;
262        }
263        if bytes[i] == b'=' {
264            segments.push(&input[start..i]);
265            start = i + 1;
266        }
267        i += 1;
268    }
269    segments.push(&input[start..]);
270    segments
271}
272
273/// Extract the last whitespace-delimited word from a string.
274fn extract_last_word(s: &str) -> String {
275    s.rsplit_once(' ')
276        .map(|(_, last)| last)
277        .unwrap_or(s)
278        .to_string()
279}
280
281/// Unescape a CEF extension value (`\=` → `=`, `\\` → `\`, `\n` → newline, `\r` → CR).
282fn unescape_extension(input: &str) -> String {
283    let bytes = input.as_bytes();
284    let len = bytes.len();
285    let mut out = String::with_capacity(len);
286    let mut i = 0;
287
288    while i < len {
289        if bytes[i] == b'\\' && i + 1 < len {
290            match bytes[i + 1] {
291                b'=' => {
292                    out.push('=');
293                    i += 2;
294                }
295                b'\\' => {
296                    out.push('\\');
297                    i += 2;
298                }
299                b'n' => {
300                    out.push('\n');
301                    i += 2;
302                }
303                b'r' => {
304                    out.push('\r');
305                    i += 2;
306                }
307                _ => {
308                    out.push('\\');
309                    i += 1;
310                }
311            }
312        } else {
313            out.push(bytes[i] as char);
314            i += 1;
315        }
316    }
317
318    out
319}
320
321#[cfg(test)]
322mod tests {
323    use super::*;
324
325    // -- Header parsing -------------------------------------------------------
326
327    #[test]
328    fn minimal_cef() {
329        let r = parse("CEF:0|Vendor|Product|1.0|100|Name|5|").unwrap();
330        assert_eq!(r.version, 0);
331        assert_eq!(r.device_vendor, "Vendor");
332        assert_eq!(r.device_product, "Product");
333        assert_eq!(r.device_version, "1.0");
334        assert_eq!(r.signature_id, "100");
335        assert_eq!(r.name, "Name");
336        assert_eq!(r.severity, "5");
337        assert!(r.extensions.is_empty());
338    }
339
340    #[test]
341    fn header_without_trailing_pipe_extensions() {
342        let r = parse("CEF:0|Vendor|Product|1.0|100|Name|5|src=10.0.0.1 dst=192.168.1.1").unwrap();
343        assert_eq!(r.extensions.len(), 2);
344        assert_eq!(r.extensions[0], ("src".into(), "10.0.0.1".into()));
345        assert_eq!(r.extensions[1], ("dst".into(), "192.168.1.1".into()));
346    }
347
348    #[test]
349    fn escaped_pipe_in_header() {
350        let r = parse(r"CEF:0|Ven\|dor|Product|1.0|100|Na\|me|5|").unwrap();
351        assert_eq!(r.device_vendor, "Ven|dor");
352        assert_eq!(r.name, "Na|me");
353    }
354
355    #[test]
356    fn escaped_backslash_in_header() {
357        let r = parse(r"CEF:0|Ven\\dor|Product|1.0|100|Name|5|").unwrap();
358        assert_eq!(r.device_vendor, r"Ven\dor");
359    }
360
361    #[test]
362    fn not_cef() {
363        assert_eq!(parse("not a CEF line"), Err(CefError::NotCef));
364    }
365
366    #[test]
367    fn incomplete_header() {
368        assert_eq!(
369            parse("CEF:0|Vendor|Product"),
370            Err(CefError::IncompleteHeader)
371        );
372    }
373
374    #[test]
375    fn invalid_version() {
376        assert_eq!(
377            parse("CEF:abc|Vendor|Product|1.0|100|Name|5|"),
378            Err(CefError::InvalidVersion)
379        );
380    }
381
382    // -- Extension parsing ----------------------------------------------------
383
384    #[test]
385    fn single_extension() {
386        let r = parse("CEF:0|V|P|1|1|N|1|src=10.0.0.1").unwrap();
387        assert_eq!(r.extensions, vec![("src".into(), "10.0.0.1".into())]);
388    }
389
390    #[test]
391    fn multiple_extensions() {
392        let r = parse("CEF:0|V|P|1|1|N|1|src=10.0.0.1 dst=192.168.1.1 dpt=443").unwrap();
393        assert_eq!(r.extensions.len(), 3);
394        assert_eq!(r.extensions[0], ("src".into(), "10.0.0.1".into()));
395        assert_eq!(r.extensions[1], ("dst".into(), "192.168.1.1".into()));
396        assert_eq!(r.extensions[2], ("dpt".into(), "443".into()));
397    }
398
399    #[test]
400    fn extension_value_with_spaces() {
401        let r = parse("CEF:0|V|P|1|1|N|1|msg=User signed in from 10.0.0.1 src=10.0.0.1").unwrap();
402        assert_eq!(r.extensions.len(), 2);
403        assert_eq!(
404            r.extensions[0],
405            ("msg".into(), "User signed in from 10.0.0.1".into())
406        );
407        assert_eq!(r.extensions[1], ("src".into(), "10.0.0.1".into()));
408    }
409
410    #[test]
411    fn extension_escaped_equals() {
412        let r =
413            parse(r"CEF:0|V|P|1|1|N|1|request=https://example.com?foo\=bar src=10.0.0.1").unwrap();
414        assert_eq!(r.extensions.len(), 2);
415        assert_eq!(
416            r.extensions[0],
417            ("request".into(), "https://example.com?foo=bar".into())
418        );
419    }
420
421    #[test]
422    fn extension_escaped_backslash() {
423        let r = parse(r"CEF:0|V|P|1|1|N|1|path=C:\\Windows\\System32").unwrap();
424        assert_eq!(
425            r.extensions[0],
426            ("path".into(), r"C:\Windows\System32".into())
427        );
428    }
429
430    #[test]
431    fn extension_escaped_newline() {
432        let r = parse(r"CEF:0|V|P|1|1|N|1|msg=line1\nline2").unwrap();
433        assert_eq!(r.extensions[0], ("msg".into(), "line1\nline2".into()));
434    }
435
436    #[test]
437    fn extension_escaped_cr() {
438        let r = parse(r"CEF:0|V|P|1|1|N|1|msg=line1\rline2").unwrap();
439        assert_eq!(r.extensions[0], ("msg".into(), "line1\rline2".into()));
440    }
441
442    #[test]
443    fn empty_extensions() {
444        let r = parse("CEF:0|V|P|1|1|N|1|").unwrap();
445        assert!(r.extensions.is_empty());
446    }
447
448    #[test]
449    fn whitespace_only_extensions() {
450        let r = parse("CEF:0|V|P|1|1|N|1|   ").unwrap();
451        assert!(r.extensions.is_empty());
452    }
453
454    // -- find_cef_start -------------------------------------------------------
455
456    #[test]
457    fn find_cef_in_syslog() {
458        let line = "<134>2022-02-14T03:17:30-08:00 host CEF:0|V|P|1|1|N|1|src=10.0.0.1";
459        let offset = find_cef_start(line).unwrap();
460        let r = parse(&line[offset..]).unwrap();
461        assert_eq!(r.device_vendor, "V");
462        assert_eq!(r.extensions[0], ("src".into(), "10.0.0.1".into()));
463    }
464
465    #[test]
466    fn find_cef_no_match() {
467        assert_eq!(find_cef_start("just a regular log line"), None);
468    }
469
470    // -- Real-world samples ---------------------------------------------------
471
472    #[test]
473    fn real_world_arcsight() {
474        let line = "CEF:0|ArcSight|ArcSight|7.0.0|agent:030|Agent Started|1|deviceExternalId=001 rt=1644800250000 cat=agent msg=ArcSight agent started successfully";
475        let r = parse(line).unwrap();
476        assert_eq!(r.device_vendor, "ArcSight");
477        assert_eq!(r.name, "Agent Started");
478        assert_eq!(r.extensions.len(), 4);
479        assert_eq!(r.extensions[0], ("deviceExternalId".into(), "001".into()));
480        assert_eq!(r.extensions[1], ("rt".into(), "1644800250000".into()));
481        assert_eq!(r.extensions[2], ("cat".into(), "agent".into()));
482        assert_eq!(
483            r.extensions[3],
484            ("msg".into(), "ArcSight agent started successfully".into())
485        );
486    }
487
488    #[test]
489    fn real_world_with_labels() {
490        let line = "CEF:0|Vendor|Firewall|2.0|100|Connection Blocked|8|src=10.0.0.1 dst=192.168.1.100 spt=12345 dpt=443 proto=TCP act=blocked";
491        let r = parse(line).unwrap();
492        assert_eq!(r.extensions.len(), 6);
493        assert_eq!(r.extensions[0], ("src".into(), "10.0.0.1".into()));
494        assert_eq!(r.extensions[1], ("dst".into(), "192.168.1.100".into()));
495        assert_eq!(r.extensions[2], ("spt".into(), "12345".into()));
496        assert_eq!(r.extensions[3], ("dpt".into(), "443".into()));
497        assert_eq!(r.extensions[4], ("proto".into(), "TCP".into()));
498        assert_eq!(r.extensions[5], ("act".into(), "blocked".into()));
499    }
500
501    #[test]
502    fn real_world_syslog_wrapped_cef() {
503        let line = "<134>Feb 14 19:04:54 firewall01 CEF:0|Palo Alto|PAN-OS|10.1|THREAT|threat|7|src=172.16.0.5 dst=10.10.10.1 msg=Malware detected in file transfer";
504        let offset = find_cef_start(line).unwrap();
505        let r = parse(&line[offset..]).unwrap();
506        assert_eq!(r.device_vendor, "Palo Alto");
507        assert_eq!(r.device_product, "PAN-OS");
508        assert_eq!(r.extensions.len(), 3);
509        assert_eq!(
510            r.extensions[2],
511            ("msg".into(), "Malware detected in file transfer".into())
512        );
513    }
514
515    #[test]
516    fn extension_single_value_no_spaces() {
517        let r = parse("CEF:0|V|P|1|1|N|1|src=10.0.0.1").unwrap();
518        assert_eq!(r.extensions.len(), 1);
519        assert_eq!(r.extensions[0], ("src".into(), "10.0.0.1".into()));
520    }
521
522    #[test]
523    fn extension_last_value_has_spaces() {
524        let r = parse("CEF:0|V|P|1|1|N|1|src=10.0.0.1 msg=This is the final message").unwrap();
525        assert_eq!(r.extensions.len(), 2);
526        assert_eq!(r.extensions[0], ("src".into(), "10.0.0.1".into()));
527        assert_eq!(
528            r.extensions[1],
529            ("msg".into(), "This is the final message".into())
530        );
531    }
532
533    #[test]
534    fn version_1() {
535        let r = parse("CEF:1|V|P|1|1|N|1|src=10.0.0.1").unwrap();
536        assert_eq!(r.version, 1);
537    }
538}