Skip to main content

anomalyx_normalize/parsers/
syslog.rs

1//! Syslog parser — RFC 3164 (BSD) and RFC 5424 wire messages.
2//!
3//! Both variants begin with a `<PRI>` priority header (`PRI = facility*8 +
4//! severity`, 0–191). We derive the numeric `facility`/`severity` ourselves from
5//! that header (clean and deterministic) and delegate the harder dual-RFC field
6//! parsing — BSD vs ISO timestamps, app/host/proc IDs, RFC 5424 structured data
7//! — to `syslog_loose`. One row per message, with columns the detectors want:
8//! `severity`/`facility` for event-rate `dist` drift, `hostname` for rare-host
9//! `structural`/`dist`, and rows-as-an-ordered-series for off-hours `contextual`
10//! (`--period 24`).
11//!
12//! Determinism: `syslog_loose`'s default entry point fills a year-less RFC 3164
13//! timestamp from the wall clock and the local time zone. We instead pin a fixed
14//! year and UTC, so the same bytes always normalize identically (the real
15//! month/day/time are preserved; only the absent RFC 3164 year is a sentinel).
16//!
17//! Detected by the `<PRI>` header **or** the PRI-less file format that
18//! rsyslog/syslog-ng actually write (ISO-8601 or BSD timestamp, then host and
19//! tag) — recognized by `syslog_loose` extracting a timestamp + host + app.
20//! `facility`/`severity` exist only when a `<PRI>` is present. Claims `.syslog`
21//! (a plain `.log` is too generic). A line that is neither is a clean parse error.
22
23use crate::parser::{Confidence, FormatParser, STRONG};
24use crate::table::TableBuilder;
25use ax_core::{AxError, Column, Value};
26use chrono::Utc;
27use std::collections::BTreeMap;
28use syslog_loose::{parse_message_with_year_tz, ProcId, Protocol, Variant};
29
30#[derive(Debug, Default, Clone)]
31pub struct SyslogParser;
32
33/// The RFC 3164 year is unknowable from the wire; pin a sentinel so the parse is
34/// deterministic (the month/day/time carry the real information).
35const SENTINEL_YEAR: i32 = 1970;
36
37/// Whether a line is syslog: either a `<PRI>` wire header, or the PRI-less file
38/// format (what rsyslog/syslog-ng actually write to `/var/log/syslog`) — which
39/// we recognize by `syslog_loose` extracting a timestamp **and** a hostname
40/// **and** an appname. Requiring all three keeps a timestamp-leading CSV row
41/// (no space-delimited host/app after the time) from being mistaken for syslog.
42fn looks_like_syslog(line: &str) -> bool {
43    if parse_pri(line).is_some() {
44        return true;
45    }
46    let m = parse_message_with_year_tz(line, |_| SENTINEL_YEAR, Some(Utc), Variant::Either);
47    m.timestamp.is_some() && m.hostname.is_some() && m.appname.is_some()
48}
49
50/// Parses the leading `<PRI>` header into `(facility, severity)`. `PRI = facility
51/// * 8 + severity` and is 0–191; anything else is not a syslog priority.
52fn parse_pri(line: &str) -> Option<(i64, i64)> {
53    let rest = line.strip_prefix('<')?;
54    let end = rest.find('>')?;
55    let pri: u16 = rest[..end].parse().ok()?;
56    (pri <= 191).then_some(((pri / 8) as i64, (pri % 8) as i64))
57}
58
59impl SyslogParser {
60    fn err(&self, msg: impl std::fmt::Display) -> AxError {
61        AxError::Parse {
62            format: self.id().to_string(),
63            message: msg.to_string(),
64        }
65    }
66}
67
68impl FormatParser for SyslogParser {
69    fn id(&self) -> &'static str {
70        "syslog"
71    }
72    fn extensions(&self) -> &'static [&'static str] {
73        &["syslog"]
74    }
75    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
76        let text = std::str::from_utf8(bytes).ok()?;
77        let line = text.lines().find(|l| !l.trim().is_empty())?;
78        looks_like_syslog(line).then_some(STRONG)
79    }
80    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
81        let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
82        let mut builder = TableBuilder::new();
83        for line in text.lines() {
84            if line.trim().is_empty() {
85                continue;
86            }
87            let pri = parse_pri(line);
88            let msg =
89                parse_message_with_year_tz(line, |_| SENTINEL_YEAR, Some(Utc), Variant::Either);
90            // Accept a line with a `<PRI>` header (wire format) OR a recognizable
91            // timestamp (the PRI-less file format rsyslog/syslog-ng write).
92            // `syslog_loose` only yields a timestamp once it has also parsed the
93            // host/tag that follow it, so the timestamp alone is a sufficient gate.
94            if pri.is_none() && msg.timestamp.is_none() {
95                return Err(
96                    self.err("not a syslog line: no <PRI> header and no recognizable timestamp")
97                );
98            }
99
100            let mut row: BTreeMap<String, Value> = BTreeMap::new();
101            // facility/severity come only from the `<PRI>` header; a file-format
102            // line has none, so those columns are simply absent for it.
103            if let Some((facility, severity)) = pri {
104                row.insert("facility".into(), Value::Int(facility));
105                row.insert("severity".into(), Value::Int(severity));
106            }
107            row.insert(
108                "protocol".into(),
109                Value::Str(
110                    match msg.protocol {
111                        Protocol::RFC3164 => "RFC3164",
112                        Protocol::RFC5424(_) => "RFC5424",
113                    }
114                    .to_string(),
115                ),
116            );
117            if let Some(ts) = msg.timestamp {
118                row.insert("timestamp".into(), Value::Str(ts.to_string()));
119            }
120            if let Some(host) = msg.hostname {
121                row.insert("hostname".into(), Value::Str(host.to_string()));
122            }
123            if let Some(app) = msg.appname {
124                row.insert("appname".into(), Value::Str(app.to_string()));
125            }
126            if let Some(procid) = msg.procid {
127                let v = match procid {
128                    ProcId::PID(pid) => Value::Int(pid as i64),
129                    ProcId::Name(name) => Value::Str(name.to_string()),
130                };
131                row.insert("procid".into(), v);
132            }
133            if let Some(msgid) = msg.msgid {
134                row.insert("msgid".into(), Value::Str(msgid.to_string()));
135            }
136            for element in &msg.structured_data {
137                for (key, value) in &element.params {
138                    row.insert(
139                        format!("sd.{}.{}", element.id, key),
140                        Value::Str(value.to_string()),
141                    );
142                }
143            }
144            row.insert("message".into(), Value::Str(msg.msg.to_string()));
145            builder.push_row(row);
146        }
147        Ok(builder.finish())
148    }
149}
150
151#[cfg(test)]
152mod tests {
153    use super::*;
154    use ax_core::ColType;
155
156    const SYSLOG: &str = concat!(
157        r#"<165>1 2003-10-11T22:14:15.003Z mymachine.example.com evntslog 1234 ID47 [exampleSDID@32473 iut="3" eventID="1011"] App event log entry"#,
158        "\n",
159        "<34>Oct 11 22:14:15 mymachine su[567]: 'su root' failed for lonvick\n",
160    );
161
162    fn parse(s: &str) -> Vec<Column> {
163        SyslogParser.parse("-", s.as_bytes()).unwrap()
164    }
165    fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
166        cols.iter()
167            .find(|c| c.name == name)
168            .unwrap_or_else(|| panic!("missing column {name}"))
169    }
170
171    #[test]
172    fn priority_decodes_to_facility_and_severity() {
173        let cols = parse(SYSLOG);
174        let fac = col(&cols, "facility");
175        let sev = col(&cols, "severity");
176        assert_eq!(fac.ty, ColType::Int);
177        assert_eq!(sev.ty, ColType::Int);
178        assert_eq!(fac.cells, vec![Value::Int(20), Value::Int(4)]); // 165/8, 34/8
179        assert_eq!(sev.cells, vec![Value::Int(5), Value::Int(2)]); // 165%8, 34%8
180    }
181
182    #[test]
183    fn both_rfc_variants_parse_their_fields() {
184        let cols = parse(SYSLOG);
185        assert_eq!(
186            col(&cols, "protocol").cells,
187            vec![Value::Str("RFC5424".into()), Value::Str("RFC3164".into())]
188        );
189        assert_eq!(
190            col(&cols, "hostname").cells,
191            vec![
192                Value::Str("mymachine.example.com".into()),
193                Value::Str("mymachine".into())
194            ]
195        );
196        assert_eq!(
197            col(&cols, "appname").cells,
198            vec![Value::Str("evntslog".into()), Value::Str("su".into())]
199        );
200        assert_eq!(
201            col(&cols, "procid").cells,
202            vec![Value::Int(1234), Value::Int(567)]
203        );
204    }
205
206    #[test]
207    fn rfc5424_only_fields_pad_with_null() {
208        let cols = parse(SYSLOG);
209        // msgid and structured data exist only on the RFC 5424 row.
210        assert_eq!(col(&cols, "msgid").cells[0], Value::Str("ID47".into()));
211        assert_eq!(col(&cols, "msgid").cells[1], Value::Null);
212        let sd = col(&cols, "sd.exampleSDID@32473.iut");
213        assert_eq!(sd.cells[0], Value::Str("3".into()));
214        assert_eq!(sd.cells[1], Value::Null);
215        assert_eq!(
216            col(&cols, "sd.exampleSDID@32473.eventID").cells[0],
217            Value::Str("1011".into())
218        );
219    }
220
221    #[test]
222    fn message_body_is_captured() {
223        let cols = parse(SYSLOG);
224        let msg = col(&cols, "message");
225        assert_eq!(msg.cells[0], Value::Str("App event log entry".into()));
226        assert_eq!(
227            msg.cells[1],
228            Value::Str("'su root' failed for lonvick".into())
229        );
230    }
231
232    #[test]
233    fn deterministic_across_calls() {
234        // Same bytes → byte-identical columns, despite RFC 3164's missing year
235        // (pinned to a sentinel, never the wall clock).
236        assert_eq!(
237            format!("{:?}", parse(SYSLOG)),
238            format!("{:?}", parse(SYSLOG))
239        );
240        // The RFC 3164 timestamp uses the sentinel year, deterministically.
241        let cols = parse(SYSLOG);
242        let ts = col(&cols, "timestamp");
243        match &ts.cells[1] {
244            Value::Str(s) => assert!(s.starts_with("1970-"), "sentinel year, got {s}"),
245            other => panic!("expected Str timestamp, got {other:?}"),
246        }
247    }
248
249    #[test]
250    fn parse_pri_units() {
251        assert_eq!(parse_pri("<0>x"), Some((0, 0)));
252        assert_eq!(parse_pri("<34>x"), Some((4, 2)));
253        assert_eq!(parse_pri("<165>x"), Some((20, 5)));
254        assert_eq!(parse_pri("<191>x"), Some((23, 7))); // max valid
255        assert_eq!(parse_pri("<192>x"), None); // out of range
256        assert_eq!(parse_pri("<abc>x"), None); // not a number
257        assert_eq!(parse_pri("<34"), None); // unterminated
258        assert_eq!(parse_pri("no bracket"), None);
259    }
260
261    #[test]
262    fn malformed_lines_error() {
263        assert!(matches!(
264            SyslogParser.parse("-", b"this is not syslog\n"),
265            Err(AxError::Parse { .. })
266        ));
267        assert!(matches!(
268            SyslogParser.parse("-", b"<192>priority out of range\n"),
269            Err(AxError::Parse { .. })
270        ));
271    }
272
273    /// The PRI-less file formats that rsyslog/syslog-ng actually write to disk.
274    const ISO_FILE: &[u8] =
275        b"2026-06-01T09:14:57.403686-07:00 4ubox NetworkManager[3524]: dhcp4 beginning\n";
276    const BSD_FILE: &[u8] = b"Jun  1 09:14:57 4ubox NetworkManager[3524]: dhcp4 beginning\n";
277
278    #[test]
279    fn sniff_keys_on_pri_header() {
280        assert_eq!(SyslogParser.sniff(SYSLOG.as_bytes()), Some(STRONG));
281        assert_eq!(
282            SyslogParser.sniff(b"<13>Feb  5 17:32:18 host app: msg\n"),
283            Some(STRONG)
284        );
285        assert_eq!(SyslogParser.sniff(b"<999>bad pri\n"), None); // > 191
286        assert_eq!(SyslogParser.sniff(b"<?xml version=\"1.0\"?>"), None); // XML, not PRI
287        assert_eq!(SyslogParser.sniff(b"plain text line\n"), None);
288        assert_eq!(SyslogParser.sniff(b"{\"a\":1}"), None);
289        assert_eq!(SyslogParser.sniff(b"a,b,c\n1,2,3"), None);
290    }
291
292    #[test]
293    fn sniff_recognizes_pri_less_file_format() {
294        // The real /var/log/syslog format (no <PRI>): ISO-8601 and BSD timestamps.
295        assert_eq!(SyslogParser.sniff(ISO_FILE), Some(STRONG));
296        assert_eq!(SyslogParser.sniff(BSD_FILE), Some(STRONG));
297        // But a timestamp-leading CSV (no space-delimited host/app) is NOT syslog.
298        assert_eq!(SyslogParser.sniff(b"2026-06-01T09:14:57,42,foo\n"), None);
299        // And it wins over the greedy `ini` sniff through the registry.
300        let reg = crate::parser::ParserRegistry::default();
301        assert_eq!(reg.resolve("-", ISO_FILE).unwrap().id(), "syslog");
302    }
303
304    #[test]
305    fn pri_less_file_line_parses_without_facility_severity() {
306        let cols = SyslogParser.parse("-", ISO_FILE).unwrap();
307        // The fields syslog_loose recovers are present...
308        assert_eq!(col(&cols, "hostname").cells[0], Value::Str("4ubox".into()));
309        assert_eq!(
310            col(&cols, "appname").cells[0],
311            Value::Str("NetworkManager".into())
312        );
313        assert_eq!(col(&cols, "procid").cells[0], Value::Int(3524));
314        assert!(
315            matches!(&col(&cols, "timestamp").cells[0], Value::Str(s) if s.starts_with("2026-06-01"))
316        );
317        // ...but facility/severity columns don't exist (no <PRI> to derive them).
318        assert!(cols
319            .iter()
320            .all(|c| c.name != "facility" && c.name != "severity"));
321    }
322
323    #[test]
324    fn claims_syslog_extension() {
325        assert_eq!(SyslogParser.extensions(), &["syslog"]);
326    }
327
328    #[test]
329    fn resolves_by_extension_and_content() {
330        let reg = crate::parser::ParserRegistry::default();
331        assert_eq!(
332            reg.resolve("app.syslog", b"<34>Oct 11 22:14:15 h a: m")
333                .unwrap()
334                .id(),
335            "syslog"
336        );
337        assert_eq!(reg.resolve("-", SYSLOG.as_bytes()).unwrap().id(), "syslog");
338    }
339}