Skip to main content

anomalyx_normalize/parsers/
accesslog.rs

1//! Combined / Common Log Format parser — nginx & Apache access logs.
2//!
3//! The NCSA Common Log Format is positional:
4//! `host ident user [time] "request" status bytes`, and the Combined Format adds
5//! `"referer" "user-agent"`. The bracketed time and the quoted request/referer/
6//! user-agent are single fields even though they contain spaces, so we tokenize
7//! with `[...]` and `"..."` treated as one token each (honoring `\"` escapes).
8//!
9//! The conventional `-` placeholder becomes `Null` (honest absence, never a fake
10//! `0`); `status`/`bytes` are typed numeric; the request line is split into
11//! `method`/`path`/`protocol`. Detected by its unmistakable
12//! `[time] "request" <status> <bytes>` shape — it claims only the explicit
13//! `.accesslog` extension (real access logs are generically named `*.log`).
14
15use crate::infer;
16use crate::parser::{Confidence, FormatParser, STRONG};
17use crate::table::TableBuilder;
18use ax_core::{AxError, Column, Value};
19use std::collections::BTreeMap;
20
21#[derive(Debug, Default, Clone)]
22pub struct AccessLogParser;
23
24/// Splits one access-log line into positional fields, treating a `[...]` group
25/// and a `"..."` group (with `\"` / `\\` escapes) each as a single field.
26fn tokenize(line: &str) -> Vec<String> {
27    let mut tokens = Vec::new();
28    let mut chars = line.chars().peekable();
29    loop {
30        while chars.peek() == Some(&' ') {
31            chars.next();
32        }
33        match chars.peek() {
34            None => break,
35            Some('[') => {
36                chars.next();
37                let mut s = String::new();
38                for c in chars.by_ref() {
39                    if c == ']' {
40                        break;
41                    }
42                    s.push(c);
43                }
44                tokens.push(s);
45            }
46            Some('"') => {
47                chars.next();
48                let mut s = String::new();
49                while let Some(c) = chars.next() {
50                    match c {
51                        '\\' => {
52                            if let Some(esc) = chars.next() {
53                                s.push(esc);
54                            }
55                        }
56                        '"' => break,
57                        _ => s.push(c),
58                    }
59                }
60                tokens.push(s);
61            }
62            Some(_) => {
63                let mut s = String::new();
64                while let Some(&c) = chars.peek() {
65                    if c == ' ' {
66                        break;
67                    }
68                    s.push(c);
69                    chars.next();
70                }
71                tokens.push(s);
72            }
73        }
74    }
75    tokens
76}
77
78/// A `-` placeholder is honest absence; otherwise the raw string.
79fn text_field(s: &str) -> Value {
80    if s == "-" {
81        Value::Null
82    } else {
83        Value::Str(s.to_string())
84    }
85}
86
87/// A `-` placeholder is `Null`; otherwise type-inferred (so `status`/`bytes` are
88/// numeric).
89fn num_field(s: &str) -> Value {
90    if s == "-" {
91        Value::Null
92    } else {
93        infer::infer_scalar(s)
94    }
95}
96
97impl AccessLogParser {
98    fn err(&self, msg: impl std::fmt::Display) -> AxError {
99        AxError::Parse {
100            format: self.id().to_string(),
101            message: msg.to_string(),
102        }
103    }
104
105    /// Maps positional tokens to a named row. `tokens` is guaranteed `len >= 7`.
106    fn row(tokens: &[String]) -> BTreeMap<String, Value> {
107        let mut row = BTreeMap::new();
108        row.insert("host".into(), text_field(&tokens[0]));
109        row.insert("ident".into(), text_field(&tokens[1]));
110        row.insert("user".into(), text_field(&tokens[2]));
111        row.insert("time".into(), text_field(&tokens[3]));
112
113        // Request line: "METHOD PATH PROTOCOL".
114        let mut req = tokens[4].splitn(3, ' ');
115        row.insert("method".into(), text_field(req.next().unwrap_or("-")));
116        row.insert("path".into(), text_field(req.next().unwrap_or("-")));
117        row.insert("protocol".into(), text_field(req.next().unwrap_or("-")));
118
119        row.insert("status".into(), num_field(&tokens[5]));
120        row.insert("bytes".into(), num_field(&tokens[6]));
121
122        // Combined format adds referer and user-agent.
123        if let Some(referer) = tokens.get(7) {
124            row.insert("referer".into(), text_field(referer));
125        }
126        if let Some(ua) = tokens.get(8) {
127            row.insert("user_agent".into(), text_field(ua));
128        }
129        row
130    }
131}
132
133impl FormatParser for AccessLogParser {
134    fn id(&self) -> &'static str {
135        "accesslog"
136    }
137    fn extensions(&self) -> &'static [&'static str] {
138        &["accesslog"]
139    }
140    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
141        let text = std::str::from_utf8(bytes).ok()?;
142        let line = text.lines().find(|l| !l.trim().is_empty())?;
143        // The bracketed time and quoted request are the signature; without them
144        // a 7-token line is just whitespace-separated text, not an access log.
145        if !line.contains('[') || !line.contains('"') {
146            return None;
147        }
148        let tokens = tokenize(line);
149        if tokens.len() < 7 {
150            return None;
151        }
152        // A valid HTTP status sits at a fixed position only when the time and
153        // request tokenized as single fields — so this also validates the shape.
154        let status_ok = tokens[5]
155            .parse::<u16>()
156            .is_ok_and(|s| (100..=599).contains(&s));
157        let bytes_ok = tokens[6] == "-" || tokens[6].parse::<u64>().is_ok();
158        (status_ok && bytes_ok).then_some(STRONG)
159    }
160    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
161        let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
162        let mut builder = TableBuilder::new();
163        for line in text.lines() {
164            if line.trim().is_empty() {
165                continue;
166            }
167            let tokens = tokenize(line);
168            if tokens.len() < 7 {
169                return Err(self.err(format!(
170                    "malformed access-log line: expected >= 7 fields, got {}",
171                    tokens.len()
172                )));
173            }
174            builder.push_row(Self::row(&tokens));
175        }
176        Ok(builder.finish())
177    }
178}
179
180#[cfg(test)]
181mod tests {
182    use super::*;
183    use ax_core::ColType;
184
185    const COMBINED: &str = "127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] \
186\"GET /apache_pb.gif HTTP/1.0\" 200 2326 \
187\"http://example.com/start.html\" \"Mozilla/4.08 [en] (Win98)\"\n";
188
189    const COMMON: &str =
190        "192.168.0.1 - - [10/Oct/2000:13:55:40 -0700] \"POST /login HTTP/1.1\" 302 -\n";
191
192    fn parse(s: &str) -> Vec<Column> {
193        AccessLogParser.parse("-", s.as_bytes()).unwrap()
194    }
195    fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
196        cols.iter().find(|c| c.name == name).unwrap()
197    }
198
199    #[test]
200    fn parses_combined_fields() {
201        let cols = parse(COMBINED);
202        assert_eq!(col(&cols, "host").cells[0], Value::Str("127.0.0.1".into()));
203        assert_eq!(col(&cols, "user").cells[0], Value::Str("frank".into()));
204        assert_eq!(
205            col(&cols, "time").cells[0],
206            Value::Str("10/Oct/2000:13:55:36 -0700".into())
207        );
208        assert_eq!(col(&cols, "method").cells[0], Value::Str("GET".into()));
209        assert_eq!(
210            col(&cols, "path").cells[0],
211            Value::Str("/apache_pb.gif".into())
212        );
213        assert_eq!(
214            col(&cols, "protocol").cells[0],
215            Value::Str("HTTP/1.0".into())
216        );
217        assert_eq!(col(&cols, "status").ty, ColType::Int);
218        assert_eq!(col(&cols, "status").cells[0], Value::Int(200));
219        assert_eq!(col(&cols, "bytes").cells[0], Value::Int(2326));
220        assert_eq!(
221            col(&cols, "referer").cells[0],
222            Value::Str("http://example.com/start.html".into())
223        );
224        // The user-agent keeps its embedded brackets — they were inside quotes.
225        assert_eq!(
226            col(&cols, "user_agent").cells[0],
227            Value::Str("Mozilla/4.08 [en] (Win98)".into())
228        );
229    }
230
231    #[test]
232    fn dash_placeholders_are_null() {
233        let cols = parse(COMMON);
234        assert_eq!(col(&cols, "ident").cells[0], Value::Null);
235        assert_eq!(col(&cols, "user").cells[0], Value::Null);
236        assert_eq!(col(&cols, "bytes").cells[0], Value::Null); // `-` bytes
237        assert_eq!(col(&cols, "status").cells[0], Value::Int(302));
238    }
239
240    #[test]
241    fn common_format_has_no_referer_or_ua_column() {
242        let cols = parse(COMMON);
243        assert!(cols.iter().all(|c| c.name != "referer"));
244        assert!(cols.iter().all(|c| c.name != "user_agent"));
245    }
246
247    #[test]
248    fn mixed_common_and_combined_pads_with_null() {
249        // Combined first, then common: referer/ua exist but are null on row 1.
250        let cols = parse(&format!("{COMBINED}{COMMON}"));
251        let referer = col(&cols, "referer");
252        assert_eq!(referer.cells.len(), 2);
253        assert_eq!(referer.cells[1], Value::Null);
254    }
255
256    #[test]
257    fn malformed_line_errors() {
258        assert!(matches!(
259            AccessLogParser.parse("-", b"this is not an access log\n"),
260            Err(AxError::Parse { .. })
261        ));
262    }
263
264    #[test]
265    fn tokenize_groups_brackets_and_quotes() {
266        let t = tokenize("a [x y] \"q \\\"r\\\" s\" b");
267        assert_eq!(t, vec!["a", "x y", "q \"r\" s", "b"]);
268    }
269
270    #[test]
271    fn sniff_recognizes_access_logs() {
272        assert_eq!(AccessLogParser.sniff(COMBINED.as_bytes()), Some(STRONG));
273        assert_eq!(AccessLogParser.sniff(COMMON.as_bytes()), Some(STRONG));
274        // No bracket/quote signature → not an access log even with 7 tokens.
275        assert_eq!(AccessLogParser.sniff(b"a b c d e 200 1024"), None);
276        // BOTH a bracket and a quote are required: one alone is not the
277        // signature, even when status/bytes are otherwise valid.
278        assert_eq!(
279            AccessLogParser.sniff(b"1.1.1.1 - - [t i] GET 200 10"),
280            None,
281            "bracket present but no quote"
282        );
283        assert_eq!(
284            AccessLogParser.sniff(b"1.1.1.1 - - t \"GET / HTTP/1.1\" 200 10"),
285            None,
286            "quote present but no bracket"
287        );
288        assert_eq!(AccessLogParser.sniff(b"a,b,c\n1,2,3"), None); // CSV
289        assert_eq!(AccessLogParser.sniff(b"k=1 v=2"), None); // logfmt-ish
290    }
291
292    #[test]
293    fn claims_the_accesslog_extension() {
294        assert_eq!(AccessLogParser.extensions(), &["accesslog"]);
295    }
296
297    #[test]
298    fn sniff_rejects_out_of_range_status_and_bad_bytes() {
299        // status 99 (< 100) and 600 (> 599) are not HTTP statuses.
300        let lo = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 99 10\n";
301        let hi = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 600 10\n";
302        let bad_bytes = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 200 abc\n";
303        assert_eq!(AccessLogParser.sniff(lo.as_bytes()), None);
304        assert_eq!(AccessLogParser.sniff(hi.as_bytes()), None);
305        assert_eq!(AccessLogParser.sniff(bad_bytes.as_bytes()), None);
306        // Boundaries 100 and 599 are valid.
307        let edge_lo = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 100 10\n";
308        let edge_hi = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 599 10\n";
309        assert_eq!(AccessLogParser.sniff(edge_lo.as_bytes()), Some(STRONG));
310        assert_eq!(AccessLogParser.sniff(edge_hi.as_bytes()), Some(STRONG));
311    }
312
313    #[test]
314    fn resolves_by_extension_and_content() {
315        let reg = crate::parser::ParserRegistry::default();
316        assert_eq!(
317            reg.resolve("x.accesslog", COMMON.as_bytes()).unwrap().id(),
318            "accesslog"
319        );
320        // A `.log` file with access-log content routes by sniff.
321        assert_eq!(
322            reg.resolve("access.log", COMBINED.as_bytes()).unwrap().id(),
323            "accesslog"
324        );
325        // A non-access `.log` is not hijacked.
326        assert_eq!(reg.resolve("app.log", b"a,b\n1,2").unwrap().id(), "csv");
327    }
328}