anomalyx_normalize/parsers/
accesslog.rs1use crate::infer;
16use crate::parser::{Confidence, FormatParser, STRONG};
17use crate::table::TableBuilder;
18use ax_core::{AxError, Column, Value};
19use std::collections::BTreeMap;
20
21#[derive(Debug, Default, Clone)]
22pub struct AccessLogParser;
23
24fn tokenize(line: &str) -> Vec<String> {
27 let mut tokens = Vec::new();
28 let mut chars = line.chars().peekable();
29 loop {
30 while chars.peek() == Some(&' ') {
31 chars.next();
32 }
33 match chars.peek() {
34 None => break,
35 Some('[') => {
36 chars.next();
37 let mut s = String::new();
38 for c in chars.by_ref() {
39 if c == ']' {
40 break;
41 }
42 s.push(c);
43 }
44 tokens.push(s);
45 }
46 Some('"') => {
47 chars.next();
48 let mut s = String::new();
49 while let Some(c) = chars.next() {
50 match c {
51 '\\' => {
52 if let Some(esc) = chars.next() {
53 s.push(esc);
54 }
55 }
56 '"' => break,
57 _ => s.push(c),
58 }
59 }
60 tokens.push(s);
61 }
62 Some(_) => {
63 let mut s = String::new();
64 while let Some(&c) = chars.peek() {
65 if c == ' ' {
66 break;
67 }
68 s.push(c);
69 chars.next();
70 }
71 tokens.push(s);
72 }
73 }
74 }
75 tokens
76}
77
78fn text_field(s: &str) -> Value {
80 if s == "-" {
81 Value::Null
82 } else {
83 Value::Str(s.to_string())
84 }
85}
86
87fn num_field(s: &str) -> Value {
90 if s == "-" {
91 Value::Null
92 } else {
93 infer::infer_scalar(s)
94 }
95}
96
97impl AccessLogParser {
98 fn err(&self, msg: impl std::fmt::Display) -> AxError {
99 AxError::Parse {
100 format: self.id().to_string(),
101 message: msg.to_string(),
102 }
103 }
104
105 fn row(tokens: &[String]) -> BTreeMap<String, Value> {
107 let mut row = BTreeMap::new();
108 row.insert("host".into(), text_field(&tokens[0]));
109 row.insert("ident".into(), text_field(&tokens[1]));
110 row.insert("user".into(), text_field(&tokens[2]));
111 row.insert("time".into(), text_field(&tokens[3]));
112
113 let mut req = tokens[4].splitn(3, ' ');
115 row.insert("method".into(), text_field(req.next().unwrap_or("-")));
116 row.insert("path".into(), text_field(req.next().unwrap_or("-")));
117 row.insert("protocol".into(), text_field(req.next().unwrap_or("-")));
118
119 row.insert("status".into(), num_field(&tokens[5]));
120 row.insert("bytes".into(), num_field(&tokens[6]));
121
122 if let Some(referer) = tokens.get(7) {
124 row.insert("referer".into(), text_field(referer));
125 }
126 if let Some(ua) = tokens.get(8) {
127 row.insert("user_agent".into(), text_field(ua));
128 }
129 row
130 }
131}
132
133impl FormatParser for AccessLogParser {
134 fn id(&self) -> &'static str {
135 "accesslog"
136 }
137 fn extensions(&self) -> &'static [&'static str] {
138 &["accesslog"]
139 }
140 fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
141 let text = std::str::from_utf8(bytes).ok()?;
142 let line = text.lines().find(|l| !l.trim().is_empty())?;
143 if !line.contains('[') || !line.contains('"') {
146 return None;
147 }
148 let tokens = tokenize(line);
149 if tokens.len() < 7 {
150 return None;
151 }
152 let status_ok = tokens[5]
155 .parse::<u16>()
156 .is_ok_and(|s| (100..=599).contains(&s));
157 let bytes_ok = tokens[6] == "-" || tokens[6].parse::<u64>().is_ok();
158 (status_ok && bytes_ok).then_some(STRONG)
159 }
160 fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
161 let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
162 let mut builder = TableBuilder::new();
163 for line in text.lines() {
164 if line.trim().is_empty() {
165 continue;
166 }
167 let tokens = tokenize(line);
168 if tokens.len() < 7 {
169 return Err(self.err(format!(
170 "malformed access-log line: expected >= 7 fields, got {}",
171 tokens.len()
172 )));
173 }
174 builder.push_row(Self::row(&tokens));
175 }
176 Ok(builder.finish())
177 }
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183 use ax_core::ColType;
184
185 const COMBINED: &str = "127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] \
186\"GET /apache_pb.gif HTTP/1.0\" 200 2326 \
187\"http://example.com/start.html\" \"Mozilla/4.08 [en] (Win98)\"\n";
188
189 const COMMON: &str =
190 "192.168.0.1 - - [10/Oct/2000:13:55:40 -0700] \"POST /login HTTP/1.1\" 302 -\n";
191
192 fn parse(s: &str) -> Vec<Column> {
193 AccessLogParser.parse("-", s.as_bytes()).unwrap()
194 }
195 fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
196 cols.iter().find(|c| c.name == name).unwrap()
197 }
198
199 #[test]
200 fn parses_combined_fields() {
201 let cols = parse(COMBINED);
202 assert_eq!(col(&cols, "host").cells[0], Value::Str("127.0.0.1".into()));
203 assert_eq!(col(&cols, "user").cells[0], Value::Str("frank".into()));
204 assert_eq!(
205 col(&cols, "time").cells[0],
206 Value::Str("10/Oct/2000:13:55:36 -0700".into())
207 );
208 assert_eq!(col(&cols, "method").cells[0], Value::Str("GET".into()));
209 assert_eq!(
210 col(&cols, "path").cells[0],
211 Value::Str("/apache_pb.gif".into())
212 );
213 assert_eq!(
214 col(&cols, "protocol").cells[0],
215 Value::Str("HTTP/1.0".into())
216 );
217 assert_eq!(col(&cols, "status").ty, ColType::Int);
218 assert_eq!(col(&cols, "status").cells[0], Value::Int(200));
219 assert_eq!(col(&cols, "bytes").cells[0], Value::Int(2326));
220 assert_eq!(
221 col(&cols, "referer").cells[0],
222 Value::Str("http://example.com/start.html".into())
223 );
224 assert_eq!(
226 col(&cols, "user_agent").cells[0],
227 Value::Str("Mozilla/4.08 [en] (Win98)".into())
228 );
229 }
230
231 #[test]
232 fn dash_placeholders_are_null() {
233 let cols = parse(COMMON);
234 assert_eq!(col(&cols, "ident").cells[0], Value::Null);
235 assert_eq!(col(&cols, "user").cells[0], Value::Null);
236 assert_eq!(col(&cols, "bytes").cells[0], Value::Null); assert_eq!(col(&cols, "status").cells[0], Value::Int(302));
238 }
239
240 #[test]
241 fn common_format_has_no_referer_or_ua_column() {
242 let cols = parse(COMMON);
243 assert!(cols.iter().all(|c| c.name != "referer"));
244 assert!(cols.iter().all(|c| c.name != "user_agent"));
245 }
246
247 #[test]
248 fn mixed_common_and_combined_pads_with_null() {
249 let cols = parse(&format!("{COMBINED}{COMMON}"));
251 let referer = col(&cols, "referer");
252 assert_eq!(referer.cells.len(), 2);
253 assert_eq!(referer.cells[1], Value::Null);
254 }
255
256 #[test]
257 fn malformed_line_errors() {
258 assert!(matches!(
259 AccessLogParser.parse("-", b"this is not an access log\n"),
260 Err(AxError::Parse { .. })
261 ));
262 }
263
264 #[test]
265 fn tokenize_groups_brackets_and_quotes() {
266 let t = tokenize("a [x y] \"q \\\"r\\\" s\" b");
267 assert_eq!(t, vec!["a", "x y", "q \"r\" s", "b"]);
268 }
269
270 #[test]
271 fn sniff_recognizes_access_logs() {
272 assert_eq!(AccessLogParser.sniff(COMBINED.as_bytes()), Some(STRONG));
273 assert_eq!(AccessLogParser.sniff(COMMON.as_bytes()), Some(STRONG));
274 assert_eq!(AccessLogParser.sniff(b"a b c d e 200 1024"), None);
276 assert_eq!(
279 AccessLogParser.sniff(b"1.1.1.1 - - [t i] GET 200 10"),
280 None,
281 "bracket present but no quote"
282 );
283 assert_eq!(
284 AccessLogParser.sniff(b"1.1.1.1 - - t \"GET / HTTP/1.1\" 200 10"),
285 None,
286 "quote present but no bracket"
287 );
288 assert_eq!(AccessLogParser.sniff(b"a,b,c\n1,2,3"), None); assert_eq!(AccessLogParser.sniff(b"k=1 v=2"), None); }
291
292 #[test]
293 fn claims_the_accesslog_extension() {
294 assert_eq!(AccessLogParser.extensions(), &["accesslog"]);
295 }
296
297 #[test]
298 fn sniff_rejects_out_of_range_status_and_bad_bytes() {
299 let lo = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 99 10\n";
301 let hi = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 600 10\n";
302 let bad_bytes = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 200 abc\n";
303 assert_eq!(AccessLogParser.sniff(lo.as_bytes()), None);
304 assert_eq!(AccessLogParser.sniff(hi.as_bytes()), None);
305 assert_eq!(AccessLogParser.sniff(bad_bytes.as_bytes()), None);
306 let edge_lo = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 100 10\n";
308 let edge_hi = "1.1.1.1 - - [t i] \"GET / HTTP/1.1\" 599 10\n";
309 assert_eq!(AccessLogParser.sniff(edge_lo.as_bytes()), Some(STRONG));
310 assert_eq!(AccessLogParser.sniff(edge_hi.as_bytes()), Some(STRONG));
311 }
312
313 #[test]
314 fn resolves_by_extension_and_content() {
315 let reg = crate::parser::ParserRegistry::default();
316 assert_eq!(
317 reg.resolve("x.accesslog", COMMON.as_bytes()).unwrap().id(),
318 "accesslog"
319 );
320 assert_eq!(
322 reg.resolve("access.log", COMBINED.as_bytes()).unwrap().id(),
323 "accesslog"
324 );
325 assert_eq!(reg.resolve("app.log", b"a,b\n1,2").unwrap().id(), "csv");
327 }
328}