grok_rs/
lib.rs

1//! grok-rs is a Rust implementation of the [Grok](https://www.elastic.co/guide/en/elasticsearch/reference/current/grok-processor.html)
2//! pattern matching library.
3//!
4//! The captured group can be renamed based on the alias, and the value can be converted to the specified type.
5//! The supported types are:
6//! - int
7//! - long
8//! - float
9//! - double
10//! - bool
11//! - boolean
12//!
13//! If the type is not specified, then the value will be kept as string.
14//!
15//! # Usage
16//!
17//! Initiate a Grok instance which includes the default patterns, or add custom patterns,
18//! then compile your whole pattern, and parse the input string based on the pattern.
19//!
20//! ```
21//! use std::collections::HashMap;
22//! use grok_rs::{Grok, Value};
23//!
24//! let mut grok = Grok::default();
25//! grok.add_pattern("NAME", r"[A-z0-9._-]+");
26//! let pattern = grok.compile("%{NAME}", false).unwrap();
27//! let expected = HashMap::from([("NAME".to_string(), Value::String("admin".into()))]);
28//!
29//! assert_eq!(expected, pattern.parse("admin").unwrap());
30//! assert_eq!(expected, pattern.parse("admin user").unwrap());
31//! ```
32use std::{
33    collections::HashMap,
34    fs::File,
35    io::{BufRead, BufReader},
36};
37
38use regex::Regex;
39
40const MAX_RECURSION: i32 = 1024;
41
42const NAME_INDEX: usize = 1;
43const PATTERN_INDEX: usize = 2;
44const ALIAS_INDEX: usize = 3;
45const TYPE_INDEX: usize = 4;
46
47const GROK_PATTERN: &str = r"(?x)
48%\{
49    (?<name>
50        (?<pattern>[[:word:]]+)
51        (?:
52            :(?<alias>[[[:word:]]@.-]+)
53            (?:
54                :(?<type>int|long|float|double|bool(?:ean)?)
55            )?
56        )?
57    )
58\}";
59
60fn load_patterns() -> HashMap<String, String> {
61    let mut patterns = HashMap::new();
62
63    for line in glob::glob("src/patterns/*")
64        .unwrap()
65        .map(|e| File::open(e.unwrap()).unwrap())
66        .flat_map(|f| BufReader::new(f).lines())
67        .map(|line| line.unwrap())
68        .filter(|line| !line.starts_with('#') && !line.is_empty())
69    {
70        let (key, value) = line.split_at(line.find(' ').unwrap());
71        patterns.insert(key.to_string(), value.trim().to_string());
72    }
73
74    patterns.insert("BOOL".into(), "true|false".into());
75
76    patterns
77}
78
79lazy_static::lazy_static! {
80    static ref GROK_REGEX: Regex = Regex::new(GROK_PATTERN).unwrap();
81    static ref DEFAULT_PATTERNS: HashMap<String, String> = load_patterns();
82}
83
84#[derive(Debug, Clone, PartialEq)]
85pub enum Value {
86    Int(i64),
87    Float(f64),
88    Bool(bool),
89    String(String),
90}
91
92type AliasType = (String, Option<String>);
93
94#[derive(Debug)]
95pub struct Pattern {
96    regex: Regex,
97    alias: HashMap<String, AliasType>,
98}
99
100impl Pattern {
101    fn new(regex: Regex, alias: HashMap<String, AliasType>) -> Self {
102        Self { regex, alias }
103    }
104
105    /// parse the input string based on the pattern, and rename the captured group based on alias.
106    ///  - if type is specified, then the value will be converted to the specified type.
107    ///  - if the type is not supported, then the value will be kept as string.
108    ///  - if the value can't be converted to the specified type, then an error will be returned.
109    ///  - if the value can't be captured, then an empty map will be returned.
110    ///
111    /// # Example
112    /// ```
113    /// use std::collections::HashMap;
114    /// use grok_rs::{Grok, Value};
115    ///
116    /// let grok = Grok::default();
117    /// let pattern = grok.compile("%{USERNAME}", false).unwrap();
118    /// let result = pattern.parse("admin admin@example.com").unwrap();
119    /// let expected = HashMap::from([("USERNAME".to_string(), Value::String("admin".into()))]);
120    /// assert_eq!(expected, result);
121    /// ```
122    pub fn parse(&self, s: &str) -> Result<HashMap<String, Value>, String> {
123        let mut map = HashMap::new();
124        let names = self.regex.capture_names().flatten().collect::<Vec<_>>();
125
126        let caps = match self.regex.captures(s) {
127            Some(caps) => caps,
128            None => return Ok(map),
129        };
130
131        for name in names {
132            if let Some(m) = caps.name(name) {
133                let value = m.as_str().to_string();
134                match self.alias.get(name) {
135                    Some((alias, type_)) => {
136                        let value = match type_ {
137                            Some(type_) if type_.eq("int") || type_.eq("long") => Value::Int(
138                                value.parse::<i64>().map_err(|e| format!("{e}: {value}"))?,
139                            ),
140                            Some(type_) if type_.eq("float") || type_.eq("double") => Value::Float(
141                                value.parse::<f64>().map_err(|e| format!("{e}: {value}"))?,
142                            ),
143                            Some(type_) if type_.eq("bool") || type_.eq("boolean") => Value::Bool(
144                                value.parse::<bool>().map_err(|e| format!("{e}: {value}"))?,
145                            ),
146                            _ => Value::String(value),
147                        };
148                        map.insert(alias.clone(), value);
149                    }
150                    None => {
151                        map.insert(name.to_string(), Value::String(value));
152                    }
153                }
154            }
155        }
156
157        Ok(map)
158    }
159}
160
161#[derive(Default, Debug)]
162pub struct Grok {
163    patterns: HashMap<String, String>,
164}
165
166impl Grok {
167    /// add a custom pattern, if the pattern is already defined, then it will be overwritten.
168    /// # Example
169    /// ```
170    /// use grok_rs::Grok;
171    ///
172    /// let mut grok = Grok::default();
173    /// grok.add_pattern("NAME", r"[A-z0-9._-]+");
174    /// ```
175    pub fn add_pattern<T: Into<String>>(&mut self, name: T, pattern: T) {
176        self.patterns.insert(name.into(), pattern.into());
177    }
178
179    /// Compile the pattern, and return a Pattern.
180    /// - if `named_capture_only` is true, then the unnamed capture group will be ignored.
181    /// - if the pattern is invalid or not found , then an error will be returned.
182    ///
183    /// Due to the compile process is heavy, it's recommended compile the pattern once and reuse it.
184    ///
185    /// # Example
186    ///
187    /// the USERNAME will be ignored because `named_capture_only` is true.
188    ///
189    /// ```
190    /// use grok_rs::Grok;
191    /// let grok = Grok::default();
192    /// let pattern = grok.compile("%{USERNAME} %{EMAILADDRESS:email}", true).unwrap();
193    /// ```
194    pub fn compile(&self, s: &str, named_capture_only: bool) -> Result<Pattern, String> {
195        let mut alias_map = HashMap::new();
196        let mut haystack = s.to_string();
197        let mut index = 0;
198        let mut iter_left = MAX_RECURSION;
199
200        while let Some(caps) = GROK_REGEX.captures(haystack.clone().as_str()) {
201            if iter_left <= 0 {
202                return Err(format!("max recursion {MAX_RECURSION} reached"));
203            }
204            iter_left -= 1;
205
206            let name = caps.get(NAME_INDEX).ok_or("name not found")?.as_str();
207            let pattern = caps.get(PATTERN_INDEX).ok_or("pattern not found")?.as_str();
208
209            let pattern_regex = self
210                .patterns
211                .get(pattern)
212                .or(DEFAULT_PATTERNS.get(pattern))
213                .ok_or(format!("pattern: {pattern}  not found"))?;
214
215            let to_replace = format!("%{{{name}}}");
216
217            while haystack.matches(&to_replace).count() > 0 {
218                let replacement = match caps.get(ALIAS_INDEX) {
219                    None if named_capture_only => {
220                        format!("(?:{pattern_regex})")
221                    }
222                    _ => {
223                        let new_name = format!("name{index}");
224                        let origin_alias =
225                            caps.get(ALIAS_INDEX).map(|m| m.as_str()).unwrap_or(pattern);
226                        let type_ = caps.get(TYPE_INDEX).map(|m| m.as_str().to_string());
227                        alias_map.insert(new_name.clone(), (origin_alias.to_string(), type_));
228                        format!("(?<{new_name}>{pattern_regex})")
229                    }
230                };
231
232                haystack = haystack.replacen(&to_replace, &replacement, 1);
233                index += 1;
234            }
235        }
236
237        let re = Regex::new(haystack.as_str()).map_err(|e| e.to_string())?;
238        Ok(Pattern::new(re, alias_map))
239    }
240}
241
242impl<T: Into<String>> FromIterator<(T, T)> for Grok {
243    fn from_iter<I: IntoIterator<Item = (T, T)>>(iter: I) -> Self {
244        let mut grok = Grok::default();
245        for (k, v) in iter {
246            grok.add_pattern(k, v);
247        }
248        grok
249    }
250}
251
252impl<S: Into<String>, const N: usize> From<[(S, S); N]> for Grok {
253    fn from(arr: [(S, S); N]) -> Self {
254        Self::from_iter(arr)
255    }
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261
262    struct Case<'a> {
263        patterns: Vec<(&'a str, &'a str)>,
264        pattern: &'a str,
265        input: &'a str,
266        expected: HashMap<String, Value>,
267        named_capture_only: bool,
268    }
269
270    fn assert(c: Case<'_>) {
271        let grok = Grok::from_iter(c.patterns);
272        let pattern = grok.compile(c.pattern, c.named_capture_only).unwrap();
273        assert_eq!(c.expected, pattern.parse(c.input).unwrap());
274    }
275
276    fn asserts(cases: Vec<Case<'_>>) {
277        for c in cases {
278            assert(c);
279        }
280    }
281
282    #[test]
283    fn test_simple_add_pattern() {
284        let mut grok = Grok::default();
285        grok.add_pattern("NAME", r"[A-z0-9._-]+");
286        let pattern = grok.compile("%{NAME}", false).unwrap();
287        let expected: HashMap<String, Value> = [("NAME", "admin")]
288            .into_iter()
289            .map(|(k, v)| (k.to_string(), Value::String(v.to_string())))
290            .collect();
291
292        assert_eq!(expected, pattern.parse("admin").unwrap());
293        assert_eq!(expected, pattern.parse("admin user").unwrap());
294    }
295
296    #[test]
297    fn test_named_capture_only() {
298        let grok = Grok::default();
299        let pattern = grok
300            // USERNAME and EMAILADDRESS are defined in grok-patterns
301            .compile("%{USERNAME} %{EMAILADDRESS:email}", true)
302            .unwrap();
303
304        let expected = [("email", "admin@example.com")]
305            .into_iter()
306            .map(|(k, v)| (k.to_string(), Value::String(v.to_string())))
307            .collect::<HashMap<String, Value>>();
308
309        assert_eq!(expected, pattern.parse("admin admin@example.com").unwrap());
310    }
311
312    #[test]
313    fn test_from() {
314        let expected = [("NAME", "admin")]
315            .into_iter()
316            .map(|(k, v)| (k.to_string(), Value::String(v.to_string())))
317            .collect::<HashMap<String, Value>>();
318
319        {
320            let grok = Grok::from_iter([("NAME", r"[A-z0-9._-]+")]);
321            let pattern = grok.compile("%{NAME}", false).unwrap();
322            assert_eq!(expected, pattern.parse("admin").unwrap());
323        }
324        {
325            let grok = Grok::from([("NAME", r"[A-z0-9._-]+")]);
326            let pattern = grok.compile("%{NAME}", false).unwrap();
327            assert_eq!(expected, pattern.parse("admin").unwrap());
328        }
329    }
330
331    #[test]
332    fn test_pattern_parse_no_captures() {
333        let grok = Grok::default();
334        let pattern = grok.compile("%{USERNAME}", false).unwrap();
335
336        assert!(pattern.parse("$#@").unwrap().is_empty());
337        assert!(pattern.parse("").unwrap().is_empty());
338        assert!(pattern.parse("โœ…๐Ÿš€๐ŸŒ").unwrap().is_empty());
339        assert!(pattern.parse("     ").unwrap().is_empty());
340    }
341
342    #[test]
343    fn test_composite_or_pattern() {
344        let mut grok = Grok::default();
345        grok.add_pattern("MAC", r"(?:%{CISCOMAC}|%{WINDOWSMAC}|%{COMMONMAC})");
346        grok.add_pattern("CISCOMAC", r"(?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})");
347        grok.add_pattern("WINDOWSMAC", r"(?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})");
348        grok.add_pattern("COMMONMAC", r"(?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})");
349
350        let pattern = grok.compile("%{MAC}", false).unwrap();
351        let expected = [
352            ("MAC", "5E:FF:56:A2:AF:15"),
353            ("COMMONMAC", "5E:FF:56:A2:AF:15"),
354        ]
355        .into_iter()
356        .map(|(k, v)| (k.to_string(), Value::String(v.to_string())))
357        .collect::<HashMap<String, Value>>();
358
359        assert_eq!(expected, pattern.parse("5E:FF:56:A2:AF:15").unwrap());
360        assert_eq!(
361            expected,
362            pattern.parse("127.0.0.1 5E:FF:56:A2:AF:15").unwrap()
363        );
364    }
365
366    #[test]
367    fn test_multiple_patterns() {
368        let mut grok = Grok::default();
369        grok.add_pattern("YEAR", r"(\d\d){1,2}");
370        grok.add_pattern("MONTH", r"\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b");
371        grok.add_pattern("DAY", r"(?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)");
372        let pattern = grok.compile("%{DAY} %{MONTH} %{YEAR}", false).unwrap();
373
374        let expected = [("DAY", "Monday"), ("MONTH", "March"), ("YEAR", "2012")]
375            .into_iter()
376            .map(|(k, v)| (k.to_string(), Value::String(v.to_string())))
377            .collect::<HashMap<String, Value>>();
378        assert_eq!(expected, pattern.parse("Monday March 2012").unwrap());
379    }
380
381    #[test]
382    fn test_adhoc_pattern() {
383        let grok = Grok::default();
384        let pattern = grok.compile(r"\[(?<threadname>[^\]]+)\]", false).unwrap();
385        let expected = [("threadname", "thread1")]
386            .into_iter()
387            .map(|(k, v)| (k.to_string(), Value::String(v.to_string())))
388            .collect::<HashMap<String, Value>>();
389        assert_eq!(expected, pattern.parse("[thread1]").unwrap());
390    }
391
392    #[test]
393    fn test_type() {
394        let mut grok = Grok::default();
395        grok.add_pattern("NUMBER", r"\d+");
396
397        // int
398        {
399            let pattern = grok.compile("%{NUMBER:digit:int}", false).unwrap();
400            let expected = [("digit", Value::Int(123))]
401                .into_iter()
402                .map(|(k, v)| (k.to_string(), v))
403                .collect::<HashMap<String, Value>>();
404            assert_eq!(expected, pattern.parse("hello 123").unwrap());
405        }
406
407        // float
408        {
409            let pattern = grok.compile("%{NUMBER:digit:float}", false).unwrap();
410            let expected = [("digit", Value::Float(123.0))]
411                .into_iter()
412                .map(|(k, v)| (k.to_string(), v))
413                .collect::<HashMap<String, Value>>();
414            assert_eq!(expected, pattern.parse("hello 123.0").unwrap());
415        }
416
417        // wrong type
418        {
419            let pattern = grok.compile("%{NUMBER:digit:wrong}", false);
420            assert!(pattern.is_err());
421        }
422
423        {
424            // wrong value
425            let pattern = grok.compile("%{USERNAME:digit:float}", false).unwrap();
426            assert_eq!(
427                Err("invalid float literal: grok".to_string()),
428                pattern.parse("grok")
429            );
430        }
431    }
432
433    #[test]
434    fn test_more_patterns() {
435        let cases: Vec<Case> = [(
436            vec![
437                (
438                    "NGINX_HOST",
439                    r#"(?:%{IP:destination.ip}|%{NGINX_NOTSEPARATOR:destination.domain})(:%{NUMBER:destination.port})?"#,
440                ),
441                ("IP", r#"(?:\[%{IPV6}\]|%{IPV6}|%{IPV4})"#),
442                ("NGINX_NOTSEPARATOR", r#"[^\t ,:]+"#),
443                ("NUMBER", r#"\d+"#),
444                (
445                    "IPV6",
446                    r#"((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?"#,
447                ),
448                (
449                    "IPV4",
450                    r#"\b(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])\.(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])\.(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])\.(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])\b"#,
451                ),
452            ],
453            "%{NGINX_HOST}",
454            "127.0.0.1:1234",
455            vec![
456                ("destination.ip", Value::String("127.0.0.1".to_string())),
457                ("destination.port", Value::String("1234".to_string())),
458            ],
459            true,
460        ),
461        (
462            vec![
463                (
464                    "NGINX_HOST",
465                    r#"(?:%{IP:destination.ip}|%{NGINX_NOTSEPARATOR:destination.domain})(:%{NUMBER:destination.port})?"#,
466                ),
467                ("IP", r#"(?:\[%{IPV6}\]|%{IPV6}|%{IPV4})"#),
468                ("NGINX_NOTSEPARATOR", r#"[^\t ,:]+"#),
469                ("NUMBER", r#"\d+"#),
470                (
471                    "IPV6",
472                    r#"((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?"#,
473                ),
474                (
475                    "IPV4",
476                    r#"\b(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])\.(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])\.(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])\.(?:[0-1]?[0-9]{1,2}|2[0-4][0-9]|25[0-5])\b"#,
477                ),
478            ],
479            "%{NGINX_HOST}",
480            "127.0.0.1:1234",
481            vec![
482                ("destination.ip", Value::String("127.0.0.1".to_string())),
483                ("destination.port", Value::String("1234".to_string())),
484                ("NGINX_HOST", Value::String("127.0.0.1:1234".to_string())),
485                ("IPV4", Value::String("127.0.0.1".to_string())),
486            ],
487            false,
488        )
489        ].into_iter().map(|(patterns, pattern, input, expected, named_capture_only)| Case {
490            patterns: patterns.into_iter().collect(),
491            pattern,
492            input,
493            expected: expected.into_iter().map(|(k, v)| (k.to_string(), v)).collect(),
494            named_capture_only,
495        }).collect();
496
497        asserts(cases);
498    }
499
500    #[test]
501    fn test_default_patterns() {
502        let cases: Vec<Case> = [
503            (
504                vec![
505                    ("NGINX_HOST",         r"(?:%{IP:destination.ip}|%{NGINX_NOTSEPARATOR:destination.domain})(:%{NUMBER:destination.port})?"),
506                    ("NGINX_NOTSEPARATOR", r#"[^\t ,:]+"#),
507                ],
508                "%{NGINX_HOST}",
509                "127.0.0.1:1234",
510                vec![
511                    ("destination.ip", Value::String("127.0.0.1".to_string())),
512                    ("destination.port", Value::String("1234".to_string())),
513                ],
514                true,
515            ),
516            (
517                vec![
518                    ("NGINX_HOST",         r"(?:%{IP:destination.ip}|%{NGINX_NOTSEPARATOR:destination.domain})(:%{NUMBER:destination.port})?"),
519                    ("NGINX_NOTSEPARATOR", r#"[^\t ,:]+"#),
520                ],
521                "%{NGINX_HOST}",
522                "127.0.0.1:1234",
523                vec![
524                    ("destination.ip", Value::String("127.0.0.1".to_string())),
525                    ("destination.port", Value::String("1234".to_string())),
526                    ("BASE10NUM", Value::String("1234".to_string())),
527                    ("NGINX_HOST", Value::String("127.0.0.1:1234".to_string())),
528                    ("IPV4", Value::String("127.0.0.1".to_string())),
529                ],
530                false,
531            ),
532        ]
533        .into_iter()
534        .map(
535            |(patterns, pattern, input, expected, named_capture_only)| Case {
536                patterns: patterns.into_iter().collect(),
537                pattern,
538                input,
539                expected: expected
540                    .into_iter()
541                    .map(|(k, v)| (k.to_string(), v))
542                    .collect(),
543                named_capture_only,
544            },
545        )
546        .collect();
547
548        asserts(cases);
549    }
550
551    #[test]
552    fn test_default_patterns_with_type() {
553        let cases: Vec<Case> = [
554            (
555                vec![
556                    ("NGINX_HOST",         r"(?:%{IP:destination.ip}|%{NGINX_NOTSEPARATOR:destination.domain})(:%{NUMBER:destination.port})?"),
557                    ("NGINX_NOTSEPARATOR", r#"[^\t ,:]+"#),
558                ],
559                "%{NGINX_HOST}",
560                "127.0.0.1:1234",
561                vec![
562                    ("destination.ip", Value::String("127.0.0.1".to_string())),
563                    ("destination.port", Value::String("1234".to_string())),
564                    ("BASE10NUM", Value::String("1234".to_string())),
565                    ("NGINX_HOST", Value::String("127.0.0.1:1234".to_string())),
566                    ("IPV4", Value::String("127.0.0.1".to_string())),
567                ],
568                false,
569            ),
570            (
571                vec![
572                    ("NGINX_HOST",         r#"(?:%{IP:destination.ip}|%{NGINX_NOTSEPARATOR:destination.domain})(:%{NUMBER:destination.port:int})?"#),
573                    ("NGINX_NOTSEPARATOR", r#"[^\t ,:]+"#),
574                    ("BOOL", r#"true|false"#),
575                ],
576                "%{NGINX_HOST} %{BOOL:destination.boolean:boolean}",
577                "127.0.0.1:1234 true",
578                vec![
579                    ("destination.ip", Value::String("127.0.0.1".to_string())),
580                    ("destination.port", Value::Int(1234)),
581                    ("destination.boolean", Value::Bool(true)),
582                ],
583                true,
584            ),
585        ]
586        .into_iter()
587        .map(
588            |(patterns, pattern, input, expected, named_capture_only)| Case {
589                patterns: patterns.into_iter().collect(),
590                pattern,
591                input,
592                expected: expected
593                    .into_iter()
594                    .map(|(k, v)| (k.to_string(), v))
595                    .collect(),
596                named_capture_only,
597            },
598        )
599        .collect();
600
601        asserts(cases);
602    }
603
604    #[test]
605    fn test_more_default_patterns() {
606        let cases = [
607            ("WORD", vec!["hello", "world123", "test_data"]),
608            ("NOTSPACE", vec!["example", "text-with-dashes", "12345"]),
609            ("SPACE", vec![" ", "\t", "  "]),
610            // types
611            ("INT", vec!["123", "-456", "+789"]),
612            ("NUMBER", vec!["123", "456.789", "-0.123"]),
613            ("BOOL", vec!["true", "false", "true"]),
614            ("BASE10NUM", vec!["123", "-123.456", "0.789"]),
615            ("BASE16NUM", vec!["1a2b", "0x1A2B", "-0x1a2b3c"]),
616            ("BASE16FLOAT", vec!["0x1.a2b3", "-0x1A2B3C.D", "0x123.abc"]),
617            ("POSINT", vec!["123", "456", "789"]),
618            ("NONNEGINT", vec!["0", "123", "456"]),
619            (
620                "GREEDYDATA",
621                vec!["anything goes", "literally anything", "123 #@!"],
622            ),
623            (
624                "QUOTEDSTRING",
625                vec!["\"This is a quote\"", "'single quoted'"],
626            ),
627            (
628                "UUID",
629                vec![
630                    "123e4567-e89b-12d3-a456-426614174000",
631                    "123e4567-e89b-12d3-a456-426614174001",
632                    "123e4567-e89b-12d3-a456-426614174002",
633                ],
634            ),
635            (
636                "URN",
637                vec![
638                    "urn:isbn:0451450523",
639                    "urn:ietf:rfc:2648",
640                    "urn:mpeg:mpeg7:schema:2001",
641                ],
642            ),
643            // network
644            (
645                "IP",
646                vec![
647                    "192.168.1.1",
648                    "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
649                    "172.16.254.1",
650                ],
651            ),
652            (
653                "IPV6",
654                vec![
655                    "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
656                    "::1",
657                    "fe80::1ff:fe23:4567:890a",
658                ],
659            ),
660            ("IPV4", vec!["192.168.1.1", "10.0.0.1", "172.16.254.1"]),
661            (
662                "IPORHOST",
663                vec!["example.com", "192.168.1.1", "fe80::1ff:fe23:4567:890a"],
664            ),
665            (
666                "HOSTNAME",
667                vec!["example.com", "sub.domain.co.uk", "localhost"],
668            ),
669            ("EMAILLOCALPART", vec!["john.doe", "alice123", "bob-smith"]),
670            (
671                "EMAILADDRESS",
672                vec![
673                    "john.doe@example.com",
674                    "alice123@domain.co.uk",
675                    "bob-smith@localhost",
676                ],
677            ),
678            ("USERNAME", vec!["user1", "john.doe", "alice_123"]),
679            ("USER", vec!["user1", "john.doe", "alice_123"]),
680            (
681                "MAC",
682                vec!["00:1A:2B:3C:4D:5E", "001A.2B3C.4D5E", "00-1A-2B-3C-4D-5E"],
683            ),
684            (
685                "CISCOMAC",
686                vec!["001A.2B3C.4D5E", "001B.2C3D.4E5F", "001C.2D3E.4F5A"],
687            ),
688            (
689                "WINDOWSMAC",
690                vec![
691                    "00-1A-2B-3C-4D-5E",
692                    "00-1B-2C-3D-4E-5F",
693                    "00-1C-2D-3E-4F-5A",
694                ],
695            ),
696            (
697                "COMMONMAC",
698                vec![
699                    "00:1A:2B:3C:4D:5E",
700                    "00:1B:2C:3D:4E:5F",
701                    "00:1C:2D:3E:4F:5A",
702                ],
703            ),
704            ("HOSTPORT", vec!["example.com:80", "192.168.1.1:8080"]),
705            // paths
706            (
707                "UNIXPATH",
708                vec!["/home/user", "/var/log/syslog", "/tmp/abc_123"],
709            ),
710            ("TTY", vec!["/dev/pts/1", "/dev/tty0", "/dev/ttyS0"]),
711            (
712                "WINPATH",
713                vec![
714                    "C:\\Program Files\\App",
715                    "D:\\Work\\project\\file.txt",
716                    "E:\\New Folder\\test",
717                ],
718            ),
719            ("URIPROTO", vec!["http", "https", "ftp"]),
720            ("URIHOST", vec!["example.com", "192.168.1.1:8080"]),
721            (
722                "URIPATH",
723                vec!["/path/to/resource", "/another/path", "/root"],
724            ),
725            (
726                "URIQUERY",
727                vec!["key=value", "name=John&Doe", "search=query&active=true"],
728            ),
729            (
730                "URIPARAM",
731                vec!["?key=value", "?name=John&Doe", "?search=query&active=true"],
732            ),
733            (
734                "URIPATHPARAM",
735                vec![
736                    "/path?query=1",
737                    "/resource?name=John",
738                    "/folder/path?valid=true",
739                ],
740            ),
741            (
742                "URI",
743                vec![
744                    "http://user:password@example.com:80/path?query=string",
745                    "https://example.com",
746                    "ftp://192.168.1.1/upload",
747                ],
748            ),
749            (
750                "PATH",
751                vec![
752                    "/home/user/documents",
753                    "C:\\Windows\\system32",
754                    "/var/log/syslog",
755                ],
756            ),
757            // dates
758            (
759                "MONTH",
760                vec![
761                    "January",
762                    "Feb",
763                    "March",
764                    "Apr",
765                    "May",
766                    "Jun",
767                    "Jul",
768                    "August",
769                    "September",
770                    "October",
771                    "Nov",
772                    "December",
773                ],
774            ),
775            // Months: January, Feb, 3, 03, 12, December "MONTH": `\b(?:[Jj]an(?:uary|uar)?|[Ff]eb(?:ruary|ruar)?|[Mm](?:a|รค)?r(?:ch|z)?|[Aa]pr(?:il)?|[Mm]a(?:y|i)?|[Jj]un(?:e|i)?|[Jj]ul(?:y|i)?|[Aa]ug(?:ust)?|[Ss]ep(?:tember)?|[Oo](?:c|k)?t(?:ober)?|[Nn]ov(?:ember)?|[Dd]e(?:c|z)(?:ember)?)\b`,
776            (
777                "MONTHNUM2",
778                vec![
779                    "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12",
780                ],
781            ),
782            // Days Monday, Tue, Thu, etc
783            (
784                "DAY",
785                vec![
786                    "Monday",
787                    "Tuesday",
788                    "Wednesday",
789                    "Thursday",
790                    "Friday",
791                    "Saturday",
792                    "Sunday",
793                ],
794            ),
795            // Years?
796            ("YEAR", vec!["1999", "2000", "2021"]),
797            ("HOUR", vec!["00", "12", "23"]),
798            ("MINUTE", vec!["00", "30", "59"]),
799            // '60' is a leap second in most time standards and thus is valid.
800            ("SECOND", vec!["00", "30", "60"]),
801            ("TIME", vec!["14:30", "23:59:59", "12:00:00", "12:00:60"]),
802            // datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
803            ("DATE_US", vec!["04/21/2022", "12-25-2020", "07/04/1999"]),
804            ("DATE_EU", vec!["21.04.2022", "25/12/2020", "04-07-1999"]),
805            ("ISO8601_TIMEZONE", vec!["Z", "+02:00", "-05:00"]),
806            ("ISO8601_SECOND", vec!["59", "30", "60.123"]),
807            (
808                "TIMESTAMP_ISO8601",
809                vec![
810                    "2022-04-21T14:30:00Z",
811                    "2020-12-25T23:59:59+02:00",
812                    "1999-07-04T12:00:00-05:00",
813                ],
814            ),
815            ("DATE", vec!["04/21/2022", "21.04.2022", "12-25-2020"]),
816            (
817                "DATESTAMP",
818                vec!["04/21/2022 14:30", "21.04.2022 23:59", "12-25-2020 12:00"],
819            ),
820            ("TZ", vec!["EST", "CET", "PDT"]),
821            ("DATESTAMP_RFC822", vec!["Wed Jan 12 2024 14:33 EST"]),
822            (
823                "DATESTAMP_RFC2822",
824                vec![
825                    "Tue, 12 Jan 2022 14:30 +0200",
826                    "Fri, 25 Dec 2020 23:59 -0500",
827                    "Sun, 04 Jul 1999 12:00 Z",
828                ],
829            ),
830            (
831                "DATESTAMP_OTHER",
832                vec![
833                    "Tue Jan 12 14:30 EST 2022",
834                    "Fri Dec 25 23:59 CET 2020",
835                    "Sun Jul 04 12:00 PDT 1999",
836                ],
837            ),
838            (
839                "DATESTAMP_EVENTLOG",
840                vec!["20220421143000", "20201225235959", "19990704120000"],
841            ),
842            // Syslog Dates: Month Day HH:MM:SS	"MONTH":         `\b(?:Jan(?:uary|uar)?|Feb(?:ruary|ruar)?|Mar(?:ch|z)?|Apr(?:il)?|May|i|Jun(?:e|i)?|Jul(?:y|i)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b`,
843            (
844                "SYSLOGTIMESTAMP",
845                vec!["Jan  1 00:00:00", "Mar 15 12:34:56", "Dec 31 23:59:59"],
846            ),
847            ("PROG", vec!["sshd", "kernel", "cron"]),
848            ("SYSLOGPROG", vec!["sshd[1234]", "kernel", "cron[5678]"]),
849            (
850                "SYSLOGHOST",
851                vec!["example.com", "192.168.1.1", "localhost"],
852            ),
853            ("SYSLOGFACILITY", vec!["<1.2>", "<12345.13456>"]),
854            ("HTTPDATE", vec!["25/Dec/2024:14:33 4"]),
855        ];
856
857        for (pattern, values) in cases {
858            let grok = Grok::default();
859            let p = grok
860                .compile(&format!("%{{{pattern}:result}}"), true)
861                .unwrap();
862
863            for value in values {
864                let m = p.parse(value).unwrap();
865                let result = m.get("result").unwrap();
866                assert_eq!(&Value::String(value.to_string()), result);
867            }
868        }
869    }
870
871    #[test]
872    fn test_elastic_docs() {
873        let cases = [(
874            "%{IP:client} %{WORD:method} %{URIPATHPARAM:request} %{NUMBER:bytes:int} %{NUMBER:duration:double}",
875            "55.3.244.1 GET /index.html 15824 0.043",
876            vec![
877                ("duration", Value::Float(0.043)),
878                ("request", Value::String("/index.html".to_string())),
879                ("method", Value::String("GET".to_string())),
880                ("bytes", Value::Int(15824)),
881                ("client", Value::String("55.3.244.1".to_string())),
882            ],
883        )];
884
885        for c in cases {
886            let grok = Grok::default();
887            let pattern = grok.compile(c.0, true).unwrap();
888            let expected =
889                c.2.into_iter()
890                    .map(|(k, v)| (k.to_string(), v))
891                    .collect::<HashMap<String, Value>>();
892            assert_eq!(expected, pattern.parse(c.1).unwrap());
893        }
894    }
895}