vrl 0.32.0

Vector Remap Language
Documentation
use crate::compiler::prelude::*;
use chrono::{NaiveDateTime, Utc, offset::TimeZone};
use regex::Regex;
use std::collections::BTreeMap;
use std::sync::LazyLock;

fn parse_glog(bytes: Value) -> Resolved {
    let bytes = bytes.try_bytes()?;
    let message = String::from_utf8_lossy(&bytes);
    let mut log = ObjectMap::new();
    let captures = REGEX_GLOG
        .captures(&message)
        .ok_or("failed parsing glog message")?;
    if let Some(level) = captures.name("level").map(|capture| capture.as_str()) {
        let level = match level {
            "I" => Ok("info"),
            "W" => Ok("warning"),
            "E" => Ok("error"),
            "F" => Ok("fatal"),
            _ => Err(format!(r#"unrecognized log level "{level}""#)),
        }?;

        log.insert("level".into(), Value::Bytes(level.to_owned().into()));
    }
    if let Some(timestamp) = captures.name("timestamp").map(|capture| capture.as_str()) {
        match NaiveDateTime::parse_from_str(timestamp, "%Y%m%d %H:%M:%S%.f") {
            Ok(naive_dt) => {
                let utc_dt = Utc.from_utc_datetime(&naive_dt);
                log.insert("timestamp".into(), Value::Timestamp(utc_dt));
            }
            Err(e) => return Err(format!("failed parsing timestamp {timestamp}: {e}").into()),
        }
    }
    if let Some(id) = captures.name("id").map(|capture| capture.as_str()) {
        log.insert(
            "id".into(),
            Value::Integer(id.parse().map_err(|_| "failed parsing id")?),
        );
    }
    if let Some(file) = captures.name("file").map(|capture| capture.as_str()) {
        log.insert("file".into(), Value::Bytes(file.to_owned().into()));
    }
    if let Some(line) = captures.name("line").map(|capture| capture.as_str()) {
        log.insert(
            "line".into(),
            Value::Integer(line.parse().map_err(|_| "failed parsing line")?),
        );
    }
    if let Some(message) = captures.name("message").map(|capture| capture.as_str()) {
        log.insert("message".into(), Value::Bytes(message.to_owned().into()));
    }
    Ok(log.into())
}

static REGEX_GLOG: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
        r"(?x)                                                     # Ignore whitespace and comments in the regex expression.
        ^\s*                                                        # Start with any number of whitespaces.
        (?P<level>\w)                                               # Match one word character (expecting `I`,`W`,`E` or `F`).
        (?P<timestamp>\d{4}\d{2}\d{2}\s\d{2}:\d{2}:\d{2}\.\d{6})    # Match YYYYMMDD hh:mm:ss.ffffff.
        \s+                                                         # Match one or more whitespace.
        (?P<id>\d+)                                                 # Match at least one digit.
        \s                                                          # Match one whitespace.
        (?P<file>.+):(?P<line>\d+)                                  # Match any character (greedily), ended by `:` and at least one digit.
        \]\s                                                        # Match `]` and one whitespace.
        (?P<message>.*?)                                            # Match any characters (non-greedily).
        \s*$                                                        # Match any number of whitespaces to be stripped from the end.
    ")
                                                 .expect("failed compiling regex for glog")
});

#[derive(Clone, Copy, Debug)]
pub struct ParseGlog;

impl Function for ParseGlog {
    fn identifier(&self) -> &'static str {
        "parse_glog"
    }

    fn usage(&self) -> &'static str {
        "Parses the `value` using the [glog (Google Logging Library)](https://github.com/google/glog) format."
    }

    fn category(&self) -> &'static str {
        Category::Parse.as_ref()
    }

    fn internal_failure_reasons(&self) -> &'static [&'static str] {
        &["`value` does not match the `glog` format."]
    }

    fn return_kind(&self) -> u16 {
        kind::OBJECT
    }

    fn examples(&self) -> &'static [Example] {
        &[example! {
            title: "Parse using glog",
            source: r#"parse_glog!("I20210131 14:48:54.411655 15520 main.c++:9] Hello world!")"#,
            result: Ok(indoc! { r#"{
                "file": "main.c++",
                "id": 15520,
                "level": "info",
                "line": 9,
                "message": "Hello world!",
                "timestamp": "2021-01-31T14:48:54.411655Z"
            }"#}),
        }]
    }

    fn compile(
        &self,
        _state: &state::TypeState,
        _ctx: &mut FunctionCompileContext,
        arguments: ArgumentList,
    ) -> Compiled {
        let value = arguments.required("value");

        Ok(ParseGlogFn { value }.as_expr())
    }

    fn parameters(&self) -> &'static [Parameter] {
        const PARAMETERS: &[Parameter] = &[Parameter::required(
            "value",
            kind::BYTES,
            "The string to parse.",
        )];
        PARAMETERS
    }
}

#[derive(Debug, Clone)]
struct ParseGlogFn {
    value: Box<dyn Expression>,
}

impl FunctionExpression for ParseGlogFn {
    fn resolve(&self, ctx: &mut Context) -> Resolved {
        let bytes = self.value.resolve(ctx)?;
        parse_glog(bytes)
    }

    fn type_def(&self, _: &state::TypeState) -> TypeDef {
        TypeDef::object(inner_kind()).fallible()
    }
}

fn inner_kind() -> BTreeMap<Field, Kind> {
    BTreeMap::from([
        ("level".into(), Kind::bytes()),
        ("timestamp".into(), Kind::timestamp()),
        ("id".into(), Kind::integer()),
        ("file".into(), Kind::bytes()),
        ("line".into(), Kind::integer()),
        ("message".into(), Kind::bytes()),
    ])
}

#[cfg(test)]
mod tests {
    use crate::btreemap;
    use chrono::DateTime;

    use super::*;

    test_function![
        parse_glog => ParseGlog;

        log_line_valid {
            args: func_args![value: "I20210131 14:48:54.411655 15520 main.c++:9] Hello world!"],
            want: Ok(btreemap! {
                "level" => "info",
                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-01-31T14:48:54.411655Z").unwrap().into()),
                "id" => 15520,
                "file" => "main.c++",
                "line" => 9,
                "message" => "Hello world!",
            }),
            tdef: TypeDef::object(inner_kind()).fallible(),
        }

        log_line_valid_strip_whitespace {
            args: func_args![value: "\n  I20210131 14:48:54.411655 15520 main.c++:9] Hello world!  \n"],
            want: Ok(btreemap! {
                "level" => "info",
                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-01-31T14:48:54.411655Z").unwrap().into()),
                "id" => 15520,
                "file" => "main.c++",
                "line" => 9,
                "message" => "Hello world!",
            }),
            tdef: TypeDef::object(inner_kind()).fallible(),
        }

        log_line_padded_threadid {
            args: func_args![value: "I20210131 14:48:54.411655    20 main.c++:9] Hello world!"],
            want: Ok(btreemap! {
                "level" => "info",
                "timestamp" => Value::Timestamp(DateTime::parse_from_rfc3339("2021-01-31T14:48:54.411655Z").unwrap().into()),
                "id" => 20,
                "file" => "main.c++",
                "line" => 9,
                "message" => "Hello world!",
            }),
            tdef: TypeDef::object(inner_kind()).fallible(),
        }

        log_line_invalid {
            args: func_args![value: "not a glog line"],
            want: Err("failed parsing glog message"),
            tdef: TypeDef::object(inner_kind()).fallible(),
        }

        log_line_invalid_log_level {
            args: func_args![value: "X20210131 14:48:54.411655 15520 main.c++:9] Hello world!"],
            want: Err(r#"unrecognized log level "X""#),
            tdef: TypeDef::object(inner_kind()).fallible(),
        }

        log_line_invalid_timestamp {
            args: func_args![value: "I20210000 14:48:54.411655 15520 main.c++:9] Hello world!"],
            want: Err("failed parsing timestamp 20210000 14:48:54.411655: input is out of range"),
            tdef: TypeDef::object(inner_kind()).fallible(),
        }

        log_line_invalid_id {
            args: func_args![value: "I20210131 14:48:54.411655 99999999999999999999999999999 main.c++:9] Hello world!"],
            want: Err("failed parsing id"),
            tdef: TypeDef::object(inner_kind()).fallible(),
        }
    ];
}