1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
use crate::{COMMENT_U8, TABS, TAB_U8};

/// Validates a log line
///
/// This function checks if
/// * the line is not empty,
/// * not a comment line
/// * and has the correct number of fields.
///
/// # Examples
///
/// ```rust
/// use cloudfront_logs::validate_line;
///
/// let okay_line = "2019-12-04	21:02:31	LAX1	392	192.0.2.100	GET	d111111abcdef8.cloudfront.net	/index.html	200	-	Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/78.0.3904.108%20Safari/537.36	-	-	Hit	SOX4xwn4XV6Q4rgb7XiVGOHms_BGlTAC4KyHmureZmBNrjGdRLiNIQ==	d111111abcdef8.cloudfront.net	https	23	0.001	-	TLSv1.2	ECDHE-RSA-AES128-GCM-SHA256	Hit	HTTP/2.0	-	-	11040	0.001	Hit	text/html	78	-	-";
/// let broken_line = "2019-12-04	21:02:31	LAX1	392	192.0.2.100	GET	d111111abcdef8.cloudfront.net	/index.html	200	-	Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/78.0.3904.108%20Safari/537.36	-	-	Hit	SOX4xwn4XV6Q4rgb7XiVGOHms_BGlTAC4KyHmureZmBNrjGdRLiNIQ==	d111111abcdef8.cloudfront.net	https	23	0.001	-	TLSv1.2	ECDHE-RSA-AES128-GCM-SHA256	Hit	HTTP/2.0	-	-";
/// let empty_line = "";
/// let comment_line = "#Version: 1.0";
///
/// assert!(validate_line(okay_line).is_ok());
/// assert!(validate_line(broken_line).is_err());
/// assert!(validate_line(empty_line).is_err());
/// assert!(validate_line(comment_line).is_err());
/// ```
#[inline]
pub fn validate_line(line: &str) -> Result<(), &'static str> {
    let bytes = line.as_bytes();
    if bytes.is_empty() {
        return Err("Invalid log line (empty)");
    }
    if bytes[0] == COMMENT_U8 {
        return Err("Invalid log line (comment)");
    }
    if memchr::memchr_iter(TAB_U8, bytes).count() != TABS {
        return Err("Invalid log line (field count)");
    }
    Ok(())
}

#[inline]
pub(crate) fn split(line: &str) -> MemchrTabSplitter<'_> {
    MemchrTabSplitter::new(line)
}

#[derive(Debug, Clone)]
pub(crate) struct MemchrTabSplitter<'a> {
    pub(crate) data: &'a str,
    pub(crate) prev: usize,
    pub(crate) end: usize,
    pub(crate) iter: memchr::Memchr<'a>,
}

impl<'a> MemchrTabSplitter<'a> {
    pub(crate) fn new(data: &'a str) -> Self {
        let prev = 0;
        let end = data.len();
        let iter = memchr::memchr_iter(TAB_U8, data.as_bytes());
        Self {
            data,
            prev,
            end,
            iter,
        }
    }
}

impl<'a> Iterator for MemchrTabSplitter<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<Self::Item> {
        let current_tab = self.iter.next();
        if let Some(tab_idx) = current_tab {
            assert!(tab_idx > 0, "Found tab stop at index 0 (invalid log line)");

            let from = self.prev;
            let to = tab_idx;
            self.prev = to + 1;
            Some(&self.data[from..to])
        } else {
            // get field after the last tab stop
            if self.prev < self.end {
                let from = self.prev;
                self.prev = self.end;
                Some(&self.data[from..])
            } else {
                None
            }
        }
    }
}

// if the input is "-", return Ok(None), otherwise parse the input as T;
// -> parse_as_option(iter.next().unwrap()).map_err(|_e| "…")?
// -> parse_as_option(str_input).map_err(|_e| "…")?
pub(crate) fn parse_as_option<T: std::str::FromStr>(s: &str) -> Result<Option<T>, T::Err> {
    if s == "-" {
        Ok(None)
    } else {
        s.parse().map(|v| Some(v))
    }
}

// better chainable version of parse_as_option;
// -> iter.next().and_then(as_optional_t).transpose().map_err(|_e| "…")?
pub(crate) fn as_optional_t<T: std::str::FromStr>(s: &str) -> Option<Result<T, T::Err>> {
    if s == "-" {
        None
    } else {
        Some(s.parse())
    }
}

// String type extension trait;
// returns None if the input is "-", otherwise Some(String)
pub(crate) trait ToOptionalString {
    fn to_optional_string(&self) -> Option<String>;
}

impl ToOptionalString for &str {
    fn to_optional_string(&self) -> Option<String> {
        if self == &"-" {
            None
        } else {
            Some((*self).to_string())
        }
    }
}

// str type extension trait;
// returns None if the input is "-", otherwise Some(&str)
#[cfg(feature = "parquet")]
pub(crate) trait AsOptionalStr {
    fn as_optional_str(&self) -> Option<&str>;
}

#[cfg(feature = "parquet")]
impl AsOptionalStr for str {
    fn as_optional_str(&self) -> Option<&str> {
        if self == "-" {
            None
        } else {
            Some(self)
        }
    }
}