vrl/datadog/grok/
lexer.rs

1use std::{iter::Peekable, str::CharIndices};
2
3use ordered_float::NotNan;
4
5pub type Tok<'input> = Token<&'input str>;
6pub type SpannedResult<'input, Loc> = Result<Spanned<'input, Loc>, Error>;
7pub type Spanned<'input, Loc> = (Loc, Tok<'input>, Loc);
8
9#[derive(Clone, PartialEq, Eq, Debug)]
10pub enum Token<S> {
11    LRule,
12    RRule,
13    LBracket,
14    RBracket,
15    Colon,
16    LParen,
17    RParen,
18    Dot,
19    Comma,
20    Null,
21    True,
22    False,
23
24    Sign(S),
25
26    IntegerLiteral(i64),
27    FloatLiteral(NotNan<f64>),
28    StringLiteral(String),
29    Identifier(S),
30    ExtendedIdentifier(S),
31    Invalid(char),
32}
33
34#[derive(thiserror::Error, Clone, Debug, PartialEq, Eq)]
35pub enum Error {
36    #[error("invalid literal")]
37    Literal { start: usize },
38
39    #[error("invalid numeric literal '{}'", .0)]
40    NumericLiteral(String),
41
42    #[error("invalid escape literal '{}'", .0)]
43    InvalidEscape(String),
44}
45
46pub struct Lexer<'input> {
47    input: &'input str,
48    chars: Peekable<CharIndices<'input>>,
49}
50
51// -----------------------------------------------------------------------------
52// lexing iterator
53// -----------------------------------------------------------------------------
54
55impl<'input> Iterator for Lexer<'input> {
56    type Item = SpannedResult<'input, usize>;
57
58    fn next(&mut self) -> Option<Self::Item> {
59        use Token::*;
60
61        loop {
62            if let Some((start, ch)) = self.bump() {
63                let result = match ch {
64                    '%' if self.test_peek(|ch| ch == '{') => {
65                        self.bump();
66                        Some(Ok(self.token(start, LRule)))
67                    }
68                    '}' => Some(Ok(self.token(start, RRule))),
69                    '[' => Some(Ok(self.token(start, LBracket))),
70                    ']' => Some(Ok(self.token(start, RBracket))),
71                    '(' => Some(Ok(self.token(start, LParen))),
72                    ')' => Some(Ok(self.token(start, RParen))),
73
74                    '.' if self.test_peek(is_digit) => Some(self.numeric_literal(start)),
75                    '.' => Some(Ok(self.token(start, Dot))),
76                    ':' => Some(Ok(self.token(start, Colon))),
77                    ',' => Some(Ok(self.token(start, Comma))),
78
79                    '"' => Some(self.string_literal(start)),
80
81                    '+' => Some(Ok(self.token(start, Sign("+")))),
82                    '-' => Some(Ok(self.token(start, Sign("-")))),
83                    ch if is_ident_start(ch) => Some(Ok(self.identifier(start))),
84                    ch if is_digit(ch) => Some(self.numeric_literal(start)),
85
86                    ch if ch.is_whitespace() => continue,
87
88                    ch => Some(Ok(self.token(start, Invalid(ch)))),
89                };
90
91                return result;
92            } else {
93                return None;
94            }
95        }
96    }
97}
98
99// -----------------------------------------------------------------------------
100// lexing helpers
101// -----------------------------------------------------------------------------
102
103impl<'input> Lexer<'input> {
104    pub fn new(input: &'input str) -> Lexer<'input> {
105        Self {
106            input,
107            chars: input.char_indices().peekable(),
108        }
109    }
110
111    fn bump(&mut self) -> Option<(usize, char)> {
112        self.chars.next()
113    }
114
115    fn peek(&mut self) -> Option<(usize, char)> {
116        self.chars.peek().copied()
117    }
118
119    fn take_while<F>(&mut self, start: usize, mut keep_going: F) -> (usize, &'input str)
120    where
121        F: FnMut(char) -> bool,
122    {
123        self.take_until(start, |c| !keep_going(c))
124    }
125
126    fn take_until<F>(&mut self, start: usize, mut terminate: F) -> (usize, &'input str)
127    where
128        F: FnMut(char) -> bool,
129    {
130        while let Some((end, ch)) = self.peek() {
131            if terminate(ch) {
132                return (end, self.slice(start, end));
133            } else {
134                self.bump();
135            }
136        }
137
138        let loc = self.next_index();
139
140        (loc, self.slice(start, loc))
141    }
142
143    fn test_peek<F>(&mut self, mut test: F) -> bool
144    where
145        F: FnMut(char) -> bool,
146    {
147        self.peek().is_some_and(|(_, ch)| test(ch))
148    }
149
150    fn slice(&self, start: usize, end: usize) -> &'input str {
151        &self.input[start..end]
152    }
153
154    fn next_index(&mut self) -> usize {
155        self.peek().as_ref().map_or(self.input.len(), |l| l.0)
156    }
157
158    fn token(&mut self, start: usize, token: Token<&'input str>) -> Spanned<'input, usize> {
159        let end = self.next_index();
160        self.token2(start, end, token)
161    }
162
163    fn token2(
164        &mut self,
165        start: usize,
166        end: usize,
167        token: Token<&'input str>,
168    ) -> Spanned<'input, usize> {
169        (start, token, end)
170    }
171
172    fn string_literal(&mut self, start: usize) -> SpannedResult<'input, usize> {
173        let content_start = self.next_index();
174
175        loop {
176            let scan_start = self.next_index();
177            self.take_until(scan_start, |c| c == '"' || c == '\\');
178
179            match self.bump() {
180                Some((_, '\\')) => self.bump(),
181                Some((end, '\"')) => {
182                    let content = unescape_string_literal(self.slice(content_start, end))?;
183                    let end = self.next_index();
184
185                    return Ok((start, Token::StringLiteral(content), end));
186                }
187                _ => break,
188            };
189        }
190
191        Err(Error::Literal { start })
192    }
193
194    fn identifier(&mut self, start: usize) -> Spanned<'input, usize> {
195        use Token::*;
196
197        let (end, ident) = self.take_while(start, is_ident_continue);
198
199        let token = match ident {
200            "true" => True,
201            "false" => False,
202            "null" => Null,
203
204            _ if ident.contains('@') || ident.contains('-') => ExtendedIdentifier(ident),
205            _ => Identifier(ident),
206        };
207
208        (start, token, end)
209    }
210
211    fn numeric_literal(&mut self, start: usize) -> SpannedResult<'input, usize> {
212        let mut is_float = false;
213        let (end, num) = self.take_while(start, |ch| {
214            is_digit(ch) || {
215                let is_float_symbol = is_float_literal_symbol(ch);
216                if is_float_symbol {
217                    is_float = true;
218                }
219                is_float_symbol
220            }
221        });
222
223        if is_float || num.starts_with('.') {
224            num.parse()
225                .map_err(|_| Error::NumericLiteral(num.to_string()))
226                .map(|n| (start, Token::FloatLiteral(n), end))
227        } else {
228            num.parse()
229                .map_err(|_| Error::NumericLiteral(num.to_string()))
230                .map(|n| (start, Token::IntegerLiteral(n), end))
231        }
232    }
233}
234
235fn is_float_literal_symbol(ch: char) -> bool {
236    matches!(ch, 'e' | 'E' | '-' | '+' | '.')
237}
238
239fn is_ident_start(ch: char) -> bool {
240    matches!(ch, '$' | '@' | '_' | 'a'..='z' | 'A'..='Z')
241}
242
243fn is_ident_continue(ch: char) -> bool {
244    match ch {
245        '0'..='9' => true,
246        '-' => true,
247        ch => is_ident_start(ch),
248    }
249}
250
251fn is_digit(ch: char) -> bool {
252    ch.is_ascii_digit()
253}
254
255fn unescape_string_literal(mut s: &str) -> Result<String, Error> {
256    let mut string = String::with_capacity(s.len());
257    while let Some(i) = s.bytes().position(|b| b == b'\\') {
258        if s.len() > i + 2 {
259            let c = match s.as_bytes()[i..i + 3] {
260                // convert to bytes since there might be multibyte(unicode) characters
261                [b'\\', b'\\', b'n'] => '\n',
262                [b'\\', b'\\', b'r'] => '\r',
263                [b'\\', b'\\', b't'] => '\t',
264                _ => '\0',
265            };
266            if c != '\0' {
267                string.push_str(&s[..i]);
268                string.push(c);
269                s = &s[i + 3..];
270                continue;
271            }
272        }
273        if s.len() > i + 1 {
274            let c = match s.as_bytes()[i + 1] {
275                b'\'' => '\'',
276                b'"' => '"',
277                b'\\' => '\\',
278                _ => return Err(Error::InvalidEscape(s.to_owned())),
279            };
280            string.push_str(&s[..i]);
281            string.push(c);
282            s = &s[i + 2..];
283        }
284    }
285
286    string.push_str(s);
287    Ok(string)
288}
289
290pub struct FloatingPointLiteral<'input> {
291    pub integral: Option<&'input str>,
292    pub fraction: Option<&'input str>,
293    pub exponent: Option<Exponent<'input>>,
294}
295
296pub struct Exponent<'input> {
297    pub sign: Option<&'input str>,
298    pub value: &'input str,
299}
300
301#[allow(dead_code)] // used by generated lalrpop parser
302impl FloatingPointLiteral<'_> {
303    pub fn parse(&self) -> f64 {
304        let mut fp = String::new();
305        fp.push_str(self.integral.unwrap_or_default());
306        if let Some(f) = &self.fraction {
307            fp.push('.');
308            fp.push_str(f);
309        }
310
311        if let Some(exp) = &self.exponent {
312            fp.push('e');
313            fp.push_str(exp.sign.unwrap_or_default());
314            fp.push_str(exp.value);
315        }
316
317        fp.parse().map_err(|_| Error::NumericLiteral(fp)).unwrap()
318    }
319}