jaq_core/load/
lex.rs

1//! Lexing.
2
3use alloc::vec::Vec;
4
5/// Component of a string potentially containing escape sequences.
6///
7/// `S` is a type of strings (without escape sequences), and
8/// `F` is a type of interpolated filters.
9#[derive(Debug)]
10pub enum StrPart<S, T> {
11    /// string without escape sequences
12    Str(S),
13    /// interpolated term (`\(...)`)
14    ///
15    /// Here, the contained term `T` must be of the shape
16    /// `Token(s, Tok::Block(...))` such that the first character of `s` is '('.
17    Term(T),
18    /// escaped character (e.g. `\n`, `t`, `\u0041`)
19    Char(char),
20}
21
22/// Token (tree) generic over string type `S`.
23///
24/// If the contained `Tok` is of the shape:
25/// * `Tok::Block(...)`, then `S` must start with `'('`, `'['`, or `'{'`.
26/// * `Tok::Var`, then `S` must start with `'$'`.
27#[derive(Debug)]
28pub struct Token<S>(pub S, pub Tok<S>);
29
30/// Type of token, generic over string type `S`.
31///
32/// This data structure should normally not be constructed manually.
33/// It is exposed mostly for fuzzing.
34#[derive(Debug)]
35pub enum Tok<S> {
36    /// keywords such as `def`, but also identifiers such as `map`, `f::g`
37    Word,
38    /// variables such as `$x`
39    Var,
40    /// formatters such as `@csv`
41    Fmt,
42    /// number
43    Num,
44    /// (interpolated) string, surrounded by opening and closing '"'
45    Str(Vec<StrPart<S, Token<S>>>),
46    /// symbol such as `.`, `;`, `-`, `|`, or `+=`
47    Sym,
48    /// delimited tokens, e.g. `(...)` or `[...]`
49    Block(Vec<Token<S>>),
50}
51
52/// Type of character that we expected.
53///
54/// Each variant is annoted with jq programs that trigger it.
55#[derive(Clone, Debug)]
56#[non_exhaustive]
57pub enum Expect<S> {
58    /// `0e`, `0.`
59    Digit,
60    /// `$`, `@`
61    Ident,
62    /// `(`, `[`, `{`
63    Delim(S),
64    /// `"\a"`
65    Escape,
66    /// `"\ux"`, `"\uD800"`
67    Unicode,
68    /// `&`, `§`, `💣`
69    Token,
70}
71
72impl Expect<&str> {
73    /// Return human-readable description of what we expected.
74    pub fn as_str(&self) -> &'static str {
75        match self {
76            Self::Digit => "digit",
77            Self::Ident => "identifier",
78            Self::Delim("(") => "closing parenthesis",
79            Self::Delim("[") => "closing bracket",
80            Self::Delim("{") => "closing brace",
81            Self::Delim("\"") => "closing quote",
82            Self::Delim(_) => panic!(),
83            Self::Escape => "string escape sequence",
84            Self::Unicode => "4-digit hexadecimal UTF-8 code point",
85            Self::Token => "token",
86        }
87    }
88}
89
90/// Lexer error, storing what we expected and what we got instead.
91pub type Error<S> = (Expect<S>, S);
92
93/// Lexer for jq files.
94pub struct Lexer<S> {
95    i: S,
96    e: Vec<Error<S>>,
97}
98
99impl<'a> Lexer<&'a str> {
100    /// Initialise a new lexer for the given input.
101    #[must_use]
102    pub fn new(i: &'a str) -> Self {
103        let e = Vec::new();
104        Self { i, e }
105    }
106
107    /// Lex, returning the resulting tokens and errors.
108    pub fn lex(mut self) -> Result<Vec<Token<&'a str>>, Vec<Error<&'a str>>> {
109        let tokens = self.tokens();
110        self.space();
111        if !self.i.is_empty() {
112            self.e.push((Expect::Token, self.i));
113        }
114
115        if self.e.is_empty() {
116            Ok(tokens)
117        } else {
118            Err(self.e)
119        }
120    }
121
122    fn next(&mut self) -> Option<char> {
123        let mut chars = self.i.chars();
124        let c = chars.next()?;
125        self.i = chars.as_str();
126        Some(c)
127    }
128
129    fn take(&mut self, len: usize) -> &'a str {
130        let (head, tail) = self.i.split_at(len);
131        self.i = tail;
132        head
133    }
134
135    fn trim(&mut self, f: impl FnMut(char) -> bool) {
136        self.i = self.i.trim_start_matches(f);
137    }
138
139    fn consumed(&mut self, skip: usize, f: impl FnOnce(&mut Self)) -> &'a str {
140        self.with_consumed(|l| {
141            l.i = &l.i[skip..];
142            f(l);
143        })
144        .0
145    }
146
147    fn with_consumed<T>(&mut self, f: impl FnOnce(&mut Self) -> T) -> (&'a str, T) {
148        let start = self.i;
149        let y = f(self);
150        (&start[..start.len() - self.i.len()], y)
151    }
152
153    /// Whitespace and comments.
154    fn space(&mut self) {
155        loop {
156            self.i = self.i.trim_start();
157            match self.i.strip_prefix('#') {
158                Some(comment) => self.i = comment,
159                None => break,
160            }
161            // ignore all lines that end with an odd number of backslashes
162            loop {
163                let (before, after) = self.i.split_once('\n').unwrap_or((self.i, ""));
164                let before = before.strip_suffix('\r').unwrap_or(before);
165                self.i = after;
166                // does the line end with an even number of backslashes?
167                if before.chars().rev().take_while(|c| *c == '\\').count() % 2 == 0 {
168                    break;
169                }
170            }
171        }
172    }
173
174    fn mod_then_ident(&mut self) {
175        self.ident0();
176        if let Some(rest) = self.i.strip_prefix("::") {
177            self.i = rest.strip_prefix(['@', '$']).unwrap_or(rest);
178            self.ident1();
179        }
180    }
181
182    /// Lex a sequence matching `[a-zA-Z0-9_]*`.
183    fn ident0(&mut self) {
184        self.trim(|c: char| c.is_ascii_alphanumeric() || c == '_');
185    }
186
187    /// Lex a sequence matching `[a-zA-Z_][a-zA-Z0-9_]*`.
188    fn ident1(&mut self) {
189        let first = |c: char| c.is_ascii_alphabetic() || c == '_';
190        if let Some(rest) = self.i.strip_prefix(first) {
191            self.i = rest;
192            self.ident0();
193        } else {
194            self.e.push((Expect::Ident, self.i));
195        }
196    }
197
198    /// Lex a non-empty digit sequence.
199    fn digits1(&mut self) {
200        if let Some(rest) = self.i.strip_prefix(|c: char| c.is_ascii_digit()) {
201            self.i = rest.trim_start_matches(|c: char| c.is_ascii_digit());
202        } else {
203            self.e.push((Expect::Digit, self.i));
204        }
205    }
206
207    /// Decimal with optional exponent.
208    fn num(&mut self) {
209        self.trim(|c| c.is_ascii_digit());
210        if let Some(i) = self.i.strip_prefix('.') {
211            self.i = i;
212            self.digits1();
213        }
214        if let Some(i) = self.i.strip_prefix(['e', 'E']) {
215            self.i = i.strip_prefix(['+', '-']).unwrap_or(i);
216            self.digits1();
217        }
218    }
219
220    fn escape(&mut self) -> Option<StrPart<&'a str, Token<&'a str>>> {
221        let mut chars = self.i.chars();
222        let part = match chars.next() {
223            Some(c @ ('\\' | '/' | '"')) => StrPart::Char(c),
224            Some('b') => StrPart::Char('\x08'),
225            Some('f') => StrPart::Char('\x0C'),
226            Some('n') => StrPart::Char('\n'),
227            Some('r') => StrPart::Char('\r'),
228            Some('t') => StrPart::Char('\t'),
229            Some('u') => {
230                let err_at = |lex: &mut Self, pos| {
231                    lex.i = pos;
232                    lex.e.push((Expect::Unicode, lex.i));
233                    None
234                };
235                let mut hex = 0;
236                let start_i = chars.as_str();
237                for _ in 0..4 {
238                    let cur_i = chars.as_str();
239                    if let Some(digit) = chars.next().and_then(|c| c.to_digit(16)) {
240                        hex = (hex << 4) + digit;
241                    } else {
242                        return err_at(self, cur_i);
243                    }
244                }
245                match char::from_u32(hex) {
246                    None => return err_at(self, start_i),
247                    Some(c) => StrPart::Char(c),
248                }
249            }
250            Some('(') => {
251                let (full, block) = self.with_consumed(Self::block);
252                return Some(StrPart::Term(Token(full, block)));
253            }
254            Some(_) | None => {
255                self.e.push((Expect::Escape, self.i));
256                return None;
257            }
258        };
259
260        self.i = chars.as_str();
261        Some(part)
262    }
263
264    /// Lex a (possibly interpolated) string.
265    ///
266    /// The input string has to start with '"'.
267    fn str(&mut self) -> Tok<&'a str> {
268        let start = self.take(1);
269        assert_eq!(start, "\"");
270        let mut parts = Vec::new();
271
272        loop {
273            let s = self.consumed(0, |lex| lex.trim(|c| c != '\\' && c != '"'));
274            if !s.is_empty() {
275                parts.push(StrPart::Str(s));
276            }
277            match self.next() {
278                Some('"') => break,
279                Some('\\') => self.escape().map(|part| parts.push(part)),
280                // SAFETY: due to `lex.trim()`
281                Some(_) => unreachable!(),
282                None => {
283                    self.e.push((Expect::Delim(start), self.i));
284                    break;
285                }
286            };
287        }
288        Tok::Str(parts)
289    }
290
291    fn token(&mut self) -> Option<Token<&'a str>> {
292        self.space();
293
294        let hd_op = |c| "|=!<>+-*/%".contains(c);
295        let tl_op = |c| hd_op(c) && c != '-';
296
297        let mut chars = self.i.chars();
298        let (s, tok) = match chars.next()? {
299            'a'..='z' | 'A'..='Z' | '_' => (self.consumed(1, Self::mod_then_ident), Tok::Word),
300            '$' => (self.consumed(1, Self::ident1), Tok::Var),
301            '@' => (self.consumed(1, Self::ident1), Tok::Fmt),
302            '0'..='9' => (self.consumed(1, Self::num), Tok::Num),
303            c if hd_op(c) => (self.consumed(1, |lex| lex.trim(tl_op)), Tok::Sym),
304            '.' => match chars.next() {
305                Some('.') => (self.take(2), Tok::Sym),
306                Some('a'..='z' | 'A'..='Z' | '_') => (self.consumed(2, Self::ident0), Tok::Sym),
307                _ => (self.take(1), Tok::Sym),
308            },
309            ':' | ';' | ',' | '?' => (self.take(1), Tok::Sym),
310            '"' => self.with_consumed(Self::str),
311            '(' | '[' | '{' => self.with_consumed(Self::block),
312            _ => return None,
313        };
314        Some(Token(s, tok))
315    }
316
317    fn tokens(&mut self) -> Vec<Token<&'a str>> {
318        core::iter::from_fn(|| self.token()).collect()
319    }
320
321    /// Lex a sequence of tokens that is surrounded by parentheses, curly braces, or brackets.
322    ///
323    /// The input string has to start with either '(', '[', or '{'.
324    fn block(&mut self) -> Tok<&'a str> {
325        let open = self.take(1);
326        let close = match open {
327            "(" => ')',
328            "[" => ']',
329            "{" => '}',
330            _ => panic!(),
331        };
332        let mut tokens = self.tokens();
333
334        self.space();
335        if let Some(rest) = self.i.strip_prefix(close) {
336            tokens.push(Token(&self.i[..1], Tok::Sym));
337            self.i = rest;
338        } else {
339            self.e.push((Expect::Delim(open), self.i));
340        }
341        Tok::Block(tokens)
342    }
343}
344
345impl<'a> Token<&'a str> {
346    /// Return the string slice corresponding to an optional token.
347    ///
348    /// If the token is not present, return an empty string slice starting at the end of `code`.
349    pub fn opt_as_str(found: Option<&Self>, code: &'a str) -> &'a str {
350        found.map_or(&code[code.len()..], |found| found.as_str())
351    }
352
353    /// Return the string slice corresponding to the token.
354    pub fn as_str(&self) -> &'a str {
355        self.0
356    }
357}