jaq_core/load/
lex.rs

1//! Lexing.
2
3use alloc::vec::Vec;
4
5/// Component of a string potentially containing escape sequences.
6///
7/// `S` is a type of strings (without escape sequences), and
8/// `F` is a type of interpolated filters.
9#[derive(Debug)]
10pub enum StrPart<S, T> {
11    /// string without escape sequences
12    Str(S),
13    /// interpolated term (`\(...)`)
14    Term(T),
15    /// escaped character (e.g. `\n`, `t`, `\u0041`)
16    Char(char),
17}
18
19/// Token (tree) generic over string type `S`.
20#[derive(Debug)]
21pub struct Token<S>(pub(crate) S, pub(crate) Tok<S>);
22
23#[derive(Debug)]
24pub(crate) enum Tok<S> {
25    /// keywords such as `def`, but also identifiers such as `map`, `f::g`
26    Word,
27    /// variables such as `$x`
28    Var,
29    /// formatters such as `@csv`
30    Fmt,
31    /// number
32    Num,
33    /// (interpolated) string, surrounded by opening and closing '"'
34    Str(Vec<StrPart<S, Token<S>>>),
35    /// symbol such as `.`, `;`, `-`, `|`, or `+=`
36    Sym,
37    /// delimited tokens, e.g. `(...)` or `[...]`
38    Block(Vec<Token<S>>),
39}
40
41/// Type of character that we expected.
42///
43/// Each variant is annoted with jq programs that trigger it.
44#[derive(Clone, Debug)]
45#[non_exhaustive]
46pub enum Expect<S> {
47    /// `0e`, `0.`
48    Digit,
49    /// `$`, `@`
50    Ident,
51    /// `(`, `[`, `{`
52    Delim(S),
53    /// `"\a"`
54    Escape,
55    /// `"\ux"`
56    Unicode,
57    /// `&`, `§`, `💣`
58    Token,
59}
60
61impl Expect<&str> {
62    /// Return human-readable description of what we expected.
63    pub fn as_str(&self) -> &'static str {
64        match self {
65            Self::Digit => "digit",
66            Self::Ident => "identifier",
67            Self::Delim("(") => "closing parenthesis",
68            Self::Delim("[") => "closing bracket",
69            Self::Delim("{") => "closing brace",
70            Self::Delim("\"") => "closing quote",
71            Self::Delim(_) => panic!(),
72            Self::Escape => "string escape sequence",
73            Self::Unicode => "4-digit hexadecimal UTF-8 code point",
74            Self::Token => "token",
75        }
76    }
77}
78
79/// Lexer error, storing what we expected and what we got instead.
80pub type Error<S> = (Expect<S>, S);
81
82/// Lexer for jq files.
83pub struct Lexer<S> {
84    i: S,
85    e: Vec<Error<S>>,
86}
87
88impl<'a> Lexer<&'a str> {
89    /// Initialise a new lexer for the given input.
90    #[must_use]
91    pub fn new(i: &'a str) -> Self {
92        let e = Vec::new();
93        Self { i, e }
94    }
95
96    /// Lex, returning the resulting tokens and errors.
97    pub fn lex(mut self) -> Result<Vec<Token<&'a str>>, Vec<Error<&'a str>>> {
98        let tokens = self.tokens();
99        self.space();
100        if !self.i.is_empty() {
101            self.e.push((Expect::Token, self.i));
102        }
103
104        if self.e.is_empty() {
105            Ok(tokens)
106        } else {
107            Err(self.e)
108        }
109    }
110
111    fn next(&mut self) -> Option<char> {
112        let mut chars = self.i.chars();
113        let c = chars.next()?;
114        self.i = chars.as_str();
115        Some(c)
116    }
117
118    fn take(&mut self, len: usize) -> &'a str {
119        let (head, tail) = self.i.split_at(len);
120        self.i = tail;
121        head
122    }
123
124    fn trim(&mut self, f: impl FnMut(char) -> bool) {
125        self.i = self.i.trim_start_matches(f);
126    }
127
128    fn consumed(&mut self, skip: usize, f: impl FnOnce(&mut Self)) -> &'a str {
129        self.with_consumed(|l| {
130            l.i = &l.i[skip..];
131            f(l);
132        })
133        .0
134    }
135
136    fn with_consumed<T>(&mut self, f: impl FnOnce(&mut Self) -> T) -> (&'a str, T) {
137        let start = self.i;
138        let y = f(self);
139        (&start[..start.len() - self.i.len()], y)
140    }
141
142    /// Whitespace and comments.
143    fn space(&mut self) {
144        loop {
145            self.i = self.i.trim_start();
146            match self.i.strip_prefix('#') {
147                Some(comment) => self.i = comment,
148                None => break,
149            }
150            // ignore all lines that end with an odd number of backslashes
151            loop {
152                let (before, after) = self.i.split_once('\n').unwrap_or((self.i, ""));
153                let before = before.strip_suffix('\r').unwrap_or(before);
154                self.i = after;
155                // does the line end with an even number of backslashes?
156                if before.chars().rev().take_while(|c| *c == '\\').count() % 2 == 0 {
157                    break;
158                }
159            }
160        }
161    }
162
163    fn mod_then_ident(&mut self) {
164        self.ident0();
165        if let Some(rest) = self.i.strip_prefix("::") {
166            self.i = rest.strip_prefix(['@', '$']).unwrap_or(rest);
167            self.ident1();
168        }
169    }
170
171    /// Lex a sequence matching `[a-zA-Z0-9_]*`.
172    fn ident0(&mut self) {
173        self.trim(|c: char| c.is_ascii_alphanumeric() || c == '_');
174    }
175
176    /// Lex a sequence matching `[a-zA-Z_][a-zA-Z0-9_]*`.
177    fn ident1(&mut self) {
178        let first = |c: char| c.is_ascii_alphabetic() || c == '_';
179        if let Some(rest) = self.i.strip_prefix(first) {
180            self.i = rest;
181            self.ident0();
182        } else {
183            self.e.push((Expect::Ident, self.i));
184        }
185    }
186
187    /// Lex a non-empty digit sequence.
188    fn digits1(&mut self) {
189        if let Some(rest) = self.i.strip_prefix(|c: char| c.is_ascii_digit()) {
190            self.i = rest.trim_start_matches(|c: char| c.is_ascii_digit());
191        } else {
192            self.e.push((Expect::Digit, self.i));
193        }
194    }
195
196    /// Decimal with optional exponent.
197    fn num(&mut self) {
198        self.trim(|c| c.is_ascii_digit());
199        if let Some(i) = self.i.strip_prefix('.') {
200            self.i = i;
201            self.digits1();
202        }
203        if let Some(i) = self.i.strip_prefix(['e', 'E']) {
204            self.i = i.strip_prefix(['+', '-']).unwrap_or(i);
205            self.digits1();
206        }
207    }
208
209    fn escape(&mut self) -> Option<StrPart<&'a str, Token<&'a str>>> {
210        let mut chars = self.i.chars();
211        let part = match chars.next() {
212            Some(c @ ('\\' | '/' | '"')) => StrPart::Char(c),
213            Some('b') => StrPart::Char('\x08'),
214            Some('f') => StrPart::Char('\x0C'),
215            Some('n') => StrPart::Char('\n'),
216            Some('r') => StrPart::Char('\r'),
217            Some('t') => StrPart::Char('\t'),
218            Some('u') => {
219                let mut hex = 0;
220                for _ in 0..4 {
221                    let i = chars.as_str();
222                    if let Some(digit) = chars.next().and_then(|c| c.to_digit(16)) {
223                        hex = (hex << 4) + digit;
224                    } else {
225                        self.i = i;
226                        self.e.push((Expect::Unicode, self.i));
227                        return None;
228                    }
229                }
230                StrPart::Char(char::from_u32(hex).unwrap())
231            }
232            Some('(') => {
233                let (full, block) = self.with_consumed(Self::block);
234                return Some(StrPart::Term(Token(full, block)));
235            }
236            Some(_) | None => {
237                self.e.push((Expect::Escape, self.i));
238                return None;
239            }
240        };
241
242        self.i = chars.as_str();
243        Some(part)
244    }
245
246    /// Lex a (possibly interpolated) string.
247    ///
248    /// The input string has to start with '"'.
249    fn str(&mut self) -> Tok<&'a str> {
250        let start = self.take(1);
251        assert_eq!(start, "\"");
252        let mut parts = Vec::new();
253
254        loop {
255            let s = self.consumed(0, |lex| lex.trim(|c| c != '\\' && c != '"'));
256            if !s.is_empty() {
257                parts.push(StrPart::Str(s));
258            }
259            match self.next() {
260                Some('"') => break,
261                Some('\\') => self.escape().map(|part| parts.push(part)),
262                // SAFETY: due to `lex.trim()`
263                Some(_) => unreachable!(),
264                None => {
265                    self.e.push((Expect::Delim(start), self.i));
266                    break;
267                }
268            };
269        }
270        Tok::Str(parts)
271    }
272
273    fn token(&mut self) -> Option<Token<&'a str>> {
274        self.space();
275
276        let is_op = |c| "|=!<>+-*/%".contains(c);
277
278        let mut chars = self.i.chars();
279        let (s, tok) = match chars.next()? {
280            'a'..='z' | 'A'..='Z' | '_' => (self.consumed(1, Self::mod_then_ident), Tok::Word),
281            '$' => (self.consumed(1, Self::ident1), Tok::Var),
282            '@' => (self.consumed(1, Self::ident1), Tok::Fmt),
283            '0'..='9' => (self.consumed(1, Self::num), Tok::Num),
284            c if is_op(c) => (self.consumed(1, |lex| lex.trim(is_op)), Tok::Sym),
285            '.' => match chars.next() {
286                Some('.') => (self.take(2), Tok::Sym),
287                Some('a'..='z' | 'A'..='Z' | '_') => (self.consumed(2, Self::ident0), Tok::Sym),
288                _ => (self.take(1), Tok::Sym),
289            },
290            ':' | ';' | ',' | '?' => (self.take(1), Tok::Sym),
291            '"' => self.with_consumed(Self::str),
292            '(' | '[' | '{' => self.with_consumed(Self::block),
293            _ => return None,
294        };
295        Some(Token(s, tok))
296    }
297
298    fn tokens(&mut self) -> Vec<Token<&'a str>> {
299        core::iter::from_fn(|| self.token()).collect()
300    }
301
302    /// Lex a sequence of tokens that is surrounded by parentheses, curly braces, or brackets.
303    ///
304    /// The input string has to start with either '(', '[', or '{'.
305    fn block(&mut self) -> Tok<&'a str> {
306        let open = self.take(1);
307        let close = match open {
308            "(" => ')',
309            "[" => ']',
310            "{" => '}',
311            _ => panic!(),
312        };
313        let mut tokens = self.tokens();
314
315        self.space();
316        if let Some(rest) = self.i.strip_prefix(close) {
317            tokens.push(Token(&self.i[..1], Tok::Sym));
318            self.i = rest;
319        } else {
320            self.e.push((Expect::Delim(open), self.i));
321        }
322        Tok::Block(tokens)
323    }
324}
325
326impl<'a> Token<&'a str> {
327    /// Return the string slice corresponding to an optional token.
328    ///
329    /// If the token is not present, return an empty string slice starting at the end of `code`.
330    pub fn opt_as_str(found: Option<&Self>, code: &'a str) -> &'a str {
331        found.map_or(&code[code.len()..], |found| found.as_str())
332    }
333
334    /// Return the string slice corresponding to the token.
335    pub fn as_str(&self) -> &'a str {
336        self.0
337    }
338}