jaq_syn/
lex.rs

1//! Lexing.
2
3use alloc::vec::Vec;
4
5/// Component of a string potentially containing escape sequences.
6///
7/// `S` is a type of strings (without escape sequences), and
8/// `F` is a type of interpolated filters.
9#[derive(Debug)]
10pub enum StrPart<S, F> {
11    /// string without escape sequences
12    Str(S),
13    /// interpolated filter (`\(...)`)
14    Filter(F),
15    /// escaped character (e.g. `\n`, `t`, `\u0041`)
16    Char(char),
17}
18
19/// Token (tree) generic over string type `S`.
20#[derive(Debug)]
21pub enum Token<S> {
22    /// keywords such as `def`, but also identifiers such as `map`, `$x`, or `@csv`
23    Word(S),
24    /// number
25    Num(S),
26    /// (interpolated) string, surrounded by opening and closing '"'
27    Str(S, Vec<StrPart<S, Self>>),
28    /// binary operator, such as `|` or `+=`
29    ///
30    /// Note that this includes `-` (negation) also when it is used as unary operator.
31    Op(S),
32    /// punctuation, such as `.` or `;`
33    Char(S),
34    /// delimited tokens, e.g. `(...)` or `[...]`
35    Block(S, Vec<Self>),
36}
37
38/// Type of character that we expected.
39///
40/// Each variant is annoted with jq programs that trigger it.
41#[derive(Clone, Debug)]
42pub enum Expect<S> {
43    /// `0e`, `0.`
44    Digit,
45    /// `$`, `@`
46    Ident,
47    /// `(`, `[`, `{`
48    Delim(S),
49    /// `"\a"`
50    Escape,
51    /// `"\ux"`
52    Unicode,
53    /// `&`, `§`, `💣`
54    Token,
55}
56
57impl<'a> Expect<&'a str> {
58    /// Return human-readable description of what we expected.
59    pub fn as_str(&self) -> &'static str {
60        match self {
61            Self::Digit => "digit",
62            Self::Ident => "identifier",
63            Self::Delim("(") => "closing parenthesis",
64            Self::Delim("[") => "closing bracket",
65            Self::Delim("{") => "closing brace",
66            Self::Delim("\"") => "closing quote",
67            Self::Delim(_) => panic!(),
68            Self::Escape => "string escape sequence",
69            Self::Unicode => "4-digit hexadecimal UTF-8 code point",
70            Self::Token => "token",
71        }
72    }
73}
74
75/// Lexer error, storing what we expected and what we got instead.
76pub type Error<S> = (Expect<S>, S);
77
78/// Lexer for jq files.
79pub struct Lexer<S> {
80    i: S,
81    e: Vec<Error<S>>,
82}
83
84impl<'a> Lexer<&'a str> {
85    /// Initialise a new lexer for the given input.
86    #[must_use]
87    pub fn new(i: &'a str) -> Self {
88        let e = Vec::new();
89        Self { i, e }
90    }
91
92    /// Lex, returning the resulting tokens and errors.
93    pub fn lex(mut self) -> Result<Vec<Token<&'a str>>, Vec<Error<&'a str>>> {
94        let tokens = self.tokens();
95        self.space();
96        if !self.i.is_empty() {
97            self.e.push((Expect::Token, self.i));
98        }
99
100        if self.e.is_empty() {
101            Ok(tokens)
102        } else {
103            Err(self.e)
104        }
105    }
106
107    fn next(&mut self) -> Option<char> {
108        let mut chars = self.i.chars();
109        let c = chars.next()?;
110        self.i = chars.as_str();
111        Some(c)
112    }
113
114    fn take(&mut self, len: usize) -> &'a str {
115        let (head, tail) = self.i.split_at(len);
116        self.i = tail;
117        head
118    }
119
120    fn trim(&mut self, f: impl FnMut(char) -> bool) {
121        self.i = self.i.trim_start_matches(f);
122    }
123
124    fn consumed(&mut self, skip: usize, f: impl FnOnce(&mut Self)) -> &'a str {
125        self.with_consumed(|l| {
126            l.i = &l.i[skip..];
127            f(l)
128        })
129        .0
130    }
131
132    fn with_consumed<T>(&mut self, f: impl FnOnce(&mut Self) -> T) -> (&'a str, T) {
133        let start = self.i;
134        let y = f(self);
135        (&start[..start.len() - self.i.len()], y)
136    }
137
138    /// Whitespace and comments.
139    fn space(&mut self) {
140        self.i = self.i.trim_start();
141        while let Some(comment) = self.i.strip_prefix('#') {
142            self.i = comment.trim_start_matches(|c| c != '\n').trim_start();
143        }
144    }
145
146    fn mod_then_ident(&mut self) {
147        self.ident0();
148        if let Some(rest) = self.i.strip_prefix("::") {
149            self.i = rest.strip_prefix(['@', '$']).unwrap_or(rest);
150            self.ident1();
151        }
152    }
153
154    /// Lex a sequence matching `[a-zA-Z0-9_]*`.
155    fn ident0(&mut self) {
156        self.trim(|c: char| c.is_ascii_alphanumeric() || c == '_');
157    }
158
159    /// Lex a sequence matching `[a-zA-Z_][a-zA-Z0-9_]*`.
160    fn ident1(&mut self) {
161        let first = |c: char| c.is_ascii_alphabetic() || c == '_';
162        if let Some(rest) = self.i.strip_prefix(first) {
163            self.i = rest;
164            self.ident0();
165        } else {
166            self.e.push((Expect::Ident, self.i));
167        }
168    }
169
170    /// Lex a non-empty digit sequence.
171    fn digits1(&mut self) {
172        if let Some(rest) = self.i.strip_prefix(|c: char| c.is_ascii_digit()) {
173            self.i = rest.trim_start_matches(|c: char| c.is_ascii_digit());
174        } else {
175            self.e.push((Expect::Digit, self.i));
176        }
177    }
178
179    /// Decimal with optional exponent.
180    fn num(&mut self) {
181        self.trim(|c| c.is_ascii_digit());
182        if let Some(i) = self.i.strip_prefix('.') {
183            self.i = i;
184            self.digits1();
185        }
186        if let Some(i) = self.i.strip_prefix(['e', 'E']) {
187            self.i = i.strip_prefix(['+', '-']).unwrap_or(i);
188            self.digits1();
189        }
190    }
191
192    fn escape(&mut self) -> Option<StrPart<&'a str, Token<&'a str>>> {
193        let mut chars = self.i.chars();
194        let part = match chars.next() {
195            Some(c @ ('\\' | '/' | '"')) => StrPart::Char(c),
196            Some('b') => StrPart::Char('\x08'),
197            Some('f') => StrPart::Char('\x0C'),
198            Some('n') => StrPart::Char('\n'),
199            Some('r') => StrPart::Char('\r'),
200            Some('t') => StrPart::Char('\t'),
201            Some('u') => {
202                let mut hex = 0;
203                for _ in 0..4 {
204                    let i = chars.as_str();
205                    match chars.next().and_then(|c| c.to_digit(16)) {
206                        Some(digit) => hex = (hex << 4) + digit,
207                        None => {
208                            self.i = i;
209                            self.e.push((Expect::Unicode, self.i));
210                            return None;
211                        }
212                    }
213                }
214                StrPart::Char(char::from_u32(hex).unwrap())
215            }
216            Some('(') => {
217                let (full, tokens) = self.with_consumed(Self::delim);
218                return Some(StrPart::Filter(Token::Block(full, tokens)));
219            }
220            Some(_) | None => {
221                self.e.push((Expect::Escape, self.i));
222                return None;
223            }
224        };
225
226        self.i = chars.as_str();
227        Some(part)
228    }
229
230    /// Lex a (possibly interpolated) string.
231    ///
232    /// The input string has to start with '"'.
233    fn str(&mut self) -> Vec<StrPart<&'a str, Token<&'a str>>> {
234        let start = self.take(1);
235        assert_eq!(start, "\"");
236        let mut parts = Vec::new();
237
238        loop {
239            let s = self.consumed(0, |lex| lex.trim(|c| c != '\\' && c != '"'));
240            if !s.is_empty() {
241                parts.push(StrPart::Str(s));
242            }
243            match self.next() {
244                Some('"') => return parts,
245                Some('\\') => self.escape().map(|part| parts.push(part)),
246                // SAFETY: due to `lex.trim()`
247                Some(_) => unreachable!(),
248                None => {
249                    self.e.push((Expect::Delim(start), self.i));
250                    return parts;
251                }
252            };
253        }
254    }
255
256    fn token(&mut self) -> Option<Token<&'a str>> {
257        self.space();
258
259        let is_op = |c| "|=!<>+-*/%".contains(c);
260
261        let mut chars = self.i.chars();
262        Some(match chars.next()? {
263            'a'..='z' | 'A'..='Z' | '_' => Token::Word(self.consumed(1, Self::mod_then_ident)),
264            '$' | '@' => Token::Word(self.consumed(1, Self::ident1)),
265            '0'..='9' => Token::Num(self.consumed(1, Self::num)),
266            c if is_op(c) => Token::Op(self.consumed(1, |lex| lex.trim(is_op))),
267            '.' => match chars.next() {
268                Some('.') => Token::Char(self.take(2)),
269                Some('a'..='z' | 'A'..='Z' | '_') => Token::Char(self.consumed(2, Self::ident0)),
270                _ => Token::Char(self.take(1)),
271            },
272            ':' | ';' | ',' | '?' => Token::Char(self.take(1)),
273            '"' => {
274                let (full, parts) = self.with_consumed(Self::str);
275                Token::Str(full, parts)
276            }
277            '(' | '[' | '{' => {
278                let (full, tokens) = self.with_consumed(Self::delim);
279                Token::Block(full, tokens)
280            }
281            _ => return None,
282        })
283    }
284
285    fn tokens(&mut self) -> Vec<Token<&'a str>> {
286        core::iter::from_fn(|| self.token()).collect()
287    }
288
289    /// Lex a sequence of tokens that is surrounded by parentheses, curly braces, or brackets.
290    ///
291    /// The input string has to start with either '(', '[', or '{'.
292    fn delim(&mut self) -> Vec<Token<&'a str>> {
293        let open = self.take(1);
294        let close = match open {
295            "(" => ')',
296            "[" => ']',
297            "{" => '}',
298            _ => panic!(),
299        };
300        let mut tokens = self.tokens();
301
302        self.space();
303        if let Some(rest) = self.i.strip_prefix(close) {
304            tokens.push(Token::Char(&self.i[..1]));
305            self.i = rest;
306        } else {
307            self.e.push((Expect::Delim(open), self.i));
308        }
309        tokens
310    }
311}
312
313impl<'a> Token<&'a str> {
314    /// Return the string slice corresponding to an optional token.
315    ///
316    /// If the token is not present, return an empty string slice starting at the end of `code`.
317    pub fn opt_as_str(found: Option<&Self>, code: &'a str) -> &'a str {
318        found.map_or(&code[code.len()..], |found| found.as_str())
319    }
320
321    /// Return the string slice corresponding to the token.
322    pub fn as_str(&self) -> &'a str {
323        match self {
324            Self::Word(s) | Self::Char(s) | Self::Op(s) | Self::Num(s) => s,
325            Self::Str(s, _) | Self::Block(s, _) => s,
326        }
327    }
328
329    /// Return the span of a token that was lexed from some given input.
330    pub fn span(&self, code: &str) -> crate::Span {
331        span(code, self.as_str())
332    }
333}
334
335/// Return the span of a string slice `part` relative to a string slice `whole`.
336///
337/// The caller must ensure that `part` is fully contained inside `whole`.
338pub fn span(whole: &str, part: &str) -> crate::Span {
339    let start = part.as_ptr() as usize - whole.as_ptr() as usize;
340    start..start + part.len()
341}