prql_parser/
lexer.rs

1use chumsky::{
2    error::Cheap,
3    prelude::*,
4    text::{newline, Character},
5};
6
7use prql_ast::expr::*;
8
9#[derive(Clone, PartialEq, Debug)]
10pub enum Token {
11    NewLine,
12
13    Ident(String),
14    Keyword(String),
15    Literal(Literal),
16    Param(String),
17
18    Range {
19        bind_left: bool,
20        bind_right: bool,
21    },
22    Interpolation(char, String),
23
24    /// single-char control tokens
25    Control(char),
26
27    ArrowThin,   // ->
28    ArrowFat,    // =>
29    Eq,          // ==
30    Ne,          // !=
31    Gte,         // >=
32    Lte,         // <=
33    RegexSearch, // ~=
34    And,         // &&
35    Or,          // ||
36    Coalesce,    // ??
37    DivInt,      // //
38    Annotate,    // @
39}
40
41pub fn lexer() -> impl Parser<char, Vec<(Token, std::ops::Range<usize>)>, Error = Cheap<char>> {
42    let whitespace = filter(|x: &char| x.is_inline_whitespace())
43        .repeated()
44        .at_least(1)
45        .ignored();
46
47    let control_multi = choice((
48        just("->").to(Token::ArrowThin),
49        just("=>").to(Token::ArrowFat),
50        just("==").to(Token::Eq),
51        just("!=").to(Token::Ne),
52        just(">=").to(Token::Gte),
53        just("<=").to(Token::Lte),
54        just("~=").to(Token::RegexSearch),
55        just("&&").then_ignore(end_expr()).to(Token::And),
56        just("||").then_ignore(end_expr()).to(Token::Or),
57        just("??").to(Token::Coalesce),
58        just("//").to(Token::DivInt),
59        just("@").then(digits(1).not().rewind()).to(Token::Annotate),
60    ));
61
62    let control = one_of("></%=+-*[]().,:|!{}").map(Token::Control);
63
64    let ident = ident_part().map(Token::Ident);
65
66    let keyword = choice((
67        just("let"),
68        just("into"),
69        just("case"),
70        just("prql"),
71        just("type"),
72        just("module"),
73        just("internal"),
74        just("func"),
75    ))
76    .then_ignore(end_expr())
77    .map(|x| x.to_string())
78    .map(Token::Keyword);
79
80    let literal = literal().map(Token::Literal);
81
82    let param = just('$')
83        .ignore_then(filter(|c: &char| c.is_alphanumeric() || *c == '_' || *c == '.').repeated())
84        .collect::<String>()
85        .map(Token::Param);
86
87    let interpolation = one_of("sf")
88        .then(quoted_string(true))
89        .map(|(c, s)| Token::Interpolation(c, s));
90
91    // I think declaring this and then cloning will be more performant than
92    // calling the function on each invocation.
93    // https://github.com/zesterer/chumsky/issues/501 would allow us to avoid
94    // this, and let us split up this giant function without sacrificing
95    // performance.
96    let newline = newline();
97
98    let token = choice((
99        newline.to(Token::NewLine),
100        control_multi,
101        interpolation,
102        param,
103        control,
104        literal,
105        keyword,
106        ident,
107    ))
108    .recover_with(skip_then_retry_until([]).skip_start());
109
110    let comment = just('#')
111        .then(newline.not().repeated())
112        .separated_by(newline.then(whitespace.or_not()))
113        .at_least(1)
114        .ignored();
115
116    let range = (whitespace.or_not())
117        .then_ignore(just(".."))
118        .then(whitespace.or_not())
119        .map(|(left, right)| Token::Range {
120            bind_left: left.is_none(),
121            bind_right: right.is_none(),
122        })
123        .map_with_span(|tok, span| (tok, span));
124
125    let line_wrap = newline
126        .then(
127            // We can optionally have an empty line, or a line with a comment,
128            // between the initial line and the continued line
129            whitespace
130                .or_not()
131                .then(comment.or_not())
132                .then(newline)
133                .repeated(),
134        )
135        .then(whitespace.repeated())
136        .then(just('\\'))
137        .ignored();
138
139    let ignored = choice((comment, whitespace, line_wrap)).repeated();
140
141    choice((
142        range,
143        ignored.ignore_then(token.map_with_span(|tok, span| (tok, span))),
144    ))
145    .repeated()
146    .then_ignore(ignored)
147    .then_ignore(end())
148}
149
150pub fn ident_part() -> impl Parser<char, String, Error = Cheap<char>> + Clone {
151    let plain = filter(|c: &char| c.is_alphabetic() || *c == '_')
152        .chain(filter(|c: &char| c.is_alphanumeric() || *c == '_').repeated());
153
154    let backticks = none_of('`').repeated().delimited_by(just('`'), just('`'));
155
156    plain.or(backticks).collect()
157}
158
159fn literal() -> impl Parser<char, Literal, Error = Cheap<char>> {
160    let exp = one_of("eE").chain(one_of("+-").or_not().chain::<char, _, _>(text::digits(10)));
161
162    let integer = filter(|c: &char| c.is_ascii_digit() && *c != '0')
163        .chain::<_, Vec<char>, _>(filter(|c: &char| c.is_ascii_digit() || *c == '_').repeated())
164        .or(just('0').map(|c| vec![c]));
165
166    let frac = just('.')
167        .chain::<char, _, _>(filter(|c: &char| c.is_ascii_digit()))
168        .chain::<char, _, _>(filter(|c: &char| c.is_ascii_digit() || *c == '_').repeated());
169
170    let number = one_of("+-")
171        .or_not()
172        .chain::<char, _, _>(integer)
173        .chain::<char, _, _>(frac.or_not().flatten())
174        .chain::<char, _, _>(exp.or_not().flatten())
175        .try_map(|chars, span| {
176            let str = chars.into_iter().filter(|c| *c != '_').collect::<String>();
177
178            if let Ok(i) = str.parse::<i64>() {
179                Ok(Literal::Integer(i))
180            } else if let Ok(f) = str.parse::<f64>() {
181                Ok(Literal::Float(f))
182            } else {
183                Err(Cheap::expected_input_found(span, None, None))
184            }
185        })
186        .labelled("number");
187
188    let string = quoted_string(true).map(Literal::String);
189
190    let raw_string = just("r")
191        .ignore_then(quoted_string(false))
192        .map(Literal::String);
193
194    let bool = (just("true").to(true))
195        .or(just("false").to(false))
196        .then_ignore(end_expr())
197        .map(Literal::Boolean);
198
199    let null = just("null").to(Literal::Null).then_ignore(end_expr());
200
201    let value_and_unit = integer
202        .then(choice((
203            just("microseconds"),
204            just("milliseconds"),
205            just("seconds"),
206            just("minutes"),
207            just("hours"),
208            just("days"),
209            just("weeks"),
210            just("months"),
211            just("years"),
212        )))
213        .then_ignore(end_expr())
214        .try_map(|(number, unit), span| {
215            let str = number.into_iter().filter(|c| *c != '_').collect::<String>();
216            if let Ok(n) = str.parse::<i64>() {
217                let unit = unit.to_string();
218                Ok(ValueAndUnit { n, unit })
219            } else {
220                Err(Cheap::expected_input_found(span, None, None))
221            }
222        })
223        .map(Literal::ValueAndUnit);
224
225    let date_inner = digits(4)
226        .chain(just('-'))
227        .chain::<char, _, _>(digits(2))
228        .chain::<char, _, _>(just('-'))
229        .chain::<char, _, _>(digits(2))
230        .boxed();
231
232    let time_inner = digits(2)
233        // minutes
234        .chain::<char, _, _>(just(':').chain(digits(2)).or_not().flatten())
235        // seconds
236        .chain::<char, _, _>(just(':').chain(digits(2)).or_not().flatten())
237        // milliseconds
238        .chain::<char, _, _>(
239            just('.')
240                .chain(
241                    filter(|c: &char| c.is_ascii_digit())
242                        .repeated()
243                        .at_least(1)
244                        .at_most(6),
245                )
246                .or_not()
247                .flatten(),
248        )
249        // timezone offset
250        .chain::<char, _, _>(
251            one_of("-+")
252                .chain(
253                    (digits(2).then_ignore(just(':').or_not()).chain(digits(2)))
254                        .or(just('Z').map(|x| vec![x])),
255                )
256                .or_not()
257                .flatten(),
258        )
259        .boxed();
260
261    // Not an annotation
262    let dt_prefix = just('@').then(just('{').not().rewind());
263
264    let date = dt_prefix
265        .ignore_then(date_inner.clone())
266        .then_ignore(end_expr())
267        .collect::<String>()
268        .map(Literal::Date);
269
270    let time = dt_prefix
271        .ignore_then(time_inner.clone())
272        .then_ignore(end_expr())
273        .collect::<String>()
274        .map(Literal::Time);
275
276    let datetime = dt_prefix
277        .ignore_then(date_inner)
278        .chain(just('T'))
279        .chain::<char, _, _>(time_inner)
280        .then_ignore(end_expr())
281        .collect::<String>()
282        .map(Literal::Timestamp);
283
284    choice((
285        string,
286        raw_string,
287        value_and_unit,
288        number,
289        bool,
290        null,
291        datetime,
292        date,
293        time,
294    ))
295}
296
297fn quoted_string(escaped: bool) -> impl Parser<char, String, Error = Cheap<char>> {
298    choice((
299        quoted_string_of_quote(&'"', escaped),
300        quoted_string_of_quote(&'\'', escaped),
301    ))
302    .collect::<String>()
303    .labelled("string")
304}
305
306fn quoted_string_of_quote(
307    quote: &char,
308    escaping: bool,
309) -> impl Parser<char, Vec<char>, Error = Cheap<char>> + '_ {
310    let opening = just(*quote).repeated().at_least(1);
311
312    opening.then_with(move |opening| {
313        if opening.len() % 2 == 0 {
314            // If we have an even number of quotes, it's an empty string.
315            return (just(vec![])).boxed();
316        }
317        let delimiter = just(*quote).repeated().exactly(opening.len());
318
319        let inner = if escaping {
320            choice((
321                // If we're escaping, don't allow consuming a backslash
322                // We need the `vec` to satisfy the type checker
323                (delimiter.or(just(vec!['\\']))).not(),
324                escaped_character(),
325                // Or escape the quote char of the current string
326                just('\\').ignore_then(just(*quote)),
327            ))
328            .boxed()
329        } else {
330            delimiter.not().boxed()
331        };
332
333        inner.repeated().then_ignore(delimiter).boxed()
334    })
335}
336
337fn escaped_character() -> impl Parser<char, char, Error = Cheap<char>> {
338    just('\\').ignore_then(choice((
339        just('\\'),
340        just('/'),
341        just('b').to('\x08'),
342        just('f').to('\x0C'),
343        just('n').to('\n'),
344        just('r').to('\r'),
345        just('t').to('\t'),
346        (just('u').ignore_then(
347            filter(|c: &char| c.is_ascii_hexdigit())
348                .repeated()
349                .exactly(4)
350                .collect::<String>()
351                .validate(|digits, span, emit| {
352                    char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| {
353                        emit(Cheap::expected_input_found(span, None, None));
354                        '\u{FFFD}' // unicode replacement character
355                    })
356                }),
357        )),
358    )))
359}
360
361fn digits(count: usize) -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
362    filter(|c: &char| c.is_ascii_digit())
363        .repeated()
364        .exactly(count)
365}
366
367fn end_expr() -> impl Parser<char, (), Error = Cheap<char>> {
368    choice((
369        end(),
370        one_of(",)]}\t >").ignored(),
371        newline(),
372        just("..").ignored(),
373    ))
374    .rewind()
375}
376
377impl Token {
378    pub fn range(bind_left: bool, bind_right: bool) -> Self {
379        Token::Range {
380            bind_left,
381            bind_right,
382        }
383    }
384}
385
386// This is here because Literal::Float(f64) does not implement Hash, so we cannot simply derive it.
387// There are reasons for that, but chumsky::Error needs Hash for the Token, so it can deduplicate
388// tokens in error.
389// So this hack could lead to duplicated tokens in error messages. Oh no.
390#[allow(clippy::derived_hash_with_manual_eq)]
391impl std::hash::Hash for Token {
392    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
393        core::mem::discriminant(self).hash(state);
394    }
395}
396
397impl std::cmp::Eq for Token {}
398
399#[test]
400fn test_line_wrap() {
401    use insta::assert_debug_snapshot;
402
403    // (TODO: is there a terser way of writing our lexer output?)
404    assert_debug_snapshot!(lexer().parse(r"5 +
405    \ 3 "
406        ).unwrap(), @r###"
407    [
408        (
409            Literal(
410                Integer(
411                    5,
412                ),
413            ),
414            0..1,
415        ),
416        (
417            Control(
418                '+',
419            ),
420            2..3,
421        ),
422        (
423            Literal(
424                Integer(
425                    3,
426                ),
427            ),
428            10..11,
429        ),
430    ]
431    "###);
432
433    // Comments get skipped over
434    assert_debug_snapshot!(lexer().parse(r"5 +
435# comment
436   # comment with whitespace
437  \ 3 "
438        ).unwrap(), @r###"
439    [
440        (
441            Literal(
442                Integer(
443                    5,
444                ),
445            ),
446            0..1,
447        ),
448        (
449            Control(
450                '+',
451            ),
452            2..3,
453        ),
454        (
455            Literal(
456                Integer(
457                    3,
458                ),
459            ),
460            47..48,
461        ),
462    ]
463    "###);
464}
465
466#[test]
467fn quotes() {
468    use insta::assert_snapshot;
469
470    // All these are valid & equal.
471    assert_snapshot!(quoted_string(false).parse(r#"'aoeu'"#).unwrap(), @"aoeu");
472    assert_snapshot!(quoted_string(false).parse(r#"'''aoeu'''"#).unwrap(), @"aoeu");
473    assert_snapshot!(quoted_string(false).parse(r#"'''''aoeu'''''"#).unwrap(), @"aoeu");
474    assert_snapshot!(quoted_string(false).parse(r#"'''''''aoeu'''''''"#).unwrap(), @"aoeu");
475
476    // An even number is interpreted as a closed string (and the remainder is unparsed)
477    assert_snapshot!(quoted_string(false).parse(r#"''aoeu''"#).unwrap(), @"");
478
479    // When not escaping, we take the inner string between the three quotes
480    assert_snapshot!(quoted_string(false).parse(r#""""\"hello\""""#).unwrap(), @r###"\"hello\"###);
481
482    assert_snapshot!(quoted_string(true).parse(r#""""\"hello\"""""#).unwrap(), @r###""hello""###);
483
484    // Escape each inner quote depending on the outer quote
485    assert_snapshot!(quoted_string(true).parse(r#""\"hello\"""#).unwrap(), @r###""hello""###);
486    assert_snapshot!(quoted_string(true).parse(r"'\'hello\''").unwrap(), @"'hello'");
487
488    assert_snapshot!(quoted_string(true).parse(r#"''"#).unwrap(), @"");
489
490    // An empty input should fail
491    quoted_string(false).parse(r#""#).unwrap_err();
492
493    // An even number of quotes is an empty string
494    assert_snapshot!(quoted_string(true).parse(r#"''''''"#).unwrap(), @"");
495}