prqlc_parser/lexer/
mod.rs

1use chumsky::error::Cheap;
2use chumsky::prelude::*;
3use chumsky::text::{newline, Character};
4
5use self::lr::{Literal, Token, TokenKind, ValueAndUnit};
6use crate::error::{Error, ErrorSource, Reason, WithErrorInfo};
7use crate::span::Span;
8
9pub mod lr;
10#[cfg(test)]
11mod test;
12
13/// Lex PRQL into LR, returning both the LR and any errors encountered
14pub fn lex_source_recovery(source: &str, source_id: u16) -> (Option<Vec<Token>>, Vec<Error>) {
15    let (tokens, lex_errors) = lexer().parse_recovery(source);
16
17    let tokens = tokens.map(insert_start);
18
19    let errors = lex_errors
20        .into_iter()
21        .map(|e| convert_lexer_error(source, e, source_id))
22        .collect();
23
24    log::debug!("lex errors: {:?}", errors);
25    (tokens, errors)
26}
27
28/// Lex PRQL into LR, returning either the LR or the errors encountered
29pub fn lex_source(source: &str) -> Result<lr::Tokens, Vec<Error>> {
30    lexer()
31        .parse(source)
32        .map(insert_start)
33        .map(lr::Tokens)
34        .map_err(|e| {
35            e.into_iter()
36                .map(|x| convert_lexer_error(source, x, 0))
37                .collect()
38        })
39}
40
41/// Insert a start token so later stages can treat the start of a file like a newline
42fn insert_start(tokens: Vec<Token>) -> Vec<Token> {
43    std::iter::once(Token {
44        kind: TokenKind::Start,
45        span: 0..0,
46    })
47    .chain(tokens)
48    .collect()
49}
50
51fn convert_lexer_error(source: &str, e: chumsky::error::Cheap<char>, source_id: u16) -> Error {
52    // We want to slice based on the chars, not the bytes, so can't just index
53    // into the str.
54    let found = source
55        .chars()
56        .skip(e.span().start)
57        .take(e.span().end() - e.span().start)
58        .collect();
59    let span = Some(Span {
60        start: e.span().start,
61        end: e.span().end,
62        source_id,
63    });
64
65    Error::new(Reason::Unexpected { found })
66        .with_span(span)
67        .with_source(ErrorSource::Lexer(e))
68}
69
70/// Lex chars to tokens until the end of the input
71pub(crate) fn lexer() -> impl Parser<char, Vec<Token>, Error = Cheap<char>> {
72    lex_token()
73        .repeated()
74        .then_ignore(ignored())
75        .then_ignore(end())
76}
77
78/// Lex chars to a single token
79fn lex_token() -> impl Parser<char, Token, Error = Cheap<char>> {
80    let control_multi = choice((
81        just("->").to(TokenKind::ArrowThin),
82        just("=>").to(TokenKind::ArrowFat),
83        just("==").to(TokenKind::Eq),
84        just("!=").to(TokenKind::Ne),
85        just(">=").to(TokenKind::Gte),
86        just("<=").to(TokenKind::Lte),
87        just("~=").to(TokenKind::RegexSearch),
88        just("&&").then_ignore(end_expr()).to(TokenKind::And),
89        just("||").then_ignore(end_expr()).to(TokenKind::Or),
90        just("??").to(TokenKind::Coalesce),
91        just("//").to(TokenKind::DivInt),
92        just("**").to(TokenKind::Pow),
93        just("@")
94            .then(digits(1).not().rewind())
95            .to(TokenKind::Annotate),
96    ));
97
98    let control = one_of("></%=+-*[]().,:|!{}").map(TokenKind::Control);
99
100    let ident = ident_part().map(TokenKind::Ident);
101
102    let keyword = choice((
103        just("let"),
104        just("into"),
105        just("case"),
106        just("prql"),
107        just("type"),
108        just("module"),
109        just("internal"),
110        just("func"),
111        just("import"),
112        just("enum"),
113    ))
114    .then_ignore(end_expr())
115    .map(|x| x.to_string())
116    .map(TokenKind::Keyword);
117
118    let literal = literal().map(TokenKind::Literal);
119
120    let param = just('$')
121        .ignore_then(filter(|c: &char| c.is_alphanumeric() || *c == '_' || *c == '.').repeated())
122        .collect::<String>()
123        .map(TokenKind::Param);
124
125    let interpolation = one_of("sf")
126        .then(quoted_string(true))
127        .map(|(c, s)| TokenKind::Interpolation(c, s));
128
129    let token = choice((
130        line_wrap(),
131        newline().to(TokenKind::NewLine),
132        control_multi,
133        interpolation,
134        param,
135        control,
136        literal,
137        keyword,
138        ident,
139        comment(),
140    ))
141    .recover_with(skip_then_retry_until([]).skip_start());
142
143    let range = (whitespace().or_not())
144        .then_ignore(just(".."))
145        .then(whitespace().or_not())
146        .map(|(left, right)| TokenKind::Range {
147            // If there was no whitespace before (after), then we mark the range
148            // as bound on the left (right).
149            bind_left: left.is_none(),
150            bind_right: right.is_none(),
151        })
152        .map_with_span(|kind, span| Token { kind, span });
153
154    choice((
155        range,
156        ignored().ignore_then(token.map_with_span(|kind, span| Token { kind, span })),
157    ))
158}
159
160fn ignored() -> impl Parser<char, (), Error = Cheap<char>> {
161    whitespace().repeated().ignored()
162}
163
164fn whitespace() -> impl Parser<char, (), Error = Cheap<char>> {
165    filter(|x: &char| x.is_inline_whitespace())
166        .repeated()
167        .at_least(1)
168        .ignored()
169}
170
171fn line_wrap() -> impl Parser<char, TokenKind, Error = Cheap<char>> {
172    newline()
173        .ignore_then(
174            whitespace()
175                .repeated()
176                .ignore_then(comment())
177                .then_ignore(newline())
178                .repeated(),
179        )
180        .then_ignore(whitespace().repeated())
181        .then_ignore(just('\\'))
182        .map(TokenKind::LineWrap)
183}
184
185fn comment() -> impl Parser<char, TokenKind, Error = Cheap<char>> {
186    just('#').ignore_then(choice((
187        // One option would be to check that doc comments have new lines in the
188        // lexer (we currently do in the parser); which would give better error
189        // messages?
190        just('!').ignore_then(
191            newline()
192                .not()
193                .repeated()
194                .collect::<String>()
195                .map(TokenKind::DocComment),
196        ),
197        newline()
198            .not()
199            .repeated()
200            .collect::<String>()
201            .map(TokenKind::Comment),
202    )))
203}
204
205pub(crate) fn ident_part() -> impl Parser<char, String, Error = Cheap<char>> + Clone {
206    let plain = filter(|c: &char| c.is_alphabetic() || *c == '_')
207        .chain(filter(|c: &char| c.is_alphanumeric() || *c == '_').repeated());
208
209    let backticks = none_of('`').repeated().delimited_by(just('`'), just('`'));
210
211    plain.or(backticks).collect()
212}
213
214fn literal() -> impl Parser<char, Literal, Error = Cheap<char>> {
215    let binary_notation = just("0b")
216        .then_ignore(just("_").or_not())
217        .ignore_then(
218            filter(|c: &char| *c == '0' || *c == '1')
219                .repeated()
220                .at_least(1)
221                .at_most(32)
222                .collect::<String>()
223                .try_map(|digits, _| {
224                    Ok(Literal::Integer(i64::from_str_radix(&digits, 2).unwrap()))
225                }),
226        )
227        .labelled("number");
228
229    let hexadecimal_notation = just("0x")
230        .then_ignore(just("_").or_not())
231        .ignore_then(
232            filter(|c: &char| c.is_ascii_hexdigit())
233                .repeated()
234                .at_least(1)
235                .at_most(12)
236                .collect::<String>()
237                .try_map(|digits, _| {
238                    Ok(Literal::Integer(i64::from_str_radix(&digits, 16).unwrap()))
239                }),
240        )
241        .labelled("number");
242
243    let octal_notation = just("0o")
244        .then_ignore(just("_").or_not())
245        .ignore_then(
246            filter(|&c| ('0'..='7').contains(&c))
247                .repeated()
248                .at_least(1)
249                .at_most(12)
250                .collect::<String>()
251                .try_map(|digits, _| {
252                    Ok(Literal::Integer(i64::from_str_radix(&digits, 8).unwrap()))
253                }),
254        )
255        .labelled("number");
256
257    let exp = one_of("eE").chain(one_of("+-").or_not().chain::<char, _, _>(text::digits(10)));
258
259    let integer = filter(|c: &char| c.is_ascii_digit() && *c != '0')
260        .chain::<_, Vec<char>, _>(filter(|c: &char| c.is_ascii_digit() || *c == '_').repeated())
261        .or(just('0').map(|c| vec![c]));
262
263    let frac = just('.')
264        .chain::<char, _, _>(filter(|c: &char| c.is_ascii_digit()))
265        .chain::<char, _, _>(filter(|c: &char| c.is_ascii_digit() || *c == '_').repeated());
266
267    let number = integer
268        .chain::<char, _, _>(frac.or_not().flatten())
269        .chain::<char, _, _>(exp.or_not().flatten())
270        .try_map(|chars, span| {
271            let str = chars.into_iter().filter(|c| *c != '_').collect::<String>();
272
273            if let Ok(i) = str.parse::<i64>() {
274                Ok(Literal::Integer(i))
275            } else if let Ok(f) = str.parse::<f64>() {
276                Ok(Literal::Float(f))
277            } else {
278                Err(Cheap::expected_input_found(span, None, None))
279            }
280        })
281        .labelled("number");
282
283    let string = quoted_string(true).map(Literal::String);
284
285    let raw_string = just("r")
286        .ignore_then(quoted_string(false))
287        .map(Literal::RawString);
288
289    let bool = (just("true").to(true))
290        .or(just("false").to(false))
291        .then_ignore(end_expr())
292        .map(Literal::Boolean);
293
294    let null = just("null").to(Literal::Null).then_ignore(end_expr());
295
296    let value_and_unit = integer
297        .then(choice((
298            just("microseconds"),
299            just("milliseconds"),
300            just("seconds"),
301            just("minutes"),
302            just("hours"),
303            just("days"),
304            just("weeks"),
305            just("months"),
306            just("years"),
307        )))
308        .then_ignore(end_expr())
309        .try_map(|(number, unit), span| {
310            let str = number.into_iter().filter(|c| *c != '_').collect::<String>();
311            if let Ok(n) = str.parse::<i64>() {
312                let unit = unit.to_string();
313                Ok(ValueAndUnit { n, unit })
314            } else {
315                Err(Cheap::expected_input_found(span, None, None))
316            }
317        })
318        .map(Literal::ValueAndUnit);
319
320    let date_inner = digits(4)
321        .chain(just('-'))
322        .chain::<char, _, _>(digits(2))
323        .chain::<char, _, _>(just('-'))
324        .chain::<char, _, _>(digits(2))
325        .boxed();
326
327    let time_inner = digits(2)
328        // minutes
329        .chain::<char, _, _>(just(':').chain(digits(2)).or_not().flatten())
330        // seconds
331        .chain::<char, _, _>(just(':').chain(digits(2)).or_not().flatten())
332        // milliseconds
333        .chain::<char, _, _>(
334            just('.')
335                .chain(
336                    filter(|c: &char| c.is_ascii_digit())
337                        .repeated()
338                        .at_least(1)
339                        .at_most(6),
340                )
341                .or_not()
342                .flatten(),
343        )
344        // timezone offset
345        .chain::<char, _, _>(
346            choice((
347                // Either just `Z`
348                just('Z').map(|x| vec![x]),
349                // Or an offset, such as `-05:00` or `-0500`
350                one_of("-+").chain(
351                    digits(2)
352                        .then_ignore(just(':').or_not())
353                        .chain::<char, _, _>(digits(2)),
354                ),
355            ))
356            .or_not(),
357        )
358        .boxed();
359
360    // Not an annotation
361    let dt_prefix = just('@').then(just('{').not().rewind());
362
363    let date = dt_prefix
364        .ignore_then(date_inner.clone())
365        .then_ignore(end_expr())
366        .collect::<String>()
367        .map(Literal::Date);
368
369    let time = dt_prefix
370        .ignore_then(time_inner.clone())
371        .then_ignore(end_expr())
372        .collect::<String>()
373        .map(Literal::Time);
374
375    let datetime = dt_prefix
376        .ignore_then(date_inner)
377        .chain(just('T'))
378        .chain::<char, _, _>(time_inner)
379        .then_ignore(end_expr())
380        .collect::<String>()
381        .map(Literal::Timestamp);
382
383    choice((
384        binary_notation,
385        hexadecimal_notation,
386        octal_notation,
387        string,
388        raw_string,
389        value_and_unit,
390        number,
391        bool,
392        null,
393        datetime,
394        date,
395        time,
396    ))
397}
398
399fn quoted_string(escaped: bool) -> impl Parser<char, String, Error = Cheap<char>> {
400    choice((
401        quoted_string_of_quote(&'"', escaped),
402        quoted_string_of_quote(&'\'', escaped),
403    ))
404    .collect::<String>()
405    .labelled("string")
406}
407
408fn quoted_string_of_quote(
409    quote: &char,
410    escaping: bool,
411) -> impl Parser<char, Vec<char>, Error = Cheap<char>> + '_ {
412    let opening = just(*quote).repeated().at_least(1);
413
414    opening.then_with(move |opening| {
415        if opening.len() % 2 == 0 {
416            // If we have an even number of quotes, it's an empty string.
417            return (just(vec![])).boxed();
418        }
419        let delimiter = just(*quote).repeated().exactly(opening.len());
420
421        let inner = if escaping {
422            choice((
423                // If we're escaping, don't allow consuming a backslash
424                // We need the `vec` to satisfy the type checker
425                (delimiter.or(just(vec!['\\']))).not(),
426                escaped_character(),
427                // Or escape the quote char of the current string
428                just('\\').ignore_then(just(*quote)),
429            ))
430            .boxed()
431        } else {
432            delimiter.not().boxed()
433        };
434
435        inner.repeated().then_ignore(delimiter).boxed()
436    })
437}
438
439fn escaped_character() -> impl Parser<char, char, Error = Cheap<char>> {
440    just('\\').ignore_then(choice((
441        just('\\'),
442        just('/'),
443        just('b').to('\x08'),
444        just('f').to('\x0C'),
445        just('n').to('\n'),
446        just('r').to('\r'),
447        just('t').to('\t'),
448        (just("u{").ignore_then(
449            filter(|c: &char| c.is_ascii_hexdigit())
450                .repeated()
451                .at_least(1)
452                .at_most(6)
453                .collect::<String>()
454                .validate(|digits, span, emit| {
455                    char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| {
456                        emit(Cheap::expected_input_found(span, None, None));
457                        '\u{FFFD}' // Unicode replacement character
458                    })
459                })
460                .then_ignore(just('}')),
461        )),
462        (just('x').ignore_then(
463            filter(|c: &char| c.is_ascii_hexdigit())
464                .repeated()
465                .exactly(2)
466                .collect::<String>()
467                .validate(|digits, span, emit| {
468                    char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| {
469                        emit(Cheap::expected_input_found(span, None, None));
470                        '\u{FFFD}'
471                    })
472                }),
473        )),
474    )))
475}
476
477fn digits(count: usize) -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
478    filter(|c: &char| c.is_ascii_digit())
479        .repeated()
480        .exactly(count)
481}
482
483fn end_expr() -> impl Parser<char, (), Error = Cheap<char>> {
484    choice((
485        end(),
486        one_of(",)]}\t >").ignored(),
487        newline(),
488        just("..").ignored(),
489    ))
490    .rewind()
491}