1use chumsky::error::Cheap;
2use chumsky::prelude::*;
3use chumsky::text::{newline, Character};
4
5use self::lr::{Literal, Token, TokenKind, ValueAndUnit};
6use crate::error::{Error, ErrorSource, Reason, WithErrorInfo};
7use crate::span::Span;
8
9pub mod lr;
10#[cfg(test)]
11mod test;
12
13pub fn lex_source_recovery(source: &str, source_id: u16) -> (Option<Vec<Token>>, Vec<Error>) {
15 let (tokens, lex_errors) = lexer().parse_recovery(source);
16
17 let tokens = tokens.map(insert_start);
18
19 let errors = lex_errors
20 .into_iter()
21 .map(|e| convert_lexer_error(source, e, source_id))
22 .collect();
23
24 log::debug!("lex errors: {:?}", errors);
25 (tokens, errors)
26}
27
28pub fn lex_source(source: &str) -> Result<lr::Tokens, Vec<Error>> {
30 lexer()
31 .parse(source)
32 .map(insert_start)
33 .map(lr::Tokens)
34 .map_err(|e| {
35 e.into_iter()
36 .map(|x| convert_lexer_error(source, x, 0))
37 .collect()
38 })
39}
40
41fn insert_start(tokens: Vec<Token>) -> Vec<Token> {
43 std::iter::once(Token {
44 kind: TokenKind::Start,
45 span: 0..0,
46 })
47 .chain(tokens)
48 .collect()
49}
50
51fn convert_lexer_error(source: &str, e: chumsky::error::Cheap<char>, source_id: u16) -> Error {
52 let found = source
55 .chars()
56 .skip(e.span().start)
57 .take(e.span().end() - e.span().start)
58 .collect();
59 let span = Some(Span {
60 start: e.span().start,
61 end: e.span().end,
62 source_id,
63 });
64
65 Error::new(Reason::Unexpected { found })
66 .with_span(span)
67 .with_source(ErrorSource::Lexer(e))
68}
69
70pub(crate) fn lexer() -> impl Parser<char, Vec<Token>, Error = Cheap<char>> {
72 lex_token()
73 .repeated()
74 .then_ignore(ignored())
75 .then_ignore(end())
76}
77
78fn lex_token() -> impl Parser<char, Token, Error = Cheap<char>> {
80 let control_multi = choice((
81 just("->").to(TokenKind::ArrowThin),
82 just("=>").to(TokenKind::ArrowFat),
83 just("==").to(TokenKind::Eq),
84 just("!=").to(TokenKind::Ne),
85 just(">=").to(TokenKind::Gte),
86 just("<=").to(TokenKind::Lte),
87 just("~=").to(TokenKind::RegexSearch),
88 just("&&").then_ignore(end_expr()).to(TokenKind::And),
89 just("||").then_ignore(end_expr()).to(TokenKind::Or),
90 just("??").to(TokenKind::Coalesce),
91 just("//").to(TokenKind::DivInt),
92 just("**").to(TokenKind::Pow),
93 just("@")
94 .then(digits(1).not().rewind())
95 .to(TokenKind::Annotate),
96 ));
97
98 let control = one_of("></%=+-*[]().,:|!{}").map(TokenKind::Control);
99
100 let ident = ident_part().map(TokenKind::Ident);
101
102 let keyword = choice((
103 just("let"),
104 just("into"),
105 just("case"),
106 just("prql"),
107 just("type"),
108 just("module"),
109 just("internal"),
110 just("func"),
111 just("import"),
112 just("enum"),
113 ))
114 .then_ignore(end_expr())
115 .map(|x| x.to_string())
116 .map(TokenKind::Keyword);
117
118 let literal = literal().map(TokenKind::Literal);
119
120 let param = just('$')
121 .ignore_then(filter(|c: &char| c.is_alphanumeric() || *c == '_' || *c == '.').repeated())
122 .collect::<String>()
123 .map(TokenKind::Param);
124
125 let interpolation = one_of("sf")
126 .then(quoted_string(true))
127 .map(|(c, s)| TokenKind::Interpolation(c, s));
128
129 let token = choice((
130 line_wrap(),
131 newline().to(TokenKind::NewLine),
132 control_multi,
133 interpolation,
134 param,
135 control,
136 literal,
137 keyword,
138 ident,
139 comment(),
140 ))
141 .recover_with(skip_then_retry_until([]).skip_start());
142
143 let range = (whitespace().or_not())
144 .then_ignore(just(".."))
145 .then(whitespace().or_not())
146 .map(|(left, right)| TokenKind::Range {
147 bind_left: left.is_none(),
150 bind_right: right.is_none(),
151 })
152 .map_with_span(|kind, span| Token { kind, span });
153
154 choice((
155 range,
156 ignored().ignore_then(token.map_with_span(|kind, span| Token { kind, span })),
157 ))
158}
159
160fn ignored() -> impl Parser<char, (), Error = Cheap<char>> {
161 whitespace().repeated().ignored()
162}
163
164fn whitespace() -> impl Parser<char, (), Error = Cheap<char>> {
165 filter(|x: &char| x.is_inline_whitespace())
166 .repeated()
167 .at_least(1)
168 .ignored()
169}
170
171fn line_wrap() -> impl Parser<char, TokenKind, Error = Cheap<char>> {
172 newline()
173 .ignore_then(
174 whitespace()
175 .repeated()
176 .ignore_then(comment())
177 .then_ignore(newline())
178 .repeated(),
179 )
180 .then_ignore(whitespace().repeated())
181 .then_ignore(just('\\'))
182 .map(TokenKind::LineWrap)
183}
184
185fn comment() -> impl Parser<char, TokenKind, Error = Cheap<char>> {
186 just('#').ignore_then(choice((
187 just('!').ignore_then(
191 newline()
192 .not()
193 .repeated()
194 .collect::<String>()
195 .map(TokenKind::DocComment),
196 ),
197 newline()
198 .not()
199 .repeated()
200 .collect::<String>()
201 .map(TokenKind::Comment),
202 )))
203}
204
205pub(crate) fn ident_part() -> impl Parser<char, String, Error = Cheap<char>> + Clone {
206 let plain = filter(|c: &char| c.is_alphabetic() || *c == '_')
207 .chain(filter(|c: &char| c.is_alphanumeric() || *c == '_').repeated());
208
209 let backticks = none_of('`').repeated().delimited_by(just('`'), just('`'));
210
211 plain.or(backticks).collect()
212}
213
214fn literal() -> impl Parser<char, Literal, Error = Cheap<char>> {
215 let binary_notation = just("0b")
216 .then_ignore(just("_").or_not())
217 .ignore_then(
218 filter(|c: &char| *c == '0' || *c == '1')
219 .repeated()
220 .at_least(1)
221 .at_most(32)
222 .collect::<String>()
223 .try_map(|digits, _| {
224 Ok(Literal::Integer(i64::from_str_radix(&digits, 2).unwrap()))
225 }),
226 )
227 .labelled("number");
228
229 let hexadecimal_notation = just("0x")
230 .then_ignore(just("_").or_not())
231 .ignore_then(
232 filter(|c: &char| c.is_ascii_hexdigit())
233 .repeated()
234 .at_least(1)
235 .at_most(12)
236 .collect::<String>()
237 .try_map(|digits, _| {
238 Ok(Literal::Integer(i64::from_str_radix(&digits, 16).unwrap()))
239 }),
240 )
241 .labelled("number");
242
243 let octal_notation = just("0o")
244 .then_ignore(just("_").or_not())
245 .ignore_then(
246 filter(|&c| ('0'..='7').contains(&c))
247 .repeated()
248 .at_least(1)
249 .at_most(12)
250 .collect::<String>()
251 .try_map(|digits, _| {
252 Ok(Literal::Integer(i64::from_str_radix(&digits, 8).unwrap()))
253 }),
254 )
255 .labelled("number");
256
257 let exp = one_of("eE").chain(one_of("+-").or_not().chain::<char, _, _>(text::digits(10)));
258
259 let integer = filter(|c: &char| c.is_ascii_digit() && *c != '0')
260 .chain::<_, Vec<char>, _>(filter(|c: &char| c.is_ascii_digit() || *c == '_').repeated())
261 .or(just('0').map(|c| vec![c]));
262
263 let frac = just('.')
264 .chain::<char, _, _>(filter(|c: &char| c.is_ascii_digit()))
265 .chain::<char, _, _>(filter(|c: &char| c.is_ascii_digit() || *c == '_').repeated());
266
267 let number = integer
268 .chain::<char, _, _>(frac.or_not().flatten())
269 .chain::<char, _, _>(exp.or_not().flatten())
270 .try_map(|chars, span| {
271 let str = chars.into_iter().filter(|c| *c != '_').collect::<String>();
272
273 if let Ok(i) = str.parse::<i64>() {
274 Ok(Literal::Integer(i))
275 } else if let Ok(f) = str.parse::<f64>() {
276 Ok(Literal::Float(f))
277 } else {
278 Err(Cheap::expected_input_found(span, None, None))
279 }
280 })
281 .labelled("number");
282
283 let string = quoted_string(true).map(Literal::String);
284
285 let raw_string = just("r")
286 .ignore_then(quoted_string(false))
287 .map(Literal::RawString);
288
289 let bool = (just("true").to(true))
290 .or(just("false").to(false))
291 .then_ignore(end_expr())
292 .map(Literal::Boolean);
293
294 let null = just("null").to(Literal::Null).then_ignore(end_expr());
295
296 let value_and_unit = integer
297 .then(choice((
298 just("microseconds"),
299 just("milliseconds"),
300 just("seconds"),
301 just("minutes"),
302 just("hours"),
303 just("days"),
304 just("weeks"),
305 just("months"),
306 just("years"),
307 )))
308 .then_ignore(end_expr())
309 .try_map(|(number, unit), span| {
310 let str = number.into_iter().filter(|c| *c != '_').collect::<String>();
311 if let Ok(n) = str.parse::<i64>() {
312 let unit = unit.to_string();
313 Ok(ValueAndUnit { n, unit })
314 } else {
315 Err(Cheap::expected_input_found(span, None, None))
316 }
317 })
318 .map(Literal::ValueAndUnit);
319
320 let date_inner = digits(4)
321 .chain(just('-'))
322 .chain::<char, _, _>(digits(2))
323 .chain::<char, _, _>(just('-'))
324 .chain::<char, _, _>(digits(2))
325 .boxed();
326
327 let time_inner = digits(2)
328 .chain::<char, _, _>(just(':').chain(digits(2)).or_not().flatten())
330 .chain::<char, _, _>(just(':').chain(digits(2)).or_not().flatten())
332 .chain::<char, _, _>(
334 just('.')
335 .chain(
336 filter(|c: &char| c.is_ascii_digit())
337 .repeated()
338 .at_least(1)
339 .at_most(6),
340 )
341 .or_not()
342 .flatten(),
343 )
344 .chain::<char, _, _>(
346 choice((
347 just('Z').map(|x| vec![x]),
349 one_of("-+").chain(
351 digits(2)
352 .then_ignore(just(':').or_not())
353 .chain::<char, _, _>(digits(2)),
354 ),
355 ))
356 .or_not(),
357 )
358 .boxed();
359
360 let dt_prefix = just('@').then(just('{').not().rewind());
362
363 let date = dt_prefix
364 .ignore_then(date_inner.clone())
365 .then_ignore(end_expr())
366 .collect::<String>()
367 .map(Literal::Date);
368
369 let time = dt_prefix
370 .ignore_then(time_inner.clone())
371 .then_ignore(end_expr())
372 .collect::<String>()
373 .map(Literal::Time);
374
375 let datetime = dt_prefix
376 .ignore_then(date_inner)
377 .chain(just('T'))
378 .chain::<char, _, _>(time_inner)
379 .then_ignore(end_expr())
380 .collect::<String>()
381 .map(Literal::Timestamp);
382
383 choice((
384 binary_notation,
385 hexadecimal_notation,
386 octal_notation,
387 string,
388 raw_string,
389 value_and_unit,
390 number,
391 bool,
392 null,
393 datetime,
394 date,
395 time,
396 ))
397}
398
399fn quoted_string(escaped: bool) -> impl Parser<char, String, Error = Cheap<char>> {
400 choice((
401 quoted_string_of_quote(&'"', escaped),
402 quoted_string_of_quote(&'\'', escaped),
403 ))
404 .collect::<String>()
405 .labelled("string")
406}
407
408fn quoted_string_of_quote(
409 quote: &char,
410 escaping: bool,
411) -> impl Parser<char, Vec<char>, Error = Cheap<char>> + '_ {
412 let opening = just(*quote).repeated().at_least(1);
413
414 opening.then_with(move |opening| {
415 if opening.len() % 2 == 0 {
416 return (just(vec![])).boxed();
418 }
419 let delimiter = just(*quote).repeated().exactly(opening.len());
420
421 let inner = if escaping {
422 choice((
423 (delimiter.or(just(vec!['\\']))).not(),
426 escaped_character(),
427 just('\\').ignore_then(just(*quote)),
429 ))
430 .boxed()
431 } else {
432 delimiter.not().boxed()
433 };
434
435 inner.repeated().then_ignore(delimiter).boxed()
436 })
437}
438
439fn escaped_character() -> impl Parser<char, char, Error = Cheap<char>> {
440 just('\\').ignore_then(choice((
441 just('\\'),
442 just('/'),
443 just('b').to('\x08'),
444 just('f').to('\x0C'),
445 just('n').to('\n'),
446 just('r').to('\r'),
447 just('t').to('\t'),
448 (just("u{").ignore_then(
449 filter(|c: &char| c.is_ascii_hexdigit())
450 .repeated()
451 .at_least(1)
452 .at_most(6)
453 .collect::<String>()
454 .validate(|digits, span, emit| {
455 char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| {
456 emit(Cheap::expected_input_found(span, None, None));
457 '\u{FFFD}' })
459 })
460 .then_ignore(just('}')),
461 )),
462 (just('x').ignore_then(
463 filter(|c: &char| c.is_ascii_hexdigit())
464 .repeated()
465 .exactly(2)
466 .collect::<String>()
467 .validate(|digits, span, emit| {
468 char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| {
469 emit(Cheap::expected_input_found(span, None, None));
470 '\u{FFFD}'
471 })
472 }),
473 )),
474 )))
475}
476
477fn digits(count: usize) -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
478 filter(|c: &char| c.is_ascii_digit())
479 .repeated()
480 .exactly(count)
481}
482
483fn end_expr() -> impl Parser<char, (), Error = Cheap<char>> {
484 choice((
485 end(),
486 one_of(",)]}\t >").ignored(),
487 newline(),
488 just("..").ignored(),
489 ))
490 .rewind()
491}