1use chumsky;
4
5use chumsky::extra;
6use chumsky::prelude::*;
7use chumsky::Parser;
8
9use self::lr::{Literal, Token, TokenKind, Tokens, ValueAndUnit};
10use crate::error::{Error, ErrorSource, Reason, WithErrorInfo};
11
12pub mod lr;
13#[cfg(test)]
14mod test;
15
16type E = Error;
17type ParserInput<'a> = &'a str;
18type ParserError<'a> = extra::Err<Simple<'a, char>>;
19
20fn convert_lexer_error(source: &str, error: &Simple<'_, char>, source_id: u16) -> E {
22 let byte_span = error.span();
26 let byte_start = byte_span.start();
27 let byte_end = byte_span.end();
28
29 let char_start = source[..byte_start].chars().count();
31 let char_end = source[..byte_end].chars().count();
32
33 let found: String = source
35 .chars()
36 .skip(char_start)
37 .take(char_end - char_start)
38 .collect();
39
40 let found_display = if found.is_empty() {
42 "end of input".to_string()
43 } else {
44 format!("'{}'", found)
45 };
46
47 let error_source = format!(
49 "Unexpected {} at position {}..{}",
50 found_display, char_start, char_end
51 );
52
53 Error::new(Reason::Unexpected {
54 found: found_display,
55 })
56 .with_span(Some(crate::span::Span {
57 start: char_start,
58 end: char_end,
59 source_id,
60 }))
61 .with_source(ErrorSource::Lexer(error_source))
62}
63
64pub fn lex_source_recovery(source: &str, source_id: u16) -> (Option<Vec<Token>>, Vec<E>) {
66 let result = lexer().parse(source).into_result();
67
68 match result {
69 Ok(tokens) => (Some(insert_start(tokens.to_vec())), vec![]),
70 Err(errors) => {
71 let errors = errors
73 .into_iter()
74 .map(|error| convert_lexer_error(source, &error, source_id))
75 .collect();
76
77 (None, errors)
78 }
79 }
80}
81
82pub fn lex_source(source: &str) -> Result<Tokens, Vec<E>> {
84 let result = lexer().parse(source).into_result();
85
86 match result {
87 Ok(tokens) => Ok(Tokens(insert_start(tokens.to_vec()))),
88 Err(errors) => {
89 let errors = errors
91 .into_iter()
92 .map(|error| convert_lexer_error(source, &error, 0))
93 .collect();
94
95 Err(errors)
96 }
97 }
98}
99
100fn insert_start(tokens: Vec<Token>) -> Vec<Token> {
102 std::iter::once(Token {
103 kind: TokenKind::Start,
104 span: 0..0,
105 })
106 .chain(tokens)
107 .collect()
108}
109
110pub fn lexer<'a>() -> impl Parser<'a, ParserInput<'a>, Vec<Token>, ParserError<'a>> {
112 lex_token()
113 .repeated()
114 .collect()
115 .then_ignore(whitespace().or_not())
116}
117
118fn lex_token<'a>() -> impl Parser<'a, ParserInput<'a>, Token, ParserError<'a>> {
120 let range = whitespace()
124 .or_not()
125 .then(just(".."))
126 .then(whitespace().or_not())
127 .map_with(|((left, _), right), extra| {
128 let span: chumsky::span::SimpleSpan = extra.span();
129 Token {
130 kind: TokenKind::Range {
131 bind_left: left.is_none(),
133 bind_right: right.is_none(),
134 },
135 span: span.start()..span.end(),
136 }
137 });
138
139 let other_tokens = whitespace()
141 .or_not()
142 .ignore_then(token().map_with(|kind, extra| {
143 let span: chumsky::span::SimpleSpan = extra.span();
144 Token {
145 kind,
146 span: span.start()..span.end(),
147 }
148 }));
149
150 choice((range, other_tokens))
152}
153
154fn token<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
156 choice((
159 line_wrap().boxed(), newline().to(TokenKind::NewLine), multi_char_operators(), interpolation().boxed(), param(), date_token().boxed(), just('@').to(TokenKind::Annotate), one_of("></%=+-*[]().,:|!{}").map(TokenKind::Control), literal().map(TokenKind::Literal).boxed(), keyword(), ident_part().map(TokenKind::Ident), comment(), ))
174}
175
176fn multi_char_operators<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
177 choice((
178 just("->").to(TokenKind::ArrowThin),
179 just("=>").to(TokenKind::ArrowFat),
180 just("==").to(TokenKind::Eq),
181 just("!=").to(TokenKind::Ne),
182 just(">=").to(TokenKind::Gte),
183 just("<=").to(TokenKind::Lte),
184 just("~=").to(TokenKind::RegexSearch),
185 just("&&").then_ignore(end_expr()).to(TokenKind::And),
186 just("||").then_ignore(end_expr()).to(TokenKind::Or),
187 just("??").to(TokenKind::Coalesce),
188 just("//").to(TokenKind::DivInt),
189 just("**").to(TokenKind::Pow),
190 ))
191}
192
193fn keyword<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
194 choice((
195 just("let"),
196 just("into"),
197 just("case"),
198 just("prql"),
199 just("type"),
200 just("module"),
201 just("internal"),
202 just("func"),
203 just("import"),
204 just("enum"),
205 ))
206 .to_slice()
207 .then_ignore(end_expr())
208 .map(|s: &str| TokenKind::Keyword(s.to_string()))
209}
210
211fn param<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
212 just('$')
213 .ignore_then(
214 any()
215 .filter(|c: &char| c.is_alphanumeric() || *c == '_' || *c == '.')
216 .repeated()
217 .to_slice()
218 .map(|s: &str| s.to_string()),
219 )
220 .map(TokenKind::Param)
221}
222
223fn interpolation<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
224 one_of("sf")
234 .then(quoted_string(true))
235 .map(|(c, s)| TokenKind::Interpolation(c, s))
236}
237
238fn whitespace<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
239 text::inline_whitespace().at_least(1)
240}
241
242fn newline<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
244 just('\n')
245 .or(just('\r').then_ignore(just('\n').or_not()))
246 .ignored()
247}
248
249fn line_wrap<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
250 newline()
251 .ignore_then(
252 whitespace()
253 .repeated()
254 .ignore_then(comment())
255 .then_ignore(newline())
256 .repeated()
257 .collect(),
258 )
259 .then_ignore(whitespace().repeated())
260 .then_ignore(just('\\'))
261 .map(TokenKind::LineWrap)
262}
263
264fn comment<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
265 let comment_text = none_of("\n\r").repeated().collect::<String>();
267
268 just('#').ignore_then(
269 just('!')
273 .ignore_then(comment_text.map(TokenKind::DocComment))
274 .or(comment_text.map(TokenKind::Comment)),
275 )
276}
277
278pub fn ident_part<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
279 let plain = any()
280 .filter(|c: &char| c.is_alphabetic() || *c == '_')
281 .then(
282 any()
287 .filter(|c: &char| c.is_alphanumeric() || *c == '_')
288 .repeated(),
289 )
290 .to_slice()
291 .map(|s: &str| s.to_string());
292
293 let backtick = none_of('`')
294 .repeated()
295 .collect::<String>()
296 .delimited_by(just('`'), just('`'));
297
298 choice((plain, backtick))
299}
300
301fn digits<'a>(count: usize) -> impl Parser<'a, ParserInput<'a>, &'a str, ParserError<'a>> {
303 chumsky::text::digits(10).exactly(count).to_slice()
304}
305
306fn date_inner<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
307 text::digits(10)
309 .exactly(4)
310 .then(just('-'))
311 .then(text::digits(10).exactly(2))
312 .then(just('-'))
313 .then(text::digits(10).exactly(2))
314 .to_slice()
315 .map(|s: &str| s.to_owned())
318}
319
320fn time_inner<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
321 fn time_component<'p>(
323 separator: char,
324 component_parser: impl Parser<'p, ParserInput<'p>, &'p str, ParserError<'p>>,
325 ) -> impl Parser<'p, ParserInput<'p>, String, ParserError<'p>> {
326 just(separator)
327 .then(component_parser)
328 .map(move |(sep, comp): (char, &str)| format!("{}{}", sep, comp))
329 .or_not()
330 .map(|opt| opt.unwrap_or_default())
331 }
332
333 let hours = digits(2).map(|s: &str| s.to_string());
335
336 let minutes = time_component(':', digits(2));
338 let seconds = time_component(':', digits(2));
339
340 let milliseconds = time_component(
342 '.',
343 any()
344 .filter(|c: &char| c.is_ascii_digit())
345 .repeated()
346 .at_least(1)
347 .at_most(6)
348 .to_slice(),
349 );
350
351 let timezone = choice((
353 just('Z').map(|c| c.to_string()),
354 one_of("-+")
355 .then(digits(2).then(just(':').or_not().then(digits(2))).map(
356 |(hrs, (_opt_colon, mins)): (&str, (Option<char>, &str))| {
357 format!("{}{}", hrs, mins)
360 },
361 ))
362 .map(|(sign, offset)| format!("{}{}", sign, offset)),
363 ))
364 .or_not()
365 .map(|opt| opt.unwrap_or_default());
366
367 hours
369 .then(minutes)
370 .then(seconds)
371 .then(milliseconds)
372 .then(timezone)
373 .map(|((((hours, mins), secs), ms), tz)| format!("{}{}{}{}{}", hours, mins, secs, ms, tz))
374}
375
376fn date_token<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
377 just('@')
379 .then(any().filter(|c: &char| c.is_ascii_digit()).rewind())
381 .ignore_then(
382 choice((
385 date_inner()
387 .then(just('T'))
388 .then(time_inner())
389 .then_ignore(end_expr())
390 .map(|((date, t), time)| Literal::Timestamp(format!("{}{}{}", date, t, time))),
391 date_inner().then_ignore(end_expr()).map(Literal::Date),
393 time_inner().then_ignore(end_expr()).map(Literal::Time),
395 )),
396 )
397 .map(TokenKind::Literal)
398}
399
400pub fn literal<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
401 choice((
402 binary_number(),
403 hexadecimal_number(),
404 octal_number(),
405 string(),
406 raw_string(),
407 value_and_unit(),
408 number(),
409 boolean(),
410 null(),
411 ))
412}
413
414fn parse_number_with_base<'a>(
416 prefix: &'static str,
417 base: u32,
418 max_digits: usize,
419 valid_digit: impl Fn(&char) -> bool + 'a,
420) -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
421 just(prefix)
422 .then_ignore(just("_").or_not()) .ignore_then(
424 any()
425 .filter(valid_digit)
426 .repeated()
427 .at_least(1)
428 .at_most(max_digits)
429 .to_slice()
430 .map(move |digits: &str| {
431 i64::from_str_radix(digits, base)
432 .map(Literal::Integer)
433 .unwrap_or(Literal::Integer(0))
434 }),
435 )
436}
437
438fn binary_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
439 parse_number_with_base("0b", 2, 32, |c| *c == '0' || *c == '1')
440}
441
442fn hexadecimal_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
443 parse_number_with_base("0x", 16, 12, |c| c.is_ascii_hexdigit())
444}
445
446fn octal_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
447 parse_number_with_base("0o", 8, 12, |c| ('0'..='7').contains(c))
448}
449
450fn number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
451 fn optional_component<'p, T>(
453 parser: impl Parser<'p, ParserInput<'p>, T, ParserError<'p>>,
454 to_string: impl Fn(T) -> String + 'p,
455 ) -> impl Parser<'p, ParserInput<'p>, String, ParserError<'p>> {
456 parser
457 .map(to_string)
458 .or_not()
459 .map(|opt| opt.unwrap_or_default())
460 }
461
462 let integer = parse_integer();
464
465 let fraction_digits = any()
467 .filter(|c: &char| c.is_ascii_digit())
468 .then(
469 any()
470 .filter(|c: &char| c.is_ascii_digit() || *c == '_')
471 .repeated(),
472 )
473 .to_slice();
474
475 let frac = just('.')
476 .then(fraction_digits)
477 .map(|(dot, digits): (char, &str)| format!("{}{}", dot, digits));
478
479 let exp_digits = one_of("+-")
481 .or_not()
482 .then(
483 any()
484 .filter(|c: &char| c.is_ascii_digit())
485 .repeated()
486 .at_least(1),
487 )
488 .to_slice();
489
490 let exp = one_of("eE")
491 .then(exp_digits)
492 .map(|(e, digits): (char, &str)| format!("{}{}", e, digits));
493
494 integer
496 .then(optional_component(frac, |f| f))
497 .then(optional_component(exp, |e| e))
498 .map(|((int_part, frac_part), exp_part)| {
499 let num_str = format!("{}{}{}", int_part, frac_part, exp_part)
501 .chars()
502 .filter(|&c| c != '_')
503 .collect::<String>();
504
505 if let Ok(i) = num_str.parse::<i64>() {
507 Literal::Integer(i)
508 } else if let Ok(f) = num_str.parse::<f64>() {
509 Literal::Float(f)
510 } else {
511 Literal::Integer(0) }
513 })
514}
515
516fn parse_integer<'a>() -> impl Parser<'a, ParserInput<'a>, &'a str, ParserError<'a>> {
517 choice((
519 any()
520 .filter(|c: &char| c.is_ascii_digit() && *c != '0')
521 .then(
522 any()
523 .filter(|c: &char| c.is_ascii_digit() || *c == '_')
524 .repeated(),
525 )
526 .to_slice(),
527 just('0').to_slice(),
528 ))
529}
530
531fn string<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
532 quoted_string(true).map(Literal::String)
533}
534
535fn raw_string<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
536 just("r")
537 .then(choice((just('\''), just('"'))))
538 .then(
539 any()
540 .filter(move |c: &char| *c != '\'' && *c != '"' && *c != '\n' && *c != '\r')
541 .repeated()
542 .to_slice(),
543 )
544 .then(choice((just('\''), just('"'))))
545 .map(
546 |(((_, _open_quote), s), _close_quote): (((&str, char), &str), char)| {
547 Literal::RawString(s.to_string())
548 },
549 )
550}
551
552fn boolean<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
553 choice((just("true").to(true), just("false").to(false)))
554 .then_ignore(end_expr())
555 .map(Literal::Boolean)
556}
557
558fn null<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
559 just("null").to(Literal::Null).then_ignore(end_expr())
560}
561
562fn value_and_unit<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
563 let unit = choice((
565 just("microseconds"),
566 just("milliseconds"),
567 just("seconds"),
568 just("minutes"),
569 just("hours"),
570 just("days"),
571 just("weeks"),
572 just("months"),
573 just("years"),
574 ));
575
576 parse_integer().then(unit).then_ignore(end_expr()).map(
578 |(number_str, unit_str): (&str, &str)| {
579 let n = number_str.replace('_', "").parse::<i64>().unwrap_or(1);
581 Literal::ValueAndUnit(ValueAndUnit {
582 n,
583 unit: unit_str.to_string(),
584 })
585 },
586 )
587}
588
589pub fn quoted_string<'a>(
590 escaped: bool,
591) -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
592 choice((
593 multi_quoted_string(&'"', escaped),
594 multi_quoted_string(&'\'', escaped),
595 ))
596 .map(|chars| chars.into_iter().collect())
597}
598
599fn parse_escape_sequence<'a>(
602 input: &mut chumsky::input::InputRef<'a, '_, ParserInput<'a>, ParserError<'a>>,
603 quote_char: char,
604) -> char {
605 match input.peek() {
606 Some(next_ch) => {
607 input.next();
608 match next_ch {
609 '\\' => '\\',
610 '/' => '/',
611 'b' => '\x08',
612 'f' => '\x0C',
613 'n' => '\n',
614 'r' => '\r',
615 't' => '\t',
616 'u' if input.peek() == Some('{') => {
617 input.next(); let mut hex = String::new();
619 while let Some(ch) = input.peek() {
620 if ch == '}' {
621 input.next();
622 break;
623 }
624 if ch.is_ascii_hexdigit() && hex.len() < 6 {
625 hex.push(ch);
626 input.next();
627 } else {
628 break;
629 }
630 }
631 char::from_u32(u32::from_str_radix(&hex, 16).unwrap_or(0)).unwrap_or('\u{FFFD}')
632 }
633 'x' => {
634 let mut hex = String::new();
635 for _ in 0..2 {
636 if let Some(ch) = input.peek() {
637 if ch.is_ascii_hexdigit() {
638 hex.push(ch);
639 input.next();
640 }
641 }
642 }
643 if hex.len() == 2 {
644 char::from_u32(u32::from_str_radix(&hex, 16).unwrap_or(0))
645 .unwrap_or('\u{FFFD}')
646 } else {
647 next_ch }
649 }
650 c if c == quote_char => quote_char, other => other, }
653 }
654 None => {
655 '\\'
657 }
658 }
659}
660
661fn multi_quoted_string<'a>(
668 quote: &char,
669 escaping: bool,
670) -> impl Parser<'a, ParserInput<'a>, Vec<char>, ParserError<'a>> {
671 let quote_char = *quote;
672
673 custom(move |input| {
674 let start_cursor = input.save();
675
676 let mut open_count = 0;
678 while let Some(ch) = input.peek() {
679 if ch == quote_char {
680 input.next();
681 open_count += 1;
682 } else {
683 break;
684 }
685 }
686
687 if open_count == 0 {
688 let span = input.span_since(start_cursor.cursor());
689 return Err(Simple::new(input.peek_maybe(), span));
690 }
691
692 if open_count % 2 == 0 {
694 return Ok(vec![]);
695 }
696
697 let mut result = Vec::new();
699
700 loop {
701 let checkpoint = input.save();
703
704 let mut close_count = 0;
706 while close_count < open_count {
707 match input.peek() {
708 Some(ch) if ch == quote_char => {
709 input.next();
710 close_count += 1;
711 }
712 _ => break,
713 }
714 }
715
716 if close_count == open_count {
718 return Ok(result);
719 }
720
721 input.rewind(checkpoint);
723
724 match input.next() {
725 Some(ch) => {
726 if escaping && ch == '\\' {
728 let escaped = parse_escape_sequence(input, quote_char);
729 result.push(escaped);
730 } else {
731 result.push(ch);
732 }
733 }
734 None => {
735 let current_cursor = input.save();
738 let span = input.span_since(current_cursor.cursor());
739 return Err(Simple::new(None, span));
740 }
741 }
742 }
743 })
744}
745
746fn end_expr<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
747 choice((
748 end(),
749 one_of(",)]}\t >").to(()),
750 newline(),
751 just("..").to(()),
752 ))
753 .rewind()
754}