1use chumsky;
4
5use chumsky::extra;
6use chumsky::prelude::*;
7use chumsky::Parser;
8
9use self::lr::{Literal, Token, TokenKind, Tokens, ValueAndUnit};
10use crate::error::{Error, ErrorSource, Reason, WithErrorInfo};
11
12pub mod lr;
13#[cfg(test)]
14mod test;
15
16type E = Error;
17type ParserInput<'a> = &'a str;
18type ParserError<'a> = extra::Err<Simple<'a, char>>;
19
20fn convert_lexer_error(source: &str, error: &Simple<'_, char>, source_id: u16) -> E {
22 let byte_span = error.span();
26 let byte_start = byte_span.start();
27 let byte_end = byte_span.end();
28
29 let char_start = source[..byte_start].chars().count();
31 let char_end = source[..byte_end].chars().count();
32
33 let found: String = source
35 .chars()
36 .skip(char_start)
37 .take(char_end - char_start)
38 .collect();
39
40 let found_display = if found.is_empty() {
42 "end of input".to_string()
43 } else {
44 format!("'{}'", found)
45 };
46
47 let error_source = format!(
49 "Unexpected {} at position {}..{}",
50 found_display, char_start, char_end
51 );
52
53 WithErrorInfo::with_span(
54 Error::new(Reason::Unexpected {
55 found: found_display,
56 }),
57 Some(crate::span::Span {
58 start: char_start,
59 end: char_end,
60 source_id,
61 }),
62 )
63 .with_source(ErrorSource::Lexer(error_source))
64}
65
66pub fn lex_source_recovery(source: &str, source_id: u16) -> (Option<Vec<Token>>, Vec<E>) {
68 let result = lexer().parse(source).into_result();
69
70 match result {
71 Ok(tokens) => (Some(insert_start(tokens.to_vec())), vec![]),
72 Err(errors) => {
73 let errors = errors
75 .into_iter()
76 .map(|error| convert_lexer_error(source, &error, source_id))
77 .collect();
78
79 (None, errors)
80 }
81 }
82}
83
84pub fn lex_source(source: &str) -> Result<Tokens, Vec<E>> {
86 let result = lexer().parse(source).into_result();
87
88 match result {
89 Ok(tokens) => Ok(Tokens(insert_start(tokens.to_vec()))),
90 Err(errors) => {
91 let errors = errors
93 .into_iter()
94 .map(|error| convert_lexer_error(source, &error, 0))
95 .collect();
96
97 Err(errors)
98 }
99 }
100}
101
102fn insert_start(tokens: Vec<Token>) -> Vec<Token> {
104 std::iter::once(Token {
105 kind: TokenKind::Start,
106 span: 0..0,
107 })
108 .chain(tokens)
109 .collect()
110}
111
112pub fn lexer<'a>() -> impl Parser<'a, ParserInput<'a>, Vec<Token>, ParserError<'a>> {
114 lex_token()
115 .repeated()
116 .collect()
117 .then_ignore(whitespace().or_not())
118}
119
120fn lex_token<'a>() -> impl Parser<'a, ParserInput<'a>, Token, ParserError<'a>> {
122 let range = whitespace()
126 .or_not()
127 .then(just(".."))
128 .then(whitespace().or_not())
129 .map_with(|((left, _), right), extra| {
130 let span: chumsky::span::SimpleSpan = extra.span();
131 Token {
132 kind: TokenKind::Range {
133 bind_left: left.is_none(),
135 bind_right: right.is_none(),
136 },
137 span: span.start()..span.end(),
138 }
139 });
140
141 let other_tokens = whitespace()
143 .or_not()
144 .ignore_then(token().map_with(|kind, extra| {
145 let span: chumsky::span::SimpleSpan = extra.span();
146 Token {
147 kind,
148 span: span.start()..span.end(),
149 }
150 }));
151
152 choice((range, other_tokens))
154}
155
156fn token<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
158 choice((
161 line_wrap().boxed(), newline().to(TokenKind::NewLine), multi_char_operators(), interpolation().boxed(), param(), date_token().boxed(), just('@').to(TokenKind::Annotate), one_of("></%=+-*[]().,:|!{}").map(TokenKind::Control), literal().map(TokenKind::Literal).boxed(), keyword(), ident_part().map(TokenKind::Ident), comment(), ))
176}
177
178fn multi_char_operators<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
179 choice((
180 just("->").to(TokenKind::ArrowThin),
181 just("=>").to(TokenKind::ArrowFat),
182 just("==").to(TokenKind::Eq),
183 just("!=").to(TokenKind::Ne),
184 just(">=").to(TokenKind::Gte),
185 just("<=").to(TokenKind::Lte),
186 just("~=").to(TokenKind::RegexSearch),
187 just("&&").then_ignore(end_expr()).to(TokenKind::And),
188 just("||").then_ignore(end_expr()).to(TokenKind::Or),
189 just("??").to(TokenKind::Coalesce),
190 just("//").to(TokenKind::DivInt),
191 just("**").to(TokenKind::Pow),
192 ))
193}
194
195fn keyword<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
196 choice((
197 just("let"),
198 just("into"),
199 just("case"),
200 just("prql"),
201 just("type"),
202 just("module"),
203 just("internal"),
204 just("func"),
205 just("import"),
206 just("enum"),
207 ))
208 .to_slice()
209 .then_ignore(end_expr())
210 .map(|s: &str| TokenKind::Keyword(s.to_string()))
211}
212
213fn param<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
214 just('$')
215 .ignore_then(
216 any()
217 .filter(|c: &char| c.is_alphanumeric() || *c == '_' || *c == '.')
218 .repeated()
219 .to_slice()
220 .map(|s: &str| s.to_string()),
221 )
222 .map(TokenKind::Param)
223}
224
225fn interpolation<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
226 one_of("sf")
236 .then(quoted_string(true))
237 .map(|(c, s)| TokenKind::Interpolation(c, s))
238}
239
240fn whitespace<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
241 text::inline_whitespace().at_least(1)
242}
243
244fn newline<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
246 just('\n')
247 .or(just('\r').then_ignore(just('\n').or_not()))
248 .ignored()
249}
250
251fn line_wrap<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
252 newline()
253 .ignore_then(
254 whitespace()
255 .repeated()
256 .ignore_then(comment())
257 .then_ignore(newline())
258 .repeated()
259 .collect(),
260 )
261 .then_ignore(whitespace().repeated())
262 .then_ignore(just('\\'))
263 .map(TokenKind::LineWrap)
264}
265
266fn comment<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
267 let comment_text = none_of("\n\r").repeated().collect::<String>();
269
270 just('#').ignore_then(
271 just('!')
275 .ignore_then(comment_text.map(TokenKind::DocComment))
276 .or(comment_text.map(TokenKind::Comment)),
277 )
278}
279
280pub fn ident_part<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
281 let plain = any()
282 .filter(|c: &char| c.is_alphabetic() || *c == '_')
283 .then(
284 any()
289 .filter(|c: &char| c.is_alphanumeric() || *c == '_')
290 .repeated(),
291 )
292 .to_slice()
293 .map(|s: &str| s.to_string());
294
295 let backtick = none_of('`')
296 .repeated()
297 .collect::<String>()
298 .delimited_by(just('`'), just('`'));
299
300 choice((plain, backtick))
301}
302
303fn digits<'a>(count: usize) -> impl Parser<'a, ParserInput<'a>, &'a str, ParserError<'a>> {
305 chumsky::text::digits(10).exactly(count).to_slice()
306}
307
308fn date_inner<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
309 text::digits(10)
311 .exactly(4)
312 .then(just('-'))
313 .then(text::digits(10).exactly(2))
314 .then(just('-'))
315 .then(text::digits(10).exactly(2))
316 .to_slice()
317 .map(|s: &str| s.to_owned())
320}
321
322fn time_inner<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
323 fn time_component<'p>(
325 separator: char,
326 component_parser: impl Parser<'p, ParserInput<'p>, &'p str, ParserError<'p>>,
327 ) -> impl Parser<'p, ParserInput<'p>, String, ParserError<'p>> {
328 just(separator)
329 .then(component_parser)
330 .map(move |(sep, comp): (char, &str)| format!("{}{}", sep, comp))
331 .or_not()
332 .map(|opt| opt.unwrap_or_default())
333 }
334
335 let hours = digits(2).map(|s: &str| s.to_string());
337
338 let minutes = time_component(':', digits(2));
340 let seconds = time_component(':', digits(2));
341
342 let milliseconds = time_component(
344 '.',
345 any()
346 .filter(|c: &char| c.is_ascii_digit())
347 .repeated()
348 .at_least(1)
349 .at_most(6)
350 .to_slice(),
351 );
352
353 let timezone = choice((
355 just('Z').map(|c| c.to_string()),
356 one_of("-+")
357 .then(digits(2).then(just(':').or_not().then(digits(2))).map(
358 |(hrs, (_opt_colon, mins)): (&str, (Option<char>, &str))| {
359 format!("{}{}", hrs, mins)
362 },
363 ))
364 .map(|(sign, offset)| format!("{}{}", sign, offset)),
365 ))
366 .or_not()
367 .map(|opt| opt.unwrap_or_default());
368
369 hours
371 .then(minutes)
372 .then(seconds)
373 .then(milliseconds)
374 .then(timezone)
375 .map(|((((hours, mins), secs), ms), tz)| format!("{}{}{}{}{}", hours, mins, secs, ms, tz))
376}
377
378fn date_token<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
379 just('@')
381 .then(any().filter(|c: &char| c.is_ascii_digit()).rewind())
383 .ignore_then(
384 choice((
387 date_inner()
389 .then(just('T'))
390 .then(time_inner())
391 .then_ignore(end_expr())
392 .map(|((date, t), time)| Literal::Timestamp(format!("{}{}{}", date, t, time))),
393 date_inner().then_ignore(end_expr()).map(Literal::Date),
395 time_inner().then_ignore(end_expr()).map(Literal::Time),
397 )),
398 )
399 .map(TokenKind::Literal)
400}
401
402pub fn literal<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
403 choice((
404 binary_number(),
405 hexadecimal_number(),
406 octal_number(),
407 string(),
408 raw_string(),
409 value_and_unit(),
410 number(),
411 boolean(),
412 null(),
413 ))
414}
415
416fn parse_number_with_base<'a>(
418 prefix: &'static str,
419 base: u32,
420 max_digits: usize,
421 valid_digit: impl Fn(&char) -> bool + 'a,
422) -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
423 just(prefix)
424 .then_ignore(just("_").or_not()) .ignore_then(
426 any()
427 .filter(valid_digit)
428 .repeated()
429 .at_least(1)
430 .at_most(max_digits)
431 .to_slice()
432 .map(move |digits: &str| {
433 i64::from_str_radix(digits, base)
434 .map(Literal::Integer)
435 .unwrap_or(Literal::Integer(0))
436 }),
437 )
438}
439
440fn binary_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
441 parse_number_with_base("0b", 2, 32, |c| *c == '0' || *c == '1')
442}
443
444fn hexadecimal_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
445 parse_number_with_base("0x", 16, 12, |c| c.is_ascii_hexdigit())
446}
447
448fn octal_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
449 parse_number_with_base("0o", 8, 12, |c| ('0'..='7').contains(c))
450}
451
452fn number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
453 fn optional_component<'p, T>(
455 parser: impl Parser<'p, ParserInput<'p>, T, ParserError<'p>>,
456 to_string: impl Fn(T) -> String + 'p,
457 ) -> impl Parser<'p, ParserInput<'p>, String, ParserError<'p>> {
458 parser
459 .map(to_string)
460 .or_not()
461 .map(|opt| opt.unwrap_or_default())
462 }
463
464 let integer = parse_integer();
466
467 let fraction_digits = any()
469 .filter(|c: &char| c.is_ascii_digit())
470 .then(
471 any()
472 .filter(|c: &char| c.is_ascii_digit() || *c == '_')
473 .repeated(),
474 )
475 .to_slice();
476
477 let frac = just('.')
478 .then(fraction_digits)
479 .map(|(dot, digits): (char, &str)| format!("{}{}", dot, digits));
480
481 let exp_digits = one_of("+-")
483 .or_not()
484 .then(
485 any()
486 .filter(|c: &char| c.is_ascii_digit())
487 .repeated()
488 .at_least(1),
489 )
490 .to_slice();
491
492 let exp = one_of("eE")
493 .then(exp_digits)
494 .map(|(e, digits): (char, &str)| format!("{}{}", e, digits));
495
496 integer
498 .then(optional_component(frac, |f| f))
499 .then(optional_component(exp, |e| e))
500 .map(|((int_part, frac_part), exp_part)| {
501 let num_str = format!("{}{}{}", int_part, frac_part, exp_part)
503 .chars()
504 .filter(|&c| c != '_')
505 .collect::<String>();
506
507 if let Ok(i) = num_str.parse::<i64>() {
509 Literal::Integer(i)
510 } else if let Ok(f) = num_str.parse::<f64>() {
511 Literal::Float(f)
512 } else {
513 Literal::Integer(0) }
515 })
516}
517
518fn parse_integer<'a>() -> impl Parser<'a, ParserInput<'a>, &'a str, ParserError<'a>> {
519 choice((
521 any()
522 .filter(|c: &char| c.is_ascii_digit() && *c != '0')
523 .then(
524 any()
525 .filter(|c: &char| c.is_ascii_digit() || *c == '_')
526 .repeated(),
527 )
528 .to_slice(),
529 just('0').to_slice(),
530 ))
531}
532
533fn string<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
534 quoted_string(true).map(Literal::String)
535}
536
537fn raw_string<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
538 just("r")
539 .then(choice((just('\''), just('"'))))
540 .then(
541 any()
542 .filter(move |c: &char| *c != '\'' && *c != '"' && *c != '\n' && *c != '\r')
543 .repeated()
544 .to_slice(),
545 )
546 .then(choice((just('\''), just('"'))))
547 .map(
548 |(((_, _open_quote), s), _close_quote): (((&str, char), &str), char)| {
549 Literal::RawString(s.to_string())
550 },
551 )
552}
553
554fn boolean<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
555 choice((just("true").to(true), just("false").to(false)))
556 .then_ignore(end_expr())
557 .map(Literal::Boolean)
558}
559
560fn null<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
561 just("null").to(Literal::Null).then_ignore(end_expr())
562}
563
564fn value_and_unit<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
565 let unit = choice((
567 just("microseconds"),
568 just("milliseconds"),
569 just("seconds"),
570 just("minutes"),
571 just("hours"),
572 just("days"),
573 just("weeks"),
574 just("months"),
575 just("years"),
576 ));
577
578 parse_integer().then(unit).then_ignore(end_expr()).map(
580 |(number_str, unit_str): (&str, &str)| {
581 let n = number_str.replace('_', "").parse::<i64>().unwrap_or(1);
583 Literal::ValueAndUnit(ValueAndUnit {
584 n,
585 unit: unit_str.to_string(),
586 })
587 },
588 )
589}
590
591pub fn quoted_string<'a>(
592 escaped: bool,
593) -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
594 choice((
595 multi_quoted_string(&'"', escaped),
596 multi_quoted_string(&'\'', escaped),
597 ))
598 .map(|chars| chars.into_iter().collect())
599}
600
601fn parse_escape_sequence<'a>(
604 input: &mut chumsky::input::InputRef<'a, '_, ParserInput<'a>, ParserError<'a>>,
605 quote_char: char,
606) -> char {
607 match input.peek() {
608 Some(next_ch) => {
609 input.next();
610 match next_ch {
611 '\\' => '\\',
612 '/' => '/',
613 'b' => '\x08',
614 'f' => '\x0C',
615 'n' => '\n',
616 'r' => '\r',
617 't' => '\t',
618 'u' if input.peek() == Some('{') => {
619 input.next(); let mut hex = String::new();
621 while let Some(ch) = input.peek() {
622 if ch == '}' {
623 input.next();
624 break;
625 }
626 if ch.is_ascii_hexdigit() && hex.len() < 6 {
627 hex.push(ch);
628 input.next();
629 } else {
630 break;
631 }
632 }
633 char::from_u32(u32::from_str_radix(&hex, 16).unwrap_or(0)).unwrap_or('\u{FFFD}')
634 }
635 'x' => {
636 let mut hex = String::new();
637 for _ in 0..2 {
638 if let Some(ch) = input.peek() {
639 if ch.is_ascii_hexdigit() {
640 hex.push(ch);
641 input.next();
642 }
643 }
644 }
645 if hex.len() == 2 {
646 char::from_u32(u32::from_str_radix(&hex, 16).unwrap_or(0))
647 .unwrap_or('\u{FFFD}')
648 } else {
649 next_ch }
651 }
652 c if c == quote_char => quote_char, other => other, }
655 }
656 None => {
657 '\\'
659 }
660 }
661}
662
663fn multi_quoted_string<'a>(
670 quote: &char,
671 escaping: bool,
672) -> impl Parser<'a, ParserInput<'a>, Vec<char>, ParserError<'a>> {
673 let quote_char = *quote;
674
675 custom(move |input| {
676 let start_cursor = input.save();
677
678 let mut open_count = 0;
680 while let Some(ch) = input.peek() {
681 if ch == quote_char {
682 input.next();
683 open_count += 1;
684 } else {
685 break;
686 }
687 }
688
689 if open_count == 0 {
690 let span = input.span_since(start_cursor.cursor());
691 return Err(Simple::new(input.peek_maybe(), span));
692 }
693
694 if open_count % 2 == 0 {
696 return Ok(vec![]);
697 }
698
699 let mut result = Vec::new();
701
702 loop {
703 let checkpoint = input.save();
705
706 let mut close_count = 0;
708 while close_count < open_count {
709 match input.peek() {
710 Some(ch) if ch == quote_char => {
711 input.next();
712 close_count += 1;
713 }
714 _ => break,
715 }
716 }
717
718 if close_count == open_count {
720 return Ok(result);
721 }
722
723 input.rewind(checkpoint);
725
726 match input.next() {
727 Some(ch) => {
728 if escaping && ch == '\\' {
730 let escaped = parse_escape_sequence(input, quote_char);
731 result.push(escaped);
732 } else {
733 result.push(ch);
734 }
735 }
736 None => {
737 let current_cursor = input.save();
740 let span = input.span_since(current_cursor.cursor());
741 return Err(Simple::new(None, span));
742 }
743 }
744 }
745 })
746}
747
748fn end_expr<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
749 choice((
750 end(),
751 one_of(",)]}\t >").to(()),
752 newline(),
753 just("..").to(()),
754 ))
755 .rewind()
756}