use chumsky;
use chumsky::extra;
use chumsky::prelude::*;
use chumsky::Parser;
use self::lr::{Literal, Token, TokenKind, Tokens, ValueAndUnit};
use crate::error::{Error, ErrorSource, Reason, WithErrorInfo};
pub mod lr;
#[cfg(test)]
mod test;
type E = Error;
type ParserInput<'a> = &'a str;
type ParserError<'a> = extra::Err<Simple<'a, char>>;
fn convert_lexer_error(source: &str, error: &Simple<'_, char>, source_id: u16) -> E {
let byte_span = error.span();
let byte_start = byte_span.start();
let byte_end = byte_span.end();
let char_start = source[..byte_start].chars().count();
let char_end = source[..byte_end].chars().count();
let found: String = source
.chars()
.skip(char_start)
.take(char_end - char_start)
.collect();
let found_display = if found.is_empty() {
"end of input".to_string()
} else {
format!("'{}'", found)
};
let error_source = format!(
"Unexpected {} at position {}..{}",
found_display, char_start, char_end
);
WithErrorInfo::with_span(
Error::new(Reason::Unexpected {
found: found_display,
}),
Some(crate::span::Span {
start: char_start,
end: char_end,
source_id,
}),
)
.with_source(ErrorSource::Lexer(error_source))
}
pub fn lex_source_recovery(source: &str, source_id: u16) -> (Option<Vec<Token>>, Vec<E>) {
let result = lexer().parse(source).into_result();
match result {
Ok(tokens) => (Some(insert_start(tokens.to_vec())), vec![]),
Err(errors) => {
let errors = errors
.into_iter()
.map(|error| convert_lexer_error(source, &error, source_id))
.collect();
(None, errors)
}
}
}
pub fn lex_source(source: &str) -> Result<Tokens, Vec<E>> {
let result = lexer().parse(source).into_result();
match result {
Ok(tokens) => Ok(Tokens(insert_start(tokens.to_vec()))),
Err(errors) => {
let errors = errors
.into_iter()
.map(|error| convert_lexer_error(source, &error, 0))
.collect();
Err(errors)
}
}
}
fn insert_start(tokens: Vec<Token>) -> Vec<Token> {
std::iter::once(Token {
kind: TokenKind::Start,
span: 0..0,
})
.chain(tokens)
.collect()
}
pub fn lexer<'a>() -> impl Parser<'a, ParserInput<'a>, Vec<Token>, ParserError<'a>> {
lex_token()
.repeated()
.collect()
.then_ignore(whitespace().or_not())
}
fn lex_token<'a>() -> impl Parser<'a, ParserInput<'a>, Token, ParserError<'a>> {
let range = whitespace()
.or_not()
.then(just(".."))
.then(whitespace().or_not())
.map_with(|((left, _), right), extra| {
let span: chumsky::span::SimpleSpan = extra.span();
Token {
kind: TokenKind::Range {
bind_left: left.is_none(),
bind_right: right.is_none(),
},
span: span.start()..span.end(),
}
});
let other_tokens = whitespace()
.or_not()
.ignore_then(token().map_with(|kind, extra| {
let span: chumsky::span::SimpleSpan = extra.span();
Token {
kind,
span: span.start()..span.end(),
}
}));
choice((range, other_tokens))
}
fn token<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
choice((
line_wrap().boxed(), newline().to(TokenKind::NewLine), multi_char_operators(), interpolation().boxed(), param(), date_token().boxed(), just('@').to(TokenKind::Annotate), one_of("></%=+-*[]().,:|!{}").map(TokenKind::Control), literal().map(TokenKind::Literal).boxed(), keyword(), ident_part().map(TokenKind::Ident), comment(), ))
}
fn multi_char_operators<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
choice((
just("->").to(TokenKind::ArrowThin),
just("=>").to(TokenKind::ArrowFat),
just("==").to(TokenKind::Eq),
just("!=").to(TokenKind::Ne),
just(">=").to(TokenKind::Gte),
just("<=").to(TokenKind::Lte),
just("~=").to(TokenKind::RegexSearch),
just("&&").then_ignore(end_expr()).to(TokenKind::And),
just("||").then_ignore(end_expr()).to(TokenKind::Or),
just("??").to(TokenKind::Coalesce),
just("//").to(TokenKind::DivInt),
just("**").to(TokenKind::Pow),
))
}
fn keyword<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
choice((
just("let"),
just("into"),
just("case"),
just("prql"),
just("type"),
just("module"),
just("internal"),
just("func"),
just("import"),
just("enum"),
))
.to_slice()
.then_ignore(end_expr())
.map(|s: &str| TokenKind::Keyword(s.to_string()))
}
fn param<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
just('$')
.ignore_then(
any()
.filter(|c: &char| c.is_alphanumeric() || *c == '_' || *c == '.')
.repeated()
.to_slice()
.map(|s: &str| s.to_string()),
)
.map(TokenKind::Param)
}
fn interpolation<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
one_of("sf")
.then(quoted_string(true))
.map(|(c, s)| TokenKind::Interpolation(c, s))
}
fn whitespace<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
text::inline_whitespace().at_least(1)
}
fn newline<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
just('\n')
.or(just('\r').then_ignore(just('\n').or_not()))
.ignored()
}
fn line_wrap<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
newline()
.ignore_then(
whitespace()
.repeated()
.ignore_then(comment())
.then_ignore(newline())
.repeated()
.collect(),
)
.then_ignore(whitespace().repeated())
.then_ignore(just('\\'))
.map(TokenKind::LineWrap)
}
fn comment<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
let comment_text = none_of("\n\r").repeated().collect::<String>();
just('#').ignore_then(
just('!')
.ignore_then(comment_text.map(TokenKind::DocComment))
.or(comment_text.map(TokenKind::Comment)),
)
}
pub fn ident_part<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
let plain = any()
.filter(|c: &char| c.is_alphabetic() || *c == '_')
.then(
any()
.filter(|c: &char| c.is_alphanumeric() || *c == '_')
.repeated(),
)
.to_slice()
.map(|s: &str| s.to_string());
let backtick = none_of('`')
.repeated()
.collect::<String>()
.delimited_by(just('`'), just('`'));
choice((plain, backtick))
}
fn digits<'a>(count: usize) -> impl Parser<'a, ParserInput<'a>, &'a str, ParserError<'a>> {
chumsky::text::digits(10).exactly(count).to_slice()
}
fn date_inner<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
text::digits(10)
.exactly(4)
.then(just('-'))
.then(text::digits(10).exactly(2))
.then(just('-'))
.then(text::digits(10).exactly(2))
.to_slice()
.map(|s: &str| s.to_owned())
}
fn time_inner<'a>() -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
fn time_component<'p>(
separator: char,
component_parser: impl Parser<'p, ParserInput<'p>, &'p str, ParserError<'p>>,
) -> impl Parser<'p, ParserInput<'p>, String, ParserError<'p>> {
just(separator)
.then(component_parser)
.map(move |(sep, comp): (char, &str)| format!("{}{}", sep, comp))
.or_not()
.map(|opt| opt.unwrap_or_default())
}
let hours = digits(2).map(|s: &str| s.to_string());
let minutes = time_component(':', digits(2));
let seconds = time_component(':', digits(2));
let milliseconds = time_component(
'.',
any()
.filter(|c: &char| c.is_ascii_digit())
.repeated()
.at_least(1)
.at_most(6)
.to_slice(),
);
let timezone = choice((
just('Z').map(|c| c.to_string()),
one_of("-+")
.then(digits(2).then(just(':').or_not().then(digits(2))).map(
|(hrs, (_opt_colon, mins)): (&str, (Option<char>, &str))| {
format!("{}{}", hrs, mins)
},
))
.map(|(sign, offset)| format!("{}{}", sign, offset)),
))
.or_not()
.map(|opt| opt.unwrap_or_default());
hours
.then(minutes)
.then(seconds)
.then(milliseconds)
.then(timezone)
.map(|((((hours, mins), secs), ms), tz)| format!("{}{}{}{}{}", hours, mins, secs, ms, tz))
}
fn date_token<'a>() -> impl Parser<'a, ParserInput<'a>, TokenKind, ParserError<'a>> {
just('@')
.then(any().filter(|c: &char| c.is_ascii_digit()).rewind())
.ignore_then(
choice((
date_inner()
.then(just('T'))
.then(time_inner())
.then_ignore(end_expr())
.map(|((date, t), time)| Literal::Timestamp(format!("{}{}{}", date, t, time))),
date_inner().then_ignore(end_expr()).map(Literal::Date),
time_inner().then_ignore(end_expr()).map(Literal::Time),
)),
)
.map(TokenKind::Literal)
}
pub fn literal<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
choice((
binary_number(),
hexadecimal_number(),
octal_number(),
string(),
raw_string(),
value_and_unit(),
number(),
boolean(),
null(),
))
}
fn parse_number_with_base<'a>(
prefix: &'static str,
base: u32,
max_digits: usize,
valid_digit: impl Fn(&char) -> bool + 'a,
) -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
just(prefix)
.then_ignore(just("_").or_not()) .ignore_then(
any()
.filter(valid_digit)
.repeated()
.at_least(1)
.at_most(max_digits)
.to_slice()
.map(move |digits: &str| {
i64::from_str_radix(digits, base)
.map(Literal::Integer)
.unwrap_or(Literal::Integer(0))
}),
)
}
fn binary_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
parse_number_with_base("0b", 2, 32, |c| *c == '0' || *c == '1')
}
fn hexadecimal_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
parse_number_with_base("0x", 16, 12, |c| c.is_ascii_hexdigit())
}
fn octal_number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
parse_number_with_base("0o", 8, 12, |c| ('0'..='7').contains(c))
}
fn number<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
fn optional_component<'p, T>(
parser: impl Parser<'p, ParserInput<'p>, T, ParserError<'p>>,
to_string: impl Fn(T) -> String + 'p,
) -> impl Parser<'p, ParserInput<'p>, String, ParserError<'p>> {
parser
.map(to_string)
.or_not()
.map(|opt| opt.unwrap_or_default())
}
let integer = parse_integer();
let fraction_digits = any()
.filter(|c: &char| c.is_ascii_digit())
.then(
any()
.filter(|c: &char| c.is_ascii_digit() || *c == '_')
.repeated(),
)
.to_slice();
let frac = just('.')
.then(fraction_digits)
.map(|(dot, digits): (char, &str)| format!("{}{}", dot, digits));
let exp_digits = one_of("+-")
.or_not()
.then(
any()
.filter(|c: &char| c.is_ascii_digit())
.repeated()
.at_least(1),
)
.to_slice();
let exp = one_of("eE")
.then(exp_digits)
.map(|(e, digits): (char, &str)| format!("{}{}", e, digits));
integer
.then(optional_component(frac, |f| f))
.then(optional_component(exp, |e| e))
.map(|((int_part, frac_part), exp_part)| {
let num_str = format!("{}{}{}", int_part, frac_part, exp_part)
.chars()
.filter(|&c| c != '_')
.collect::<String>();
if let Ok(i) = num_str.parse::<i64>() {
Literal::Integer(i)
} else if let Ok(f) = num_str.parse::<f64>() {
Literal::Float(f)
} else {
Literal::Integer(0) }
})
}
fn parse_integer<'a>() -> impl Parser<'a, ParserInput<'a>, &'a str, ParserError<'a>> {
choice((
any()
.filter(|c: &char| c.is_ascii_digit() && *c != '0')
.then(
any()
.filter(|c: &char| c.is_ascii_digit() || *c == '_')
.repeated(),
)
.to_slice(),
just('0').to_slice(),
))
}
fn string<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
quoted_string(true).map(Literal::String)
}
fn raw_string<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
just("r")
.then(choice((just('\''), just('"'))))
.then(
any()
.filter(move |c: &char| *c != '\'' && *c != '"' && *c != '\n' && *c != '\r')
.repeated()
.to_slice(),
)
.then(choice((just('\''), just('"'))))
.map(
|(((_, _open_quote), s), _close_quote): (((&str, char), &str), char)| {
Literal::RawString(s.to_string())
},
)
}
fn boolean<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
choice((just("true").to(true), just("false").to(false)))
.then_ignore(end_expr())
.map(Literal::Boolean)
}
fn null<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
just("null").to(Literal::Null).then_ignore(end_expr())
}
fn value_and_unit<'a>() -> impl Parser<'a, ParserInput<'a>, Literal, ParserError<'a>> {
let unit = choice((
just("microseconds"),
just("milliseconds"),
just("seconds"),
just("minutes"),
just("hours"),
just("days"),
just("weeks"),
just("months"),
just("years"),
));
parse_integer().then(unit).then_ignore(end_expr()).map(
|(number_str, unit_str): (&str, &str)| {
let n = number_str.replace('_', "").parse::<i64>().unwrap_or(1);
Literal::ValueAndUnit(ValueAndUnit {
n,
unit: unit_str.to_string(),
})
},
)
}
pub fn quoted_string<'a>(
escaped: bool,
) -> impl Parser<'a, ParserInput<'a>, String, ParserError<'a>> {
choice((
multi_quoted_string(&'"', escaped),
multi_quoted_string(&'\'', escaped),
))
.map(|chars| chars.into_iter().collect())
}
fn parse_escape_sequence<'a>(
input: &mut chumsky::input::InputRef<'a, '_, ParserInput<'a>, ParserError<'a>>,
quote_char: char,
) -> char {
match input.peek() {
Some(next_ch) => {
input.next();
match next_ch {
'\\' => '\\',
'/' => '/',
'b' => '\x08',
'f' => '\x0C',
'n' => '\n',
'r' => '\r',
't' => '\t',
'u' if input.peek() == Some('{') => {
input.next(); let mut hex = String::new();
while let Some(ch) = input.peek() {
if ch == '}' {
input.next();
break;
}
if ch.is_ascii_hexdigit() && hex.len() < 6 {
hex.push(ch);
input.next();
} else {
break;
}
}
char::from_u32(u32::from_str_radix(&hex, 16).unwrap_or(0)).unwrap_or('\u{FFFD}')
}
'x' => {
let mut hex = String::new();
for _ in 0..2 {
if let Some(ch) = input.peek() {
if ch.is_ascii_hexdigit() {
hex.push(ch);
input.next();
}
}
}
if hex.len() == 2 {
char::from_u32(u32::from_str_radix(&hex, 16).unwrap_or(0))
.unwrap_or('\u{FFFD}')
} else {
next_ch }
}
c if c == quote_char => quote_char, other => other, }
}
None => {
'\\'
}
}
}
fn multi_quoted_string<'a>(
quote: &char,
escaping: bool,
) -> impl Parser<'a, ParserInput<'a>, Vec<char>, ParserError<'a>> {
let quote_char = *quote;
custom(move |input| {
let start_cursor = input.save();
let mut open_count = 0;
while let Some(ch) = input.peek() {
if ch == quote_char {
input.next();
open_count += 1;
} else {
break;
}
}
if open_count == 0 {
let span = input.span_since(start_cursor.cursor());
return Err(Simple::new(input.peek_maybe(), span));
}
if open_count % 2 == 0 {
return Ok(vec![]);
}
let mut result = Vec::new();
loop {
let checkpoint = input.save();
let mut close_count = 0;
while close_count < open_count {
match input.peek() {
Some(ch) if ch == quote_char => {
input.next();
close_count += 1;
}
_ => break,
}
}
if close_count == open_count {
return Ok(result);
}
input.rewind(checkpoint);
match input.next() {
Some(ch) => {
if escaping && ch == '\\' {
let escaped = parse_escape_sequence(input, quote_char);
result.push(escaped);
} else {
result.push(ch);
}
}
None => {
let current_cursor = input.save();
let span = input.span_since(current_cursor.cursor());
return Err(Simple::new(None, span));
}
}
}
})
}
fn end_expr<'a>() -> impl Parser<'a, ParserInput<'a>, (), ParserError<'a>> {
choice((
end(),
one_of(",)]}\t >").to(()),
newline(),
just("..").to(()),
))
.rewind()
}