use super::token::{Token, TokenKind};
use chumsky::input::StrInput;
use chumsky::prelude::*;
type LexerError<'src> = chumsky::extra::Err<Rich<'src, char, SimpleSpan>>;
fn whitespace_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
one_of(" \t\r")
.repeated()
.at_least(1)
.to(TokenKind::Whitespace)
}
fn linebreak_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
text::newline()
.repeated()
.at_least(1)
.to(TokenKind::LineBreak)
}
fn comment_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
let endline = text::newline().or(end());
let single_line = just("//")
.ignore_then(any().and_is(endline.not()).repeated())
.then_ignore(endline.rewind())
.to(TokenKind::SingleLineComment);
let multi_line = just("/*")
.ignore_then(any().and_is(just("*/").not()).repeated())
.then_ignore(just("*/"))
.to(TokenKind::MultiLineComment);
single_line.or(multi_line)
}
fn string_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
none_of('"')
.repeated()
.delimited_by(just('"'), just('"'))
.to(TokenKind::Str)
}
fn number_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
let float = text::int::<I, _>(10)
.then_ignore(just('.'))
.then(text::digits::<I, _>(10))
.then_ignore(just('.').not().ignored().or(end()).rewind())
.to(TokenKind::Float);
let int = text::int::<I, LexerError<'src>>(10).to(TokenKind::Int);
float.or(int)
}
fn operator_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
choice((
just("->").to(TokenKind::Arrow),
just("<-").to(TokenKind::LeftArrow),
just("=>").to(TokenKind::FatArrow),
just("==").to(TokenKind::OpEqual),
just("!=").to(TokenKind::OpNotEqual),
just("<=").to(TokenKind::OpLessEqual),
just(">=").to(TokenKind::OpGreaterEqual),
just("&&").to(TokenKind::OpAnd),
just("||").to(TokenKind::OpOr),
just("|>").to(TokenKind::OpPipe),
just("+").to(TokenKind::OpSum),
just("-").to(TokenKind::OpMinus),
just("*").to(TokenKind::OpProduct),
just("/").to(TokenKind::OpDivide),
just("%").to(TokenKind::OpModulo),
just("^").to(TokenKind::OpExponent),
just("@").to(TokenKind::OpAt),
just("<").to(TokenKind::OpLessThan),
just(">").to(TokenKind::OpGreaterThan),
just("=").to(TokenKind::Assign),
just("!").to(TokenKind::MacroExpand),
))
}
fn punctuation_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
choice((
just("::").to(TokenKind::DoubleColon),
just("..").to(TokenKind::DoubleDot),
just(".").to(TokenKind::Dot),
just(",").to(TokenKind::Comma),
just(":").to(TokenKind::Colon),
just(";").to(TokenKind::LineBreak), just("(").to(TokenKind::ParenBegin),
just(")").to(TokenKind::ParenEnd),
just("[").to(TokenKind::ArrayBegin),
just("]").to(TokenKind::ArrayEnd),
just("{").to(TokenKind::BlockBegin),
just("}").to(TokenKind::BlockEnd),
just("`").to(TokenKind::BackQuote),
just("$").to(TokenKind::Dollar),
just("#").to(TokenKind::Sharp),
just("|").to(TokenKind::LambdaArgBeginEnd),
))
}
fn identifier_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
text::ident()
.to_slice()
.map(|ident: &'src str| match ident {
"fn" => TokenKind::Function,
"macro" => TokenKind::Macro,
"self" => TokenKind::SelfLit,
"now" => TokenKind::Now,
"samplerate" => TokenKind::SampleRate,
"let" => TokenKind::Let,
"letrec" => TokenKind::LetRec,
"if" => TokenKind::If,
"else" => TokenKind::Else,
"match" => TokenKind::Match,
"float" => TokenKind::FloatType,
"int" => TokenKind::IntegerType,
"string" => TokenKind::StringType,
"struct" => TokenKind::StructType,
"include" => TokenKind::Include,
"stage" => TokenKind::StageKwd,
"main" => TokenKind::Main,
"mod" => TokenKind::Mod,
"use" => TokenKind::Use,
"pub" => TokenKind::Pub,
"type" => TokenKind::Type,
"alias" => TokenKind::Alias,
"rec" => TokenKind::Rec,
"_" => TokenKind::PlaceHolder,
_ => TokenKind::Ident,
})
}
fn token_parser<'src, I>() -> impl Parser<'src, I, TokenKind, LexerError<'src>> + Clone
where
I: StrInput<'src, Token = char, Span = SimpleSpan, Slice = &'src str>,
{
choice((
comment_parser(),
linebreak_parser(),
whitespace_parser(),
string_parser(),
number_parser(),
identifier_parser(),
operator_parser(), punctuation_parser(),
))
}
fn split_projection_float_tokens(tokens: Vec<Token>, source: &str) -> Vec<Token> {
let mut result: Vec<Token> = Vec::with_capacity(tokens.len());
tokens.into_iter().for_each(|token| {
let maybe_split = result
.last()
.filter(|prev| prev.kind == TokenKind::Dot && prev.end() == token.start)
.and_then(|_| {
if token.kind != TokenKind::Float {
return None;
}
token.text(source).split_once('.').and_then(|(head, tail)| {
let is_digit_only =
|s: &str| !s.is_empty() && s.chars().all(|c| c.is_ascii_digit());
if is_digit_only(head) && is_digit_only(tail) {
Some((head.len(), tail.len()))
} else {
None
}
})
});
if let Some((head_len, tail_len)) = maybe_split {
let head = Token::new(TokenKind::Int, token.start, head_len);
let dot = Token::new(TokenKind::Dot, token.start + head_len, 1);
let tail = Token::new(TokenKind::Int, token.start + head_len + 1, tail_len);
result.push(head);
result.push(dot);
result.push(tail);
} else {
result.push(token);
}
});
result
}
pub fn tokenize(source: &str) -> Vec<Token> {
let error_token = any().map_with(|_, e| {
let span: SimpleSpan = e.span();
Token::new(TokenKind::Error, span.start, span.end - span.start)
});
let lexer = token_parser()
.map_with(|kind, e| {
let span: SimpleSpan = e.span();
Token::new(kind, span.start, span.end - span.start)
})
.or(error_token)
.repeated()
.collect::<Vec<_>>()
.then_ignore(end());
let (tokens, errors) = lexer.parse(source).into_output_errors();
if !errors.is_empty() {
eprintln!("Tokenization recovered from {} errors:", errors.len());
for error in &errors {
eprintln!(" - {error:?}");
}
}
match tokens {
Some(mut tokens) => {
tokens = split_projection_float_tokens(tokens, source);
tokens.push(Token::new(TokenKind::Eof, source.len(), 0));
tokens
}
None => {
vec![Token::new(TokenKind::Eof, source.len(), 0)]
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_simple() {
let source = "fn dsp() { 42 }";
let tokens = tokenize(source);
assert_eq!(tokens[0].kind, TokenKind::Function);
assert_eq!(tokens[0].text(source), "fn");
assert_eq!(tokens[1].kind, TokenKind::Whitespace);
assert_eq!(tokens[2].kind, TokenKind::Ident);
assert_eq!(tokens[2].text(source), "dsp");
}
#[test]
fn test_tokenize_numbers() {
let source = "42 3.14";
let tokens = tokenize(source);
assert_eq!(tokens[0].kind, TokenKind::Int);
assert_eq!(tokens[0].text(source), "42");
assert_eq!(tokens[2].kind, TokenKind::Float);
assert_eq!(tokens[2].text(source), "3.14");
}
#[test]
fn test_tokenize_projection_chain_numbers() {
let source = "c.0.0";
let tokens = tokenize(source);
let kinds: Vec<_> = tokens
.iter()
.filter(|t| !matches!(t.kind, TokenKind::Whitespace | TokenKind::Eof))
.map(|t| t.kind)
.collect();
assert_eq!(
kinds,
vec![
TokenKind::Ident,
TokenKind::Dot,
TokenKind::Int,
TokenKind::Dot,
TokenKind::Int
]
);
}
#[test]
fn test_tokenize_string() {
let source = r#""hello world""#;
let tokens = tokenize(source);
assert_eq!(tokens[0].kind, TokenKind::Str);
assert_eq!(tokens[0].text(source), r#""hello world""#);
}
#[test]
fn test_tokenize_comments() {
let source = "// single line\n/* multi\nline */";
let tokens = tokenize(source);
assert_eq!(tokens[0].kind, TokenKind::SingleLineComment);
assert_eq!(tokens[0].text(source), "// single line");
assert_eq!(tokens[1].kind, TokenKind::LineBreak);
assert_eq!(tokens[2].kind, TokenKind::MultiLineComment);
assert_eq!(tokens[2].text(source), "/* multi\nline */");
}
#[test]
fn test_tokenize_operators() {
let source = "+ - * / == != < <= > >= && || |>";
let tokens = tokenize(source);
let op_kinds: Vec<_> = tokens
.iter()
.filter(|t| !matches!(t.kind, TokenKind::Whitespace | TokenKind::Eof))
.map(|t| t.kind)
.collect();
assert_eq!(
op_kinds,
vec![
TokenKind::OpSum,
TokenKind::OpMinus,
TokenKind::OpProduct,
TokenKind::OpDivide,
TokenKind::OpEqual,
TokenKind::OpNotEqual,
TokenKind::OpLessThan,
TokenKind::OpLessEqual,
TokenKind::OpGreaterThan,
TokenKind::OpGreaterEqual,
TokenKind::OpAnd,
TokenKind::OpOr,
TokenKind::OpPipe,
]
);
}
#[test]
fn test_trivia_detection() {
let source = "fn // comment\n dsp";
let tokens = tokenize(source);
assert!(!tokens[0].is_trivia()); assert!(tokens[1].is_trivia()); assert!(tokens[2].is_trivia()); assert!(tokens[3].is_trivia()); assert!(tokens[4].is_trivia()); assert!(!tokens[5].is_trivia()); }
#[test]
fn test_error_recovery() {
let source = "fn dsp() { 42 § }";
let tokens = tokenize(source);
let token_kinds: Vec<_> = tokens
.iter()
.filter(|t| !t.is_trivia() && t.kind != TokenKind::Eof)
.map(|t| t.kind)
.collect();
assert!(token_kinds.contains(&TokenKind::Function));
assert!(token_kinds.contains(&TokenKind::Ident));
assert!(token_kinds.contains(&TokenKind::Int));
assert!(token_kinds.contains(&TokenKind::Error));
assert!(token_kinds.contains(&TokenKind::BlockBegin));
assert!(token_kinds.contains(&TokenKind::BlockEnd));
}
#[test]
fn test_error_recovery_multiple_errors() {
let source = "fn § dsp() { © }";
let tokens = tokenize(source);
let error_count = tokens.iter().filter(|t| t.is_error()).count();
assert_eq!(error_count, 2);
let has_fn = tokens.iter().any(|t| t.kind == TokenKind::Function);
let has_dsp = tokens.iter().any(|t| t.kind == TokenKind::Ident);
assert!(has_fn);
assert!(has_dsp);
}
#[test]
fn test_tokenize_module_keywords() {
let source = "mod use pub";
let tokens = tokenize(source);
let kinds: Vec<_> = tokens
.iter()
.filter(|t| !matches!(t.kind, TokenKind::Whitespace | TokenKind::Eof))
.map(|t| t.kind)
.collect();
assert_eq!(kinds, vec![TokenKind::Mod, TokenKind::Use, TokenKind::Pub]);
}
#[test]
fn test_tokenize_double_colon() {
let source = "mod::path::name";
let tokens = tokenize(source);
let kinds: Vec<_> = tokens
.iter()
.filter(|t| !matches!(t.kind, TokenKind::Whitespace | TokenKind::Eof))
.map(|t| t.kind)
.collect();
assert_eq!(
kinds,
vec![
TokenKind::Mod,
TokenKind::DoubleColon,
TokenKind::Ident,
TokenKind::DoubleColon,
TokenKind::Ident
]
);
}
#[test]
fn test_tokenize_module_declaration() {
let source = "mod mymod { pub fn foo() { 42 } }";
let tokens = tokenize(source);
let has_mod = tokens.iter().any(|t| t.kind == TokenKind::Mod);
let has_pub = tokens.iter().any(|t| t.kind == TokenKind::Pub);
let has_fn = tokens.iter().any(|t| t.kind == TokenKind::Function);
let has_ident = tokens.iter().any(|t| t.kind == TokenKind::Ident);
assert!(has_mod);
assert!(has_pub);
assert!(has_fn);
assert!(has_ident);
}
#[test]
fn test_tokenize_use_statement() {
let source = "use modA::modB::func";
let tokens = tokenize(source);
let kinds: Vec<_> = tokens
.iter()
.filter(|t| !matches!(t.kind, TokenKind::Whitespace | TokenKind::Eof))
.map(|t| t.kind)
.collect();
assert_eq!(
kinds,
vec![
TokenKind::Use,
TokenKind::Ident,
TokenKind::DoubleColon,
TokenKind::Ident,
TokenKind::DoubleColon,
TokenKind::Ident
]
);
}
}