use nom::{
branch::alt,
bytes::complete::{is_a, is_not, take},
bytes::complete::take_while,
character::complete::{anychar, char, digit0, digit1, not_line_ending, one_of},
combinator::{map, opt, recognize, value},
error::context,
multi::{many0, many1, many_till},
Parser,
sequence::{delimited, tuple},
};
use nom_supreme::ParserExt;
use nom_supreme::tag::complete::{tag, tag_no_case};
pub use enums::*;
use crate::{TokenizationError, TokenizationResult};
use crate::Span;
pub mod enums;
pub mod traits;
pub const LOWER_ALPHABET: &str = "abcdefghijklmnopqrstuvwxyz";
pub const UPPER_ALPHABET: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
macro_rules! include_literal {
(exact $tag:literal => $token:expr) => {
value($token, exact_literal_token(tag($tag)))
};
($tag:literal => $token:expr) => {
value($token, tag($tag))
};
(soft $tag:literal => $token:expr) => {
value($token, soft_literal_token($tag))
};
}
macro_rules! include_literals {
{$($($specifier:ident)* $tag:literal => $token:expr,)+} => {
($(include_literal!($($specifier)* $tag => $token),)+)
};
}
pub fn exact_literal_token<'t, O, P: Parser<&'t str, O, TokenizationError<'t>>>(
mut parser: P,
) -> impl FnMut(&'t str) -> TokenizationResult<O> {
move |input| parser.parse(input)
}
pub fn soft_literal_token(literal: Span) -> impl Parser<Span, Span, TokenizationError> {
identifier
.map(|it| match it {
Identifier::Type(x) | Identifier::Identifier(x) => x,
})
.verify(move |&it| it == literal)
}
fn ignore(input: Span) -> TokenizationResult<Ignore> {
let comment = not_line_ending
.cut()
.preceded_by(tag("//"))
.recognize()
.map(Ignore::Comment);
let multiline_comment = tag("/*")
.precedes(many_till(anychar, tag("*/")).cut())
.recognize()
.map(Ignore::MultilineComment);
context(
"ignore",
alt((
comment,
value(Ignore::Whitespace, many1(is_a(" \t"))),
value(Ignore::Newline, many1(is_a("\n\r"))),
multiline_comment,
)),
)(input)
}
fn keyword(input: Span) -> TokenizationResult<Keyword> {
context(
"keyword",
alt(include_literals! {
"fun" => Keyword::Fun,
"val" => Keyword::Val,
"var" => Keyword::Var,
"if" => Keyword::If,
"elif" => Keyword::Elif,
"else" => Keyword::Else,
"match" => Keyword::Match,
"while" => Keyword::While,
"module" => Keyword::Module,
"extend" => Keyword::Extend,
"\\" => Keyword::Lambda,
soft "abstract" => Keyword::Abstract,
soft "trait" => Keyword::Trait,
soft "struct" => Keyword::Struct,
soft "class" => Keyword::Class,
soft "enum" => Keyword::Enum,
soft "foreign" => Keyword::Foreign,
soft "type" => Keyword::TypeAlias,
soft "with" => Keyword::With,
}),
)(input)
}
fn symbol(input: Span) -> TokenizationResult<Symbol> {
context(
"symbol",
alt(include_literals! {
"," => Symbol::Comma,
";" => Symbol::Semicolon,
"{" => Symbol::LBrace,
"}" => Symbol::RBrace,
"[" => Symbol::LBracket,
"]" => Symbol::RBracket,
"(" => Symbol::LParen,
")" => Symbol::RParen,
"_" => Symbol::TypeGap,
"::" => Symbol::DoubleColon,
":" => Symbol::Colon,
}),
)(input)
}
fn identifier(input: Span) -> TokenizationResult<Identifier> {
let identifier_parser = |alphabet| {
recognize(tuple((
tag("_").opt(),
one_of(alphabet),
take_while(|it: char| it == '_' || it.is_alphanumeric()),
)))
};
context(
"identifier",
alt((
map(identifier_parser(LOWER_ALPHABET), Identifier::Identifier),
map(identifier_parser(UPPER_ALPHABET), Identifier::Type),
)),
)(input)
}
fn literal(input: Span) -> TokenizationResult<Literal> {
fn number_parser<'a>(
prefix: &'static str,
alphabet: &'static str,
) -> impl Parser<Span<'a>, &'a str, TokenizationError<'a>> {
tag_no_case(prefix)
.precedes(
tuple((
one_of(alphabet),
many0(one_of(alphabet).or(one_of("0_"))),
one_of(alphabet).or(char('0')),
))
.recognize()
.or(one_of(alphabet).or(char('0')).recognize())
.cut(),
)
.recognize()
}
let binary = number_parser("0b", "1");
let octal = number_parser("0c", "1234567");
let hex = number_parser("0x", "123456789ABCDEF");
let floating = recognize(tuple((
opt(one_of("-+")),
alt((
tuple((digit1, opt(tuple((char('.'), digit0))))).recognize(),
tuple((char('.'), digit1)).recognize(),
)),
opt(tuple((tag_no_case("e"), opt(one_of("-+")), digit1))),
)));
let char_p = delimited(char('\''), char('\'').not().recognize(), char('\''));
let string = delimited(char('"'), is_not(r#"""#).opt(), char('"'));
context(
"literal",
alt((
map(binary, Literal::Binary),
map(octal, Literal::Octal),
map(hex, Literal::Hex),
map(floating, Literal::Floating),
map(char_p, Literal::Char),
map(string, |it| Literal::String(it.unwrap_or_default())),
)),
)(input)
}
fn operator(input: Span) -> TokenizationResult<Operator> {
context(
"operator",
alt((
alt(include_literals! {
"." => Operator::Dot,
"=>" => Operator::Flow,
}),
map(
alt(include_literals! {
"+" => MathOperator::Plus,
"-" => MathOperator::Sub,
"**" => MathOperator::Pow,
"*" => MathOperator::Times,
"/" => MathOperator::Div,
"%" => MathOperator::Mod,
}),
Operator::Math,
),
map(
alt(include_literals! {
"<=>" => ComparisonOperator::Spaceship,
"==" => ComparisonOperator::Equiv,
"=" => ComparisonOperator::Equals,
"!=" => ComparisonOperator::NotEquiv,
">=" => ComparisonOperator::GreaterEquals,
">" => ComparisonOperator::Greater,
"<=" => ComparisonOperator::LessEquals,
"<" => ComparisonOperator::Less,
}),
Operator::Comparison,
),
map(
alt(include_literals! {
"||" => LogicOperator::OrLogic,
"&&" => LogicOperator::AndLogic,
"!" => LogicOperator::NotLogic,
}),
Operator::Logic,
),
map(
alt(include_literals! {
"|" => BitOperator::OrBit,
"&" => BitOperator::AndBit,
"^" => BitOperator::XorBit,
"~" => BitOperator::NotBit,
}),
Operator::Bit,
),
)),
)(input)
}
pub(crate) fn token(input: Span) -> TokenizationResult<Token> {
context(
"lexer",
alt((
map(ignore, Token::Ignore),
map(keyword, Token::Keyword),
map(symbol, Token::Symbol),
map(identifier, Token::Identifier),
map(literal, Token::Literal),
map(operator, Token::Operator),
value(Token::Unknown, take(1usize)),
)),
)(input)
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use std::fmt::Debug;
use nom::Finish;
use test_case::test_case;
#[allow(unused_imports)]
use crate::lexer::{ignore, Ignore};
use crate::TokenizationResult;
#[test_case(ignore("// hello world!"), Ignore::Comment("// hello world!"), None; "ignore_parser_comment")]
#[test_case(
ignore("//hello world!\nthis is not comment"),
Ignore::Comment("//hello world!"),
Some("\nthis is not comment"); "ignore_parser_comment_another_line")]
#[test_case(
ignore("/* this is\nmultiline comment */"),
Ignore::MultilineComment("/* this is\nmultiline comment */"),
None; "ignore_parser_multiline_comment"
)]
#[test_case(
ignore("/* this is\nmultiline comment */ this is not"),
Ignore::MultilineComment("/* this is\nmultiline comment */"),
Some(" this is not"); "ignore_parser_multiline_comment_with_rest"
)]
#[test_case(ignore("\n\n\n"), Ignore::Newline, None; "ignore_parser_newline")]
#[test_case(ignore(" \t"), Ignore::Whitespace, None; "ignore_parser_whitespace")]
fn test_parser<T: PartialEq + Debug>(
result: TokenizationResult<T>,
expected: T,
expected_rest: Option<&'static str>,
) {
let (rest, data) = result.finish().unwrap();
assert_eq!(rest, expected_rest.unwrap_or(""));
assert_eq!(data, expected);
}
}