use crate::visitors::{Visit, VisitMut, Visitor, VisitorMut};
use full_moon_derive::{symbols, Owned};
use nom::{
branch::alt,
bytes::complete::{tag, tag_no_case, take_till, take_while, take_while1},
character::complete::{anychar, digit1, line_ending, space1},
combinator::{opt, recognize},
multi::many_till,
sequence::{delimited, pair, preceded, tuple},
IResult,
};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use std::{borrow::Cow, cmp::Ordering, fmt, str::FromStr};
symbols!(
And => "and",
Break => "break",
Do => "do",
ElseIf => "elseif",
Else => "else",
End => "end",
False => "false",
For => "for",
Function => "function",
If => "if",
In => "in",
Local => "local",
Nil => "nil",
Not => "not",
Or => "or",
Repeat => "repeat",
Return => "return",
Then => "then",
True => "true",
Until => "until",
While => "while",
PlusEqual => "+=",
MinusEqual => "-=",
StarEqual => "*=",
SlashEqual => "/=",
PercentEqual => "%=",
CaretEqual => "^=",
TwoDotsEqual => "..=",
Ampersand => "&",
ThinArrow => "->",
Caret => "^",
Colon => ":",
Comma => ",",
Ellipse => "...",
TwoDots => "..",
Dot => ".",
TwoEqual => "==",
Equal => "=",
GreaterThanEqual => ">=",
GreaterThan => ">",
Hash => "#",
LeftBrace => "{",
LeftBracket => "[",
LeftParen => "(",
LessThanEqual => "<=",
LessThan => "<",
Minus => "-",
Percent => "%",
Pipe => "|",
Plus => "+",
QuestionMark => "?",
RightBrace => "}",
RightBracket => "]",
RightParen => ")",
Semicolon => ";",
Slash => "/",
Star => "*",
TildeEqual => "~=",
);
#[derive(Clone, Debug, PartialEq)]
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
pub enum TokenizerErrorType {
UnclosedComment,
UnclosedString,
UnexpectedToken(char),
InvalidSymbol(String),
}
#[derive(Clone, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
#[cfg_attr(feature = "serde", serde(tag = "type"))]
pub enum TokenType<'a> {
Eof,
Identifier {
#[cfg_attr(feature = "serde", serde(borrow))]
identifier: Cow<'a, str>,
},
MultiLineComment {
blocks: usize,
#[cfg_attr(feature = "serde", serde(borrow))]
comment: Cow<'a, str>,
},
Number {
#[cfg_attr(feature = "serde", serde(borrow))]
text: Cow<'a, str>,
},
Shebang {
#[cfg_attr(feature = "serde", serde(borrow))]
line: Cow<'a, str>,
},
SingleLineComment {
#[cfg_attr(feature = "serde", serde(borrow))]
comment: Cow<'a, str>,
},
StringLiteral {
#[cfg_attr(feature = "serde", serde(borrow))]
literal: Cow<'a, str>,
#[cfg_attr(feature = "serde", serde(skip_serializing_if = "Option::is_none"))]
multi_line: Option<usize>,
quote_type: StringLiteralQuoteType,
},
Symbol {
symbol: Symbol,
},
Whitespace {
#[cfg_attr(feature = "serde", serde(borrow))]
characters: Cow<'a, str>,
},
}
impl<'a> TokenType<'a> {
#[deprecated(since = "0.5.0", note = "Please use is_trivia instead")]
pub fn ignore(&self) -> bool {
self.is_trivia()
}
pub fn is_trivia(&self) -> bool {
match self {
TokenType::Shebang { .. }
| TokenType::SingleLineComment { .. }
| TokenType::MultiLineComment { .. }
| TokenType::Whitespace { .. } => true,
_ => false,
}
}
pub fn kind(&self) -> TokenKind {
match self {
TokenType::Eof => TokenKind::Eof,
TokenType::Identifier { .. } => TokenKind::Identifier,
TokenType::MultiLineComment { .. } => TokenKind::MultiLineComment,
TokenType::Number { .. } => TokenKind::Number,
TokenType::Shebang { .. } => TokenKind::Shebang,
TokenType::SingleLineComment { .. } => TokenKind::SingleLineComment,
TokenType::StringLiteral { .. } => TokenKind::StringLiteral,
TokenType::Symbol { .. } => TokenKind::Symbol,
TokenType::Whitespace { .. } => TokenKind::Whitespace,
}
}
pub fn spaces(spaces: usize) -> Self {
TokenType::Whitespace {
characters: Cow::from(" ".repeat(spaces)),
}
}
pub fn tabs(tabs: usize) -> Self {
TokenType::Whitespace {
characters: Cow::from("\t".repeat(tabs)),
}
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum TokenKind {
Eof,
Identifier,
MultiLineComment,
Number,
Shebang,
SingleLineComment,
StringLiteral,
Symbol,
Whitespace,
}
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
pub struct Token<'a> {
pub(crate) start_position: Position,
pub(crate) end_position: Position,
#[cfg_attr(feature = "serde", serde(borrow))]
pub(crate) token_type: TokenType<'a>,
}
impl<'a> Token<'a> {
pub fn new(token_type: TokenType<'a>) -> Token<'a> {
Token {
start_position: Position::default(),
end_position: Position::default(),
token_type,
}
}
pub fn start_position(&self) -> Position {
self.start_position
}
pub fn end_position(&self) -> Position {
self.end_position
}
pub fn token_type(&self) -> &TokenType<'a> {
&self.token_type
}
pub fn token_kind(&self) -> TokenKind {
self.token_type().kind()
}
}
impl<'a> fmt::Display for Token<'a> {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
use self::TokenType::*;
match &*self.token_type() {
Eof => "".to_string(),
Number { text } => text.to_string(),
Identifier { identifier } => identifier.to_string(),
MultiLineComment { blocks, comment } => {
format!("--[{0}[{1}]{0}]", "=".repeat(*blocks), comment)
}
Shebang { line } => line.to_string(),
SingleLineComment { comment } => format!("--{}", comment),
StringLiteral {
literal,
multi_line,
quote_type,
} => {
if let Some(blocks) = multi_line {
format!("[{0}[{1}]{0}]", "=".repeat(*blocks), literal.to_string())
} else {
format!("{0}{1}{0}", quote_type.to_string(), literal.to_string())
}
}
Symbol { symbol } => symbol.to_string(),
Whitespace { characters } => characters.to_string(),
}
.fmt(formatter)
}
}
impl<'a> PartialEq<Self> for Token<'a> {
fn eq(&self, rhs: &Self) -> bool {
self.start_position() == rhs.start_position()
&& self.end_position() == rhs.end_position()
&& self.token_type == rhs.token_type
}
}
impl<'a> Eq for Token<'a> {}
impl<'a> Ord for Token<'a> {
fn cmp(&self, other: &Self) -> Ordering {
self.start_position().cmp(&other.start_position())
}
}
impl<'a> PartialOrd for Token<'a> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'ast> Visit<'ast> for Token<'ast> {
fn visit<V: Visitor<'ast>>(&self, visitor: &mut V) {
visitor.visit_token(self);
match self.token_kind() {
TokenKind::Eof => {}
TokenKind::Identifier => visitor.visit_identifier(self),
TokenKind::MultiLineComment => visitor.visit_multi_line_comment(self),
TokenKind::Number => visitor.visit_number(self),
TokenKind::Shebang => {}
TokenKind::SingleLineComment => visitor.visit_single_line_comment(self),
TokenKind::StringLiteral => visitor.visit_string_literal(self),
TokenKind::Symbol => visitor.visit_symbol(self),
TokenKind::Whitespace => visitor.visit_whitespace(self),
}
}
}
impl<'ast> VisitMut<'ast> for Token<'ast> {
fn visit_mut<V: VisitorMut<'ast>>(self, visitor: &mut V) -> Self {
let token = visitor.visit_token(self);
match token.token_kind() {
TokenKind::Eof => token,
TokenKind::Identifier => visitor.visit_identifier(token),
TokenKind::MultiLineComment => visitor.visit_multi_line_comment(token),
TokenKind::Number => visitor.visit_number(token),
TokenKind::Shebang => token,
TokenKind::SingleLineComment => visitor.visit_single_line_comment(token),
TokenKind::StringLiteral => visitor.visit_string_literal(token),
TokenKind::Symbol => visitor.visit_symbol(token),
TokenKind::Whitespace => visitor.visit_whitespace(token),
}
}
}
#[derive(Clone, Debug, Owned)]
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
pub struct TokenReference<'a> {
#[cfg_attr(feature = "serde", serde(borrow))]
pub(crate) leading_trivia: Vec<Token<'a>>,
#[cfg_attr(feature = "serde", serde(borrow))]
pub(crate) token: Token<'a>,
#[cfg_attr(feature = "serde", serde(borrow))]
pub(crate) trailing_trivia: Vec<Token<'a>>,
}
impl<'a> TokenReference<'a> {
pub fn new(
leading_trivia: Vec<Token<'a>>,
token: Token<'a>,
trailing_trivia: Vec<Token<'a>>,
) -> Self {
Self {
leading_trivia,
token,
trailing_trivia,
}
}
pub fn symbol(text: &str) -> Result<Self, TokenizerErrorType> {
let mut chars = text.chars().peekable();
let mut leading_trivia = String::new();
while let Some(character) = chars.peek() {
if character.is_ascii_whitespace() {
leading_trivia.push(chars.next().unwrap());
} else {
break;
}
}
let mut symbol_text = String::new();
while let Some(character) = chars.peek() {
if !character.is_ascii_whitespace() {
symbol_text.push(chars.next().unwrap());
} else {
break;
}
}
let symbol = Symbol::from_str(&symbol_text)
.map_err(|_| TokenizerErrorType::InvalidSymbol(symbol_text))?;
let mut trailing_trivia = String::new();
while let Some(character) = chars.peek() {
if character.is_ascii_whitespace() {
trailing_trivia.push(chars.next().unwrap());
} else {
return Err(TokenizerErrorType::UnexpectedToken(*character));
}
}
Ok(Self {
leading_trivia: vec![Token::new(TokenType::Whitespace {
characters: Cow::Owned(leading_trivia),
})],
token: Token::new(TokenType::Symbol { symbol }),
trailing_trivia: vec![Token::new(TokenType::Whitespace {
characters: Cow::Owned(trailing_trivia),
})],
})
}
pub fn token(&self) -> &Token<'a> {
&self.token
}
pub fn leading_trivia(&self) -> impl Iterator<Item = &Token<'a>> {
self.leading_trivia.iter()
}
pub fn trailing_trivia(&self) -> impl Iterator<Item = &Token<'a>> {
self.trailing_trivia.iter()
}
pub fn with_token(&self, token: Token<'a>) -> Self {
Self {
token,
leading_trivia: self.leading_trivia.clone(),
trailing_trivia: self.trailing_trivia.clone(),
}
}
}
impl<'a> std::borrow::Borrow<Token<'a>> for &TokenReference<'a> {
fn borrow(&self) -> &Token<'a> {
&**self
}
}
impl<'a> std::ops::Deref for TokenReference<'a> {
type Target = Token<'a>;
fn deref(&self) -> &Self::Target {
&self.token
}
}
impl<'a> fmt::Display for TokenReference<'a> {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
for trivia in &self.leading_trivia {
formatter.write_str(&trivia.to_string())?;
}
formatter.write_str(&self.token.to_string())?;
for trivia in &self.trailing_trivia {
formatter.write_str(&trivia.to_string())?;
}
Ok(())
}
}
impl<'a> PartialEq<Self> for TokenReference<'a> {
fn eq(&self, other: &Self) -> bool {
(**self).eq(other)
}
}
impl<'a> Eq for TokenReference<'a> {}
impl<'a> Ord for TokenReference<'a> {
fn cmp(&self, other: &Self) -> Ordering {
(**self).cmp(&**other)
}
}
impl<'a> PartialOrd for TokenReference<'a> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'ast> Visit<'ast> for TokenReference<'ast> {
fn visit<V: Visitor<'ast>>(&self, visitor: &mut V) {
visitor.visit_token(self);
if matches!(self.token().token_kind(), TokenKind::Eof) {
visitor.visit_eof(self);
}
self.leading_trivia.visit(visitor);
self.token.visit(visitor);
self.trailing_trivia.visit(visitor);
}
}
impl<'ast> VisitMut<'ast> for TokenReference<'ast> {
fn visit_mut<V: VisitorMut<'ast>>(self, visitor: &mut V) -> Self {
let mut token_reference = visitor.visit_token_reference(self);
if matches!(token_reference.token().token_kind(), TokenKind::Eof) {
token_reference = visitor.visit_eof(token_reference);
}
token_reference.leading_trivia = token_reference.leading_trivia.visit_mut(visitor);
token_reference.token = token_reference.token.visit_mut(visitor);
token_reference.trailing_trivia = token_reference.trailing_trivia.visit_mut(visitor);
token_reference
}
}
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
pub struct Position {
pub(crate) bytes: usize,
pub(crate) character: usize,
pub(crate) line: usize,
}
impl Position {
pub fn bytes(self) -> usize {
self.bytes
}
pub fn character(self) -> usize {
self.character
}
pub fn line(self) -> usize {
self.line
}
}
impl Ord for Position {
fn cmp(&self, other: &Self) -> Ordering {
self.bytes.cmp(&other.bytes)
}
}
impl PartialOrd for Position {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
#[derive(Clone, Debug, PartialEq)]
struct TokenAdvancement<'a> {
pub advance: usize,
pub token_type: TokenType<'a>,
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
pub enum StringLiteralQuoteType {
Brackets,
Double,
Single,
}
impl<'a> fmt::Display for StringLiteralQuoteType {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
match *self {
StringLiteralQuoteType::Brackets => unreachable!(),
StringLiteralQuoteType::Double => "\"",
StringLiteralQuoteType::Single => "'",
}
.fmt(formatter)
}
}
type Advancement<'a> = Result<Option<TokenAdvancement<'a>>, TokenizerErrorType>;
#[inline]
fn parse_single_line_comment(code: &str) -> IResult<&str, &str> {
preceded(tag("--"), take_till(|x: char| x == '\r' || x == '\n'))(code)
}
#[inline]
fn parse_multi_line_comment_start(code: &str) -> IResult<&str, &str> {
delimited(tag("--["), take_while(|x: char| x == '='), tag("["))(code)
}
#[inline]
fn parse_multi_line_comment_body<'a>(
code: &'a str,
block_count: &'a str,
) -> IResult<&'a str, &'a str> {
recognize(many_till(
anychar,
recognize(tuple((tag("]"), tag(block_count), tag("]")))),
))(code)
}
fn advance_comment(code: &str) -> Advancement {
if let Ok((code, block_count)) = parse_multi_line_comment_start(code) {
return match parse_multi_line_comment_body(code, block_count) {
Ok((_, comment)) => {
let blocks = block_count.chars().count();
let comment = &comment[..(comment.len() - "]]".len() - block_count.len())];
Ok(Some(TokenAdvancement {
advance: comment.chars().count() + blocks * 2 + "--[[]]".chars().count(),
token_type: TokenType::MultiLineComment {
blocks,
comment: Cow::from(comment),
},
}))
}
Err(_) => Err(TokenizerErrorType::UnclosedComment),
};
}
match parse_single_line_comment(code) {
Ok((_, comment)) => Ok(Some(TokenAdvancement {
advance: 2 + comment.chars().count(),
token_type: TokenType::SingleLineComment {
comment: Cow::from(comment),
},
})),
Err(_) => Ok(None),
}
}
fn parse_hex_number(code: &str) -> IResult<&str, &str> {
recognize(pair(
tag_no_case("0x"),
#[cfg(not(feature = "roblox"))]
take_while1(|c: char| c.is_digit(16)),
#[cfg(feature = "roblox")]
take_while1(|c: char| c.is_digit(16) || c == '_'),
))(code)
}
#[cfg(not(feature = "roblox"))]
fn parse_digit_with_seperator(code: &str) -> IResult<&str, &str> {
digit1(code)
}
#[cfg(feature = "roblox")]
fn parse_digit_with_seperator(code: &str) -> IResult<&str, &str> {
recognize(pair(
digit1,
opt(take_while1(|c: char| c.is_digit(10) || c == '_')),
))(code)
}
fn parse_no_int_fractional_number(code: &str) -> IResult<&str, &str> {
recognize(pair(
opt(parse_digit_with_seperator),
pair(
pair(tag("."), parse_digit_with_seperator),
opt(pair(
pair(tag_no_case("e"), opt(alt((tag("-"), tag("+"))))),
parse_digit_with_seperator,
)),
),
))(code)
}
fn parse_basic_number(code: &str) -> IResult<&str, &str> {
recognize(pair(
parse_digit_with_seperator,
pair(
opt(pair(tag("."), parse_digit_with_seperator)),
opt(pair(
pair(tag_no_case("e"), opt(alt((tag("-"), tag("+"))))),
parse_digit_with_seperator,
)),
),
))(code)
}
#[cfg(not(feature = "roblox"))]
fn parse_roblox_number(_: &str) -> IResult<&str, &str> {
Err(nom::Err::Error((
"roblox feature not enabled",
nom::error::ErrorKind::Alt,
)))
}
#[cfg(feature = "roblox")]
fn parse_roblox_number(code: &str) -> IResult<&str, &str> {
recognize(pair(
tag_no_case("0b"),
take_while1(|x: char| x == '0' || x == '1' || x == '_'),
))(code)
}
fn parse_number(code: &str) -> IResult<&str, &str> {
alt((
parse_roblox_number,
parse_hex_number,
parse_basic_number,
parse_no_int_fractional_number,
))(code)
}
fn advance_number(code: &str) -> Advancement {
match parse_number(code) {
Ok((_, number)) => Ok(Some(TokenAdvancement {
advance: number.chars().count(),
token_type: TokenType::Number {
text: Cow::from(number),
},
})),
Err(_) => Ok(None),
}
}
#[inline]
fn parse_identifier(code: &str) -> IResult<&str, &str> {
recognize(pair(
take_while1(|x: char| x.is_ascii_alphabetic() || x == '_'),
take_while(|x: char| x.is_ascii_alphanumeric() || x == '_'),
))(code)
}
fn advance_identifier(code: &str) -> Advancement {
match parse_identifier(code) {
Ok((_, identifier)) => Ok(Some(TokenAdvancement {
advance: identifier.chars().count(),
token_type: TokenType::Identifier {
identifier: Cow::from(identifier),
},
})),
Err(_) => Ok(None),
}
}
#[inline]
fn parse_shebang(code: &str) -> IResult<&str, &str> {
recognize(pair(tag("#!"), many_till(anychar, line_ending)))(code)
}
fn advance_shebang(code: &str) -> Advancement {
match parse_shebang(code) {
Ok((_, line)) => Ok(Some(TokenAdvancement {
advance: line.chars().count(),
token_type: TokenType::Shebang {
line: Cow::from(line),
},
})),
Err(_) => Ok(None),
}
}
#[inline]
fn parse_multi_line_string_start(code: &str) -> IResult<&str, &str> {
delimited(tag("["), take_while(|x: char| x == '='), tag("["))(code)
}
#[inline]
fn parse_multi_line_string_body<'a>(
code: &'a str,
block_count: &'a str,
) -> IResult<&'a str, &'a str> {
recognize(many_till(
anychar,
recognize(tuple((tag("]"), tag(block_count), tag("]")))),
))(code)
}
fn advance_quote(code: &str) -> Advancement {
if let Ok((code, block_count)) = parse_multi_line_string_start(code) {
return match parse_multi_line_string_body(code, block_count) {
Ok((_, body)) => {
let blocks = block_count.chars().count();
let body = &body[..(body.len() - "]]".len() - block_count.len())];
Ok(Some(TokenAdvancement {
advance: body.chars().count() + blocks * 2 + "[[]]".chars().count(),
token_type: TokenType::StringLiteral {
multi_line: Some(blocks),
literal: Cow::from(body),
quote_type: StringLiteralQuoteType::Brackets,
},
}))
}
Err(_) => Err(TokenizerErrorType::UnclosedString),
};
}
let quote = if code.starts_with('"') {
'"'
} else if code.starts_with('\'') {
'\''
} else {
return Ok(None);
};
let mut end = None;
let mut escape = false;
for (char_index, (byte_index, character)) in code.char_indices().enumerate().skip(1) {
if character == '\\' {
escape = !escape;
} else if character == quote {
if escape {
escape = false;
} else {
end = Some((char_index, byte_index));
break;
}
} else if (character == '\r' || character == '\n') && !escape {
return Err(TokenizerErrorType::UnclosedString);
} else {
escape = false;
}
}
if let Some((char_index, byte_index)) = end {
Ok(Some(TokenAdvancement {
advance: char_index + 1,
token_type: TokenType::StringLiteral {
literal: Cow::from(&code[1..byte_index]),
multi_line: None,
quote_type: match quote {
'"' => StringLiteralQuoteType::Double,
'\'' => StringLiteralQuoteType::Single,
_ => unreachable!(),
},
},
}))
} else {
Err(TokenizerErrorType::UnclosedString)
}
}
fn advance_symbol(code: &str) -> Advancement {
match parse_symbol(code) {
Ok((_, string)) => Ok(Some(TokenAdvancement {
advance: string.chars().count(),
token_type: TokenType::Symbol {
symbol: Symbol::from_str(string).unwrap(),
},
})),
Err(_) => Ok(None),
}
}
#[inline]
fn parse_whitespace(code: &str) -> IResult<&str, &str> {
alt((recognize(pair(opt(line_ending), space1)), line_ending))(code)
}
fn advance_whitespace(code: &str) -> Advancement {
match parse_whitespace(code) {
Ok((_, whitespace)) => Ok(Some(TokenAdvancement {
advance: whitespace.chars().count(),
token_type: TokenType::Whitespace {
characters: Cow::from(whitespace),
},
})),
Err(_) => Ok(None),
}
}
#[derive(Clone, Debug, PartialEq)]
#[cfg_attr(feature = "serde", derive(Deserialize, Serialize))]
pub struct TokenizerError {
error: TokenizerErrorType,
position: Position,
}
impl fmt::Display for TokenizerError {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(
formatter,
"{} at line {}, column {}",
match &self.error {
TokenizerErrorType::UnclosedComment => "unclosed comment".to_string(),
TokenizerErrorType::UnclosedString => "unclosed string".to_string(),
TokenizerErrorType::UnexpectedToken(character) => {
format!("unexpected character {}", character)
}
TokenizerErrorType::InvalidSymbol(symbol) => {
format!("invalid symbol {}", symbol)
}
},
self.position.line,
self.position.character,
)
}
}
impl std::error::Error for TokenizerError {}
pub fn tokens<'a>(code: &'a str) -> Result<Vec<Token<'a>>, TokenizerError> {
let mut tokens = Vec::new();
let mut position = Position {
bytes: 0,
character: 1,
line: 1,
};
let mut next_is_new_line = false;
macro_rules! advance {
($function:ident) => {
match $function(&code[position.bytes..]) {
Ok(Some(advancement)) => {
let start_position = position;
for character in code[position.bytes..].chars().take(advancement.advance) {
if next_is_new_line {
next_is_new_line = false;
position.line += 1;
position.character = 1;
}
if character == '\n' {
next_is_new_line = true;
} else {
position.character += 1;
}
position.bytes += character.len_utf8();
}
tokens.push(Token {
start_position,
end_position: position,
token_type: advancement.token_type,
});
continue;
}
Ok(None) => {}
Err(error) => {
return Err(TokenizerError { error, position });
}
};
};
}
for _ in 0..1 {
advance!(advance_shebang);
}
while code.bytes().count() > position.bytes {
advance!(advance_whitespace);
advance!(advance_comment);
advance!(advance_number);
advance!(advance_quote);
advance!(advance_symbol);
advance!(advance_identifier);
return Err(TokenizerError {
error: TokenizerErrorType::UnexpectedToken(
code.chars()
.nth(position.character - 1)
.expect("text overflow while giving unexpected token error"),
),
position,
});
}
tokens.push(Token {
start_position: position,
end_position: position,
token_type: TokenType::Eof,
});
Ok(tokens)
}
#[cfg(test)]
mod tests {
use crate::tokenizer::*;
use pretty_assertions::assert_eq;
macro_rules! test_advancer {
($advancer:ident($code:tt), $result:expr) => {
assert_eq!($advancer($code), $result);
let result: Advancement = $result;
match result {
Ok(Some(token)) => {
let tokens = tokens($code).expect("couldn't tokenize");
let first_token = &tokens.get(0).expect("tokenized response is empty");
assert_eq!(*first_token.token_type(), token.token_type);
}
Err(advancement_error) => {
if let Err(TokenizerError { error, .. }) = tokens($code) {
assert_eq!(error, advancement_error);
}
}
_ => {}
};
};
}
#[test]
fn test_advance_comment() {
test_advancer!(
advance_comment("-- hello world"),
Ok(Some(TokenAdvancement {
advance: 14,
token_type: TokenType::SingleLineComment {
comment: Cow::from(" hello world"),
},
}))
);
test_advancer!(
advance_comment("--[[ hello world ]]"),
Ok(Some(TokenAdvancement {
advance: 19,
token_type: TokenType::MultiLineComment {
blocks: 0,
comment: Cow::from(" hello world "),
},
}))
);
test_advancer!(
advance_comment("--[=[ hello world ]=]"),
Ok(Some(TokenAdvancement {
advance: 21,
token_type: TokenType::MultiLineComment {
blocks: 1,
comment: Cow::from(" hello world "),
},
}))
);
}
#[test]
fn test_advance_numbers() {
test_advancer!(
advance_number("213"),
Ok(Some(TokenAdvancement {
advance: 3,
token_type: TokenType::Number {
text: Cow::from("213"),
},
}))
);
test_advancer!(
advance_number("123.45"),
Ok(Some(TokenAdvancement {
advance: 6,
token_type: TokenType::Number {
text: Cow::from("123.45"),
},
}))
);
}
#[test]
#[cfg_attr(not(feature = "roblox"), ignore)]
fn test_advance_binary_literals() {
test_advancer!(
advance_number("0b101"),
Ok(Some(TokenAdvancement {
advance: 5,
token_type: TokenType::Number {
text: Cow::from("0b101"),
},
}))
);
}
#[test]
fn test_advance_identifier() {
test_advancer!(
advance_identifier("hello"),
Ok(Some(TokenAdvancement {
advance: 5,
token_type: TokenType::Identifier {
identifier: Cow::from("hello"),
},
}))
);
test_advancer!(
advance_identifier("hello world"),
Ok(Some(TokenAdvancement {
advance: 5,
token_type: TokenType::Identifier {
identifier: Cow::from("hello"),
},
}))
);
test_advancer!(
advance_identifier("hello___"),
Ok(Some(TokenAdvancement {
advance: 8,
token_type: TokenType::Identifier {
identifier: Cow::from("hello___"),
},
}))
);
test_advancer!(advance_identifier("123"), Ok(None));
}
#[test]
fn test_advance_symbols() {
test_advancer!(
advance_symbol("local"),
Ok(Some(TokenAdvancement {
advance: 5,
token_type: TokenType::Symbol {
symbol: Symbol::Local
},
}))
);
}
#[test]
fn test_advance_whitespace() {
test_advancer!(
advance_whitespace("\t \n"),
Ok(Some(TokenAdvancement {
advance: 3,
token_type: TokenType::Whitespace {
characters: Cow::from("\t "),
},
}))
);
test_advancer!(
advance_whitespace("\thello"),
Ok(Some(TokenAdvancement {
advance: 1,
token_type: TokenType::Whitespace {
characters: Cow::from("\t"),
},
}))
);
test_advancer!(
advance_whitespace("\t\t\nhello"),
Ok(Some(TokenAdvancement {
advance: 2,
token_type: TokenType::Whitespace {
characters: Cow::from("\t\t"),
},
}))
);
}
#[test]
fn test_advance_quote() {
test_advancer!(
advance_quote("\"hello\""),
Ok(Some(TokenAdvancement {
advance: 7,
token_type: TokenType::StringLiteral {
literal: Cow::from("hello"),
multi_line: None,
quote_type: StringLiteralQuoteType::Double,
},
}))
);
test_advancer!(
advance_quote("\"hello\\\nworld\""),
Ok(Some(TokenAdvancement {
advance: 14,
token_type: TokenType::StringLiteral {
literal: Cow::from("hello\\\nworld"),
multi_line: None,
quote_type: StringLiteralQuoteType::Double,
},
}))
);
test_advancer!(
advance_quote("\"hello"),
Err(TokenizerErrorType::UnclosedString)
);
}
#[test]
fn test_symbols_within_symbols() {
test_advancer!(advance_symbol("index"), Ok(None));
test_advancer!(
advance_symbol("<="),
Ok(Some(TokenAdvancement {
advance: 2,
token_type: TokenType::Symbol {
symbol: Symbol::LessThanEqual,
},
}))
);
}
#[test]
fn test_advance_shebang() {
test_advancer!(
advance_shebang("#!/usr/bin/env lua\n"),
Ok(Some(TokenAdvancement {
advance: 19,
token_type: TokenType::Shebang {
line: "#!/usr/bin/env lua\n".into()
}
}))
);
test_advancer!(advance_shebang(" #!/usr/bin/env lua\n"), Ok(None));
}
#[test]
fn test_new_line_on_same_line() {
assert_eq!(
tokens("\n").unwrap()[0],
Token {
start_position: Position {
bytes: 0,
character: 1,
line: 1,
},
end_position: Position {
bytes: 1,
character: 1,
line: 1,
},
token_type: TokenType::Whitespace {
characters: Cow::from("\n")
},
}
);
}
#[test]
fn test_fuzzer() {
let _ = tokens("*ա");
let _ = tokens("̹(");
let _ = tokens("¹;");
}
}