use crate::parser::{
error::ErrorContext,
lexing::{Keyword, TokenSpan},
};
use super::{
error::{LexingError, SpannedLexingError},
AttributeKeyword, Token, TokenKind,
};
pub(super) struct Tokenizer<'a> {
current_index: usize,
remaining_text: &'a str,
original_text: &'a str,
}
impl<'a> Tokenizer<'a> {
pub(super) fn new(input: &'a str) -> Self {
Self {
current_index: 0,
remaining_text: input,
original_text: input,
}
}
pub(super) fn next_token(&mut self) -> Result<Option<Token>, SpannedLexingError> {
self.skip_ignored_tokens();
if self.remaining_text.is_empty() {
Ok(None)
} else {
let start = self.current_index;
let (token_kind, index) = self.get_next_tokenkind().map_err(|e| {
let context = ErrorContext::from_index(start, self.original_text);
SpannedLexingError { source: e, context }
})?;
self.chomp(index); let end = self.current_index;
Ok(Some(Token {
span: TokenSpan { start, end },
kind: token_kind,
}))
}
}
fn get_next_tokenkind(&mut self) -> Result<(TokenKind, usize), LexingError> {
let next = match self.remaining_text.chars().next() {
Some(c) => c,
None => return Err(LexingError::UnexpectedEOF),
};
let (tok, length) = match next {
'(' => (TokenKind::CurvedBracketOpen, 1),
')' => (TokenKind::CurvedBracketClose, 1),
'{' => (TokenKind::CurlyBracketOpen, 1),
'}' => (TokenKind::CurlyBracketClose, 1),
'<' => (TokenKind::AngledBracketOpen, 1),
'>' => (TokenKind::AngledBracketClose, 1),
'[' => (TokenKind::SquareBracketOpen, 1),
']' => (TokenKind::SquareBracketClose, 1),
':' => (TokenKind::Colon, 1),
';' => (TokenKind::Semicolon, 1),
',' => (TokenKind::Comma, 1),
'#' => (TokenKind::PoundSign, 1),
'=' => (TokenKind::EqualsSign, 1),
'"' => tokenize_literal_string(self.remaining_text, "\"")?,
'r' => try_to_tokenize_raw_literal_string(self.remaining_text)?,
'-' => tokenize_arrow(self.remaining_text)?,
'/' => tokenize_comment(self.remaining_text)?,
c if c.is_alphabetic() => tokenize_ident(self.remaining_text)?,
'_' => tokenize_ident(self.remaining_text)?,
other => return Err(LexingError::UnknownCharacter(other)),
};
Ok((tok, length))
}
fn skip_ignored_tokens(&mut self) {
loop {
let ws = self.skip_whitespace();
let comments = self.skip_block_comment();
if ws + comments == 0 {
return;
}
}
}
fn skip_whitespace(&mut self) -> usize {
let mut remaining = self.remaining_text;
let _ws = {
let ws = match take_while(remaining, |ch| ch.is_whitespace()) {
Ok((_, bytes_skipped)) => bytes_skipped,
_ => 0,
};
remaining = &remaining[ws..];
ws
};
let skip = self.remaining_text.len() - remaining.len();
self.chomp(skip);
skip
}
fn skip_block_comment(&mut self) -> usize {
let pairs = [("/*", "*/")];
let src = self.remaining_text;
for &(pattern, matcher) in &pairs {
if src.starts_with(pattern) {
let leftovers = skip_until(src, matcher);
let skip = src.len() - leftovers.len();
self.chomp(skip);
return skip;
}
}
0
}
fn chomp(&mut self, chars_to_chomp: usize) {
self.remaining_text = &self.remaining_text[chars_to_chomp..];
self.current_index += chars_to_chomp;
}
}
fn end_of_line(text: &str) -> bool {
let next = text.chars().next();
if let Some('\n') = next {
true
} else if let Some('\r') = next {
true
} else {
false
}
}
fn tokenize_comment(text: &str) -> Result<(TokenKind, usize), LexingError> {
let slashes: &str = &text[..2];
if slashes != "//" {
Err(LexingError::ExpectedComment)
} else {
let text: &str = &text[2..];
if end_of_line(&text) {
Ok((TokenKind::Comment("".to_owned()), 1 + 2))
} else {
let (comment, chars_read) = take_while(text, |ch| ch != '\n' && ch != '\r')?;
let comment = comment.trim_end();
Ok((TokenKind::Comment(comment.to_owned()), chars_read + 2))
}
}
}
fn try_to_tokenize_raw_literal_string(text: &str) -> Result<(TokenKind, usize), LexingError> {
let text_without_r = &text[1..];
let next_char = &text_without_r[..1];
if next_char == "#" {
let (delimeter, chars_read) = take_while(text_without_r, |ch| ch == '#')?;
let delimeter = format!("\"{}", delimeter);
let (token, length) = tokenize_literal_string(&text_without_r[chars_read..], &delimeter)?;
Ok((token, length + 1 + chars_read))
} else if next_char == "\"" {
let (token, length) = tokenize_literal_string(text_without_r, "\"")?;
Ok((token, length + 1))
} else {
tokenize_ident(text)
}
}
fn tokenize_literal_string(text: &str, delimeter: &str) -> Result<(TokenKind, usize), LexingError> {
assert_eq!(&text[..1], "\"");
let text_without_quote = &text[1..];
if &text_without_quote[0..delimeter.len()] == delimeter {
Ok((TokenKind::StringLiteral("".to_owned()), 1 + delimeter.len()))
} else {
let mut predicates: Vec<_> = delimeter
.chars()
.map(|ch| {
move |ch2| ch2 == ch
})
.collect();
let (literal, chars_read) =
take_until_succesive_match(text_without_quote, &mut predicates)?;
Ok((TokenKind::StringLiteral(literal.to_owned()), chars_read + 1))
}
}
fn tokenize_ident(text: &str) -> Result<(TokenKind, usize), LexingError> {
let (got, chars_read) = take_while(text, |ch| ch == '_' || ch.is_alphanumeric())?;
let tokenkind = match got {
"mod" => TokenKind::Keyword(Keyword::r#mod),
"fn" => TokenKind::Keyword(Keyword::r#fn),
"struct" => TokenKind::Keyword(Keyword::r#struct),
"enum" => TokenKind::Keyword(Keyword::r#enum),
"derive" => TokenKind::AttributeKeyword(AttributeKeyword::derive),
"doc" => TokenKind::AttributeKeyword(AttributeKeyword::doc),
"error" => TokenKind::AttributeKeyword(AttributeKeyword::error),
other => TokenKind::Identifier(other.to_string()),
};
Ok((tokenkind, chars_read))
}
fn tokenize_arrow(text: &str) -> Result<(TokenKind, usize), LexingError> {
let mut chars = text.chars();
if let Some(char) = chars.next() {
if char == '-' {
if let Some(char) = chars.next() {
if char == '>' {
return Ok((TokenKind::Arrow, 2));
}
}
}
}
Err(LexingError::ExpectedArrow)
}
fn take_while<F>(data: &str, mut pred: F) -> Result<(&str, usize), LexingError>
where
F: FnMut(char) -> bool,
{
let mut current_index = 0;
for ch in data.chars() {
let should_continue = pred(ch);
if !should_continue {
break;
}
current_index += ch.len_utf8();
}
if current_index == 0 {
Err(LexingError::NoMatchesTaken)
} else {
Ok((&data[..current_index], current_index))
}
}
fn take_until_succesive_match<'a, F>(
data: &'a str,
preds: &mut [F],
) -> Result<(&'a str, usize), LexingError>
where
F: FnMut(char) -> bool,
{
assert!(!preds.is_empty(), "Predicates need to be provided");
let mut current_byte_index = 0;
let mut current_char_index = 0;
let mut current_predicate_byte_index = 0;
let mut current_predicate_char_index;
'outer: for ch in data.chars() {
let should_stop = preds[0](ch);
if should_stop {
current_predicate_byte_index = current_byte_index;
current_predicate_char_index = current_char_index;
if preds.len() == 1 {
current_predicate_byte_index += ch.len_utf8();
break 'outer;
}
'inner: for predicate_index in 1..preds.len() {
let preds_len = preds.len();
let pred = &mut preds[predicate_index];
current_predicate_byte_index += ch.len_utf8();
current_predicate_char_index += 1;
let ch: char = data
.chars()
.nth(current_predicate_char_index)
.expect("This should always exist");
if pred(ch) && predicate_index == preds_len - 1 {
current_predicate_byte_index += ch.len_utf8();
break 'outer;
} else if pred(ch) {
continue;
} else {
break 'inner;
}
}
}
current_byte_index += ch.len_utf8();
current_char_index += 1;
}
if current_byte_index == 0 {
Err(LexingError::NoMatchesTaken)
} else if current_byte_index > current_predicate_byte_index {
Err(LexingError::RunawayQuote)
} else {
Ok((
&data[..current_byte_index],
current_byte_index + (current_predicate_byte_index - current_byte_index),
))
}
}
fn skip_until<'a>(mut src: &'a str, pattern: &str) -> &'a str {
while !src.is_empty() && !src.starts_with(pattern) {
let next_char_size = src
.chars()
.next()
.expect("The string isn't empty")
.len_utf8();
src = &src[next_char_size..];
}
&src[pattern.len()..]
}