use std::{
collections::VecDeque,
iter::{DoubleEndedIterator, Peekable},
str::CharIndices,
};
#[derive(Debug, PartialEq, Eq)]
pub enum Token<'a> {
BlockComment(&'a str, &'a str, &'a str),
Ident(&'a str),
LineComment(&'a str, &'a str),
Number(&'a str),
String(&'a str, &'a str, &'a str),
Symbol(&'a str),
}
pub struct Tokenizer<'a> {
content: &'a str,
}
impl<'a> Tokenizer<'a> {
pub fn new(content: &'a str) -> Self {
Tokenizer { content }
}
pub fn tokens(&self) -> Tokens<'a> {
Tokens {
backlog: VecDeque::new(),
chars: self.content.char_indices().peekable(),
content: self.content,
current_token_idx: 0,
}
}
}
struct Cycle<T: Copy> {
items: Vec<T>,
index: usize,
}
impl<T: Copy> Cycle<T> {
fn new(starting_items: Vec<T>) -> Self {
Cycle {
items: starting_items,
index: 0,
}
}
fn push(&mut self, item: T) {
self.items[self.index] = item;
self.bump_index();
}
fn bump_index(&mut self) {
self.index = (self.index + 1) % self.items.len();
}
fn get_items(&self) -> Vec<T> {
let mut items = Vec::with_capacity(self.items.len());
for i in 0..self.items.len() {
items.push(self.items[(i + self.index) % self.items.len()])
}
items
}
}
pub struct Tokens<'a> {
backlog: VecDeque<(usize, char)>,
chars: Peekable<CharIndices<'a>>,
content: &'a str,
current_token_idx: usize,
}
impl<'a> Tokens<'a> {
fn advance(&mut self) -> Option<(usize, char)> {
self.next_backlog().or_else(|| self.chars.next())
}
fn start_new_token(&mut self) -> Option<char> {
let (idx, ch) = self.advance()?;
self.current_token_idx = idx;
Some(ch)
}
fn peek(&mut self) -> Option<(usize, char)> {
self.peek_backlog().or(self.chars.peek().copied())
}
fn next_backlog(&mut self) -> Option<(usize, char)> {
self.backlog.pop_front()
}
fn peek_backlog(&mut self) -> Option<(usize, char)> {
self.backlog.front().copied()
}
fn push_backlog<I>(&mut self, new_chars: I)
where
I: Iterator<Item = (usize, char)> + DoubleEndedIterator,
{
for ch in new_chars.rev() {
self.backlog.push_front(ch)
}
}
fn token_start(&self) -> usize {
self.current_token_idx
}
fn eat_whitespace(&mut self) -> usize {
loop {
match self.peek() {
Some((_, ch)) if ch.is_whitespace() => self.advance(),
Some((idx, _)) => break idx,
None => break self.content.len(),
};
}
}
fn eat_non_newline_whitespace(&mut self) -> usize {
loop {
match self.peek() {
Some((idx, ch)) if ch == '\n' || ch == '\r' => {
break idx;
}
Some((_, ch)) if ch.is_whitespace() => self.advance(),
Some((idx, _)) => {
break idx;
}
_ => break self.content.len(),
};
}
}
fn take_if<F>(&mut self, cond: &mut F) -> usize
where
F: FnMut(char) -> bool,
{
loop {
match self.peek() {
Some((idx, ch)) => {
if !cond(ch) {
break idx;
};
self.advance();
}
None => break self.content.len(),
};
}
}
fn take_if_slice<F>(&mut self, cond: &mut F) -> &'a str
where
F: FnMut(char) -> bool,
{
let end = self.take_if(cond);
self.slice_from_token_start(end)
}
fn slice_from_token_start(&self, end: usize) -> &'a str {
self.slice(self.token_start(), end)
}
fn slice(&self, start: usize, end: usize) -> &'a str {
&self.content[start..end]
}
fn block_comment(
&mut self,
start_sequence: &Vec<char>,
end_sequence: &Vec<char>,
) -> Option<Token<'a>> {
let mut symbol = vec![start_sequence[0]];
for expected_symbol in start_sequence[1..].into_iter() {
match self.peek() {
Some((_, ch)) if ch == *expected_symbol => {
symbol.push(ch);
self.advance();
}
_ => {
let token_start = self.token_start();
let backlog_chars = symbol[1..]
.into_iter()
.enumerate()
.map(|(idx, ch)| (idx + token_start, *ch));
self.push_backlog(backlog_chars);
return Some(Token::Symbol(self.slice_from_token_start(token_start + 1)));
}
}
}
let symbol = self.slice_from_token_start(self.token_start() + symbol.len());
match self.take_block(self.token_start() + symbol.len(), end_sequence) {
Ok((content, end_sequence)) => Some(Token::BlockComment(symbol, content, end_sequence)),
Err(token) => Some(token),
}
}
fn take_block(
&mut self,
content_idx: usize,
end_sequence: &Vec<char>,
) -> Result<(&'a str, &'a str), Token<'a>> {
let mut prev_chars = Cycle::new(vec!['@'; end_sequence.len()]);
let mut take_if = |ch| {
let should_take = prev_chars.get_items() != *end_sequence;
if should_take {
prev_chars.push(ch);
}
should_take
};
let end = self.take_if(&mut take_if);
if prev_chars.get_items() == *end_sequence {
let end_sequence_start = end - end_sequence.len();
let content = self.slice(content_idx, end_sequence_start);
let end_sequence = self.slice(end_sequence_start, end);
Ok((content, end_sequence))
} else {
let backlog_start = self.token_start() + 1;
let backlog_chars = self
.slice(backlog_start, end)
.char_indices()
.map(|(idx, ch)| (idx + backlog_start, ch));
self.push_backlog(backlog_chars);
Err(Token::Symbol(self.slice_from_token_start(backlog_start)))
}
}
}
impl<'a> Iterator for Tokens<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
self.eat_whitespace();
match self.start_new_token() {
Some(ch) if ch.is_alphabetic() || ch == '_' => Some(Token::Ident(
self.take_if_slice(&mut |ch| ch.is_alphanumeric() || ch == '_'),
)),
Some('0') => match self.peek() {
Some((_, 'b')) => {
self.advance();
Some(Token::Number(self.take_if_slice(&mut |ch| {
ch == '1' || ch == '0' || ch == '_'
})))
}
Some((_, 'o')) => {
self.advance();
Some(Token::Number(self.take_if_slice(&mut |ch| match ch {
'0'..='7' | '_' => true,
_ => false,
})))
}
Some((_, 'x')) => {
self.advance();
Some(Token::Number(self.take_if_slice(&mut |ch| {
ch.is_ascii_hexdigit() || ch == '_'
})))
}
_ => Some(Token::Number(self.take_if_slice(&mut numeric_closure()))),
},
Some(ch) if ch == '-' || ch == '+' => match self.peek() {
Some((_, ch)) if ch.is_numeric() => {
Some(Token::Number(self.take_if_slice(&mut numeric_closure())))
}
Some((_, '-')) if ch == '-' => {
let symbol = self.take_if_slice(&mut |ch| ch == '-');
let comment_start = self.eat_non_newline_whitespace();
let comment_end = self.take_if(&mut |ch| ch != '\r' && ch != '\n');
let comment = self.slice(comment_start, comment_end);
Some(Token::LineComment(symbol, comment))
}
_ => Some(Token::Symbol(
&self.content[self.token_start()..self.token_start() + 1],
)),
},
Some(ch) if ch.is_numeric() => {
Some(Token::Number(self.take_if_slice(&mut numeric_closure())))
}
Some('/') => match self.peek() {
Some((_, '/')) => {
let symbol = self.take_if_slice(&mut |ch| ch == '/');
let comment_start = self.eat_non_newline_whitespace();
let comment_end = self.take_if(&mut |ch| ch != '\r' && ch != '\n');
let comment = self.slice(comment_start, comment_end);
Some(Token::LineComment(symbol, comment))
}
Some((_, '*')) => self.block_comment(&vec!['/', '*'], &vec!['*', '/']),
_ => Some(Token::Symbol(
self.slice_from_token_start(self.token_start() + 1),
)),
},
Some('{') => match self.peek() {
Some((_, '-')) => self.block_comment(&vec!['{', '-'], &vec!['-', '}']),
_ => Some(Token::Symbol(
self.slice_from_token_start(self.token_start() + 1),
)),
},
Some('(') => match self.peek() {
Some((_, '*')) => self.block_comment(&vec!['(', '*'], &vec!['*', ')']),
_ => Some(Token::Symbol(
self.slice_from_token_start(self.token_start() + 1),
)),
},
Some('<') => self.block_comment(&vec!['<', '!', '-', '-'], &vec!['-', '-', '>']),
Some('#') => {
let symbol = self.take_if_slice(&mut |ch| ch == '#');
let comment_start = self.eat_non_newline_whitespace();
let comment_end = self.take_if(&mut |ch| ch != '\r' && ch != '\n');
let comment = self.slice(comment_start, comment_end);
Some(Token::LineComment(symbol, comment))
}
Some('%') => {
let symbol = self.take_if_slice(&mut |ch| ch == '%');
let comment_start = self.eat_non_newline_whitespace();
let comment_end = self.take_if(&mut |ch| ch != '\r' && ch != '\n');
let comment = self.slice(comment_start, comment_end);
Some(Token::LineComment(symbol, comment))
}
Some(quote_char @ '"') | Some(quote_char @ '\'') | Some(quote_char @ '`') => {
let symbol = self.take_if_slice(&mut |ch| ch == quote_char);
match symbol.len() {
1 => {
let mut is_escaped = false;
let mut string_closure = |ch: char| {
let should_take = !((ch == quote_char && !is_escaped) || ch == '\n');
is_escaped = ch == '\\' && !is_escaped;
should_take
};
let string_end = self.take_if(&mut string_closure);
let string_content = self.slice(self.token_start() + 1, string_end);
match self.peek() {
Some((_, ch)) if ch == quote_char => {
self.advance();
Some(Token::String(
self.slice_from_token_start(self.token_start() + 1),
string_content,
self.slice(string_end, string_end + 1),
))
}
_ => {
let backlog_start = self.token_start() + 1;
let chars_to_backlog = string_content
.char_indices()
.map(|(idx, ch)| (idx + backlog_start, ch));
self.push_backlog(chars_to_backlog);
Some(Token::Symbol(self.slice_from_token_start(backlog_start)))
}
}
}
2 => Some(Token::String(
self.slice_from_token_start(self.token_start() + 1),
"",
self.slice(self.token_start() + 1, self.token_start() + 2),
)),
_ => {
let string_indicator = vec![quote_char; symbol.len()];
match self.take_block(
self.token_start() + string_indicator.len(),
&string_indicator,
) {
Ok((content, end_indicator)) => Some(Token::String(
self.slice_from_token_start(self.token_start() + symbol.len()),
content,
end_indicator,
)),
Err(token) => Some(token),
}
}
}
}
Some(ch) if ch.is_ascii_punctuation() => Some(Token::Symbol(
&self.content[self.token_start()..self.token_start() + 1],
)),
Some(ch) => Some(Token::Symbol(
self.slice_from_token_start(self.token_start() + ch.len_utf8()),
)),
None => None,
}
}
}
fn numeric_closure() -> Box<dyn FnMut(char) -> bool> {
let mut seen_decimal = false;
Box::new(move |ch| match ch {
ch if ch.is_numeric() || ch == '_' => true,
'.' if !seen_decimal => {
seen_decimal = true;
true
}
_ => false,
})
}
#[cfg(test)]
mod tests {
use super::*;
use Token::*;
#[test]
fn idents_symbols() {
let sample = r#"
fn main() {
let x_x2 = 京y;
let _ = 4;
println!("{}", x_x2);
}
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
Ident("fn"),
Ident("main"),
Symbol("("),
Symbol(")"),
Symbol("{"),
Ident("let"),
Ident("x_x2"),
Symbol("="),
Ident("京y"),
Symbol(";"),
Ident("let"),
Ident("_"),
Symbol("="),
Number("4"),
Symbol(";"),
Ident("println"),
Symbol("!"),
Symbol("("),
String("\"", "{}", "\""),
Symbol(","),
Ident("x_x2"),
Symbol(")"),
Symbol(";"),
Symbol("}"),
];
assert_eq!(tokens, expected)
}
#[test]
fn numbers() {
let sample = r#"
1;
1_000;
-1;
-1_000;
1.5;
.1.5;
1.1.4;
0b1010;
0o700;
0xFFFFFFFFFFFFFFFFF;
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
Number("1"),
Symbol(";"),
Number("1_000"),
Symbol(";"),
Number("-1"),
Symbol(";"),
Number("-1_000"),
Symbol(";"),
Number("1.5"),
Symbol(";"),
Symbol("."),
Number("1.5"),
Symbol(";"),
Number("1.1"),
Symbol("."),
Number("4"),
Symbol(";"),
Number("0b1010"),
Symbol(";"),
Number("0o700"),
Symbol(";"),
Number("0xFFFFFFFFFFFFFFFFF"),
Symbol(";"),
];
assert_eq!(tokens, expected)
}
#[test]
fn line_comment() {
let sample = r#"
// this is a line comment
/// this is also one
//
--Another line
## Python here
% anotha one
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
LineComment("//", "this is a line comment"),
LineComment("///", "this is also one"),
LineComment("//", ""),
LineComment("--", "Another line"),
LineComment("##", "Python here"),
LineComment("%", "anotha one"),
];
assert_eq!(tokens, expected)
}
#[test]
fn string() {
let sample = r#"
"Hello, World"
'Heyyy, single quotes'
`Back ticks`
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
String("\"", "Hello, World", "\""),
String("'", "Heyyy, single quotes", "'"),
String("`", "Back ticks", "`"),
];
assert_eq!(tokens, expected)
}
#[test]
fn string_multiline() {
let sample = r#"
""" Hey there
this is a multiliner"""
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![String(
"\"\"\"",
" Hey there\n this is a multiliner",
"\"\"\"",
)];
assert_eq!(tokens, expected);
}
#[test]
fn string_multiline_other() {
let sample = r#"
''' hey single quotes '''
``` hey backticks ```
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
String("'''", " hey single quotes ", "'''"),
String("```", " hey backticks ", "```"),
];
assert_eq!(tokens, expected);
}
#[test]
fn string_unterminated_multiline() {
let sample = r#"
"""
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![Symbol("\""), String("\"", "", "\"")];
assert_eq!(tokens, expected);
}
#[test]
fn incomplete_string() {
let sample = r#"
"Hello
10
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![Symbol("\""), Ident("Hello"), Number("10")];
assert_eq!(tokens, expected)
}
#[test]
fn escaped_quote() {
let sample = r#"
"Hello\" World"
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![String("\"", "Hello\\\" World", "\"")];
assert_eq!(tokens, expected)
}
#[test]
fn misamtched_string_identifiers() {
let sample = r#"
"Hello World'
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![Symbol("\""), Ident("Hello"), Ident("World"), Symbol("'")];
assert_eq!(tokens, expected)
}
#[test]
fn block_comment() {
let sample = r#"
/* Comment Here */
/* */
/**/
/*
* Multi line*/
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
BlockComment("/*", " Comment Here ", "*/"),
BlockComment("/*", " ", "*/"),
BlockComment("/*", "", "*/"),
BlockComment("/*", "\n * Multi line", "*/"),
];
assert_eq!(tokens, expected);
}
#[test]
fn other_block_comments() {
let sample = r#"
{-comment-}
(*block*)
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
BlockComment("{-", "comment", "-}"),
BlockComment("(*", "block", "*)"),
];
assert_eq!(tokens, expected);
}
#[test]
fn html_comment() {
let sample = r#"
<!-- Comment Here-->
<!--
Multi line
Comment
-->
<!---->
<!-- -->
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
BlockComment("<!--", " Comment Here", "-->"),
BlockComment(
"<!--",
" \n Multi line\n Comment\n ",
"-->",
),
BlockComment("<!--", "", "-->"),
BlockComment("<!--", " ", "-->"),
];
assert_eq!(tokens, expected);
}
#[test]
fn unterminated_html_comment() {
let sample = r#"
<!-- hey
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![Symbol("<"), Symbol("!"), LineComment("--", "hey")];
assert_eq!(tokens, expected);
}
#[test]
fn unterminated_html_comment2() {
let sample = r#"
< let x
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![Symbol("<"), Ident("let"), Ident("x")];
assert_eq!(tokens, expected);
}
#[test]
fn unterminated_html_comment3() {
let sample = r#"<"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![Symbol("<")];
assert_eq!(tokens, expected);
}
#[test]
fn unterminated_block_comment() {
let sample = r#"
/* let x
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![Symbol("/"), Symbol("*"), Ident("let"), Ident("x")];
assert_eq!(tokens, expected);
}
#[test]
fn random_chars() {
let sample = r#"
→
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![Symbol("→")];
assert_eq!(tokens, expected);
}
#[test]
fn nested_backlog() {
let sample = r#"
/* `helloworldwhat
let x = 5
"#;
let tokenizer = Tokenizer::new(sample);
let tokens: Vec<Token> = tokenizer.tokens().collect();
let expected = vec![
Symbol("/"),
Symbol("*"),
Symbol("`"),
Ident("helloworldwhat"),
Ident("let"),
Ident("x"),
Symbol("="),
Number("5"),
];
assert_eq!(tokens, expected);
}
#[test]
fn test_escaped_string() {
let sample = r#"
"Hello \"World"
"Hello World\\"
"Hello World\" x
"#;
let tokens: Vec<_> = Tokenizer::new(sample).tokens().collect();
let expected = vec![
String("\"", "Hello \\\"World", "\""),
String("\"", "Hello World\\\\", "\""),
Symbol("\""),
Ident("Hello"),
Ident("World"),
Symbol("\\"),
Symbol("\""),
Ident("x"),
];
assert_eq!(tokens, expected);
}
}