use crate::compiler::parser::ast::Span;
use crate::compiler::parser::lexer::TokenType::{
AmpAmp, Bang, BangBang, BangEquals, Colon, Comma, Dot, Eof, Equals, EqualsEquals, Error, False,
Function, GreaterThan, Identifier, LeftCurly, LeftParen, LessThan, Match, Minus, Number,
PipePipe, Plus, Return, RightArrow, RightCurly, RightParen, Semicolon, Slash, Star,
StringLiteral, True, Type,
};
use std::str::Chars;
#[derive(Debug, PartialEq, Copy, Clone)]
pub enum TokenType {
Equals,
LeftParen,
RightParen,
Comma,
Bang,
Plus,
Minus,
Star,
Slash,
LeftCurly,
RightCurly,
GreaterThan,
LessThan,
Semicolon,
Colon,
Dot,
EqualsEquals,
BangEquals,
AmpAmp,
PipePipe,
RightArrow,
BangBang,
True,
False,
Number,
StringLiteral,
Identifier,
Function,
Type,
Match,
Return,
Error,
Eof,
}
#[derive(Debug, Clone)]
pub struct Token<'input> {
pub value: &'input str,
pub t_type: TokenType,
pub line: usize,
pub offset_in_line: usize,
}
impl<'input> Token<'input> {
pub(crate) fn span(&self) -> Span {
Span {
line: self.line,
offset: self.offset_in_line,
length: self.value.len(),
}
}
}
pub struct TokenStream<'input> {
content: &'input str,
chars: Chars<'input>,
current_char: Option<char>,
current_char_offset: usize,
finished: bool,
current_token_start_offset: usize,
current_token_start_line: usize,
current_token_start_in_line: usize,
global_line_count: usize,
last_line_start_global_offset: usize,
}
impl<'input> TokenStream<'input> {
pub fn from(content: &'input str) -> TokenStream<'input> {
let mut result = TokenStream {
content,
chars: content.chars(),
current_char: None,
current_char_offset: 0,
finished: false,
current_token_start_offset: 0,
current_token_start_line: 1,
current_token_start_in_line: 1,
global_line_count: 1,
last_line_start_global_offset: 0,
};
result.current_char = result.chars.next();
result
}
fn increment_line(&mut self) {
self.global_line_count += 1;
self.last_line_start_global_offset = self.current_char_offset;
}
fn advance(&mut self) {
if self.finished {
self.current_char = None;
return;
}
self.current_char = self.chars.next();
self.current_char_offset += 1;
if self.current_char.is_none() {
self.finished = true;
}
}
fn peek(&self) -> char {
self.current_char.unwrap_or('\0')
}
fn make_token(&mut self, t_type: TokenType) -> Token<'input> {
Token {
value: &self.content[self.current_token_start_offset..self.current_char_offset],
t_type,
line: self.current_token_start_line,
offset_in_line: self.current_token_start_in_line,
}
}
fn make_string_token(&mut self) -> Token<'input> {
Token {
value: &self.content[self.current_token_start_offset + 1..self.current_char_offset - 1],
t_type: StringLiteral,
line: self.current_token_start_line,
offset_in_line: self.current_token_start_in_line,
}
}
fn string(&mut self) -> Token<'input> {
loop {
match self.peek() {
'\0' => {
return self.make_token(Eof);
}
'\n' | '\r' => {
self.increment_line();
self.advance();
}
'\"' => {
self.advance(); return self.make_string_token();
}
_ => {
self.advance();
}
}
}
}
fn skip_whitespace(&mut self) {
loop {
match self.peek() {
'\t' | ' ' => {
self.advance();
}
'\r' | '\n' => {
self.advance();
self.increment_line();
}
'\0' => {
self.finished = true;
break;
}
_ => {
return;
}
}
}
}
fn keyword(token: &mut Token) {
match token.value {
"function" => {
token.t_type = Function;
}
"type" => {
token.t_type = Type;
}
"false" => {
token.t_type = False;
}
"match" => {
token.t_type = Match;
}
"return" => {
token.t_type = Return;
}
"true" => {
token.t_type = True;
}
_ => {}
}
}
fn identifier(&mut self) -> Token<'input> {
while !self.finished && (self.peek().is_alphanumeric() || self.peek() == '_') {
self.advance();
}
let mut token = self.make_token(Identifier);
Self::keyword(&mut token);
token
}
fn number(&mut self) -> Token<'input> {
while !self.finished && self.peek().is_ascii_digit() {
self.advance();
}
self.make_token(Number)
}
pub(crate) fn next(&mut self) -> Option<Token<'input>> {
self.skip_whitespace();
if self.finished {
return None;
}
self.current_token_start_offset = self.current_char_offset;
self.current_token_start_line = self.global_line_count;
self.current_token_start_in_line =
self.current_token_start_offset - self.last_line_start_global_offset + 1;
let current_char = self.peek();
if current_char.is_ascii_digit() {
return Some(self.number());
}
if current_char.is_ascii_alphabetic() || current_char == '_' {
return Some(self.identifier());
}
self.advance();
match current_char {
'=' => {
if self.peek() == '=' {
self.advance();
return Some(self.make_token(EqualsEquals));
}
Some(self.make_token(Equals))
}
'!' => {
if self.peek() == '=' {
self.advance();
return Some(self.make_token(BangEquals));
}
if self.peek() == '!' {
self.advance();
return Some(self.make_token(BangBang));
}
Some(self.make_token(Bang))
}
'-' => {
if self.peek() == '>' {
self.advance();
return Some(self.make_token(RightArrow));
}
Some(self.make_token(Minus))
}
'&' => {
if self.peek() == '&' {
self.advance();
return Some(self.make_token(AmpAmp));
}
Some(self.make_token(Error))
}
'|' => {
if self.peek() == '|' {
self.advance();
return Some(self.make_token(PipePipe));
}
Some(self.make_token(Error))
}
'(' => Some(self.make_token(LeftParen)),
')' => Some(self.make_token(RightParen)),
',' => Some(self.make_token(Comma)),
'"' => Some(self.string()),
'+' => Some(self.make_token(Plus)),
'*' => Some(self.make_token(Star)),
'/' => Some(self.make_token(Slash)),
'{' => Some(self.make_token(LeftCurly)),
'}' => Some(self.make_token(RightCurly)),
'>' => Some(self.make_token(GreaterThan)),
'<' => Some(self.make_token(LessThan)),
';' => Some(self.make_token(Semicolon)),
':' => Some(self.make_token(Colon)),
'.' => Some(self.make_token(Dot)),
_ => Some(self.make_token(Error)),
}
}
}
#[cfg(test)]
mod tests {
use crate::compiler::parser::lexer::Token;
use crate::compiler::parser::lexer::TokenStream;
use crate::compiler::parser::lexer::TokenType::{
Bang, BangEquals, Eof, Equals, EqualsEquals, Error, GreaterThan, Identifier, Minus, Number,
StringLiteral, True,
};
fn test_token(expected_token: &Token, scanned: &Token) {
assert_eq!(expected_token.line, scanned.line);
assert_eq!(expected_token.t_type, scanned.t_type);
assert_eq!(expected_token.offset_in_line, scanned.offset_in_line);
if expected_token.t_type == Identifier
|| expected_token.t_type == StringLiteral
|| expected_token.t_type == Number
{
assert_eq!(expected_token.value, scanned.value);
}
}
#[test]
fn test_whitespace() {
let input = "\n\t a\n\rbcd\r \n";
let mut scanner = TokenStream::from(input);
scanner.skip_whitespace();
assert_eq!(scanner.global_line_count, 2);
assert_eq!(scanner.peek(), 'a');
scanner.advance();
scanner.skip_whitespace();
assert_eq!(scanner.global_line_count, 4);
assert_eq!(scanner.peek(), 'b');
scanner.advance();
scanner.skip_whitespace();
assert_eq!(scanner.global_line_count, 4);
assert_eq!(scanner.peek(), 'c');
scanner.advance();
scanner.skip_whitespace();
assert_eq!(scanner.global_line_count, 4);
assert_eq!(scanner.peek(), 'd');
scanner.advance();
scanner.skip_whitespace();
assert_eq!(scanner.global_line_count, 6);
assert_eq!(scanner.peek(), '\0');
assert_eq!(scanner.finished, true);
}
#[test]
fn test_true_false() {
let input = "t true fa\nlse truea fals\n\r123";
let mut scanner = TokenStream::from(input);
test_token(
&Token {
value: "t",
t_type: Identifier,
line: 1,
offset_in_line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "true",
t_type: True,
line: 1,
offset_in_line: 3,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "fa",
t_type: Identifier,
line: 1,
offset_in_line: 8,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "lse",
t_type: Identifier,
line: 2,
offset_in_line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "truea",
t_type: Identifier,
line: 2,
offset_in_line: 5,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "fals",
t_type: Identifier,
line: 2,
offset_in_line: 11,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "123",
t_type: Number,
line: 4,
offset_in_line: 1,
},
&scanner.next().unwrap(),
);
assert!(scanner.next().is_none());
}
#[test]
fn test_two_character_operators() {
let input = " == 2== =!==\n=\n! =";
let mut scanner = TokenStream::from(input);
test_token(
&Token {
value: "==",
t_type: EqualsEquals,
offset_in_line: 2,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "2",
t_type: Number,
offset_in_line: 5,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "==",
t_type: EqualsEquals,
offset_in_line: 6,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "=",
t_type: Equals,
offset_in_line: 9,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "!=",
t_type: BangEquals,
offset_in_line: 10,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "=",
t_type: Equals,
offset_in_line: 12,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "=",
t_type: Equals,
offset_in_line: 1,
line: 2,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "!",
t_type: Bang,
offset_in_line: 1,
line: 3,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "=",
t_type: Equals,
offset_in_line: 3,
line: 3,
},
&scanner.next().unwrap(),
);
assert!(scanner.next().is_none());
}
#[test]
fn test_end_in_the_middle_of_two_character_token() {
let input = "-\n> =\n= | |";
let mut scanner = TokenStream::from(input);
test_token(
&Token {
value: "-",
t_type: Minus,
offset_in_line: 1,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: ">",
t_type: GreaterThan,
offset_in_line: 1,
line: 2,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "=",
t_type: Equals,
offset_in_line: 3,
line: 2,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "=",
t_type: Equals,
offset_in_line: 1,
line: 3,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "",
t_type: Error,
offset_in_line: 3,
line: 3,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "",
t_type: Error,
offset_in_line: 5,
line: 3,
},
&scanner.next().unwrap(),
);
assert!(&scanner.next().is_none());
}
#[test]
fn test_string_simple() {
let input = "\"this\" \"another\"";
let mut scanner = TokenStream::from(input);
test_token(
&Token {
value: "this",
t_type: StringLiteral,
offset_in_line: 1,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "another",
t_type: StringLiteral,
offset_in_line: 8,
line: 1,
},
&scanner.next().unwrap(),
);
assert!(&scanner.next().is_none());
}
#[test]
fn test_string_multiline_and_eof() {
let input = " \"multi\nline\"\n\"\n";
let mut scanner = TokenStream::from(input);
test_token(
&Token {
value: "multi\nline",
t_type: StringLiteral,
offset_in_line: 2,
line: 1,
},
&scanner.next().unwrap(),
);
test_token(
&Token {
value: "\"",
t_type: Eof,
offset_in_line: 1,
line: 3,
},
&scanner.next().unwrap(),
);
assert!(scanner.next().is_none());
}
}