use super::Result;
use super::error::Error;
use super::error::SyntaxError;
use super::token::Token;
use super::token::TokenType::{self, *};
use std::iter::Peekable;
use std::slice::SliceIndex;
use std::str::CharIndices;
#[derive(Debug)]
pub(super) struct TokenStream<'a> {
lexer: Lexer<'a>,
lookahead: Option<Token>,
}
#[derive(Debug)]
pub(super) struct Lexer<'a> {
pos: usize,
linebreaks: Vec<usize>,
iter: Peekable<CharIndices<'a>>,
source: &'a str,
}
impl<'a> TokenStream<'a> {
#[must_use]
pub(super) fn new(source: &'a str) -> Self {
Self {
lexer: Lexer::new(source),
lookahead: None,
}
}
#[hotpath::measure]
pub(super) fn next(&mut self) -> Result<Token> {
match self.lookahead.take() {
Some(token) => Ok(token),
None => self.lexer.next_token(),
}
}
#[hotpath::measure]
pub(super) fn peek(&mut self) -> Result<&Token> {
if self.lookahead.is_none() {
self.lookahead = Some(self.lexer.next_token()?);
}
Ok(self
.lookahead
.as_ref()
.expect("lexer lookahead is populated before returning"))
}
pub(super) fn peek_type(&mut self) -> Result<TokenType> {
Ok(self.peek()?.typ)
}
pub(super) fn check_type(&mut self, expected_type: TokenType) -> Result<bool> {
Ok(self.peek_type()? == expected_type)
}
pub(super) fn try_pop(&mut self, expected_type: TokenType) -> Result<Option<Token>> {
if self.check_type(expected_type)? {
Ok(Some(self.next()?))
} else {
Ok(None)
}
}
#[must_use]
pub(super) fn line_and_column(&self, pos: usize) -> (usize, usize) {
self.lexer.line_and_col(pos)
}
#[must_use]
pub(super) fn pos(&self) -> usize {
match &self.lookahead {
Some(token) => token.start,
None => self.lexer.pos,
}
}
#[must_use]
pub(super) fn substring(&self, index: impl SliceIndex<str, Output = str>) -> &'a str {
&self.lexer.source[index]
}
}
impl<'a> Lexer<'a> {
#[must_use]
pub(super) fn new(source: &'a str) -> Self {
let linebreaks = vec![0];
Self {
iter: source.char_indices().peekable(),
linebreaks,
pos: 0,
source,
}
}
#[hotpath::measure]
pub(super) fn next_token(&mut self) -> Result<Token> {
let starts_line = self.consume_whitespace();
let tok_start = self.pos;
if let Some(first_char) = self.next_char() {
let tok_type = match first_char {
'+' => Plus,
'*' => Star,
'/' => Slash,
'%' => Mod,
'^' => Caret,
'#' => Hash,
';' => Semi,
':' => Colon,
',' => Comma,
'(' if starts_line => LParenLineStart,
'(' => LParen,
')' => RParen,
'{' => LCurly,
'}' => RCurly,
']' => RSquare,
'.' => self.peek_dot(tok_start)?,
'=' | '<' | '>' | '~' => self.peek_equals(tok_start, first_char)?,
'-' => {
if self.try_next('-') {
return self.comment();
}
Minus
}
'\'' | '\"' => self.lex_string(first_char, tok_start)?,
'[' => {
if let Some('=' | '[') = self.peek_char() {
panic!("Long strings are not supported yet.");
} else {
LSquare
}
}
'0'..='9' => self.lex_full_number(tok_start, first_char)?,
'a'..='z' | 'A'..='Z' | '_' => self.lex_word(first_char),
_ => return Err(self.error(SyntaxError::InvalidCharacter(first_char))),
};
let len = (self.pos - tok_start) as u32;
let token = Token {
typ: tok_type,
start: tok_start,
len,
};
Ok(token)
} else {
Ok(self.end_of_file())
}
}
fn comment(&mut self) -> Result<Token> {
if self.peek_char() == Some('[') {
self.next_char(); if self.peek_char() == Some('[') {
self.next_char(); loop {
match self.next_char() {
Some(']') if self.peek_char() == Some(']') => {
self.next_char(); return self.next_token();
}
None => return Ok(self.end_of_file()),
_ => {}
}
}
}
}
while let Some(c) = self.next_char() {
if c == '\n' {
return self.next_token();
}
}
Ok(self.end_of_file())
}
#[must_use]
fn peek_char(&mut self) -> Option<char> {
self.iter.peek().map(|(_, c)| *c)
}
fn next_char(&mut self) -> Option<char> {
match self.iter.next() {
Some((pos, c)) => {
self.pos = pos + c.len_utf8();
if c == '\n' {
self.linebreaks.push(self.pos);
}
Some(c)
}
None => None,
}
}
fn consume_whitespace(&mut self) -> bool {
let mut ret = false;
while let Some(c) = self.peek_char() {
if !c.is_ascii_whitespace() {
break;
}
if c == '\n' {
ret = true;
}
self.next_char();
}
ret
}
fn try_next(&mut self, expected: char) -> bool {
match self.peek_char() {
Some(c) if c == expected => {
self.next_char();
true
}
_ => false,
}
}
#[must_use]
fn error(&self, kind: SyntaxError) -> Error {
let (line_num, column) = self.line_and_col(self.pos);
Error::new(kind, line_num, column)
}
fn peek_dot(&mut self, tok_start: usize) -> Result<TokenType> {
let typ = match self.peek_char() {
Some('.') => {
self.next_char();
if self.try_next('.') {
DotDotDot
} else {
DotDot
}
}
Some(c) if c.is_ascii_digit() => {
self.next_char();
self.lex_number_after_decimal(tok_start)?;
LiteralNumber
}
_ => Dot,
};
Ok(typ)
}
fn peek_equals(&mut self, _tok_start: usize, first_char: char) -> Result<TokenType> {
if self.try_next('=') {
let typ = match first_char {
'=' => Equal,
'~' => NotEqual,
'<' => LessEqual,
'>' => GreaterEqual,
_ => panic!("peek_equals was called with first_char = {first_char}"),
};
Ok(typ)
} else {
match first_char {
'=' => Ok(Assign),
'<' => Ok(Less),
'>' => Ok(Greater),
'~' => Err(self.error(SyntaxError::InvalidCharacter(first_char))),
_ => panic!("peek_equals was called with first_char = {first_char}"),
}
}
}
fn lex_string(&mut self, quote: char, _tok_start: usize) -> Result<TokenType> {
while let Some(c) = self.next_char() {
if c == quote {
return Ok(LiteralString);
} else if c == '\\' {
self.next_char();
} else if c == '\n' {
return Err(self.error(SyntaxError::UnclosedString));
}
}
Err(self.error(SyntaxError::UnclosedString))
}
fn lex_full_number(&mut self, tok_start: usize, first_char: char) -> Result<TokenType> {
if first_char == '0' && (self.try_next('x') || self.try_next('X')) {
match self.next_char() {
Some(c) if c.is_ascii_hexdigit() => (),
_ => return Err(self.error(SyntaxError::BadNumber)),
};
while let Some(c) = self.peek_char() {
if c.is_ascii_hexdigit() {
self.next_char();
} else {
break;
}
}
match self.peek_char() {
Some(c) if c.is_ascii_hexdigit() => Err(self.error(SyntaxError::BadNumber)),
_ => Ok(LiteralHexNumber),
}
} else {
self.lex_digits();
if self.try_next('.') {
match self.peek_char() {
Some(c) if c.is_ascii_digit() => self.lex_number_after_decimal(tok_start)?,
_ => self.lex_exponent(tok_start)?,
}
} else {
self.lex_exponent(tok_start)?;
}
Ok(LiteralNumber)
}
}
fn lex_number_after_decimal(&mut self, tok_start: usize) -> Result<()> {
self.lex_digits();
self.lex_exponent(tok_start)
}
fn lex_digits(&mut self) {
while let Some(c) = self.peek_char() {
if c.is_ascii_digit() {
self.next_char();
} else {
break;
}
}
}
fn lex_exponent(&mut self, _tok_start: usize) -> Result<()> {
if self.try_next('E') || self.try_next('e') {
if let Some(c) = self.peek_char()
&& (c == '+' || c == '-')
{
self.next_char();
}
self.lex_digits();
}
match self.peek_char() {
Some(c) if c.is_ascii_hexdigit() => Err(self.error(SyntaxError::BadNumber)),
_ => Ok(()),
}
}
fn lex_word(&mut self, first_char: char) -> TokenType {
let mut word = String::new();
word.push(first_char);
while let Some(c) = self.peek_char() {
if c.is_ascii_alphabetic() || c.is_ascii_digit() || c == '_' {
word.push(c);
self.next_char();
} else {
break;
}
}
keyword_match(&word)
}
#[must_use]
fn line_and_col(&self, pos: usize) -> (usize, usize) {
let iter = self.linebreaks.windows(2).enumerate();
for (line_num, linebreak_pair) in iter {
if pos < linebreak_pair[1] {
let column = pos - linebreak_pair[0];
return (line_num + 1, column + 1);
}
}
let line_num = self.linebreaks.len() - 1;
let column = pos
- self
.linebreaks
.last()
.expect("lexer always stores the first line start");
(line_num + 1, column + 1)
}
#[must_use]
const fn end_of_file(&self) -> Token {
Token {
typ: TokenType::EndOfFile,
start: self.pos,
len: 0,
}
}
}
#[must_use]
fn keyword_match(s: &str) -> TokenType {
match s {
"and" => And,
"break" => Break,
"do" => Do,
"else" => Else,
"elseif" => ElseIf,
"end" => End,
"false" => False,
"for" => For,
"function" => Function,
"if" => If,
"in" => In,
"local" => Local,
"nil" => Nil,
"not" => Not,
"or" => Or,
"repeat" => Repeat,
"return" => Return,
"then" => Then,
"true" => True,
"until" => Until,
"while" => While,
_ => Identifier,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn check(input: &str, tokens: &[(TokenType, usize, u32)], lines: &[usize]) {
let mut lexer = Lexer::new(input);
let mut tokens = tokens
.iter()
.map(|&(typ, start, len)| Token { typ, start, len });
loop {
let actual = lexer.next_token().unwrap();
if actual.typ == TokenType::EndOfFile {
break;
}
let expected = tokens.next().unwrap();
assert_eq!(expected, actual);
}
assert!(tokens.next().is_none());
assert_eq!(lines, lexer.linebreaks.as_slice());
}
fn check_line(input: &str, tokens: &[(TokenType, usize, u32)]) {
check(input, tokens, &[0]);
}
#[test]
fn test_lexer01() {
let tokens = &[(LiteralNumber, 0, 2)];
check_line("50", tokens);
}
#[test]
fn test_lexer02() {
let input = "hi 4 false";
let tokens = &[(Identifier, 0, 2), (LiteralNumber, 3, 1), (False, 5, 5)];
check_line(input, tokens);
}
#[test]
fn test_lexer03() {
let input = "hi5";
let tokens = &[(Identifier, 0, 3)];
check_line(input, tokens);
}
#[test]
fn test_lexer04() {
let input = "5 + 5";
let tokens = &[(LiteralNumber, 0, 1), (Plus, 2, 1), (LiteralNumber, 4, 1)];
check_line(input, tokens);
}
#[test]
fn test_lexer05() {
let input = "print 5 or 6;";
let tokens = &[
(Identifier, 0, 5),
(LiteralNumber, 6, 1),
(Or, 8, 2),
(LiteralNumber, 11, 1),
(Semi, 12, 1),
];
check_line(input, tokens);
}
#[test]
fn test_lexer06() {
let input = "t = {x = 3}";
let tokens = &[
(Identifier, 0, 1),
(Assign, 2, 1),
(LCurly, 4, 1),
(Identifier, 5, 1),
(Assign, 7, 1),
(LiteralNumber, 9, 1),
(RCurly, 10, 1),
];
check_line(input, tokens);
}
#[test]
fn test_lexer07() {
let input = "0x5rad";
let tokens = &[(LiteralHexNumber, 0, 3), (Identifier, 3, 3)];
check_line(input, tokens);
}
#[test]
fn test_lexer08() {
let input = "print {x = 5,}";
let tokens = &[
(Identifier, 0, 5),
(LCurly, 6, 1),
(Identifier, 7, 1),
(Assign, 9, 1),
(LiteralNumber, 11, 1),
(Comma, 12, 1),
(RCurly, 13, 1),
];
check_line(input, tokens);
}
#[test]
fn test_lexer09() {
let input = "print()\nsome_other_function(an_argument)\n";
let tokens = &[
(Identifier, 0, 5),
(LParen, 5, 1),
(RParen, 6, 1),
(Identifier, 8, 19),
(LParen, 27, 1),
(Identifier, 28, 11),
(RParen, 39, 1),
];
let linebreaks = &[0, 8, 41];
check(input, tokens, linebreaks);
}
#[test]
fn test_lexer10() {
let input = "\n\n2\n456\n";
let tokens = &[(LiteralNumber, 2, 1), (LiteralNumber, 4, 3)];
let linebreaks = &[0, 1, 2, 4, 8];
check(input, tokens, linebreaks);
}
#[test]
fn test_lexer11() {
let input = "-- basic test\nprint('hi' --comment\n )\n";
let tokens = &[
(Identifier, 14, 5),
(LParen, 19, 1),
(LiteralString, 20, 4),
(RParen, 36, 1),
];
let linebreaks = &[0, 14, 35, 38];
check(input, tokens, linebreaks);
}
#[test]
fn test_lexer12() {
let input = "print()\n(some_other_function)(an_argument)\n";
let tokens = &[
(Identifier, 0, 5),
(LParen, 5, 1),
(RParen, 6, 1),
(LParenLineStart, 8, 1),
(Identifier, 9, 19),
(RParen, 28, 1),
(LParen, 29, 1),
(Identifier, 30, 11),
(RParen, 41, 1),
];
let linebreaks = &[0, 8, 43];
check(input, tokens, linebreaks);
}
}