use std::{cell::Cell, vec};
pub mod token;
pub struct Lexer<'a> {
input: &'a str,
size: Cell<usize>,
location: Cell<usize>,
}
#[allow(dead_code, unused_variables)]
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
Self {
input,
size: input.chars().count().into(),
location: 0.into(),
}
}
pub fn parse_tokens(&self) -> Vec<token::Token> {
let mut tokens = vec![];
while self.location.get() < self.size.get() {
let tok = self.next_token();
tokens.push(tok);
}
tokens
}
fn read_char(&self) -> char {
if self.location.get() < self.size.get() {
let char = self.input.chars().nth(self.location.get()).unwrap();
self.location.set(self.location.get() + 1);
char } else {
'\0'
}
}
#[expect(
clippy::too_many_lines,
reason = "token dispatch is intentionally kept together while the lexer is small"
)]
pub fn next_token(&self) -> token::Token {
while self.peek_char().is_whitespace() {
self.read_char();
}
while self.peek_char() == '\n' {
self.read_char();
}
let ch = self.peek_char();
match ch {
';' => {
self.read_char();
token::Token::Semicolon
}
',' => {
self.read_char();
token::Token::Comma
}
':' => {
self.read_char();
token::Token::Colon
}
'=' => {
self.read_char();
token::Token::Equal
}
'<' => {
self.read_char();
match self.peek_char() {
'=' => {
self.read_char();
token::Token::LessEqual
}
'>' => {
self.read_char();
token::Token::NotEqual
}
_ => token::Token::Less,
}
}
'>' => {
self.read_char();
if self.peek_char() == '=' {
self.read_char();
token::Token::GreaterEqual
} else {
token::Token::Greater
}
}
'+' => {
self.read_char();
token::Token::Plus
}
'-' => {
self.read_char();
token::Token::Minus
}
'*' => {
self.read_char();
token::Token::Asterisk
}
'/' => {
self.read_char();
token::Token::Slash
}
'(' => {
self.read_char();
token::Token::LParen
}
')' => {
self.read_char();
token::Token::RParen
}
'\0' => token::Token::Eof,
'\"' => {
let mut str_lit_char = self.read_char();
let mut str_literal: String = String::new();
str_literal.push(str_lit_char);
str_lit_char = self.read_char();
while str_lit_char != '\"' {
str_literal.push(str_lit_char);
if self.size == self.location {
return token::Token::InvalidLit(str_literal);
}
str_lit_char = self.read_char();
}
str_literal.push(str_lit_char);
token::Token::String(str_literal)
}
'\'' => {
let mut ch_lit_char = self.read_char();
let mut ch_literal: String = String::new();
ch_literal.push(ch_lit_char);
ch_lit_char = self.read_char();
while ch_lit_char != '\'' {
ch_literal.push(ch_lit_char);
if self.size == self.location {
return token::Token::InvalidLit(ch_literal);
}
ch_lit_char = self.read_char();
}
ch_literal.push(ch_lit_char);
if ch_literal.chars().count() > 3 || ch_literal.chars().count() < 3 {
return token::Token::InvalidLit(ch_literal);
}
token::Token::Char(ch_literal.chars().nth(1).unwrap())
}
_ => {
if ch.is_alphabetic() {
let ident: String = self.read_identifier();
match ident.as_str() {
"print" => token::Token::Print,
"let" => token::Token::Let,
"if" => token::Token::If,
"then" => token::Token::Then,
"else" => token::Token::Else,
"for" => token::Token::For,
"to" => token::Token::To,
"next" => token::Token::Next,
"goto" => token::Token::Goto,
"gosub" => token::Token::Gosub,
"return" => token::Token::Return,
"end" => token::Token::End,
"rem" => token::Token::Rem,
_ => token::Token::Identifier(ident),
}
} else if ch.is_numeric() {
let num = self.read_number();
token::Token::Integer(num) } else {
self.read_char();
token::Token::Invalid(ch)
}
}
}
}
fn read_identifier(&self) -> String {
let mut ident = String::new();
while self.peek_char().is_alphanumeric() || self.peek_char() == '_' {
ident.push(self.read_char());
}
ident
}
fn read_number(&self) -> i64 {
let mut num = String::new();
while self.peek_char().is_numeric() {
num.push(self.read_char());
}
num.parse().unwrap()
}
pub fn peek(&self) -> token::Token {
let location = self.location.get();
let token = self.next_token();
self.location.set(location);
token
}
fn peek_char(&self) -> char {
if self.location.get() < self.size.get() {
self.input.chars().nth(self.location.get()).unwrap()
} else {
'\0'
}
}
}
#[cfg(test)]
mod test {
use crate::lexer::{Lexer, token};
use ntest::timeout;
#[test]
#[timeout(100)]
fn test_peek_char() {
let lex: Lexer<'_> = Lexer::new("Alphabet");
assert_eq!(lex.peek_char(), 'A');
assert_eq!(lex.peek_char(), 'A');
assert_eq!(lex.peek_char(), 'A');
assert_eq!(lex.read_char(), 'A');
assert_eq!(lex.peek_char(), 'l');
}
#[test]
#[timeout(100)]
fn test_peek_token() {
let lex: Lexer<'_> = Lexer::new("print \"HELLO\";");
assert_eq!(lex.peek(), token::Token::Print);
assert_eq!(lex.peek(), token::Token::Print);
assert_eq!(lex.next_token(), token::Token::Print);
assert_eq!(lex.peek(), token::Token::String("\"HELLO\"".to_string()));
assert_eq!(
lex.next_token(),
token::Token::String("\"HELLO\"".to_string())
);
assert_eq!(lex.next_token(), token::Token::Semicolon);
}
#[test]
#[timeout(100)]
fn test_peek_token_skips_whitespace_without_consuming() {
let lex: Lexer<'_> = Lexer::new(" <= 10");
assert_eq!(lex.peek(), token::Token::LessEqual);
assert_eq!(lex.peek(), token::Token::LessEqual);
assert_eq!(lex.next_token(), token::Token::LessEqual);
assert_eq!(lex.next_token(), token::Token::Integer(10));
}
#[test]
#[timeout(100)]
fn test_read_char() {
let lex: Lexer<'_> = Lexer::new("Alphabet");
assert_eq!(lex.read_char(), 'A');
assert_eq!(lex.read_char(), 'l');
assert_eq!(lex.read_char(), 'p');
assert_eq!(lex.read_char(), 'h');
assert_eq!(lex.read_char(), 'a');
assert_eq!(lex.read_char(), 'b');
assert_eq!(lex.read_char(), 'e');
assert_eq!(lex.read_char(), 't');
assert_eq!(lex.read_char(), '\0');
assert_eq!(lex.read_char(), '\0');
assert_eq!(lex.read_char(), '\0');
}
#[test]
#[timeout(100)]
fn test_main_fn() {
let lex = Lexer::new("main()");
let tokens = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Identifier("main".to_string()));
assert_eq!(tokens[1], token::Token::LParen);
assert_eq!(tokens[2], token::Token::RParen);
}
#[test]
#[timeout(100)]
fn test_next_token() {
let lex: Lexer<'_> = Lexer::new("ALPHA + GAMMA");
assert_eq!(
lex.next_token(),
token::Token::Identifier("ALPHA".to_string())
);
assert_eq!(lex.next_token(), token::Token::Plus);
assert_eq!(
lex.next_token(),
token::Token::Identifier("GAMMA".to_string())
);
}
#[test]
#[timeout(100)]
fn test_parse_tokens() {
let lex: Lexer<'_> = Lexer::new("ALPHA + GAMMA");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Identifier("ALPHA".to_string()));
assert_eq!(tokens[1], token::Token::Plus);
assert_eq!(tokens[2], token::Token::Identifier("GAMMA".to_string()));
}
#[test]
#[timeout(100)]
fn test_parse_separators() {
let lex: Lexer<'_> = Lexer::new("A,B:C;");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Identifier("A".to_string()));
assert_eq!(tokens[1], token::Token::Comma);
assert_eq!(tokens[2], token::Token::Identifier("B".to_string()));
assert_eq!(tokens[3], token::Token::Colon);
assert_eq!(tokens[4], token::Token::Identifier("C".to_string()));
assert_eq!(tokens[5], token::Token::Semicolon);
}
#[test]
#[timeout(100)]
fn test_parse_comparison_operators() {
let lex: Lexer<'_> = Lexer::new("= < > <= >= <>");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Equal);
assert_eq!(tokens[1], token::Token::Less);
assert_eq!(tokens[2], token::Token::Greater);
assert_eq!(tokens[3], token::Token::LessEqual);
assert_eq!(tokens[4], token::Token::GreaterEqual);
assert_eq!(tokens[5], token::Token::NotEqual);
}
#[test]
#[timeout(100)]
fn test_string_token() {
let lex: Lexer<'_> = Lexer::new("\"ABCD\" \"Sheep\" \"LITERAL\"");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::String(String::from("\"ABCD\"")));
assert_eq!(tokens[1], token::Token::String(String::from("\"Sheep\"")));
assert_eq!(tokens[2], token::Token::String(String::from("\"LITERAL\"")));
}
#[test]
#[timeout(100)]
fn test_char_lit_token() {
let lex: Lexer<'_> = Lexer::new("'a' 'ab'");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Char('a'));
assert_eq!(tokens[1], token::Token::InvalidLit("'ab'".to_string()));
}
#[test]
#[timeout(100)]
fn test_invalid_str_literal() {
let lex: Lexer<'_> = Lexer::new("\"aaaaaaaa");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(
tokens[0],
token::Token::InvalidLit("\"aaaaaaaa".to_string())
);
}
#[test]
#[timeout(100)]
fn test_invalid_ch_literal() {
let lex: Lexer<'_> = Lexer::new("'aaaaaaaa");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(
tokens[0],
token::Token::InvalidLit("\'aaaaaaaa".to_string())
);
}
#[test]
#[timeout(100)]
fn test_parse_keywords() {
let lex: Lexer<'_> =
Lexer::new("print let if then else for to next goto gosub return end rem");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Print);
assert_eq!(tokens[1], token::Token::Let);
assert_eq!(tokens[2], token::Token::If);
assert_eq!(tokens[3], token::Token::Then);
assert_eq!(tokens[4], token::Token::Else);
assert_eq!(tokens[5], token::Token::For);
assert_eq!(tokens[6], token::Token::To);
assert_eq!(tokens[7], token::Token::Next);
assert_eq!(tokens[8], token::Token::Goto);
assert_eq!(tokens[9], token::Token::Gosub);
assert_eq!(tokens[10], token::Token::Return);
assert_eq!(tokens[11], token::Token::End);
assert_eq!(tokens[12], token::Token::Rem);
}
#[test]
#[timeout(100)]
fn test_invalid_num() {
let lex: Lexer<'_> = Lexer::new("65e");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Integer(65));
assert_eq!(tokens[1], token::Token::Identifier("e".to_string()));
}
#[test]
#[timeout(100)]
fn lex_example_code() {
let lex: Lexer<'_> = Lexer::new("1;\n2;");
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Integer(1));
assert_eq!(tokens[1], token::Token::Semicolon);
assert_eq!(tokens[2], token::Token::Integer(2));
assert_eq!(tokens[3], token::Token::Semicolon);
}
#[test]
#[timeout(100)]
fn test_print() {
let file_path = "./tests/input/print.bsc";
let contents = std::fs::read_to_string(file_path).unwrap();
let lex: Lexer<'_> = Lexer::new(&contents);
let tokens: Vec<token::Token> = lex.parse_tokens();
assert_eq!(tokens[0], token::Token::Print);
assert_eq!(
tokens[1],
token::Token::String("\"HELLO WORLD\"".to_string())
);
assert_eq!(tokens[2], token::Token::Semicolon);
}
}