bcomp 0.1.0 - Docs.rs

//! The lexer will take the main source code and turn it into tokens
//!
//! The lexer is the first module called at entry, and is the first thing to touch the code being modified
//! TODO: Fill out the rest of the required rust documentation like the examples and such

use std::{cell::Cell, vec};

// We'll need to estable some kind of token system
pub mod token;

/// Lexer struct
///
/// We need a lifetime for the input string.
/// I wonder if it would be better to create a shared reference the main program is responsible for deleting
/// Maybe with like RC<str> or something
pub struct Lexer<'a> {
    input: &'a str,
    size: Cell<usize>,
    location: Cell<usize>,
}

#[allow(dead_code, unused_variables)]
impl<'a> Lexer<'a> {
    pub fn new(input: &'a str) -> Self {
        Self {
            input,
            size: input.chars().count().into(),
            location: 0.into(),
        }
    }

    /// Parses the tokens of the input into a vector of tokens
    pub fn parse_tokens(&self) -> Vec<token::Token> {
        let mut tokens = vec![];

        while self.location.get() < self.size.get() {
            let tok = self.next_token();
            tokens.push(tok);
        }

        tokens
    }

    /// Reads the next token and increments the current location
    ///
    /// If used while the location is out of bounds, the function returns 0x00
    fn read_char(&self) -> char {
        if self.location.get() < self.size.get() {
            let char = self.input.chars().nth(self.location.get()).unwrap();
            self.location.set(self.location.get() + 1);
            char // returns char
        } else {
            '\0'
        }
    }

    /// Read the next token
    ///
    #[expect(
        clippy::too_many_lines,
        reason = "token dispatch is intentionally kept together while the lexer is small"
    )]
    pub fn next_token(&self) -> token::Token {
        // Skip whitespace
        while self.peek_char().is_whitespace() {
            self.read_char();
        }
        while self.peek_char() == '\n' {
            self.read_char();
        }

        let ch = self.peek_char();

        match ch {
            // Example: single-character tokens
            ';' => {
                self.read_char();
                token::Token::Semicolon
            }
            ',' => {
                self.read_char();
                token::Token::Comma
            }
            ':' => {
                self.read_char();
                token::Token::Colon
            }
            '=' => {
                self.read_char();
                token::Token::Equal
            }
            '<' => {
                self.read_char();
                match self.peek_char() {
                    '=' => {
                        self.read_char();
                        token::Token::LessEqual
                    }
                    '>' => {
                        self.read_char();
                        token::Token::NotEqual
                    }
                    _ => token::Token::Less,
                }
            }
            '>' => {
                self.read_char();
                if self.peek_char() == '=' {
                    self.read_char();
                    token::Token::GreaterEqual
                } else {
                    token::Token::Greater
                }
            }
            '+' => {
                self.read_char();
                token::Token::Plus
            }
            '-' => {
                self.read_char();
                token::Token::Minus
            }
            '*' => {
                self.read_char();
                token::Token::Asterisk
            }
            '/' => {
                self.read_char();
                token::Token::Slash
            }
            '(' => {
                self.read_char();
                token::Token::LParen
            }
            ')' => {
                self.read_char();
                token::Token::RParen
            }
            '\0' => token::Token::Eof,
            '\"' => {
                let mut str_lit_char = self.read_char();
                let mut str_literal: String = String::new();
                str_literal.push(str_lit_char);
                str_lit_char = self.read_char();
                while str_lit_char != '\"' {
                    str_literal.push(str_lit_char);
                    if self.size == self.location {
                        return token::Token::InvalidLit(str_literal);
                    }
                    str_lit_char = self.read_char();
                }
                str_literal.push(str_lit_char);
                token::Token::String(str_literal)
            }
            '\'' => {
                let mut ch_lit_char = self.read_char();
                let mut ch_literal: String = String::new();
                ch_literal.push(ch_lit_char);
                ch_lit_char = self.read_char();
                while ch_lit_char != '\'' {
                    ch_literal.push(ch_lit_char);
                    if self.size == self.location {
                        return token::Token::InvalidLit(ch_literal);
                    }
                    ch_lit_char = self.read_char();
                }
                ch_literal.push(ch_lit_char);
                if ch_literal.chars().count() > 3 || ch_literal.chars().count() < 3 {
                    return token::Token::InvalidLit(ch_literal);
                }
                token::Token::Char(ch_literal.chars().nth(1).unwrap())
            }
            _ => {
                // Example: identifier or number
                if ch.is_alphabetic() {
                    let ident: String = self.read_identifier();
                    //checks if the token is a keyword
                    match ident.as_str() {
                        "print" => token::Token::Print,
                        "let" => token::Token::Let,
                        "if" => token::Token::If,
                        "then" => token::Token::Then,
                        "else" => token::Token::Else,
                        "for" => token::Token::For,
                        "to" => token::Token::To,
                        "next" => token::Token::Next,
                        "goto" => token::Token::Goto,
                        "gosub" => token::Token::Gosub,
                        "return" => token::Token::Return,
                        "end" => token::Token::End,
                        "rem" => token::Token::Rem,
                        _ => token::Token::Identifier(ident),
                    }
                } else if ch.is_numeric() {
                    let num = self.read_number();
                    token::Token::Integer(num) // ARCHER: For now we'll just support integers, you should fix this
                } else {
                    self.read_char();
                    token::Token::Invalid(ch)
                }
            }
        }
    }

    /// Reads an identifier (e.g., variable/function name)
    fn read_identifier(&self) -> String {
        let mut ident = String::new();
        while self.peek_char().is_alphanumeric() || self.peek_char() == '_' {
            ident.push(self.read_char());
        }
        ident
    }

    /// Reads a number (integer only for now)
    fn read_number(&self) -> i64 {
        let mut num = String::new();
        while self.peek_char().is_numeric() {
            num.push(self.read_char());
        }
        num.parse().unwrap()
    }

    /// Take a look at the next token without incrementing the location.
    pub fn peek(&self) -> token::Token {
        let location = self.location.get();
        let token = self.next_token();
        self.location.set(location);
        token
    }

    /// Take a look at the next character without incrimenting the location
    ///
    /// If used while the location is out of bounds, the function returns 0x00
    fn peek_char(&self) -> char {
        if self.location.get() < self.size.get() {
            self.input.chars().nth(self.location.get()).unwrap()
        } else {
            '\0'
        }
    }
}

#[cfg(test)]
mod test {
    use crate::lexer::{Lexer, token};
    use ntest::timeout;

    #[test]
    #[timeout(100)]
    fn test_peek_char() {
        let lex: Lexer<'_> = Lexer::new("Alphabet");

        assert_eq!(lex.peek_char(), 'A');
        assert_eq!(lex.peek_char(), 'A');
        assert_eq!(lex.peek_char(), 'A');
        assert_eq!(lex.read_char(), 'A');

        assert_eq!(lex.peek_char(), 'l');
    }

    #[test]
    #[timeout(100)]
    fn test_peek_token() {
        let lex: Lexer<'_> = Lexer::new("print \"HELLO\";");

        assert_eq!(lex.peek(), token::Token::Print);
        assert_eq!(lex.peek(), token::Token::Print);
        assert_eq!(lex.next_token(), token::Token::Print);
        assert_eq!(lex.peek(), token::Token::String("\"HELLO\"".to_string()));
        assert_eq!(
            lex.next_token(),
            token::Token::String("\"HELLO\"".to_string())
        );
        assert_eq!(lex.next_token(), token::Token::Semicolon);
    }

    #[test]
    #[timeout(100)]
    fn test_peek_token_skips_whitespace_without_consuming() {
        let lex: Lexer<'_> = Lexer::new("   <= 10");

        assert_eq!(lex.peek(), token::Token::LessEqual);
        assert_eq!(lex.peek(), token::Token::LessEqual);
        assert_eq!(lex.next_token(), token::Token::LessEqual);
        assert_eq!(lex.next_token(), token::Token::Integer(10));
    }

    #[test]
    #[timeout(100)]
    fn test_read_char() {
        let lex: Lexer<'_> = Lexer::new("Alphabet");

        assert_eq!(lex.read_char(), 'A');
        assert_eq!(lex.read_char(), 'l');
        assert_eq!(lex.read_char(), 'p');
        assert_eq!(lex.read_char(), 'h');
        assert_eq!(lex.read_char(), 'a');
        assert_eq!(lex.read_char(), 'b');
        assert_eq!(lex.read_char(), 'e');
        assert_eq!(lex.read_char(), 't');
        assert_eq!(lex.read_char(), '\0');
        assert_eq!(lex.read_char(), '\0');
        assert_eq!(lex.read_char(), '\0');
    }

    #[test]
    #[timeout(100)]
    fn test_main_fn() {
        let lex = Lexer::new("main()");
        let tokens = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Identifier("main".to_string()));
        assert_eq!(tokens[1], token::Token::LParen);
        assert_eq!(tokens[2], token::Token::RParen);
    }

    #[test]
    #[timeout(100)]
    fn test_next_token() {
        let lex: Lexer<'_> = Lexer::new("ALPHA + GAMMA");

        assert_eq!(
            lex.next_token(),
            token::Token::Identifier("ALPHA".to_string())
        );
        assert_eq!(lex.next_token(), token::Token::Plus);
        assert_eq!(
            lex.next_token(),
            token::Token::Identifier("GAMMA".to_string())
        );
    }

    #[test]
    #[timeout(100)]
    fn test_parse_tokens() {
        let lex: Lexer<'_> = Lexer::new("ALPHA + GAMMA");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Identifier("ALPHA".to_string()));
        assert_eq!(tokens[1], token::Token::Plus);
        assert_eq!(tokens[2], token::Token::Identifier("GAMMA".to_string()));
    }

    #[test]
    #[timeout(100)]
    fn test_parse_separators() {
        let lex: Lexer<'_> = Lexer::new("A,B:C;");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Identifier("A".to_string()));
        assert_eq!(tokens[1], token::Token::Comma);
        assert_eq!(tokens[2], token::Token::Identifier("B".to_string()));
        assert_eq!(tokens[3], token::Token::Colon);
        assert_eq!(tokens[4], token::Token::Identifier("C".to_string()));
        assert_eq!(tokens[5], token::Token::Semicolon);
    }

    #[test]
    #[timeout(100)]
    fn test_parse_comparison_operators() {
        let lex: Lexer<'_> = Lexer::new("= < > <= >= <>");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Equal);
        assert_eq!(tokens[1], token::Token::Less);
        assert_eq!(tokens[2], token::Token::Greater);
        assert_eq!(tokens[3], token::Token::LessEqual);
        assert_eq!(tokens[4], token::Token::GreaterEqual);
        assert_eq!(tokens[5], token::Token::NotEqual);
    }

    #[test]
    #[timeout(100)]
    fn test_string_token() {
        let lex: Lexer<'_> = Lexer::new("\"ABCD\" \"Sheep\" \"LITERAL\"");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::String(String::from("\"ABCD\"")));
        assert_eq!(tokens[1], token::Token::String(String::from("\"Sheep\"")));
        assert_eq!(tokens[2], token::Token::String(String::from("\"LITERAL\"")));
    }

    #[test]
    #[timeout(100)]
    fn test_char_lit_token() {
        let lex: Lexer<'_> = Lexer::new("'a' 'ab'");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Char('a'));
        assert_eq!(tokens[1], token::Token::InvalidLit("'ab'".to_string()));
    }

    #[test]
    #[timeout(100)]
    fn test_invalid_str_literal() {
        let lex: Lexer<'_> = Lexer::new("\"aaaaaaaa");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(
            tokens[0],
            token::Token::InvalidLit("\"aaaaaaaa".to_string())
        );
    }

    #[test]
    #[timeout(100)]
    fn test_invalid_ch_literal() {
        let lex: Lexer<'_> = Lexer::new("'aaaaaaaa");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(
            tokens[0],
            token::Token::InvalidLit("\'aaaaaaaa".to_string())
        );
    }

    #[test]
    #[timeout(100)]
    fn test_parse_keywords() {
        let lex: Lexer<'_> =
            Lexer::new("print let if then else for to next goto gosub return end rem");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Print);
        assert_eq!(tokens[1], token::Token::Let);
        assert_eq!(tokens[2], token::Token::If);
        assert_eq!(tokens[3], token::Token::Then);
        assert_eq!(tokens[4], token::Token::Else);
        assert_eq!(tokens[5], token::Token::For);
        assert_eq!(tokens[6], token::Token::To);
        assert_eq!(tokens[7], token::Token::Next);
        assert_eq!(tokens[8], token::Token::Goto);
        assert_eq!(tokens[9], token::Token::Gosub);
        assert_eq!(tokens[10], token::Token::Return);
        assert_eq!(tokens[11], token::Token::End);
        assert_eq!(tokens[12], token::Token::Rem);
    }

    #[test]
    #[timeout(100)]
    fn test_invalid_num() {
        let lex: Lexer<'_> = Lexer::new("65e");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Integer(65));
        assert_eq!(tokens[1], token::Token::Identifier("e".to_string()));
    }

    #[test]
    #[timeout(100)]
    fn lex_example_code() {
        let lex: Lexer<'_> = Lexer::new("1;\n2;");
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Integer(1));
        assert_eq!(tokens[1], token::Token::Semicolon);
        assert_eq!(tokens[2], token::Token::Integer(2));
        assert_eq!(tokens[3], token::Token::Semicolon);
    }

    #[test]
    #[timeout(100)]
    fn test_print() {
        let file_path = "./tests/input/print.bsc";

        // Attempt to read the file contents into a String
        let contents = std::fs::read_to_string(file_path).unwrap();
        let lex: Lexer<'_> = Lexer::new(&contents);
        let tokens: Vec<token::Token> = lex.parse_tokens();

        assert_eq!(tokens[0], token::Token::Print);
        assert_eq!(
            tokens[1],
            token::Token::String("\"HELLO WORLD\"".to_string())
        );
        assert_eq!(tokens[2], token::Token::Semicolon);
    }
}