dark_vm/
lexer.rs

1//! The Lexer struct tokenizes the input and returns a VecDeque of Tokens
2//! The lexer may prematurely return an error if it can not parse a specific character.
3//!
4//! The lexer must be the first thing that is invoked because it generates the tokens necessary for the VM.
5//!
6//! # Example
7//! ```
8//! # fn run() -> Result<(), Error> {
9//! let contents = "push 1";
10//! let tokens = Lexer::default().lex(contents)?;
11//! # Ok(())
12//! # }
13//! ```
14
15use crate::{
16    errors::{error::Error, error_kind::ErrorKind},
17    tokens::{token::Token, token_kind::TokenKind},
18};
19
20use std::{collections::VecDeque, iter::Peekable, str::Chars};
21
22#[derive(Default)]
23pub struct Lexer {
24    current_position: usize,
25}
26
27impl Lexer {
28    /// This function lexes the input and returns either a VecDeque of tokens or an error.
29    /// The return value of this function may change to returning a vector of errors.
30    ///
31    /// # Arguments
32    /// * `contents` - The contents to lex. This may come from a file or from the REPL.
33    pub fn lex(&mut self, contents: &str) -> Result<VecDeque<Token>, Error> {
34        let mut iter = contents.chars().peekable();
35        let mut tokens = VecDeque::new();
36        while let Some(ch) = iter.next() {
37            self.current_position += 1;
38
39            // If the current character is a whitespace or a comment, handle it, and continue lexing.
40            if ch.is_ascii_whitespace() || self.handle_comments(ch, &mut iter) {
41                continue;
42            }
43
44            // Identify what the character is and try to lex as much of it as possible.
45            match ch {
46                '0'..='9' | '-' => tokens.push_back(self.make_number(ch, &mut iter)?),
47                '\'' | '"' => tokens.push_back(self.make_string(ch, &mut iter)?),
48                '@' => tokens.push_back(self.make_label(&mut iter)?),
49                letter if ch.is_ascii_alphabetic() || ch == '_' => {
50                    tokens.push_back(self.make_word(letter, &mut iter))
51                }
52                _ => {
53                    return Err(Error::new(
54                        ErrorKind::UnknownCharacter,
55                        self.current_position,
56                    ))
57                }
58            }
59        }
60
61        Ok(tokens)
62    }
63
64    /// This function produces an int, a float, or an error.
65    ///
66    /// # Arguments
67    /// * `digit` - The first character of the number. This may also be a negative sign.
68    /// * `iter` - The iterator which contains all of the characters.
69    fn make_number(&mut self, digit: char, iter: &mut Peekable<Chars>) -> Result<Token, Error> {
70        let initial_point = self.current_position;
71        let mut number = digit.to_string();
72        let mut has_decimal_point = false;
73        while let Some(ch) = iter.peek() {
74            // After the value of the character has been identified, it is important to remember to advance the iterator.
75            // Otherwise, an infinite loop will be generated.
76            if ch.is_ascii_digit() {
77                number.push(self.advance(iter));
78            } else if ch == &'.' && !has_decimal_point {
79                number.push(self.advance(iter));
80                has_decimal_point = true;
81            } else {
82                break;
83            }
84        }
85
86        // If it does not have a decimal point, it must be an integer.
87        if !has_decimal_point {
88            if let Ok(value) = number.parse() {
89                Ok(Token::new(TokenKind::IntegerLiteral(value), initial_point))
90            } else {
91                Err(Error::new(
92                    ErrorKind::InvalidNumberFormat,
93                    self.current_position,
94                ))
95            }
96        } else if let Ok(value) = number.parse() {
97            Ok(Token::new(TokenKind::FloatLiteral(value), initial_point))
98        } else {
99            Err(Error::new(
100                ErrorKind::InvalidNumberFormat,
101                self.current_position,
102            ))
103        }
104    }
105
106    /// This function produces an instruction, identifier, a special value, or a boolean. This funtion always succeeds because a word is always an identifier.
107    ///
108    /// # Arguments
109    /// * `letter` - The first letter of the word.
110    /// * `iter` - The iterator which contains all of the characters.
111    fn make_word(&mut self, letter: char, iter: &mut Peekable<Chars>) -> Token {
112        let initial_point = self.current_position;
113        let mut word = letter.to_string();
114        while let Some(ch) = iter.peek() {
115            if ch.is_ascii_whitespace() {
116                self.advance(iter);
117                break;
118            } else {
119                word.push(self.advance(iter));
120            }
121        }
122
123        // This probably could be written using a match statement.
124        match word.to_ascii_lowercase().as_str() {
125            "void" => Token::new(TokenKind::Void, initial_point),
126            "any" => Token::new(TokenKind::Any, initial_point),
127            "true" => Token::new(TokenKind::BooleanLiteral(true), initial_point),
128            "false" => Token::new(TokenKind::BooleanLiteral(false), initial_point),
129            "end" => Token::new(TokenKind::End, initial_point),
130            instr @ _ => {
131                if let Some(instruction) = TokenKind::is_instruction(instr) {
132                    Token::new(instruction, initial_point)
133                } else {
134                    Token::new(TokenKind::Identifier(word), initial_point)
135                }
136            }
137        }
138    }
139
140    /// This function produces a string or an error.
141    ///
142    /// # Arguments
143    /// * `beginning_of_string` - The first opening quote used to begin the string. This could be ' or ".
144    /// * `iter` - The iterator which contains all of the characters.
145    fn make_string(
146        &mut self,
147        beginning_of_string: char,
148        iter: &mut Peekable<Chars>,
149    ) -> Result<Token, Error> {
150        let initial_point = self.current_position;
151        let mut string = String::new();
152        let mut is_terminated = false;
153        while let Some(ch) = iter.peek() {
154            if ch == &beginning_of_string {
155                self.advance(iter);
156                is_terminated = true;
157                break;
158            } else {
159                string.push(self.advance(iter));
160            }
161        }
162
163        // If the string does not end with the same quote used to open it, the function returns an error.
164        if !is_terminated {
165            Err(Error::new(ErrorKind::UnterminatedString, initial_point))
166        } else {
167            Ok(Token::new(TokenKind::StringLiteral(string), initial_point))
168        }
169    }
170
171    /// This function produces a label or an error.
172    ///
173    /// # Arguments
174    /// * `iter` - The iterator which contains all of the characters.
175    fn make_label(&mut self, iter: &mut Peekable<Chars>) -> Result<Token, Error> {
176        let initial_point = self.current_position;
177        let mut label = String::new();
178        while let Some(ch) = iter.peek() {
179            if ch.is_ascii_whitespace() {
180                break;
181            } else {
182                label.push(self.advance(iter));
183            }
184        }
185
186        if label.is_empty() {
187            Err(Error::new(ErrorKind::InvalidLabelName, initial_point))
188        } else {
189            Ok(Token::new(TokenKind::Label(label), initial_point))
190        }
191    }
192
193    /// This function handles comments. This function returns whether or not it found a commment and handled it.
194    ///
195    /// # Arguments
196    /// * `ch` - The current character the lexer is looking at.
197    /// * `iter` - The iterator which contains all of the characters.
198    fn handle_comments(&mut self, ch: char, iter: &mut Peekable<Chars>) -> bool {
199        if ch == '-' {
200            match iter.peek() {
201                Some('-') => {
202                    self.handle_single_line_comments(iter);
203                    true
204                }
205                Some('!') => {
206                    self.handle_multi_line_comments(iter);
207                    true
208                }
209                _ => false,
210            }
211        } else {
212            false
213        }
214    }
215
216    /// This function handles single line comments.
217    ///
218    /// # Arguments
219    /// * `iter` - The iterator which contains all of the characters.
220    fn handle_single_line_comments(&mut self, iter: &mut Peekable<Chars>) {
221        self.advance(iter);
222        for c in iter {
223            self.current_position += 1;
224            if c == '\n' {
225                break;
226            }
227        }
228    }
229
230    /// This function handles multiline comments.
231    ///
232    /// # Arguments
233    /// * `iter` - The iterator which contains all of the characters.
234    fn handle_multi_line_comments(&mut self, iter: &mut Peekable<Chars>) {
235        self.advance(iter);
236        while let Some(c) = iter.next() {
237            self.current_position += 1;
238            if c == '!' {
239                if let Some('-') = iter.peek() {
240                    self.advance(iter);
241                    break;
242                }
243            }
244        }
245    }
246
247    /// This function increments the current position and returns the next character.
248    /// The bounds check was already performed by the loops, so there is no need to return an option.
249    ///
250    /// # Arguments
251    /// * `iter` - The iterator which contains all of the characters.
252    fn advance(&mut self, iter: &mut Peekable<Chars>) -> char {
253        self.current_position += 1;
254        iter.next().unwrap()
255    }
256}