dark_vm/lexer.rs
1//! The Lexer struct tokenizes the input and returns a VecDeque of Tokens
2//! The lexer may prematurely return an error if it can not parse a specific character.
3//!
4//! The lexer must be the first thing that is invoked because it generates the tokens necessary for the VM.
5//!
6//! # Example
7//! ```
8//! # fn run() -> Result<(), Error> {
9//! let contents = "push 1";
10//! let tokens = Lexer::default().lex(contents)?;
11//! # Ok(())
12//! # }
13//! ```
14
15use crate::{
16 errors::{error::Error, error_kind::ErrorKind},
17 tokens::{token::Token, token_kind::TokenKind},
18};
19
20use std::{collections::VecDeque, iter::Peekable, str::Chars};
21
22#[derive(Default)]
23pub struct Lexer {
24 current_position: usize,
25}
26
27impl Lexer {
28 /// This function lexes the input and returns either a VecDeque of tokens or an error.
29 /// The return value of this function may change to returning a vector of errors.
30 ///
31 /// # Arguments
32 /// * `contents` - The contents to lex. This may come from a file or from the REPL.
33 pub fn lex(&mut self, contents: &str) -> Result<VecDeque<Token>, Error> {
34 let mut iter = contents.chars().peekable();
35 let mut tokens = VecDeque::new();
36 while let Some(ch) = iter.next() {
37 self.current_position += 1;
38
39 // If the current character is a whitespace or a comment, handle it, and continue lexing.
40 if ch.is_ascii_whitespace() || self.handle_comments(ch, &mut iter) {
41 continue;
42 }
43
44 // Identify what the character is and try to lex as much of it as possible.
45 match ch {
46 '0'..='9' | '-' => tokens.push_back(self.make_number(ch, &mut iter)?),
47 '\'' | '"' => tokens.push_back(self.make_string(ch, &mut iter)?),
48 '@' => tokens.push_back(self.make_label(&mut iter)?),
49 letter if ch.is_ascii_alphabetic() || ch == '_' => {
50 tokens.push_back(self.make_word(letter, &mut iter))
51 }
52 _ => {
53 return Err(Error::new(
54 ErrorKind::UnknownCharacter,
55 self.current_position,
56 ))
57 }
58 }
59 }
60
61 Ok(tokens)
62 }
63
64 /// This function produces an int, a float, or an error.
65 ///
66 /// # Arguments
67 /// * `digit` - The first character of the number. This may also be a negative sign.
68 /// * `iter` - The iterator which contains all of the characters.
69 fn make_number(&mut self, digit: char, iter: &mut Peekable<Chars>) -> Result<Token, Error> {
70 let initial_point = self.current_position;
71 let mut number = digit.to_string();
72 let mut has_decimal_point = false;
73 while let Some(ch) = iter.peek() {
74 // After the value of the character has been identified, it is important to remember to advance the iterator.
75 // Otherwise, an infinite loop will be generated.
76 if ch.is_ascii_digit() {
77 number.push(self.advance(iter));
78 } else if ch == &'.' && !has_decimal_point {
79 number.push(self.advance(iter));
80 has_decimal_point = true;
81 } else {
82 break;
83 }
84 }
85
86 // If it does not have a decimal point, it must be an integer.
87 if !has_decimal_point {
88 if let Ok(value) = number.parse() {
89 Ok(Token::new(TokenKind::IntegerLiteral(value), initial_point))
90 } else {
91 Err(Error::new(
92 ErrorKind::InvalidNumberFormat,
93 self.current_position,
94 ))
95 }
96 } else if let Ok(value) = number.parse() {
97 Ok(Token::new(TokenKind::FloatLiteral(value), initial_point))
98 } else {
99 Err(Error::new(
100 ErrorKind::InvalidNumberFormat,
101 self.current_position,
102 ))
103 }
104 }
105
106 /// This function produces an instruction, identifier, a special value, or a boolean. This funtion always succeeds because a word is always an identifier.
107 ///
108 /// # Arguments
109 /// * `letter` - The first letter of the word.
110 /// * `iter` - The iterator which contains all of the characters.
111 fn make_word(&mut self, letter: char, iter: &mut Peekable<Chars>) -> Token {
112 let initial_point = self.current_position;
113 let mut word = letter.to_string();
114 while let Some(ch) = iter.peek() {
115 if ch.is_ascii_whitespace() {
116 self.advance(iter);
117 break;
118 } else {
119 word.push(self.advance(iter));
120 }
121 }
122
123 // This probably could be written using a match statement.
124 match word.to_ascii_lowercase().as_str() {
125 "void" => Token::new(TokenKind::Void, initial_point),
126 "any" => Token::new(TokenKind::Any, initial_point),
127 "true" => Token::new(TokenKind::BooleanLiteral(true), initial_point),
128 "false" => Token::new(TokenKind::BooleanLiteral(false), initial_point),
129 "end" => Token::new(TokenKind::End, initial_point),
130 instr @ _ => {
131 if let Some(instruction) = TokenKind::is_instruction(instr) {
132 Token::new(instruction, initial_point)
133 } else {
134 Token::new(TokenKind::Identifier(word), initial_point)
135 }
136 }
137 }
138 }
139
140 /// This function produces a string or an error.
141 ///
142 /// # Arguments
143 /// * `beginning_of_string` - The first opening quote used to begin the string. This could be ' or ".
144 /// * `iter` - The iterator which contains all of the characters.
145 fn make_string(
146 &mut self,
147 beginning_of_string: char,
148 iter: &mut Peekable<Chars>,
149 ) -> Result<Token, Error> {
150 let initial_point = self.current_position;
151 let mut string = String::new();
152 let mut is_terminated = false;
153 while let Some(ch) = iter.peek() {
154 if ch == &beginning_of_string {
155 self.advance(iter);
156 is_terminated = true;
157 break;
158 } else {
159 string.push(self.advance(iter));
160 }
161 }
162
163 // If the string does not end with the same quote used to open it, the function returns an error.
164 if !is_terminated {
165 Err(Error::new(ErrorKind::UnterminatedString, initial_point))
166 } else {
167 Ok(Token::new(TokenKind::StringLiteral(string), initial_point))
168 }
169 }
170
171 /// This function produces a label or an error.
172 ///
173 /// # Arguments
174 /// * `iter` - The iterator which contains all of the characters.
175 fn make_label(&mut self, iter: &mut Peekable<Chars>) -> Result<Token, Error> {
176 let initial_point = self.current_position;
177 let mut label = String::new();
178 while let Some(ch) = iter.peek() {
179 if ch.is_ascii_whitespace() {
180 break;
181 } else {
182 label.push(self.advance(iter));
183 }
184 }
185
186 if label.is_empty() {
187 Err(Error::new(ErrorKind::InvalidLabelName, initial_point))
188 } else {
189 Ok(Token::new(TokenKind::Label(label), initial_point))
190 }
191 }
192
193 /// This function handles comments. This function returns whether or not it found a commment and handled it.
194 ///
195 /// # Arguments
196 /// * `ch` - The current character the lexer is looking at.
197 /// * `iter` - The iterator which contains all of the characters.
198 fn handle_comments(&mut self, ch: char, iter: &mut Peekable<Chars>) -> bool {
199 if ch == '-' {
200 match iter.peek() {
201 Some('-') => {
202 self.handle_single_line_comments(iter);
203 true
204 }
205 Some('!') => {
206 self.handle_multi_line_comments(iter);
207 true
208 }
209 _ => false,
210 }
211 } else {
212 false
213 }
214 }
215
216 /// This function handles single line comments.
217 ///
218 /// # Arguments
219 /// * `iter` - The iterator which contains all of the characters.
220 fn handle_single_line_comments(&mut self, iter: &mut Peekable<Chars>) {
221 self.advance(iter);
222 for c in iter {
223 self.current_position += 1;
224 if c == '\n' {
225 break;
226 }
227 }
228 }
229
230 /// This function handles multiline comments.
231 ///
232 /// # Arguments
233 /// * `iter` - The iterator which contains all of the characters.
234 fn handle_multi_line_comments(&mut self, iter: &mut Peekable<Chars>) {
235 self.advance(iter);
236 while let Some(c) = iter.next() {
237 self.current_position += 1;
238 if c == '!' {
239 if let Some('-') = iter.peek() {
240 self.advance(iter);
241 break;
242 }
243 }
244 }
245 }
246
247 /// This function increments the current position and returns the next character.
248 /// The bounds check was already performed by the loops, so there is no need to return an option.
249 ///
250 /// # Arguments
251 /// * `iter` - The iterator which contains all of the characters.
252 fn advance(&mut self, iter: &mut Peekable<Chars>) -> char {
253 self.current_position += 1;
254 iter.next().unwrap()
255 }
256}