lexer/lib.rs
1//! # Lexer module
2//!
3//! The lexer module is responsible for tokenising input strings. The lexer supports
4//! various token types such as identifiers, numbers, strings, and operators. The lexer
5//! uses a cursor-based approach to iterate over the input string and extract tokens.
6//!
7//! The lexer is implemented as a struct called `Lexer`, which provides methods for
8//! tokenising input strings into individual tokens. The `Lexer` struct contains an
9//! iterator over the characters of the input string, and uses this iterator to extract
10//! tokens from the input.
11//!
12//! The `Lexer` struct provides a method called `next_token`, which advances the lexer to
13//! the next token in the input stream and returns the token. This method is essentially a
14//! large switch statement, containing branches corresponding to every token type. The
15//! `next_token` method skips any whitespace and comments before identifying the next token.
16//!
17//! The token is represented by a `Token` struct, which contains information about its kind
18//! (e.g., identifier, operator, literal) and its span in the input stream.
19//!
20//! The lexer module is used by the parser to tokenise the input string before parsing it
21//! into an abstract syntax tree (AST).
22use std::str::Chars;
23
24use shared::span::Span;
25use token::{Token, TokenKind};
26
27// Attempt to obtain the current version of the lexer module.
28pub const VERSION: Option<&str> = std::option_env!("CARGO_PKG_VERSION");
29
30#[cfg(test)]
31mod test;
32
33pub mod token;
34
35/// Lexer for tokenising input strings.
36///
37/// The `Lexer` provides methods for tokenising input strings into individual tokens.
38/// It supports various token types such as identifiers, numbers, strings, and operators.
39/// The `Lexer` uses a cursor-based approach to iterate over the input string and extract
40/// tokens.
41pub struct Lexer<'lexer> {
42 /// Represents the input for the lexer.
43 ///
44 /// The `input` field is of type `Chars<'lexer>`, which is an iterator over the
45 /// characters of a string. Using `Chars` instead of just a raw string allows for
46 /// iteration over the string one character at a time. Notably, it supports
47 /// unicode characters and characters of unusual length.
48 input: Chars<'lexer>,
49 chr: char,
50 position: usize,
51}
52
53impl<'lexer> Lexer<'lexer> {
54 /// Creates a new Lexer instance.
55 ///
56 /// # Arguments
57 ///
58 /// * `input` - The input string to be tokenised.
59 pub fn new(input: &'lexer str) -> Self {
60 let mut lexer = Lexer {
61 input: input.chars(),
62 chr: char::from(0),
63 position: 0,
64 };
65 lexer.read_char();
66 // We set position to 0 here because `read_char()` increments position to 1,
67 // but we want to start the index at 0 for consistency.
68 lexer.position = 0;
69 lexer
70 }
71
72 /// Reads the next character from the input stream and updates the lexer's internal
73 /// state.
74 fn read_char(&mut self) {
75 match self.input.next() {
76 Some(chr) => {
77 self.chr = chr;
78 self.position += 1
79 }
80 None => {
81 // '\0' indicates the end of the file.
82 // If we are already at the end of the file, there is no need to update the
83 // character or increment the position.
84 if self.chr != '\0' {
85 self.chr = '\0';
86 self.position += 1
87 }
88 }
89 }
90 }
91
92 /// Returns the next character in the input stream without consuming it.
93 ///
94 /// # Returns
95 ///
96 /// The next character in the input stream, or `'\0'` if the end of the stream has
97 /// been reached.
98 fn peek_char(&mut self) -> char {
99 // Clones the iterator to peek ahead without advancing it.
100 match self.input.clone().next() {
101 Some(chr) => chr,
102 None => '\0',
103 }
104 }
105
106 /// Advances the lexer to the next token in the input stream and returns the token.
107 ///
108 /// This function is essentially a large switch statement, containing branches
109 /// corresponding to every token type. This function skips any whitespace and
110 /// comments before identifying the next token. The token is represented by a
111 /// `Token` struct, which contains information about its kind (e.g., identifier,
112 /// operator, literal) and its span in the input stream.
113 ///
114 /// # Returns
115 ///
116 /// The next token in the sequence, and will continue to return an Eof token once the
117 /// end is reached.
118 pub fn next_token(&mut self) -> Token {
119 let start_position = self.position;
120
121 // Skip over any whitespace, comments, and newlines.
122 match self.skip_garbage() {
123 Ok(encountered_newline) => {
124 // If we encountered a newline character (`\n`), we return a NewLine token.
125 if encountered_newline {
126 return Token {
127 span: Span::new(self.position - 1, self.position),
128 kind: TokenKind::NewLine,
129 };
130 }
131 }
132 // The only type of error that can be returned is an unterminated multi-line
133 // comment, so we can safely unwrap the error and return the corresponding
134 // token.
135 Err(_) => {
136 return Token {
137 span: Span::new(start_position, self.position),
138 kind: TokenKind::UnterminatedComment,
139 };
140 }
141 }
142
143 let start_position = self.position;
144
145 // Determine what type of token we are dealing with.
146 let token_kind = match self.chr {
147 // Single character symbols
148 '+' => TokenKind::Plus,
149 '-' => TokenKind::Minus,
150 '*' => TokenKind::Mult,
151 '/' => TokenKind::Div,
152 '%' => TokenKind::Mod,
153
154 ',' => TokenKind::Comma,
155 ';' => TokenKind::Semicolon,
156 ':' => TokenKind::Colon,
157
158 '(' => TokenKind::LParen,
159 ')' => TokenKind::RParen,
160 '{' => TokenKind::LCurly,
161 '}' => TokenKind::RCurly,
162 '[' => TokenKind::LBracket,
163 ']' => TokenKind::RBracket,
164
165 '\0' => TokenKind::Eof,
166 '\n' => TokenKind::NewLine,
167
168 // Potentially double character symbols
169 '=' => {
170 if self.peek_char() == '=' {
171 self.read_char();
172 TokenKind::Eq
173 } else {
174 TokenKind::Assign
175 }
176 }
177 '!' => {
178 if self.peek_char() == '=' {
179 self.read_char();
180 TokenKind::NotEq
181 } else {
182 TokenKind::Illegal(self.chr.to_string())
183 }
184 }
185 '<' => {
186 if self.peek_char() == '=' {
187 self.read_char();
188 TokenKind::LtEq
189 } else {
190 TokenKind::Lt
191 }
192 }
193 '>' => {
194 if self.peek_char() == '=' {
195 self.read_char();
196 TokenKind::GtEq
197 } else {
198 TokenKind::Gt
199 }
200 }
201
202 // The following rules return immediately, because the size of the token is not fixed,
203 // so is handled separately.
204
205 // String literals
206 '"' => {
207 match self.read_string() {
208 Ok(string) => {
209 return Token {
210 span: Span {
211 start: start_position,
212 end: self.position,
213 },
214 kind: TokenKind::String(string),
215 }
216 }
217 // An error represents an unterminated string.
218 Err(_) => {
219 return Token {
220 span: Span {
221 start: start_position,
222 end: self.position,
223 },
224 kind: TokenKind::UnterminatedString,
225 }
226 }
227 };
228 }
229
230 // Else, we a dealing with a keyword, identifier, number of an illegal character.
231 _ => {
232 if is_valid_ident_start(self.chr) {
233 let ident = self.read_identifier();
234 return Token {
235 span: Span {
236 start: start_position,
237 end: self.position,
238 },
239 kind: TokenKind::lookup_ident(&ident),
240 };
241 } else if is_digit(self.chr) {
242 return self.read_number();
243 } else {
244 TokenKind::Illegal(self.chr.to_string())
245 }
246 }
247 };
248
249 // Since every branch advances the character by at least one, we move the function out
250 // here to simplify the syntax.
251 self.read_char();
252
253 return Token {
254 span: Span {
255 start: start_position,
256 end: self.position,
257 },
258 kind: token_kind,
259 };
260 }
261
262 /// Reads a string literal (including quotes) from the current position in the input.
263 ///
264 /// # Returns
265 ///
266 /// A `String` containing the contents of the string literal or an `Err(())` if the
267 /// string was not terminated.
268 fn read_string(&mut self) -> Result<String, ()> {
269 let mut string = String::new();
270 // Read opening '"'
271 self.read_char();
272
273 // Read string contents
274 while !(self.chr == '"') {
275 if self.chr == '\0' {
276 return Err(());
277 }
278 string.push(self.chr);
279 self.read_char();
280 }
281
282 // Read closing '"'
283 self.read_char();
284
285 Ok(string)
286 }
287
288 /// Reads an identifier starting from the current character position.
289 ///
290 /// # Returns
291 ///
292 /// A `String` representing the identifier extracted from the input.
293 fn read_identifier(&mut self) -> String {
294 let mut ident = String::new();
295
296 while is_valid_ident_continue(self.chr) {
297 ident.push(self.chr);
298 self.read_char();
299 }
300 ident
301 }
302
303 /// Reads a number from the current position in the input and constructs a `Token`.
304 ///
305 /// This function reads a sequence of digits as an integer. If a decimal point is
306 /// encountered, it continues to read the fractional part, constructing a
307 /// floating-point number.
308 ///
309 /// # Returns
310 ///
311 /// A `Token` representing either an integer or a floating-point number, depending on
312 /// the input.
313 fn read_number(&mut self) -> Token {
314 let mut number = String::new();
315 self._read_int(&mut number);
316
317 // If we encounter a decimal point, we continue to read the fractional part.
318 if self.chr == '.' {
319 number.push(self.chr);
320 self.read_char();
321 self._read_int(&mut number);
322 return Token {
323 span: Span {
324 start: self.position - number.len(),
325 end: self.position,
326 },
327 kind: TokenKind::Float(number),
328 };
329 } else {
330 return Token {
331 span: Span {
332 start: self.position - number.len(),
333 end: self.position,
334 },
335 kind: TokenKind::Int(number),
336 };
337 }
338 }
339
340 /// Reads and appends digits to a given string from the current position in the input.
341 ///
342 /// # Arguments
343 ///
344 /// * `number` - A mutable reference to a `String` where the digits are appended.
345 fn _read_int(&mut self, number: &mut String) {
346 while is_digit(self.chr) {
347 number.push(self.chr);
348 self.read_char();
349 }
350 }
351
352 /// Skips over a single-line comment (`//`) in the current input.
353 ///
354 /// It reads characters until it reaches the end of the line or the end of the input.
355 ///
356 /// Assumes that the current character (`self.chr`) is the first slash.
357 fn skip_comment(&mut self) {
358 if self.chr == '/' && self.peek_char() == '/' {
359 // Read the '//'
360 self.read_char();
361 self.read_char();
362
363 // Read the comment till the end of the line, or the end of the input.
364 loop {
365 self.read_char();
366 if self.chr == '\n' {
367 self.read_char();
368 break;
369 }
370 if self.chr == '\0' {
371 break;
372 }
373 }
374 }
375 }
376
377 /// Skips over a multi-line comment (`/* ... */`) in the current input.
378 ///
379 /// Assumes that the current character (`self.chr`) is the first slash.
380 ///
381 /// # Returns
382 ///
383 /// An `Ok(())` if the multi-line comment was successfully skipped, or an
384 /// `Err(())` error if the comment was not terminated.
385 fn skip_multi_comment(&mut self) -> Result<(), ()> {
386 // Consume the opening '/*'
387 if self.chr == '/' && self.peek_char() == '*' {
388 self.read_char();
389 self.read_char();
390 } else {
391 return Ok(());
392 }
393 // Consume the comment
394 while !(self.chr == '*' && self.peek_char() == '/') {
395 self.read_char();
396 if self.chr == '\0' {
397 return Err(());
398 };
399 }
400 // Consume the closing '*/'
401 self.read_char();
402 self.read_char();
403
404 Ok(())
405 }
406
407 /// Skips over any whitespace characters, comments, and newlines in the current input.
408 ///
409 /// # Returns
410 ///
411 /// A `Result` containing a `bool` indicating whether a newline character was
412 /// encountered. If an error occurs, it returns an `Err(())`.
413 fn skip_garbage(&mut self) -> Result<bool, ()> {
414 // We store whether we encountered a newline because the lexer does
415 // count newlines, however it only needs to know if it encountered one,
416 // not how many it encountered.
417 //
418 // Note: The parser does depend on this functionality, so don't remove it. :)
419 let mut encountered_newline = false;
420 while matches!(self.chr, ' ' | '\t' | '\n' | '\r' | '/') {
421 match self.chr {
422 // Skip whitespace
423 ' ' | '\t' => self.skip_whitespace(),
424 // Skip newlines
425 '\n' | '\r' => {
426 encountered_newline = true;
427 self.read_char();
428 }
429 // Skip comments
430 '/' => match self.peek_char() {
431 '/' => self.skip_comment(),
432 '*' => self.skip_multi_comment()?,
433 _ => break,
434 },
435 // The while statement above ensures that there can be no other pattern, but we need
436 // to handle it in this match statement to satisfy the compiler.
437 _ => unreachable!(),
438 }
439 }
440 return Ok(encountered_newline);
441 }
442
443 /// Skips over any whitespace characters in the current input.
444 fn skip_whitespace(&mut self) {
445 while matches!(self.chr, ' ' | '\t') {
446 self.read_char();
447 }
448 }
449}
450
451/// Serves as a source of truth for the definition of what an identifier can start with.
452fn is_valid_ident_start(chr: char) -> bool {
453 chr.is_ascii_alphabetic() || chr == '_'
454}
455
456/// Serves as a source of truth for the definition of what an identifier can continue
457/// with.
458fn is_valid_ident_continue(chr: char) -> bool {
459 chr.is_ascii_alphanumeric() || chr == '_' || is_digit(chr)
460}
461
462/// Serves as a source of truth for the definition of a 'digit'.
463fn is_digit(chr: char) -> bool {
464 chr.is_ascii_digit()
465}