lexer/
lib.rs

1//! # Lexer module
2//!
3//! The lexer module is responsible for tokenising input strings. The lexer supports
4//! various token types such as identifiers, numbers, strings, and operators. The lexer
5//! uses a cursor-based approach to iterate over the input string and extract tokens.
6//!
7//! The lexer is implemented as a struct called `Lexer`, which provides methods for
8//! tokenising input strings into individual tokens. The `Lexer` struct contains an
9//! iterator over the characters of the input string, and uses this iterator to extract
10//! tokens from the input.
11//!
12//! The `Lexer` struct provides a method called `next_token`, which advances the lexer to
13//! the next token in the input stream and returns the token. This method is essentially a
14//! large switch statement, containing branches corresponding to every token type. The
15//! `next_token` method skips any whitespace and comments before identifying the next token.
16//!
17//! The token is represented by a `Token` struct, which contains information about its kind
18//! (e.g., identifier, operator, literal) and its span in the input stream.
19//!
20//! The lexer module is used by the parser to tokenise the input string before parsing it
21//! into an abstract syntax tree (AST).
22use std::str::Chars;
23
24use shared::span::Span;
25use token::{Token, TokenKind};
26
27// Attempt to obtain the current version of the lexer module.
28pub const VERSION: Option<&str> = std::option_env!("CARGO_PKG_VERSION");
29
30#[cfg(test)]
31mod test;
32
33pub mod token;
34
35/// Lexer for tokenising input strings.
36///
37/// The `Lexer` provides methods for tokenising input strings into individual tokens.
38/// It supports various token types such as identifiers, numbers, strings, and operators.
39/// The `Lexer` uses a cursor-based approach to iterate over the input string and extract
40/// tokens.
41pub struct Lexer<'lexer> {
42    /// Represents the input for the lexer.
43    ///
44    /// The `input` field is of type `Chars<'lexer>`, which is an iterator over the
45    /// characters of a string. Using `Chars` instead of just a raw string allows for
46    /// iteration over the string one character at a time. Notably, it supports
47    /// unicode characters and characters of unusual length.
48    input: Chars<'lexer>,
49    chr: char,
50    position: usize,
51}
52
53impl<'lexer> Lexer<'lexer> {
54    /// Creates a new Lexer instance.
55    ///
56    /// # Arguments
57    ///
58    /// * `input` - The input string to be tokenised.
59    pub fn new(input: &'lexer str) -> Self {
60        let mut lexer = Lexer {
61            input: input.chars(),
62            chr: char::from(0),
63            position: 0,
64        };
65        lexer.read_char();
66        // We set position to 0 here because `read_char()` increments position to 1,
67        // but we want to start the index at 0 for consistency.
68        lexer.position = 0;
69        lexer
70    }
71
72    /// Reads the next character from the input stream and updates the lexer's internal
73    /// state.
74    fn read_char(&mut self) {
75        match self.input.next() {
76            Some(chr) => {
77                self.chr = chr;
78                self.position += 1
79            }
80            None => {
81                // '\0' indicates the end of the file.
82                // If we are already at the end of the file, there is no need to update the
83                // character or increment the position.
84                if self.chr != '\0' {
85                    self.chr = '\0';
86                    self.position += 1
87                }
88            }
89        }
90    }
91
92    /// Returns the next character in the input stream without consuming it.
93    ///
94    /// # Returns
95    ///
96    /// The next character in the input stream, or `'\0'` if the end of the stream has
97    /// been reached.
98    fn peek_char(&mut self) -> char {
99        // Clones the iterator to peek ahead without advancing it.
100        match self.input.clone().next() {
101            Some(chr) => chr,
102            None => '\0',
103        }
104    }
105
106    /// Advances the lexer to the next token in the input stream and returns the token.
107    ///
108    /// This function is essentially a large switch statement, containing branches
109    /// corresponding to every token type. This function skips any whitespace and
110    /// comments before identifying the next token. The token is represented by a
111    /// `Token` struct, which contains information about its kind (e.g., identifier,
112    /// operator, literal) and its span in the input stream.
113    ///
114    /// # Returns
115    ///
116    /// The next token in the sequence, and will continue to return an Eof token once the
117    /// end is reached.
118    pub fn next_token(&mut self) -> Token {
119        let start_position = self.position;
120
121        // Skip over any whitespace, comments, and newlines.
122        match self.skip_garbage() {
123            Ok(encountered_newline) => {
124                // If we encountered a newline character (`\n`), we return a NewLine token.
125                if encountered_newline {
126                    return Token {
127                        span: Span::new(self.position - 1, self.position),
128                        kind: TokenKind::NewLine,
129                    };
130                }
131            }
132            // The only type of error that can be returned is an unterminated multi-line
133            // comment, so we can safely unwrap the error and return the corresponding
134            // token.
135            Err(_) => {
136                return Token {
137                    span: Span::new(start_position, self.position),
138                    kind: TokenKind::UnterminatedComment,
139                };
140            }
141        }
142
143        let start_position = self.position;
144
145        // Determine what type of token we are dealing with.
146        let token_kind = match self.chr {
147            // Single character symbols
148            '+' => TokenKind::Plus,
149            '-' => TokenKind::Minus,
150            '*' => TokenKind::Mult,
151            '/' => TokenKind::Div,
152            '%' => TokenKind::Mod,
153
154            ',' => TokenKind::Comma,
155            ';' => TokenKind::Semicolon,
156            ':' => TokenKind::Colon,
157
158            '(' => TokenKind::LParen,
159            ')' => TokenKind::RParen,
160            '{' => TokenKind::LCurly,
161            '}' => TokenKind::RCurly,
162            '[' => TokenKind::LBracket,
163            ']' => TokenKind::RBracket,
164
165            '\0' => TokenKind::Eof,
166            '\n' => TokenKind::NewLine,
167
168            // Potentially double character symbols
169            '=' => {
170                if self.peek_char() == '=' {
171                    self.read_char();
172                    TokenKind::Eq
173                } else {
174                    TokenKind::Assign
175                }
176            }
177            '!' => {
178                if self.peek_char() == '=' {
179                    self.read_char();
180                    TokenKind::NotEq
181                } else {
182                    TokenKind::Illegal(self.chr.to_string())
183                }
184            }
185            '<' => {
186                if self.peek_char() == '=' {
187                    self.read_char();
188                    TokenKind::LtEq
189                } else {
190                    TokenKind::Lt
191                }
192            }
193            '>' => {
194                if self.peek_char() == '=' {
195                    self.read_char();
196                    TokenKind::GtEq
197                } else {
198                    TokenKind::Gt
199                }
200            }
201
202            // The following rules return immediately, because the size of the token is not fixed,
203            // so is handled separately.
204
205            // String literals
206            '"' => {
207                match self.read_string() {
208                    Ok(string) => {
209                        return Token {
210                            span: Span {
211                                start: start_position,
212                                end: self.position,
213                            },
214                            kind: TokenKind::String(string),
215                        }
216                    }
217                    // An error represents an unterminated string.
218                    Err(_) => {
219                        return Token {
220                            span: Span {
221                                start: start_position,
222                                end: self.position,
223                            },
224                            kind: TokenKind::UnterminatedString,
225                        }
226                    }
227                };
228            }
229
230            // Else, we a dealing with a keyword, identifier, number of an illegal character.
231            _ => {
232                if is_valid_ident_start(self.chr) {
233                    let ident = self.read_identifier();
234                    return Token {
235                        span: Span {
236                            start: start_position,
237                            end: self.position,
238                        },
239                        kind: TokenKind::lookup_ident(&ident),
240                    };
241                } else if is_digit(self.chr) {
242                    return self.read_number();
243                } else {
244                    TokenKind::Illegal(self.chr.to_string())
245                }
246            }
247        };
248
249        // Since every branch advances the character by at least one, we move the function out
250        // here to simplify the syntax.
251        self.read_char();
252
253        return Token {
254            span: Span {
255                start: start_position,
256                end: self.position,
257            },
258            kind: token_kind,
259        };
260    }
261
262    /// Reads a string literal (including quotes) from the current position in the input.
263    ///
264    /// # Returns
265    ///
266    /// A `String` containing the contents of the string literal or an `Err(())` if the
267    /// string was not terminated.
268    fn read_string(&mut self) -> Result<String, ()> {
269        let mut string = String::new();
270        // Read opening '"'
271        self.read_char();
272
273        // Read string contents
274        while !(self.chr == '"') {
275            if self.chr == '\0' {
276                return Err(());
277            }
278            string.push(self.chr);
279            self.read_char();
280        }
281
282        // Read closing '"'
283        self.read_char();
284
285        Ok(string)
286    }
287
288    /// Reads an identifier starting from the current character position.
289    ///
290    /// # Returns
291    ///
292    /// A `String` representing the identifier extracted from the input.
293    fn read_identifier(&mut self) -> String {
294        let mut ident = String::new();
295
296        while is_valid_ident_continue(self.chr) {
297            ident.push(self.chr);
298            self.read_char();
299        }
300        ident
301    }
302
303    /// Reads a number from the current position in the input and constructs a `Token`.
304    ///
305    /// This function reads a sequence of digits as an integer. If a decimal point is
306    /// encountered, it continues to read the fractional part, constructing a
307    /// floating-point number.
308    ///
309    /// # Returns
310    ///
311    /// A `Token` representing either an integer or a floating-point number, depending on
312    /// the input.
313    fn read_number(&mut self) -> Token {
314        let mut number = String::new();
315        self._read_int(&mut number);
316
317        // If we encounter a decimal point, we continue to read the fractional part.
318        if self.chr == '.' {
319            number.push(self.chr);
320            self.read_char();
321            self._read_int(&mut number);
322            return Token {
323                span: Span {
324                    start: self.position - number.len(),
325                    end: self.position,
326                },
327                kind: TokenKind::Float(number),
328            };
329        } else {
330            return Token {
331                span: Span {
332                    start: self.position - number.len(),
333                    end: self.position,
334                },
335                kind: TokenKind::Int(number),
336            };
337        }
338    }
339
340    /// Reads and appends digits to a given string from the current position in the input.
341    ///
342    /// # Arguments
343    ///
344    /// * `number` - A mutable reference to a `String` where the digits are appended.
345    fn _read_int(&mut self, number: &mut String) {
346        while is_digit(self.chr) {
347            number.push(self.chr);
348            self.read_char();
349        }
350    }
351
352    /// Skips over a single-line comment (`//`) in the current input.
353    ///
354    /// It reads characters until it reaches the end of the line or the end of the input.
355    ///
356    /// Assumes that the current character (`self.chr`) is the first slash.
357    fn skip_comment(&mut self) {
358        if self.chr == '/' && self.peek_char() == '/' {
359            // Read the '//'
360            self.read_char();
361            self.read_char();
362
363            // Read the comment till the end of the line, or the end of the input.
364            loop {
365                self.read_char();
366                if self.chr == '\n' {
367                    self.read_char();
368                    break;
369                }
370                if self.chr == '\0' {
371                    break;
372                }
373            }
374        }
375    }
376
377    /// Skips over a multi-line comment (`/* ... */`) in the current input.
378    ///
379    /// Assumes that the current character (`self.chr`) is the first slash.
380    ///
381    /// # Returns
382    ///
383    /// An `Ok(())` if the multi-line comment was successfully skipped, or an
384    /// `Err(())` error if the comment was not terminated.
385    fn skip_multi_comment(&mut self) -> Result<(), ()> {
386        // Consume the opening '/*'
387        if self.chr == '/' && self.peek_char() == '*' {
388            self.read_char();
389            self.read_char();
390        } else {
391            return Ok(());
392        }
393        // Consume the comment
394        while !(self.chr == '*' && self.peek_char() == '/') {
395            self.read_char();
396            if self.chr == '\0' {
397                return Err(());
398            };
399        }
400        // Consume the closing '*/'
401        self.read_char();
402        self.read_char();
403
404        Ok(())
405    }
406
407    /// Skips over any whitespace characters, comments, and newlines in the current input.
408    ///
409    /// # Returns
410    ///
411    /// A `Result` containing a `bool` indicating whether a newline character was
412    /// encountered. If an error occurs, it returns an `Err(())`.
413    fn skip_garbage(&mut self) -> Result<bool, ()> {
414        // We store whether we encountered a newline because the lexer does
415        // count newlines, however it only needs to know if it encountered one,
416        // not how many it encountered.
417        //
418        // Note: The parser does depend on this functionality, so don't remove it. :)
419        let mut encountered_newline = false;
420        while matches!(self.chr, ' ' | '\t' | '\n' | '\r' | '/') {
421            match self.chr {
422                // Skip whitespace
423                ' ' | '\t' => self.skip_whitespace(),
424                // Skip newlines
425                '\n' | '\r' => {
426                    encountered_newline = true;
427                    self.read_char();
428                }
429                // Skip comments
430                '/' => match self.peek_char() {
431                    '/' => self.skip_comment(),
432                    '*' => self.skip_multi_comment()?,
433                    _ => break,
434                },
435                // The while statement above ensures that there can be no other pattern, but we need
436                // to handle it in this match statement to satisfy the compiler.
437                _ => unreachable!(),
438            }
439        }
440        return Ok(encountered_newline);
441    }
442
443    /// Skips over any whitespace characters in the current input.
444    fn skip_whitespace(&mut self) {
445        while matches!(self.chr, ' ' | '\t') {
446            self.read_char();
447        }
448    }
449}
450
451/// Serves as a source of truth for the definition of what an identifier can start with.
452fn is_valid_ident_start(chr: char) -> bool {
453    chr.is_ascii_alphabetic() || chr == '_'
454}
455
456/// Serves as a source of truth for the definition of what an identifier can continue
457/// with.
458fn is_valid_ident_continue(chr: char) -> bool {
459    chr.is_ascii_alphanumeric() || chr == '_' || is_digit(chr)
460}
461
462/// Serves as a source of truth for the definition of a 'digit'.
463fn is_digit(chr: char) -> bool {
464    chr.is_ascii_digit()
465}