runic_kit/
lexer.rs

1//! This module defines the `Lexer` struct and the `LexerRule` trait.
2//! The `Lexer` is responsible for tokenizing the source code based on the provided rules.
3//! The `LexerRule` trait defines the interface for lexer rules that can be used to generate
4//!
5//! It also provides some utilities for common lexer rules.
6
7// TODO: refactor
8
9use crate::{error::Error, source::Source, token::Token};
10
11/// Defines the interface for lexer rules.
12pub trait LexerRule<'a, T> {
13    /// This method is called to get a token from the lexer.
14    /// It should return `Ok(Some(token))` if a token is found,
15    /// `Ok(None)` if no token is found,
16    /// or `Err(error)` if an error occurs.
17    ///
18    /// If an error occurs, the lexer will stop processing and return the error.
19    /// Otherwise, it will continue to the next rule.
20    fn get_token(&self, lexer: &mut Lexer<'a, T>) -> Result<Option<Token<T>>, Error>;
21    /// This method returns `true` if the rule generates a token,
22    /// and `false` if it does not.
23    ///
24    /// This is used to determine whether the lexer should jump back to the previous position
25    /// if the rule does not generate a token.
26    ///
27    /// For example, a rule that skips whitespace does not generate a token,
28    /// so the lexer will jump back to the previous position if it does not find a token.
29    fn generates_token(&self) -> bool {
30        true
31    }
32}
33
34/// Tokenizes the source code.
35pub struct Lexer<'a, T> {
36    /// The source code to be tokenized.
37    pub source: &'a Source<'a>,
38    /// The current position in the source code.
39    pub position: usize,
40    /// The current character being processed.
41    pub current_char: Option<char>,
42    /// The rules used to tokenize the source code.
43    rules: Vec<Box<dyn LexerRule<'a, T>>>,
44}
45
46impl<'a, T> Lexer<'a, T> {
47    /// Creates a new `Lexer` instance with the given source code and rules.
48    pub fn new(source: &'a Source<'a>, rules: Vec<Box<dyn LexerRule<'a, T>>>) -> Self {
49        let mut lexer = Lexer {
50            source,
51            position: 0,
52            current_char: None,
53            rules,
54        };
55
56        if lexer.position < lexer.source.code.len() {
57            lexer.current_char = Some(lexer.source.code[lexer.position..].chars().next().unwrap());
58        } else {
59            lexer.current_char = None;
60        }
61
62        lexer
63    }
64
65    /// Advances the lexer to the next character in the source code.
66    pub fn advance(&mut self) {
67        if self.position < self.source.code.len() - 1 {
68            self.position += 1;
69            self.current_char = Some(self.source.code[self.position..].chars().next().unwrap());
70        } else {
71            self.current_char = None;
72        }
73    }
74
75    /// Jumps to a specific position in the source code.
76    pub fn jump_to(&mut self, position: usize) {
77        if position < self.source.code.len() {
78            self.position = position;
79            self.current_char = Some(self.source.code[self.position..].chars().next().unwrap());
80        } else {
81            self.position = self.source.code.len() + 1;
82            self.current_char = None;
83        }
84    }
85
86    /// Attempts to get the next token from the lexer using the defined rules.
87    ///
88    /// If a token is found, it returns `Ok(Some(token))`.
89    /// If no token is found, it returns `Ok(None)`.
90    /// If an error occurs, it returns `Err(error)`.
91    pub fn get_token(&mut self) -> Result<Option<Token<T>>, Error> {
92        // TODO: refactor this to avoid using unsafe?
93
94        let self_ptr = self as *mut Self;
95
96        for rule in &self.rules {
97            let prev_position = self.position;
98            let token = unsafe { rule.get_token(&mut *self_ptr) }?;
99
100            if let Some(token) = token {
101                return Ok(Some(token));
102            } else if rule.generates_token() {
103                unsafe {
104                    (*self_ptr).jump_to(prev_position);
105                }
106            }
107        }
108
109        Ok(None)
110    }
111}
112
113/// This module provides utility functions and common lexer rules.
114pub mod utils {
115    use crate::lexer::LexerRule;
116
117    mod macros {
118        /// Creates a vector of lexer rules.
119        ///
120        /// # Usage
121        ///
122        /// ```rust
123        /// use runic_kit::lexer::utils::{SkipWhitespaceRule, rules_vec};
124        ///
125        /// let rules: Vec<Box<dyn runic_kit::lexer::LexerRule<'_, u8>>> = rules_vec![SkipWhitespaceRule]; // vec![Box::new(SkipWhitespaceRule)]
126        /// ```
127        #[macro_export]
128        macro_rules! rules_vec {
129            ($($rule:expr),* $(,)?) => {
130                vec![$(Box::new($rule) as Box<dyn $crate::lexer::LexerRule<'_, _>>),*]
131            };
132        }
133
134        /// Creates a lexer rule that matches a specific string.
135        ///
136        /// # Usage
137        ///
138        /// ```rust
139        /// use runic_kit::lexer::utils::match_string;
140        ///
141        /// match_string!("let", String, "let".to_string(), LetRule); // `"let"` is the string to match, `String` is the type of the token, `"let".to_string()` is the token value, and `LetRule` is the name of the rule.
142        /// ```
143        #[macro_export]
144        macro_rules! match_string {
145            ($string:expr, $token_type:ty, $token_value:expr, $rule_name:ident) => {
146                struct $rule_name;
147                impl<'a> $crate::lexer::LexerRule<'a, $token_type> for $rule_name {
148                    fn get_token(
149                        &self,
150                        lexer: &mut $crate::lexer::Lexer<'a, $token_type>,
151                    ) -> Result<Option<$crate::token::Token<$token_type>>, $crate::error::Error>
152                    {
153                        let start_pos = lexer.position;
154                        let mut matched = true;
155
156                        for c in $string.chars() {
157                            if lexer.current_char == Some(c) {
158                                lexer.advance();
159                            } else {
160                                matched = false;
161                                break;
162                            }
163                        }
164
165                        if matched {
166                            Ok(Some($crate::token::Token::new(
167                                $token_value,
168                                $crate::span::Span::new(start_pos, lexer.position),
169                            )))
170                        } else {
171                            Ok(None)
172                        }
173                    }
174                }
175            };
176        }
177
178        /// Creates a lexer rule that matches a specific word.
179        /// Words are the sequences of strings that are separated by a space.
180        ///
181        /// For example, if we want to match the word `"let"` (specifically `"let"` followed by either a space or `EOF`) in the string `"let x = 10;"`,
182        /// we should use this macro.
183        ///
184        /// # Usage
185        ///
186        /// ```rust
187        /// use runic_kit::lexer::utils::match_word;
188        ///
189        /// match_word!("let", String, "let".to_string(), LetRule);
190        /// ```
191        #[macro_export]
192        macro_rules! match_word {
193            ($word:expr, $token_type:ty, $token_value:expr, $rule_name:ident) => {
194                struct $rule_name;
195                impl<'a> $crate::lexer::LexerRule<'a, $token_type> for $rule_name {
196                    fn get_token(
197                        &self,
198                        lexer: &mut $crate::lexer::Lexer<'a, $token_type>,
199                    ) -> Result<Option<$crate::token::Token<$token_type>>, $crate::error::Error>
200                    {
201                        let start_pos = lexer.position;
202                        let mut matched = true;
203
204                        for c in $word.chars() {
205                            if lexer.current_char == Some(c) {
206                                lexer.advance();
207                            } else {
208                                matched = false;
209                                break;
210                            }
211                        }
212
213                        if matched
214                            && (lexer.current_char == Some(' ') || lexer.current_char.is_none())
215                        {
216                            Ok(Some($crate::token::Token::new(
217                                $token_value,
218                                $crate::span::Span::new(start_pos, lexer.position),
219                            )))
220                        } else {
221                            Ok(None)
222                        }
223                    }
224                }
225            };
226        }
227
228        pub use match_string;
229        pub use match_word;
230        pub use rules_vec;
231    }
232
233    /// A lexer rule that skips whitespace characters.
234    pub struct SkipWhitespaceRule;
235    impl<'a, T> LexerRule<'a, T> for SkipWhitespaceRule {
236        fn get_token(
237            &self,
238            lexer: &mut super::Lexer<'a, T>,
239        ) -> Result<Option<crate::token::Token<T>>, crate::error::Error> {
240            while let Some(c) = lexer.current_char {
241                if c.is_whitespace() {
242                    lexer.advance();
243                } else {
244                    break;
245                }
246            }
247            Ok(None)
248        }
249
250        fn generates_token(&self) -> bool {
251            false
252        }
253    }
254
255    pub use macros::{match_string, match_word, rules_vec};
256
257    #[cfg(test)]
258    mod tests {
259        use super::*;
260        use crate::{lexer::Lexer, source::Source};
261
262        #[test]
263        fn test_skip_whitespace_rule() {
264            let source = Source::from_str("test_input.txt", "     let x = 10;");
265            let rules = rules_vec![SkipWhitespaceRule];
266
267            let mut lexer = Lexer::<String>::new(&source, rules);
268            let token = lexer.get_token().unwrap();
269
270            assert!(token.is_none());
271            assert_eq!(lexer.position, 5);
272            assert_eq!(lexer.current_char, Some('l'));
273        }
274
275        #[test]
276        fn test_rules_vec_macro() {
277            let rules: Vec<Box<dyn LexerRule<'_, String> + 'static>> =
278                rules_vec![SkipWhitespaceRule];
279            assert_eq!(rules.len(), 1);
280            assert!(rules[0].generates_token() == false);
281        }
282
283        #[test]
284        fn test_match_string_macro() {
285            match_string!("let", String, "let".to_string(), LetRule);
286            let source = Source::from_str("test_input.txt", "let x = 10;");
287            let rules = rules_vec![LetRule];
288            let mut lexer = Lexer::<String>::new(&source, rules);
289            let token = lexer.get_token().unwrap();
290
291            assert!(token.is_some());
292            let token = token.unwrap();
293            assert_eq!(token.kind, "let");
294
295            let token = lexer.get_token().unwrap();
296            assert!(token.is_none());
297            assert_eq!(lexer.position, 3);
298            assert_eq!(lexer.current_char, Some(' '));
299        }
300
301        #[test]
302        fn test_match_word_macro() {
303            match_word!("let", String, "let".to_string(), LetRule);
304            let source = Source::from_str("test_input.txt", "let x = 10;");
305            let rules = rules_vec![LetRule];
306            let mut lexer = Lexer::<String>::new(&source, rules);
307
308            let token = lexer.get_token().unwrap();
309            assert!(token.is_some());
310            let token = token.unwrap();
311            assert_eq!(token.kind, "let");
312
313            let source = Source::from_str("test_input.txt", "letx = 10;");
314            let rules = rules_vec![LetRule];
315            let mut lexer = Lexer::<String>::new(&source, rules);
316            let token = lexer.get_token().unwrap();
317            assert!(token.is_none());
318        }
319    }
320
321    // TODO: add more utils
322}
323
324#[cfg(test)]
325mod tests {
326    use super::*;
327    use crate::{error::Error, source::Source, span::Span, token::Token};
328
329    #[test]
330    fn test_lexer_new() {
331        let source = Source::from_str("test_input.txt", "let x = 10;");
332        let rules = utils::rules_vec![utils::SkipWhitespaceRule];
333        let lexer = Lexer::<u8>::new(&source, rules);
334
335        assert_eq!(lexer.position, 0);
336        assert_eq!(lexer.current_char, Some('l'));
337    }
338
339    #[test]
340    fn test_lexer_advance() {
341        let source = Source::from_str("test_input.txt", "let x = 10;");
342        let rules = utils::rules_vec![utils::SkipWhitespaceRule];
343        let mut lexer = Lexer::<u8>::new(&source, rules);
344
345        lexer.advance();
346        assert_eq!(lexer.position, 1);
347        assert_eq!(lexer.current_char, Some('e'));
348    }
349
350    #[test]
351    fn test_lexer_jump_to() {
352        let source = Source::from_str("test_input.txt", "let x = 10;");
353        let rules = utils::rules_vec![utils::SkipWhitespaceRule];
354        let mut lexer = Lexer::<u8>::new(&source, rules);
355
356        lexer.jump_to(4);
357        assert_eq!(lexer.position, 4);
358        assert_eq!(lexer.current_char, Some('x'));
359    }
360
361    #[test]
362    fn test_lexer_get_token() {
363        let source = Source::from_str("test_input.txt", "let x = 10;");
364
365        struct TestRule;
366        impl<'a> LexerRule<'a, String> for TestRule {
367            fn get_token(
368                &self,
369                lexer: &mut Lexer<'a, String>,
370            ) -> Result<Option<Token<String>>, Error> {
371                if lexer.current_char == Some('l') {
372                    lexer.advance();
373                    Ok(Some(Token::new("let".to_string(), Span::new(0, 3))))
374                } else {
375                    Ok(None)
376                }
377            }
378        }
379
380        let rules = utils::rules_vec![utils::SkipWhitespaceRule, TestRule];
381        let mut lexer = Lexer::<String>::new(&source, rules);
382        let token = lexer.get_token().unwrap();
383
384        assert!(token.is_some());
385
386        let token = token.unwrap();
387
388        assert_eq!(token.kind, "let");
389        assert_eq!(token.span.start, 0);
390        assert_eq!(token.span.end, 3);
391    }
392}