Skip to main content

oak_msil/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// MSIL token type definition.
3pub mod token_type;
4pub use token_type::MsilTokenType;
5
6use crate::language::MsilLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
8
9pub(crate) type State<'a, S> = LexerState<'a, S, MsilLanguage>;
10
11/// MSIL lexer.
12#[derive(Clone, Debug)]
13pub struct MsilLexer;
14
15impl MsilLexer {
16    /// Creates a new MSIL lexer.
17    pub fn new(config: &MsilLanguage) -> Self {
18        Self
19    }
20}
21
22impl MsilLexer {
23    /// Runs the lexer.
24    pub fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
25        let safe_point = state.get_position();
26        while state.not_at_end() {
27            if self.skip_whitespace(state) {
28                continue;
29            }
30
31            if self.lex_newline(state) {
32                continue;
33            }
34
35            if self.lex_comment(state) {
36                continue;
37            }
38
39            if self.lex_string(state) {
40                continue;
41            }
42
43            if self.lex_identifier(state) {
44                continue;
45            }
46
47            if self.lex_number(state) {
48                continue;
49            }
50
51            if self.lex_delimiter(state) {
52                continue;
53            }
54
55            // If no rules matched, skip current character
56            if let Some(ch) = state.peek() {
57                let start_pos = state.get_position();
58                state.advance(ch.len_utf8());
59                state.add_token(MsilTokenType::Error, start_pos, state.get_position())
60            }
61
62            state.advance_if_dead_lock(safe_point)
63        }
64
65        state.add_eof();
66        Ok(())
67    }
68
69    /// Skips whitespace characters.
70    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
71        let start_pos = state.get_position();
72
73        while let Some(ch) = state.peek() {
74            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
75        }
76
77        if state.get_position() > start_pos {
78            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
79            true
80        }
81        else {
82            false
83        }
84    }
85
86    /// Lexes a newline.
87    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start_pos = state.get_position();
89
90        if let Some('\n') = state.peek() {
91            state.advance(1);
92            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
93            true
94        }
95        else if let Some('\r') = state.peek() {
96            state.advance(1);
97            if let Some('\n') = state.peek() {
98                state.advance(1)
99            }
100            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
101            true
102        }
103        else {
104            false
105        }
106    }
107
108    /// Lexes a comment.
109    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110        let start_pos = state.get_position();
111
112        if state.starts_with("//") {
113            while let Some(ch) = state.peek() {
114                if ch == '\n' || ch == '\r' {
115                    break;
116                }
117                state.advance(ch.len_utf8())
118            }
119            state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
120            true
121        }
122        else if state.starts_with("/*") {
123            state.advance(2);
124            while let Some(ch) = state.peek() {
125                if state.starts_with("*/") {
126                    state.advance(2);
127                    break;
128                }
129                state.advance(ch.len_utf8())
130            }
131            state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
132            true
133        }
134        else {
135            false
136        }
137    }
138
139    /// Lexes identifiers and keywords.
140    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
141        let start_pos = state.get_position();
142
143        if let Some(ch) = state.peek() {
144            if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
145                return false;
146            }
147
148            // Collect identifier characters
149            while let Some(ch) = state.peek() {
150                if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' { state.advance(ch.len_utf8()) } else { break }
151            }
152
153            // Check if it's a keyword
154            let text = state.get_text_in((start_pos..state.get_position()).into());
155            let token_kind = match text {
156                std::borrow::Cow::Borrowed(".assembly") => MsilTokenType::AssemblyKeyword,
157                std::borrow::Cow::Borrowed("extern") => MsilTokenType::ExternKeyword,
158                std::borrow::Cow::Borrowed(".module") => MsilTokenType::ModuleKeyword,
159                std::borrow::Cow::Borrowed(".class") => MsilTokenType::ClassKeyword,
160                std::borrow::Cow::Borrowed(".method") => MsilTokenType::MethodKeyword,
161                std::borrow::Cow::Borrowed(".data") => MsilTokenType::IdentifierToken,
162                std::borrow::Cow::Borrowed(".ver") => MsilTokenType::IdentifierToken,
163                std::borrow::Cow::Borrowed(".publickeytoken") => MsilTokenType::IdentifierToken,
164                std::borrow::Cow::Borrowed(".custom") => MsilTokenType::IdentifierToken,
165                std::borrow::Cow::Borrowed("public") => MsilTokenType::PublicKeyword,
166                std::borrow::Cow::Borrowed("private") => MsilTokenType::PrivateKeyword,
167                std::borrow::Cow::Borrowed("static") => MsilTokenType::StaticKeyword,
168                std::borrow::Cow::Borrowed("void") => MsilTokenType::Keyword,
169                std::borrow::Cow::Borrowed("bool") => MsilTokenType::Keyword,
170                std::borrow::Cow::Borrowed("int8") => MsilTokenType::Keyword,
171                std::borrow::Cow::Borrowed("int16") => MsilTokenType::Keyword,
172                std::borrow::Cow::Borrowed("int32") => MsilTokenType::Keyword,
173                std::borrow::Cow::Borrowed("int64") => MsilTokenType::Keyword,
174                std::borrow::Cow::Borrowed("float32") => MsilTokenType::Keyword,
175                std::borrow::Cow::Borrowed("float64") => MsilTokenType::Keyword,
176                std::borrow::Cow::Borrowed("string") => MsilTokenType::Keyword,
177                std::borrow::Cow::Borrowed("object") => MsilTokenType::Keyword,
178                std::borrow::Cow::Borrowed("char") => MsilTokenType::Keyword,
179                std::borrow::Cow::Borrowed("unsigned") => MsilTokenType::Keyword,
180                std::borrow::Cow::Borrowed("extends") => MsilTokenType::Keyword,
181                std::borrow::Cow::Borrowed("implements") => MsilTokenType::Keyword,
182                std::borrow::Cow::Borrowed("auto") => MsilTokenType::Keyword,
183                std::borrow::Cow::Borrowed("ansi") => MsilTokenType::Keyword,
184                std::borrow::Cow::Borrowed("beforefieldinit") => MsilTokenType::Keyword,
185                std::borrow::Cow::Borrowed("sealed") => MsilTokenType::Keyword,
186                std::borrow::Cow::Borrowed("abstract") => MsilTokenType::Keyword,
187                std::borrow::Cow::Borrowed("serializable") => MsilTokenType::Keyword,
188                std::borrow::Cow::Borrowed("sequential") => MsilTokenType::Keyword,
189                std::borrow::Cow::Borrowed("explicit") => MsilTokenType::Keyword,
190                std::borrow::Cow::Borrowed("unicode") => MsilTokenType::Keyword,
191                std::borrow::Cow::Borrowed("autochar") => MsilTokenType::Keyword,
192                std::borrow::Cow::Borrowed("family") => MsilTokenType::Keyword,
193                std::borrow::Cow::Borrowed("assembly") => MsilTokenType::Keyword,
194                std::borrow::Cow::Borrowed("famandassem") => MsilTokenType::Keyword,
195                std::borrow::Cow::Borrowed("famorassem") => MsilTokenType::Keyword,
196                std::borrow::Cow::Borrowed("privatescope") => MsilTokenType::Keyword,
197                std::borrow::Cow::Borrowed("hidebysig") => MsilTokenType::Keyword,
198                std::borrow::Cow::Borrowed("specialname") => MsilTokenType::Keyword,
199                std::borrow::Cow::Borrowed("rtspecialname") => MsilTokenType::Keyword,
200                std::borrow::Cow::Borrowed("cil") => MsilTokenType::Keyword,
201                std::borrow::Cow::Borrowed("managed") => MsilTokenType::Keyword,
202                _ => MsilTokenType::IdentifierToken,
203            };
204
205            state.add_token(token_kind, start_pos, state.get_position());
206            true
207        }
208        else {
209            false
210        }
211    }
212
213    /// Lexes numbers (decimal and hexadecimal).
214    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
215        let start_pos = state.get_position();
216
217        if let Some(ch) = state.peek() {
218            // Check for hexadecimal numbers with 0x prefix
219            if ch == '0' {
220                if let Some(next_ch) = state.peek_next_n(1) {
221                    if next_ch == 'x' || next_ch == 'X' {
222                        state.advance(2); // Skip "0x"
223                        let mut has_digits = false;
224                        while let Some(ch) = state.peek() {
225                            if ch.is_ascii_hexdigit() {
226                                state.advance(ch.len_utf8());
227                                has_digits = true;
228                            }
229                            else {
230                                break;
231                            }
232                        }
233                        if has_digits {
234                            state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
235                            return true;
236                        }
237                    }
238                }
239            }
240
241            // Check for decimal numbers
242            if ch.is_ascii_digit() {
243                // Handle integer part
244                while let Some(ch) = state.peek() {
245                    if ch.is_ascii_digit() || ch.is_ascii_hexdigit() {
246                        state.advance(ch.len_utf8());
247                    }
248                    else {
249                        break;
250                    }
251                }
252
253                // Handle decimal point
254                if let Some('.') = state.peek() {
255                    if let Some(next_ch) = state.peek_next_n(1) {
256                        if next_ch.is_ascii_digit() {
257                            state.advance(1); // Skip decimal point
258                            while let Some(ch) = state.peek() {
259                                if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
260                            }
261                        }
262                    }
263                }
264
265                state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
266                true
267            }
268            // Check for hexadecimal bytes (like B7, 7A, etc.)
269            else if ch.is_ascii_hexdigit() {
270                let mut has_digits = false;
271                while let Some(ch) = state.peek() {
272                    if ch.is_ascii_hexdigit() {
273                        state.advance(ch.len_utf8());
274                        has_digits = true;
275                    }
276                    else {
277                        break;
278                    }
279                }
280                if has_digits {
281                    state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
282                    return true;
283                }
284                false
285            }
286            else {
287                false
288            }
289        }
290        else {
291            false
292        }
293    }
294
295    /// Lexes strings.
296    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
297        let start_pos = state.get_position();
298
299        if let Some('"') = state.peek() {
300            state.advance(1);
301
302            while let Some(ch) = state.peek() {
303                if ch == '"' {
304                    state.advance(1);
305                    break;
306                }
307                else if ch == '\\' {
308                    state.advance(1);
309                    if let Some(_) = state.peek() {
310                        state.advance(1)
311                    }
312                }
313                else {
314                    state.advance(ch.len_utf8())
315                }
316            }
317
318            state.add_token(MsilTokenType::StringToken, start_pos, state.get_position());
319            true
320        }
321        else {
322            false
323        }
324    }
325
326    /// Lexes delimiters.
327    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
328        let start_pos = state.get_position();
329
330        if let Some(ch) = state.peek() {
331            let kind = match ch {
332                '{' => MsilTokenType::LeftBrace,
333                '}' => MsilTokenType::RightBrace,
334                '(' => MsilTokenType::LeftParen,
335                ')' => MsilTokenType::RightParen,
336                '[' => MsilTokenType::LeftBracket,
337                ']' => MsilTokenType::RightBracket,
338                '.' => MsilTokenType::Dot,
339                ':' => MsilTokenType::Colon,
340                ';' => MsilTokenType::Semicolon,
341                ',' => MsilTokenType::Comma,
342                '=' => MsilTokenType::Equal,
343                '/' => MsilTokenType::Slash,
344                _ => return false,
345            };
346
347            state.advance(ch.len_utf8());
348            state.add_token(kind, start_pos, state.get_position());
349            true
350        }
351        else {
352            false
353        }
354    }
355}
356
357impl Lexer<MsilLanguage> for MsilLexer {
358    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<MsilLanguage>) -> LexOutput<MsilLanguage> {
359        let mut state = State::new_with_cache(source, 0, cache);
360        let result = self.run(&mut state);
361        state.finish_with_cache(result, cache)
362    }
363}
364
365impl MsilLexer {
366    /// Tokenizes the text into a list of tokens
367    pub fn tokenize<'a>(&self, text: &'a str) -> Vec<oak_core::Token<<MsilLanguage as oak_core::Language>::TokenType>> {
368        let source = oak_core::SourceText::new(text);
369        let mut cache = oak_core::parser::session::ParseSession::<MsilLanguage>::default();
370        let mut state = State::new_with_cache(&source, 0, &mut cache);
371        let result = self.run(&mut state);
372        state.finish_with_cache(result, &mut cache).result.unwrap().0.to_vec()
373    }
374}