Skip to main content

oak_msil/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// MSIL token type definition.
3pub mod token_type;
4pub use token_type::MsilTokenType;
5
6use crate::language::MsilLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
8
9pub(crate) type State<'a, S> = LexerState<'a, S, MsilLanguage>;
10
11/// MSIL lexer.
12#[derive(Clone, Debug)]
13pub struct MsilLexer<'config> {
14    config: &'config MsilLanguage,
15}
16
17impl<'config> MsilLexer<'config> {
18    /// Creates a new MSIL lexer.
19    pub fn new(config: &'config MsilLanguage) -> Self {
20        Self { config }
21    }
22}
23
24impl MsilLexer<'_> {
25    /// Runs the lexer.
26    pub fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
27        let safe_point = state.get_position();
28        while state.not_at_end() {
29            if self.skip_whitespace(state) {
30                continue;
31            }
32
33            if self.lex_newline(state) {
34                continue;
35            }
36
37            if self.lex_comment(state) {
38                continue;
39            }
40
41            if self.lex_string(state) {
42                continue;
43            }
44
45            if self.lex_number(state) {
46                continue;
47            }
48
49            if self.lex_identifier(state) {
50                continue;
51            }
52
53            if self.lex_delimiter(state) {
54                continue;
55            }
56
57            // If no rules matched, skip current character
58            if let Some(ch) = state.peek() {
59                let start_pos = state.get_position();
60                state.advance(ch.len_utf8());
61                state.add_token(MsilTokenType::Error, start_pos, state.get_position())
62            }
63
64            state.advance_if_dead_lock(safe_point)
65        }
66
67        state.add_eof();
68        Ok(())
69    }
70
71    /// Skips whitespace characters.
72    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
73        let start_pos = state.get_position();
74
75        while let Some(ch) = state.peek() {
76            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
77        }
78
79        if state.get_position() > start_pos {
80            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
81            true
82        }
83        else {
84            false
85        }
86    }
87
88    /// Lexes a newline.
89    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
90        let start_pos = state.get_position();
91
92        if let Some('\n') = state.peek() {
93            state.advance(1);
94            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
95            true
96        }
97        else if let Some('\r') = state.peek() {
98            state.advance(1);
99            if let Some('\n') = state.peek() {
100                state.advance(1)
101            }
102            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
103            true
104        }
105        else {
106            false
107        }
108    }
109
110    /// Lexes a comment.
111    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112        let start_pos = state.get_position();
113
114        if state.starts_with("//") {
115            while let Some(ch) = state.peek() {
116                if ch == '\n' || ch == '\r' {
117                    break;
118                }
119                state.advance(ch.len_utf8())
120            }
121            state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
122            true
123        }
124        else if state.starts_with("/*") {
125            state.advance(2);
126            while let Some(ch) = state.peek() {
127                if state.starts_with("*/") {
128                    state.advance(2);
129                    break;
130                }
131                state.advance(ch.len_utf8())
132            }
133            state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
134            true
135        }
136        else {
137            false
138        }
139    }
140
141    /// Lexes identifiers and keywords.
142    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
143        let start_pos = state.get_position();
144
145        if let Some(ch) = state.peek() {
146            if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
147                return false;
148            }
149
150            // Collect identifier characters
151            while let Some(ch) = state.peek() {
152                if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' { state.advance(ch.len_utf8()) } else { break }
153            }
154
155            // Check if it's a keyword
156            let text = state.get_text_in((start_pos..state.get_position()).into());
157            let token_kind = match text {
158                std::borrow::Cow::Borrowed(".assembly") => MsilTokenType::AssemblyKeyword,
159                std::borrow::Cow::Borrowed("extern") => MsilTokenType::ExternKeyword,
160                std::borrow::Cow::Borrowed(".module") => MsilTokenType::ModuleKeyword,
161                std::borrow::Cow::Borrowed(".class") => MsilTokenType::ClassKeyword,
162                std::borrow::Cow::Borrowed(".method") => MsilTokenType::MethodKeyword,
163                std::borrow::Cow::Borrowed("public") => MsilTokenType::PublicKeyword,
164                std::borrow::Cow::Borrowed("private") => MsilTokenType::PrivateKeyword,
165                std::borrow::Cow::Borrowed("static") => MsilTokenType::StaticKeyword,
166                std::borrow::Cow::Borrowed("void") => MsilTokenType::Keyword,
167                std::borrow::Cow::Borrowed("bool") => MsilTokenType::Keyword,
168                std::borrow::Cow::Borrowed("int8") => MsilTokenType::Keyword,
169                std::borrow::Cow::Borrowed("int16") => MsilTokenType::Keyword,
170                std::borrow::Cow::Borrowed("int32") => MsilTokenType::Keyword,
171                std::borrow::Cow::Borrowed("int64") => MsilTokenType::Keyword,
172                std::borrow::Cow::Borrowed("float32") => MsilTokenType::Keyword,
173                std::borrow::Cow::Borrowed("float64") => MsilTokenType::Keyword,
174                std::borrow::Cow::Borrowed("string") => MsilTokenType::Keyword,
175                std::borrow::Cow::Borrowed("object") => MsilTokenType::Keyword,
176                std::borrow::Cow::Borrowed("char") => MsilTokenType::Keyword,
177                std::borrow::Cow::Borrowed("unsigned") => MsilTokenType::Keyword,
178                std::borrow::Cow::Borrowed("extends") => MsilTokenType::Keyword,
179                std::borrow::Cow::Borrowed("implements") => MsilTokenType::Keyword,
180                std::borrow::Cow::Borrowed("auto") => MsilTokenType::Keyword,
181                std::borrow::Cow::Borrowed("ansi") => MsilTokenType::Keyword,
182                std::borrow::Cow::Borrowed("beforefieldinit") => MsilTokenType::Keyword,
183                std::borrow::Cow::Borrowed("sealed") => MsilTokenType::Keyword,
184                std::borrow::Cow::Borrowed("abstract") => MsilTokenType::Keyword,
185                std::borrow::Cow::Borrowed("serializable") => MsilTokenType::Keyword,
186                std::borrow::Cow::Borrowed("sequential") => MsilTokenType::Keyword,
187                std::borrow::Cow::Borrowed("explicit") => MsilTokenType::Keyword,
188                std::borrow::Cow::Borrowed("unicode") => MsilTokenType::Keyword,
189                std::borrow::Cow::Borrowed("autochar") => MsilTokenType::Keyword,
190                std::borrow::Cow::Borrowed("family") => MsilTokenType::Keyword,
191                std::borrow::Cow::Borrowed("assembly") => MsilTokenType::Keyword,
192                std::borrow::Cow::Borrowed("famandassem") => MsilTokenType::Keyword,
193                std::borrow::Cow::Borrowed("famorassem") => MsilTokenType::Keyword,
194                std::borrow::Cow::Borrowed("privatescope") => MsilTokenType::Keyword,
195                std::borrow::Cow::Borrowed("hidebysig") => MsilTokenType::Keyword,
196                std::borrow::Cow::Borrowed("specialname") => MsilTokenType::Keyword,
197                std::borrow::Cow::Borrowed("rtspecialname") => MsilTokenType::Keyword,
198                std::borrow::Cow::Borrowed("cil") => MsilTokenType::Keyword,
199                std::borrow::Cow::Borrowed("managed") => MsilTokenType::Keyword,
200                _ => MsilTokenType::IdentifierToken,
201            };
202
203            state.add_token(token_kind, start_pos, state.get_position());
204            true
205        }
206        else {
207            false
208        }
209    }
210
211    /// Lexes numbers.
212    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
213        let start_pos = state.get_position();
214
215        if let Some(ch) = state.peek() {
216            if !ch.is_ascii_digit() {
217                return false;
218            }
219
220            // Handle integer part
221            while let Some(ch) = state.peek() {
222                if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
223            }
224
225            // Handle decimal point
226            if let Some('.') = state.peek() {
227                if let Some(next_ch) = state.peek_next_n(1) {
228                    if next_ch.is_ascii_digit() {
229                        state.advance(1); // Skip decimal point
230                        while let Some(ch) = state.peek() {
231                            if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
232                        }
233                    }
234                }
235            }
236
237            state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
238            true
239        }
240        else {
241            false
242        }
243    }
244
245    /// Lexes strings.
246    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
247        let start_pos = state.get_position();
248
249        if let Some('"') = state.peek() {
250            state.advance(1);
251
252            while let Some(ch) = state.peek() {
253                if ch == '"' {
254                    state.advance(1);
255                    break;
256                }
257                else if ch == '\\' {
258                    state.advance(1);
259                    if let Some(_) = state.peek() {
260                        state.advance(1)
261                    }
262                }
263                else {
264                    state.advance(ch.len_utf8())
265                }
266            }
267
268            state.add_token(MsilTokenType::StringToken, start_pos, state.get_position());
269            true
270        }
271        else {
272            false
273        }
274    }
275
276    /// Lexes delimiters.
277    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
278        let start_pos = state.get_position();
279
280        if let Some(ch) = state.peek() {
281            let kind = match ch {
282                '{' => MsilTokenType::LeftBrace,
283                '}' => MsilTokenType::RightBrace,
284                '(' => MsilTokenType::LeftParen,
285                ')' => MsilTokenType::RightParen,
286                '[' => MsilTokenType::LeftBracket,
287                ']' => MsilTokenType::RightBracket,
288                '.' => MsilTokenType::Dot,
289                ':' => MsilTokenType::Colon,
290                ';' => MsilTokenType::Semicolon,
291                ',' => MsilTokenType::Comma,
292                '=' => MsilTokenType::Equal,
293                '/' => MsilTokenType::Slash,
294                _ => return false,
295            };
296
297            state.advance(ch.len_utf8());
298            state.add_token(kind, start_pos, state.get_position());
299            true
300        }
301        else {
302            false
303        }
304    }
305}
306
307impl Lexer<MsilLanguage> for MsilLexer<'_> {
308    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<MsilLanguage>) -> LexOutput<MsilLanguage> {
309        let mut state = State::new_with_cache(source, 0, cache);
310        let result = self.run(&mut state);
311        state.finish_with_cache(result, cache)
312    }
313}
314
315impl MsilLexer<'_> {
316    /// Tokenizes the text into a list of tokens
317    pub fn tokenize<'a>(&self, text: &'a str) -> Vec<oak_core::Token<<MsilLanguage as oak_core::Language>::TokenType>> {
318        let source = oak_core::SourceText::new(text);
319        let mut cache = oak_core::parser::session::ParseSession::<MsilLanguage>::default();
320        let mut state = State::new_with_cache(&source, 0, &mut cache);
321        let result = self.run(&mut state);
322        state.finish_with_cache(result, &mut cache).result.unwrap().0.to_vec()
323    }
324}