1#![doc = include_str!("readme.md")]
2pub mod token_type;
4pub use token_type::MsilTokenType;
5
6use crate::language::MsilLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
8
9pub(crate) type State<'a, S> = LexerState<'a, S, MsilLanguage>;
10
11#[derive(Clone, Debug)]
13pub struct MsilLexer;
14
15impl MsilLexer {
16 pub fn new(config: &MsilLanguage) -> Self {
18 Self
19 }
20}
21
22impl MsilLexer {
23 pub fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
25 let safe_point = state.get_position();
26 while state.not_at_end() {
27 if self.skip_whitespace(state) {
28 continue;
29 }
30
31 if self.lex_newline(state) {
32 continue;
33 }
34
35 if self.lex_comment(state) {
36 continue;
37 }
38
39 if self.lex_string(state) {
40 continue;
41 }
42
43 if self.lex_identifier(state) {
44 continue;
45 }
46
47 if self.lex_number(state) {
48 continue;
49 }
50
51 if self.lex_delimiter(state) {
52 continue;
53 }
54
55 if let Some(ch) = state.peek() {
57 let start_pos = state.get_position();
58 state.advance(ch.len_utf8());
59 state.add_token(MsilTokenType::Error, start_pos, state.get_position())
60 }
61
62 state.advance_if_dead_lock(safe_point)
63 }
64
65 state.add_eof();
66 Ok(())
67 }
68
69 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
71 let start_pos = state.get_position();
72
73 while let Some(ch) = state.peek() {
74 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
75 }
76
77 if state.get_position() > start_pos {
78 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
79 true
80 }
81 else {
82 false
83 }
84 }
85
86 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88 let start_pos = state.get_position();
89
90 if let Some('\n') = state.peek() {
91 state.advance(1);
92 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
93 true
94 }
95 else if let Some('\r') = state.peek() {
96 state.advance(1);
97 if let Some('\n') = state.peek() {
98 state.advance(1)
99 }
100 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
101 true
102 }
103 else {
104 false
105 }
106 }
107
108 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110 let start_pos = state.get_position();
111
112 if state.starts_with("//") {
113 while let Some(ch) = state.peek() {
114 if ch == '\n' || ch == '\r' {
115 break;
116 }
117 state.advance(ch.len_utf8())
118 }
119 state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
120 true
121 }
122 else if state.starts_with("/*") {
123 state.advance(2);
124 while let Some(ch) = state.peek() {
125 if state.starts_with("*/") {
126 state.advance(2);
127 break;
128 }
129 state.advance(ch.len_utf8())
130 }
131 state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
132 true
133 }
134 else {
135 false
136 }
137 }
138
139 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
141 let start_pos = state.get_position();
142
143 if let Some(ch) = state.peek() {
144 if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
145 return false;
146 }
147
148 while let Some(ch) = state.peek() {
150 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' { state.advance(ch.len_utf8()) } else { break }
151 }
152
153 let text = state.get_text_in((start_pos..state.get_position()).into());
155 let token_kind = match text {
156 std::borrow::Cow::Borrowed(".assembly") => MsilTokenType::AssemblyKeyword,
157 std::borrow::Cow::Borrowed("extern") => MsilTokenType::ExternKeyword,
158 std::borrow::Cow::Borrowed(".module") => MsilTokenType::ModuleKeyword,
159 std::borrow::Cow::Borrowed(".class") => MsilTokenType::ClassKeyword,
160 std::borrow::Cow::Borrowed(".method") => MsilTokenType::MethodKeyword,
161 std::borrow::Cow::Borrowed(".data") => MsilTokenType::IdentifierToken,
162 std::borrow::Cow::Borrowed(".ver") => MsilTokenType::IdentifierToken,
163 std::borrow::Cow::Borrowed(".publickeytoken") => MsilTokenType::IdentifierToken,
164 std::borrow::Cow::Borrowed(".custom") => MsilTokenType::IdentifierToken,
165 std::borrow::Cow::Borrowed("public") => MsilTokenType::PublicKeyword,
166 std::borrow::Cow::Borrowed("private") => MsilTokenType::PrivateKeyword,
167 std::borrow::Cow::Borrowed("static") => MsilTokenType::StaticKeyword,
168 std::borrow::Cow::Borrowed("void") => MsilTokenType::Keyword,
169 std::borrow::Cow::Borrowed("bool") => MsilTokenType::Keyword,
170 std::borrow::Cow::Borrowed("int8") => MsilTokenType::Keyword,
171 std::borrow::Cow::Borrowed("int16") => MsilTokenType::Keyword,
172 std::borrow::Cow::Borrowed("int32") => MsilTokenType::Keyword,
173 std::borrow::Cow::Borrowed("int64") => MsilTokenType::Keyword,
174 std::borrow::Cow::Borrowed("float32") => MsilTokenType::Keyword,
175 std::borrow::Cow::Borrowed("float64") => MsilTokenType::Keyword,
176 std::borrow::Cow::Borrowed("string") => MsilTokenType::Keyword,
177 std::borrow::Cow::Borrowed("object") => MsilTokenType::Keyword,
178 std::borrow::Cow::Borrowed("char") => MsilTokenType::Keyword,
179 std::borrow::Cow::Borrowed("unsigned") => MsilTokenType::Keyword,
180 std::borrow::Cow::Borrowed("extends") => MsilTokenType::Keyword,
181 std::borrow::Cow::Borrowed("implements") => MsilTokenType::Keyword,
182 std::borrow::Cow::Borrowed("auto") => MsilTokenType::Keyword,
183 std::borrow::Cow::Borrowed("ansi") => MsilTokenType::Keyword,
184 std::borrow::Cow::Borrowed("beforefieldinit") => MsilTokenType::Keyword,
185 std::borrow::Cow::Borrowed("sealed") => MsilTokenType::Keyword,
186 std::borrow::Cow::Borrowed("abstract") => MsilTokenType::Keyword,
187 std::borrow::Cow::Borrowed("serializable") => MsilTokenType::Keyword,
188 std::borrow::Cow::Borrowed("sequential") => MsilTokenType::Keyword,
189 std::borrow::Cow::Borrowed("explicit") => MsilTokenType::Keyword,
190 std::borrow::Cow::Borrowed("unicode") => MsilTokenType::Keyword,
191 std::borrow::Cow::Borrowed("autochar") => MsilTokenType::Keyword,
192 std::borrow::Cow::Borrowed("family") => MsilTokenType::Keyword,
193 std::borrow::Cow::Borrowed("assembly") => MsilTokenType::Keyword,
194 std::borrow::Cow::Borrowed("famandassem") => MsilTokenType::Keyword,
195 std::borrow::Cow::Borrowed("famorassem") => MsilTokenType::Keyword,
196 std::borrow::Cow::Borrowed("privatescope") => MsilTokenType::Keyword,
197 std::borrow::Cow::Borrowed("hidebysig") => MsilTokenType::Keyword,
198 std::borrow::Cow::Borrowed("specialname") => MsilTokenType::Keyword,
199 std::borrow::Cow::Borrowed("rtspecialname") => MsilTokenType::Keyword,
200 std::borrow::Cow::Borrowed("cil") => MsilTokenType::Keyword,
201 std::borrow::Cow::Borrowed("managed") => MsilTokenType::Keyword,
202 _ => MsilTokenType::IdentifierToken,
203 };
204
205 state.add_token(token_kind, start_pos, state.get_position());
206 true
207 }
208 else {
209 false
210 }
211 }
212
213 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
215 let start_pos = state.get_position();
216
217 if let Some(ch) = state.peek() {
218 if ch == '0' {
220 if let Some(next_ch) = state.peek_next_n(1) {
221 if next_ch == 'x' || next_ch == 'X' {
222 state.advance(2); let mut has_digits = false;
224 while let Some(ch) = state.peek() {
225 if ch.is_ascii_hexdigit() {
226 state.advance(ch.len_utf8());
227 has_digits = true;
228 }
229 else {
230 break;
231 }
232 }
233 if has_digits {
234 state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
235 return true;
236 }
237 }
238 }
239 }
240
241 if ch.is_ascii_digit() {
243 while let Some(ch) = state.peek() {
245 if ch.is_ascii_digit() || ch.is_ascii_hexdigit() {
246 state.advance(ch.len_utf8());
247 }
248 else {
249 break;
250 }
251 }
252
253 if let Some('.') = state.peek() {
255 if let Some(next_ch) = state.peek_next_n(1) {
256 if next_ch.is_ascii_digit() {
257 state.advance(1); while let Some(ch) = state.peek() {
259 if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
260 }
261 }
262 }
263 }
264
265 state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
266 true
267 }
268 else if ch.is_ascii_hexdigit() {
270 let mut has_digits = false;
271 while let Some(ch) = state.peek() {
272 if ch.is_ascii_hexdigit() {
273 state.advance(ch.len_utf8());
274 has_digits = true;
275 }
276 else {
277 break;
278 }
279 }
280 if has_digits {
281 state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
282 return true;
283 }
284 false
285 }
286 else {
287 false
288 }
289 }
290 else {
291 false
292 }
293 }
294
295 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
297 let start_pos = state.get_position();
298
299 if let Some('"') = state.peek() {
300 state.advance(1);
301
302 while let Some(ch) = state.peek() {
303 if ch == '"' {
304 state.advance(1);
305 break;
306 }
307 else if ch == '\\' {
308 state.advance(1);
309 if let Some(_) = state.peek() {
310 state.advance(1)
311 }
312 }
313 else {
314 state.advance(ch.len_utf8())
315 }
316 }
317
318 state.add_token(MsilTokenType::StringToken, start_pos, state.get_position());
319 true
320 }
321 else {
322 false
323 }
324 }
325
326 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
328 let start_pos = state.get_position();
329
330 if let Some(ch) = state.peek() {
331 let kind = match ch {
332 '{' => MsilTokenType::LeftBrace,
333 '}' => MsilTokenType::RightBrace,
334 '(' => MsilTokenType::LeftParen,
335 ')' => MsilTokenType::RightParen,
336 '[' => MsilTokenType::LeftBracket,
337 ']' => MsilTokenType::RightBracket,
338 '.' => MsilTokenType::Dot,
339 ':' => MsilTokenType::Colon,
340 ';' => MsilTokenType::Semicolon,
341 ',' => MsilTokenType::Comma,
342 '=' => MsilTokenType::Equal,
343 '/' => MsilTokenType::Slash,
344 _ => return false,
345 };
346
347 state.advance(ch.len_utf8());
348 state.add_token(kind, start_pos, state.get_position());
349 true
350 }
351 else {
352 false
353 }
354 }
355}
356
357impl Lexer<MsilLanguage> for MsilLexer {
358 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<MsilLanguage>) -> LexOutput<MsilLanguage> {
359 let mut state = State::new_with_cache(source, 0, cache);
360 let result = self.run(&mut state);
361 state.finish_with_cache(result, cache)
362 }
363}
364
365impl MsilLexer {
366 pub fn tokenize<'a>(&self, text: &'a str) -> Vec<oak_core::Token<<MsilLanguage as oak_core::Language>::TokenType>> {
368 let source = oak_core::SourceText::new(text);
369 let mut cache = oak_core::parser::session::ParseSession::<MsilLanguage>::default();
370 let mut state = State::new_with_cache(&source, 0, &mut cache);
371 let result = self.run(&mut state);
372 state.finish_with_cache(result, &mut cache).result.unwrap().0.to_vec()
373 }
374}