1#![doc = include_str!("readme.md")]
2pub mod token_type;
4pub use token_type::MsilTokenType;
5
6use crate::language::MsilLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
8
9pub(crate) type State<'a, S> = LexerState<'a, S, MsilLanguage>;
10
11#[derive(Clone, Debug)]
13pub struct MsilLexer<'config> {
14 config: &'config MsilLanguage,
15}
16
17impl<'config> MsilLexer<'config> {
18 pub fn new(config: &'config MsilLanguage) -> Self {
20 Self { config }
21 }
22}
23
24impl MsilLexer<'_> {
25 pub fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
27 let safe_point = state.get_position();
28 while state.not_at_end() {
29 if self.skip_whitespace(state) {
30 continue;
31 }
32
33 if self.lex_newline(state) {
34 continue;
35 }
36
37 if self.lex_comment(state) {
38 continue;
39 }
40
41 if self.lex_string(state) {
42 continue;
43 }
44
45 if self.lex_number(state) {
46 continue;
47 }
48
49 if self.lex_identifier(state) {
50 continue;
51 }
52
53 if self.lex_delimiter(state) {
54 continue;
55 }
56
57 if let Some(ch) = state.peek() {
59 let start_pos = state.get_position();
60 state.advance(ch.len_utf8());
61 state.add_token(MsilTokenType::Error, start_pos, state.get_position())
62 }
63
64 state.advance_if_dead_lock(safe_point)
65 }
66
67 state.add_eof();
68 Ok(())
69 }
70
71 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
73 let start_pos = state.get_position();
74
75 while let Some(ch) = state.peek() {
76 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
77 }
78
79 if state.get_position() > start_pos {
80 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
81 true
82 }
83 else {
84 false
85 }
86 }
87
88 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
90 let start_pos = state.get_position();
91
92 if let Some('\n') = state.peek() {
93 state.advance(1);
94 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
95 true
96 }
97 else if let Some('\r') = state.peek() {
98 state.advance(1);
99 if let Some('\n') = state.peek() {
100 state.advance(1)
101 }
102 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
103 true
104 }
105 else {
106 false
107 }
108 }
109
110 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
112 let start_pos = state.get_position();
113
114 if state.starts_with("//") {
115 while let Some(ch) = state.peek() {
116 if ch == '\n' || ch == '\r' {
117 break;
118 }
119 state.advance(ch.len_utf8())
120 }
121 state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
122 true
123 }
124 else if state.starts_with("/*") {
125 state.advance(2);
126 while let Some(ch) = state.peek() {
127 if state.starts_with("*/") {
128 state.advance(2);
129 break;
130 }
131 state.advance(ch.len_utf8())
132 }
133 state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
134 true
135 }
136 else {
137 false
138 }
139 }
140
141 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
143 let start_pos = state.get_position();
144
145 if let Some(ch) = state.peek() {
146 if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
147 return false;
148 }
149
150 while let Some(ch) = state.peek() {
152 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' { state.advance(ch.len_utf8()) } else { break }
153 }
154
155 let text = state.get_text_in((start_pos..state.get_position()).into());
157 let token_kind = match text {
158 std::borrow::Cow::Borrowed(".assembly") => MsilTokenType::AssemblyKeyword,
159 std::borrow::Cow::Borrowed("extern") => MsilTokenType::ExternKeyword,
160 std::borrow::Cow::Borrowed(".module") => MsilTokenType::ModuleKeyword,
161 std::borrow::Cow::Borrowed(".class") => MsilTokenType::ClassKeyword,
162 std::borrow::Cow::Borrowed(".method") => MsilTokenType::MethodKeyword,
163 std::borrow::Cow::Borrowed("public") => MsilTokenType::PublicKeyword,
164 std::borrow::Cow::Borrowed("private") => MsilTokenType::PrivateKeyword,
165 std::borrow::Cow::Borrowed("static") => MsilTokenType::StaticKeyword,
166 std::borrow::Cow::Borrowed("void") => MsilTokenType::Keyword,
167 std::borrow::Cow::Borrowed("bool") => MsilTokenType::Keyword,
168 std::borrow::Cow::Borrowed("int8") => MsilTokenType::Keyword,
169 std::borrow::Cow::Borrowed("int16") => MsilTokenType::Keyword,
170 std::borrow::Cow::Borrowed("int32") => MsilTokenType::Keyword,
171 std::borrow::Cow::Borrowed("int64") => MsilTokenType::Keyword,
172 std::borrow::Cow::Borrowed("float32") => MsilTokenType::Keyword,
173 std::borrow::Cow::Borrowed("float64") => MsilTokenType::Keyword,
174 std::borrow::Cow::Borrowed("string") => MsilTokenType::Keyword,
175 std::borrow::Cow::Borrowed("object") => MsilTokenType::Keyword,
176 std::borrow::Cow::Borrowed("char") => MsilTokenType::Keyword,
177 std::borrow::Cow::Borrowed("unsigned") => MsilTokenType::Keyword,
178 std::borrow::Cow::Borrowed("extends") => MsilTokenType::Keyword,
179 std::borrow::Cow::Borrowed("implements") => MsilTokenType::Keyword,
180 std::borrow::Cow::Borrowed("auto") => MsilTokenType::Keyword,
181 std::borrow::Cow::Borrowed("ansi") => MsilTokenType::Keyword,
182 std::borrow::Cow::Borrowed("beforefieldinit") => MsilTokenType::Keyword,
183 std::borrow::Cow::Borrowed("sealed") => MsilTokenType::Keyword,
184 std::borrow::Cow::Borrowed("abstract") => MsilTokenType::Keyword,
185 std::borrow::Cow::Borrowed("serializable") => MsilTokenType::Keyword,
186 std::borrow::Cow::Borrowed("sequential") => MsilTokenType::Keyword,
187 std::borrow::Cow::Borrowed("explicit") => MsilTokenType::Keyword,
188 std::borrow::Cow::Borrowed("unicode") => MsilTokenType::Keyword,
189 std::borrow::Cow::Borrowed("autochar") => MsilTokenType::Keyword,
190 std::borrow::Cow::Borrowed("family") => MsilTokenType::Keyword,
191 std::borrow::Cow::Borrowed("assembly") => MsilTokenType::Keyword,
192 std::borrow::Cow::Borrowed("famandassem") => MsilTokenType::Keyword,
193 std::borrow::Cow::Borrowed("famorassem") => MsilTokenType::Keyword,
194 std::borrow::Cow::Borrowed("privatescope") => MsilTokenType::Keyword,
195 std::borrow::Cow::Borrowed("hidebysig") => MsilTokenType::Keyword,
196 std::borrow::Cow::Borrowed("specialname") => MsilTokenType::Keyword,
197 std::borrow::Cow::Borrowed("rtspecialname") => MsilTokenType::Keyword,
198 std::borrow::Cow::Borrowed("cil") => MsilTokenType::Keyword,
199 std::borrow::Cow::Borrowed("managed") => MsilTokenType::Keyword,
200 _ => MsilTokenType::IdentifierToken,
201 };
202
203 state.add_token(token_kind, start_pos, state.get_position());
204 true
205 }
206 else {
207 false
208 }
209 }
210
211 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
213 let start_pos = state.get_position();
214
215 if let Some(ch) = state.peek() {
216 if !ch.is_ascii_digit() {
217 return false;
218 }
219
220 while let Some(ch) = state.peek() {
222 if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
223 }
224
225 if let Some('.') = state.peek() {
227 if let Some(next_ch) = state.peek_next_n(1) {
228 if next_ch.is_ascii_digit() {
229 state.advance(1); while let Some(ch) = state.peek() {
231 if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
232 }
233 }
234 }
235 }
236
237 state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
238 true
239 }
240 else {
241 false
242 }
243 }
244
245 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
247 let start_pos = state.get_position();
248
249 if let Some('"') = state.peek() {
250 state.advance(1);
251
252 while let Some(ch) = state.peek() {
253 if ch == '"' {
254 state.advance(1);
255 break;
256 }
257 else if ch == '\\' {
258 state.advance(1);
259 if let Some(_) = state.peek() {
260 state.advance(1)
261 }
262 }
263 else {
264 state.advance(ch.len_utf8())
265 }
266 }
267
268 state.add_token(MsilTokenType::StringToken, start_pos, state.get_position());
269 true
270 }
271 else {
272 false
273 }
274 }
275
276 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
278 let start_pos = state.get_position();
279
280 if let Some(ch) = state.peek() {
281 let kind = match ch {
282 '{' => MsilTokenType::LeftBrace,
283 '}' => MsilTokenType::RightBrace,
284 '(' => MsilTokenType::LeftParen,
285 ')' => MsilTokenType::RightParen,
286 '[' => MsilTokenType::LeftBracket,
287 ']' => MsilTokenType::RightBracket,
288 '.' => MsilTokenType::Dot,
289 ':' => MsilTokenType::Colon,
290 ';' => MsilTokenType::Semicolon,
291 ',' => MsilTokenType::Comma,
292 '=' => MsilTokenType::Equal,
293 '/' => MsilTokenType::Slash,
294 _ => return false,
295 };
296
297 state.advance(ch.len_utf8());
298 state.add_token(kind, start_pos, state.get_position());
299 true
300 }
301 else {
302 false
303 }
304 }
305}
306
307impl Lexer<MsilLanguage> for MsilLexer<'_> {
308 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<MsilLanguage>) -> LexOutput<MsilLanguage> {
309 let mut state = State::new_with_cache(source, 0, cache);
310 let result = self.run(&mut state);
311 state.finish_with_cache(result, cache)
312 }
313}
314
315impl MsilLexer<'_> {
316 pub fn tokenize<'a>(&self, text: &'a str) -> Vec<oak_core::Token<<MsilLanguage as oak_core::Language>::TokenType>> {
318 let source = oak_core::SourceText::new(text);
319 let mut cache = oak_core::parser::session::ParseSession::<MsilLanguage>::default();
320 let mut state = State::new_with_cache(&source, 0, &mut cache);
321 let result = self.run(&mut state);
322 state.finish_with_cache(result, &mut cache).result.unwrap().0.to_vec()
323 }
324}