1#![doc = include_str!("readme.md")]
2pub mod token_type;
3pub use token_type::MsilTokenType;
4
5use crate::language::MsilLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
7
8type State<'a, S> = LexerState<'a, S, MsilLanguage>;
9
10#[derive(Clone, Debug)]
11pub struct MsilLexer<'config> {
12 _config: &'config MsilLanguage,
13}
14
15impl<'config> MsilLexer<'config> {
16 pub fn new(config: &'config MsilLanguage) -> Self {
17 Self { _config: config }
18 }
19}
20
21impl MsilLexer<'_> {
22 pub fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
23 let safe_point = state.get_position();
24 while state.not_at_end() {
25 if self.skip_whitespace(state) {
26 continue;
27 }
28
29 if self.lex_newline(state) {
30 continue;
31 }
32
33 if self.lex_comment(state) {
34 continue;
35 }
36
37 if self.lex_string(state) {
38 continue;
39 }
40
41 if self.lex_number(state) {
42 continue;
43 }
44
45 if self.lex_identifier(state) {
46 continue;
47 }
48
49 if self.lex_delimiter(state) {
50 continue;
51 }
52
53 if let Some(ch) = state.peek() {
55 let start_pos = state.get_position();
56 state.advance(ch.len_utf8());
57 state.add_token(MsilTokenType::Error, start_pos, state.get_position())
58 }
59
60 state.advance_if_dead_lock(safe_point)
61 }
62
63 state.add_eof();
64 Ok(())
65 }
66
67 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
69 let start_pos = state.get_position();
70
71 while let Some(ch) = state.peek() {
72 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
73 }
74
75 if state.get_position() > start_pos {
76 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
77 true
78 }
79 else {
80 false
81 }
82 }
83
84 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86 let start_pos = state.get_position();
87
88 if let Some('\n') = state.peek() {
89 state.advance(1);
90 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
91 true
92 }
93 else if let Some('\r') = state.peek() {
94 state.advance(1);
95 if let Some('\n') = state.peek() {
96 state.advance(1)
97 }
98 state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
99 true
100 }
101 else {
102 false
103 }
104 }
105
106 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108 let start_pos = state.get_position();
109
110 if let Some('/') = state.peek() {
111 if let Some('/') = state.peek_next_n(1) {
112 state.advance(2);
114 while let Some(ch) = state.peek() {
115 if ch == '\n' || ch == '\r' {
116 break;
117 }
118 state.advance(ch.len_utf8())
119 }
120 state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
121 return true;
122 }
123 }
124
125 false
126 }
127
128 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
130 let start_pos = state.get_position();
131
132 if let Some(ch) = state.peek() {
133 if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
134 return false;
135 }
136
137 while let Some(ch) = state.peek() {
139 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' { state.advance(ch.len_utf8()) } else { break }
140 }
141
142 let text = state.get_text_in((start_pos..state.get_position()).into());
144 let token_kind = match text {
145 std::borrow::Cow::Borrowed(".assembly") => MsilTokenType::AssemblyKeyword,
146 std::borrow::Cow::Borrowed("extern") => MsilTokenType::ExternKeyword,
147 std::borrow::Cow::Borrowed(".module") => MsilTokenType::ModuleKeyword,
148 std::borrow::Cow::Borrowed(".class") => MsilTokenType::ClassKeyword,
149 std::borrow::Cow::Borrowed(".method") => MsilTokenType::MethodKeyword,
150 std::borrow::Cow::Borrowed("public") => MsilTokenType::PublicKeyword,
151 std::borrow::Cow::Borrowed("private") => MsilTokenType::PrivateKeyword,
152 std::borrow::Cow::Borrowed("static") => MsilTokenType::StaticKeyword,
153 std::borrow::Cow::Borrowed("void") => MsilTokenType::Keyword,
154 std::borrow::Cow::Borrowed("bool") => MsilTokenType::Keyword,
155 std::borrow::Cow::Borrowed("int8") => MsilTokenType::Keyword,
156 std::borrow::Cow::Borrowed("int16") => MsilTokenType::Keyword,
157 std::borrow::Cow::Borrowed("int32") => MsilTokenType::Keyword,
158 std::borrow::Cow::Borrowed("int64") => MsilTokenType::Keyword,
159 std::borrow::Cow::Borrowed("float32") => MsilTokenType::Keyword,
160 std::borrow::Cow::Borrowed("float64") => MsilTokenType::Keyword,
161 std::borrow::Cow::Borrowed("string") => MsilTokenType::Keyword,
162 std::borrow::Cow::Borrowed("object") => MsilTokenType::Keyword,
163 std::borrow::Cow::Borrowed("char") => MsilTokenType::Keyword,
164 std::borrow::Cow::Borrowed("unsigned") => MsilTokenType::Keyword,
165 std::borrow::Cow::Borrowed("extends") => MsilTokenType::Keyword,
166 std::borrow::Cow::Borrowed("implements") => MsilTokenType::Keyword,
167 std::borrow::Cow::Borrowed("auto") => MsilTokenType::Keyword,
168 std::borrow::Cow::Borrowed("ansi") => MsilTokenType::Keyword,
169 std::borrow::Cow::Borrowed("beforefieldinit") => MsilTokenType::Keyword,
170 std::borrow::Cow::Borrowed("sealed") => MsilTokenType::Keyword,
171 std::borrow::Cow::Borrowed("abstract") => MsilTokenType::Keyword,
172 std::borrow::Cow::Borrowed("serializable") => MsilTokenType::Keyword,
173 std::borrow::Cow::Borrowed("sequential") => MsilTokenType::Keyword,
174 std::borrow::Cow::Borrowed("explicit") => MsilTokenType::Keyword,
175 std::borrow::Cow::Borrowed("unicode") => MsilTokenType::Keyword,
176 std::borrow::Cow::Borrowed("autochar") => MsilTokenType::Keyword,
177 std::borrow::Cow::Borrowed("family") => MsilTokenType::Keyword,
178 std::borrow::Cow::Borrowed("assembly") => MsilTokenType::Keyword,
179 std::borrow::Cow::Borrowed("famandassem") => MsilTokenType::Keyword,
180 std::borrow::Cow::Borrowed("famorassem") => MsilTokenType::Keyword,
181 std::borrow::Cow::Borrowed("privatescope") => MsilTokenType::Keyword,
182 std::borrow::Cow::Borrowed("hidebysig") => MsilTokenType::Keyword,
183 std::borrow::Cow::Borrowed("specialname") => MsilTokenType::Keyword,
184 std::borrow::Cow::Borrowed("rtspecialname") => MsilTokenType::Keyword,
185 std::borrow::Cow::Borrowed("cil") => MsilTokenType::Keyword,
186 std::borrow::Cow::Borrowed("managed") => MsilTokenType::Keyword,
187 _ => MsilTokenType::IdentifierToken,
188 };
189
190 state.add_token(token_kind, start_pos, state.get_position());
191 true
192 }
193 else {
194 false
195 }
196 }
197
198 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
200 let start_pos = state.get_position();
201
202 if let Some(ch) = state.peek() {
203 if !ch.is_ascii_digit() {
204 return false;
205 }
206
207 while let Some(ch) = state.peek() {
209 if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
210 }
211
212 if let Some('.') = state.peek() {
214 if let Some(next_ch) = state.peek_next_n(1) {
215 if next_ch.is_ascii_digit() {
216 state.advance(1); while let Some(ch) = state.peek() {
218 if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
219 }
220 }
221 }
222 }
223
224 state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
225 true
226 }
227 else {
228 false
229 }
230 }
231
232 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
234 let start_pos = state.get_position();
235
236 if let Some('"') = state.peek() {
237 state.advance(1);
238
239 while let Some(ch) = state.peek() {
240 if ch == '"' {
241 state.advance(1);
242 break;
243 }
244 else if ch == '\\' {
245 state.advance(1);
246 if let Some(_) = state.peek() {
247 state.advance(1)
248 }
249 }
250 else {
251 state.advance(ch.len_utf8())
252 }
253 }
254
255 state.add_token(MsilTokenType::StringToken, start_pos, state.get_position());
256 true
257 }
258 else {
259 false
260 }
261 }
262
263 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
265 let start_pos = state.get_position();
266
267 if let Some(ch) = state.peek() {
268 let kind = match ch {
269 '{' => MsilTokenType::LeftBrace,
270 '}' => MsilTokenType::RightBrace,
271 '(' => MsilTokenType::LeftParen,
272 ')' => MsilTokenType::RightParen,
273 '[' => MsilTokenType::LeftBracket,
274 ']' => MsilTokenType::RightBracket,
275 '.' => MsilTokenType::Dot,
276 ':' => MsilTokenType::Colon,
277 ';' => MsilTokenType::Semicolon,
278 ',' => MsilTokenType::Comma,
279 '=' => MsilTokenType::Equal,
280 '/' => MsilTokenType::Slash,
281 _ => return false,
282 };
283
284 state.advance(ch.len_utf8());
285 state.add_token(kind, start_pos, state.get_position());
286 true
287 }
288 else {
289 false
290 }
291 }
292}
293
294impl Lexer<MsilLanguage> for MsilLexer<'_> {
295 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<MsilLanguage>) -> LexOutput<MsilLanguage> {
296 let mut state = State::new_with_cache(source, 0, cache);
297 let result = self.run(&mut state);
298 state.finish_with_cache(result, cache)
299 }
300}
301
302impl MsilLexer<'_> {
303 pub fn tokenize<'a>(&self, text: &'a str) -> Vec<oak_core::Token<<MsilLanguage as oak_core::Language>::TokenType>> {
304 let source = oak_core::SourceText::new(text);
305 let mut cache = oak_core::parser::session::ParseSession::<MsilLanguage>::default();
306 let mut state = State::new_with_cache(&source, 0, &mut cache);
307 let result = self.run(&mut state);
308 state.finish_with_cache(result, &mut cache).result.unwrap().to_vec()
309 }
310}