1use crate::{kind::MsilSyntaxKind, language::MsilLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, MsilLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct MsilLexer<'config> {
8 _config: &'config MsilLanguage,
9}
10
11impl<'config> MsilLexer<'config> {
12 pub fn new(config: &'config MsilLanguage) -> Self {
13 Self { _config: config }
14 }
15}
16
17impl MsilLexer<'_> {
18 pub fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
19 let safe_point = state.get_position();
20 while state.not_at_end() {
21 if self.skip_whitespace(state) {
22 continue;
23 }
24
25 if self.lex_newline(state) {
26 continue;
27 }
28
29 if self.lex_comment(state) {
30 continue;
31 }
32
33 if self.lex_string(state) {
34 continue;
35 }
36
37 if self.lex_number(state) {
38 continue;
39 }
40
41 if self.lex_identifier(state) {
42 continue;
43 }
44
45 if self.lex_delimiter(state) {
46 continue;
47 }
48
49 if let Some(ch) = state.peek() {
51 let start_pos = state.get_position();
52 state.advance(ch.len_utf8());
53 state.add_token(MsilSyntaxKind::Error, start_pos, state.get_position());
54 }
55
56 state.advance_if_dead_lock(safe_point);
57 }
58
59 state.add_eof();
60 Ok(())
61 }
62
63 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
65 let start_pos = state.get_position();
66
67 while let Some(ch) = state.peek() {
68 if ch == ' ' || ch == '\t' {
69 state.advance(ch.len_utf8());
70 }
71 else {
72 break;
73 }
74 }
75
76 if state.get_position() > start_pos {
77 state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
78 true
79 }
80 else {
81 false
82 }
83 }
84
85 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87 let start_pos = state.get_position();
88
89 if let Some('\n') = state.peek() {
90 state.advance(1);
91 state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
92 true
93 }
94 else if let Some('\r') = state.peek() {
95 state.advance(1);
96 if let Some('\n') = state.peek() {
97 state.advance(1);
98 }
99 state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
100 true
101 }
102 else {
103 false
104 }
105 }
106
107 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
109 let start_pos = state.get_position();
110
111 if let Some('/') = state.peek() {
112 if let Some('/') = state.peek_next_n(1) {
113 state.advance(2);
115 while let Some(ch) = state.peek() {
116 if ch == '\n' || ch == '\r' {
117 break;
118 }
119 state.advance(ch.len_utf8());
120 }
121 state.add_token(MsilSyntaxKind::CommentToken, start_pos, state.get_position());
122 return true;
123 }
124 }
125
126 false
127 }
128
129 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
131 let start_pos = state.get_position();
132
133 if let Some(ch) = state.peek() {
134 if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
135 return false;
136 }
137
138 while let Some(ch) = state.peek() {
140 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
141 state.advance(ch.len_utf8());
142 }
143 else {
144 break;
145 }
146 }
147
148 let text = state.get_text_in((start_pos..state.get_position()).into());
150 let token_kind = match text {
151 std::borrow::Cow::Borrowed(".assembly") => MsilSyntaxKind::AssemblyKeyword,
152 std::borrow::Cow::Borrowed("extern") => MsilSyntaxKind::ExternKeyword,
153 std::borrow::Cow::Borrowed(".module") => MsilSyntaxKind::ModuleKeyword,
154 std::borrow::Cow::Borrowed(".class") => MsilSyntaxKind::ClassKeyword,
155 std::borrow::Cow::Borrowed(".method") => MsilSyntaxKind::MethodKeyword,
156 std::borrow::Cow::Borrowed("public") => MsilSyntaxKind::PublicKeyword,
157 std::borrow::Cow::Borrowed("private") => MsilSyntaxKind::PrivateKeyword,
158 std::borrow::Cow::Borrowed("static") => MsilSyntaxKind::StaticKeyword,
159 _ => MsilSyntaxKind::IdentifierToken,
160 };
161
162 state.add_token(token_kind, start_pos, state.get_position());
163 true
164 }
165 else {
166 false
167 }
168 }
169
170 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
172 let start_pos = state.get_position();
173
174 if let Some(ch) = state.peek() {
175 if !ch.is_ascii_digit() {
176 return false;
177 }
178
179 while let Some(ch) = state.peek() {
181 if ch.is_ascii_digit() {
182 state.advance(ch.len_utf8());
183 }
184 else {
185 break;
186 }
187 }
188
189 if let Some('.') = state.peek() {
191 if let Some(next_ch) = state.peek_next_n(1) {
192 if next_ch.is_ascii_digit() {
193 state.advance(1); while let Some(ch) = state.peek() {
195 if ch.is_ascii_digit() {
196 state.advance(ch.len_utf8());
197 }
198 else {
199 break;
200 }
201 }
202 }
203 }
204 }
205
206 state.add_token(MsilSyntaxKind::NumberToken, start_pos, state.get_position());
207 true
208 }
209 else {
210 false
211 }
212 }
213
214 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
216 let start_pos = state.get_position();
217
218 if let Some('"') = state.peek() {
219 state.advance(1);
220
221 while let Some(ch) = state.peek() {
222 if ch == '"' {
223 state.advance(1);
224 break;
225 }
226 else if ch == '\\' {
227 state.advance(1);
228 if let Some(_) = state.peek() {
229 state.advance(1);
230 }
231 }
232 else {
233 state.advance(ch.len_utf8());
234 }
235 }
236
237 state.add_token(MsilSyntaxKind::StringToken, start_pos, state.get_position());
238 true
239 }
240 else {
241 false
242 }
243 }
244
245 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
247 let start_pos = state.get_position();
248
249 if let Some(ch) = state.peek() {
250 let kind = match ch {
251 '{' => MsilSyntaxKind::LeftBrace,
252 '}' => MsilSyntaxKind::RightBrace,
253 '(' => MsilSyntaxKind::LeftParen,
254 ')' => MsilSyntaxKind::RightParen,
255 '[' => MsilSyntaxKind::LeftBracket,
256 ']' => MsilSyntaxKind::RightBracket,
257 '.' => MsilSyntaxKind::Dot,
258 ':' => MsilSyntaxKind::Colon,
259 ';' => MsilSyntaxKind::Semicolon,
260 ',' => MsilSyntaxKind::Comma,
261 _ => return false,
262 };
263
264 state.advance(ch.len_utf8());
265 state.add_token(kind, start_pos, state.get_position());
266 true
267 }
268 else {
269 false
270 }
271 }
272}
273
274impl Lexer<MsilLanguage> for MsilLexer<'_> {
275 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<MsilLanguage>) -> LexOutput<MsilLanguage> {
276 let mut state = State::new_with_cache(source, 0, cache);
277 let result = self.run(&mut state);
278 state.finish_with_cache(result, cache)
279 }
280}
281
282impl MsilLexer<'_> {
283 pub fn tokenize<'a>(&self, text: &'a str) -> Vec<oak_core::Token<<MsilLanguage as oak_core::Language>::TokenType>> {
284 let source = oak_core::SourceText::new(text);
285 let mut cache = oak_core::parser::session::ParseSession::<MsilLanguage>::default();
286 let mut state = State::new_with_cache(&source, 0, &mut cache);
287 let result = self.run(&mut state);
288 state.finish_with_cache(result, &mut cache).result.unwrap().to_vec()
289 }
290}