Skip to main content

oak_msil/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3pub use token_type::MsilTokenType;
4
5use crate::language::MsilLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
7
8type State<'a, S> = LexerState<'a, S, MsilLanguage>;
9
10#[derive(Clone, Debug)]
11pub struct MsilLexer<'config> {
12    _config: &'config MsilLanguage,
13}
14
15impl<'config> MsilLexer<'config> {
16    pub fn new(config: &'config MsilLanguage) -> Self {
17        Self { _config: config }
18    }
19}
20
21impl MsilLexer<'_> {
22    pub fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
23        let safe_point = state.get_position();
24        while state.not_at_end() {
25            if self.skip_whitespace(state) {
26                continue;
27            }
28
29            if self.lex_newline(state) {
30                continue;
31            }
32
33            if self.lex_comment(state) {
34                continue;
35            }
36
37            if self.lex_string(state) {
38                continue;
39            }
40
41            if self.lex_number(state) {
42                continue;
43            }
44
45            if self.lex_identifier(state) {
46                continue;
47            }
48
49            if self.lex_delimiter(state) {
50                continue;
51            }
52
53            // 如果没有匹配任何规则,跳过当前字符
54            if let Some(ch) = state.peek() {
55                let start_pos = state.get_position();
56                state.advance(ch.len_utf8());
57                state.add_token(MsilTokenType::Error, start_pos, state.get_position())
58            }
59
60            state.advance_if_dead_lock(safe_point)
61        }
62
63        state.add_eof();
64        Ok(())
65    }
66
67    /// 跳过空白字符
68    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
69        let start_pos = state.get_position();
70
71        while let Some(ch) = state.peek() {
72            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
73        }
74
75        if state.get_position() > start_pos {
76            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
77            true
78        }
79        else {
80            false
81        }
82    }
83
84    /// 处理换行
85    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86        let start_pos = state.get_position();
87
88        if let Some('\n') = state.peek() {
89            state.advance(1);
90            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
91            true
92        }
93        else if let Some('\r') = state.peek() {
94            state.advance(1);
95            if let Some('\n') = state.peek() {
96                state.advance(1)
97            }
98            state.add_token(MsilTokenType::Whitespace, start_pos, state.get_position());
99            true
100        }
101        else {
102            false
103        }
104    }
105
106    /// 处理注释
107    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108        let start_pos = state.get_position();
109
110        if let Some('/') = state.peek() {
111            if let Some('/') = state.peek_next_n(1) {
112                // 行注释
113                state.advance(2);
114                while let Some(ch) = state.peek() {
115                    if ch == '\n' || ch == '\r' {
116                        break;
117                    }
118                    state.advance(ch.len_utf8())
119                }
120                state.add_token(MsilTokenType::CommentToken, start_pos, state.get_position());
121                return true;
122            }
123        }
124
125        false
126    }
127
128    /// 处理标识符和关键字
129    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
130        let start_pos = state.get_position();
131
132        if let Some(ch) = state.peek() {
133            if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
134                return false;
135            }
136
137            // 收集标识符字符
138            while let Some(ch) = state.peek() {
139                if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' { state.advance(ch.len_utf8()) } else { break }
140            }
141
142            // 检查是否是关键字
143            let text = state.get_text_in((start_pos..state.get_position()).into());
144            let token_kind = match text {
145                std::borrow::Cow::Borrowed(".assembly") => MsilTokenType::AssemblyKeyword,
146                std::borrow::Cow::Borrowed("extern") => MsilTokenType::ExternKeyword,
147                std::borrow::Cow::Borrowed(".module") => MsilTokenType::ModuleKeyword,
148                std::borrow::Cow::Borrowed(".class") => MsilTokenType::ClassKeyword,
149                std::borrow::Cow::Borrowed(".method") => MsilTokenType::MethodKeyword,
150                std::borrow::Cow::Borrowed("public") => MsilTokenType::PublicKeyword,
151                std::borrow::Cow::Borrowed("private") => MsilTokenType::PrivateKeyword,
152                std::borrow::Cow::Borrowed("static") => MsilTokenType::StaticKeyword,
153                std::borrow::Cow::Borrowed("void") => MsilTokenType::Keyword,
154                std::borrow::Cow::Borrowed("bool") => MsilTokenType::Keyword,
155                std::borrow::Cow::Borrowed("int8") => MsilTokenType::Keyword,
156                std::borrow::Cow::Borrowed("int16") => MsilTokenType::Keyword,
157                std::borrow::Cow::Borrowed("int32") => MsilTokenType::Keyword,
158                std::borrow::Cow::Borrowed("int64") => MsilTokenType::Keyword,
159                std::borrow::Cow::Borrowed("float32") => MsilTokenType::Keyword,
160                std::borrow::Cow::Borrowed("float64") => MsilTokenType::Keyword,
161                std::borrow::Cow::Borrowed("string") => MsilTokenType::Keyword,
162                std::borrow::Cow::Borrowed("object") => MsilTokenType::Keyword,
163                std::borrow::Cow::Borrowed("char") => MsilTokenType::Keyword,
164                std::borrow::Cow::Borrowed("unsigned") => MsilTokenType::Keyword,
165                std::borrow::Cow::Borrowed("extends") => MsilTokenType::Keyword,
166                std::borrow::Cow::Borrowed("implements") => MsilTokenType::Keyword,
167                std::borrow::Cow::Borrowed("auto") => MsilTokenType::Keyword,
168                std::borrow::Cow::Borrowed("ansi") => MsilTokenType::Keyword,
169                std::borrow::Cow::Borrowed("beforefieldinit") => MsilTokenType::Keyword,
170                std::borrow::Cow::Borrowed("sealed") => MsilTokenType::Keyword,
171                std::borrow::Cow::Borrowed("abstract") => MsilTokenType::Keyword,
172                std::borrow::Cow::Borrowed("serializable") => MsilTokenType::Keyword,
173                std::borrow::Cow::Borrowed("sequential") => MsilTokenType::Keyword,
174                std::borrow::Cow::Borrowed("explicit") => MsilTokenType::Keyword,
175                std::borrow::Cow::Borrowed("unicode") => MsilTokenType::Keyword,
176                std::borrow::Cow::Borrowed("autochar") => MsilTokenType::Keyword,
177                std::borrow::Cow::Borrowed("family") => MsilTokenType::Keyword,
178                std::borrow::Cow::Borrowed("assembly") => MsilTokenType::Keyword,
179                std::borrow::Cow::Borrowed("famandassem") => MsilTokenType::Keyword,
180                std::borrow::Cow::Borrowed("famorassem") => MsilTokenType::Keyword,
181                std::borrow::Cow::Borrowed("privatescope") => MsilTokenType::Keyword,
182                std::borrow::Cow::Borrowed("hidebysig") => MsilTokenType::Keyword,
183                std::borrow::Cow::Borrowed("specialname") => MsilTokenType::Keyword,
184                std::borrow::Cow::Borrowed("rtspecialname") => MsilTokenType::Keyword,
185                std::borrow::Cow::Borrowed("cil") => MsilTokenType::Keyword,
186                std::borrow::Cow::Borrowed("managed") => MsilTokenType::Keyword,
187                _ => MsilTokenType::IdentifierToken,
188            };
189
190            state.add_token(token_kind, start_pos, state.get_position());
191            true
192        }
193        else {
194            false
195        }
196    }
197
198    /// 处理数字
199    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
200        let start_pos = state.get_position();
201
202        if let Some(ch) = state.peek() {
203            if !ch.is_ascii_digit() {
204                return false;
205            }
206
207            // 处理整数部分
208            while let Some(ch) = state.peek() {
209                if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
210            }
211
212            // 处理小数点
213            if let Some('.') = state.peek() {
214                if let Some(next_ch) = state.peek_next_n(1) {
215                    if next_ch.is_ascii_digit() {
216                        state.advance(1); // 跳过小数点
217                        while let Some(ch) = state.peek() {
218                            if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
219                        }
220                    }
221                }
222            }
223
224            state.add_token(MsilTokenType::NumberToken, start_pos, state.get_position());
225            true
226        }
227        else {
228            false
229        }
230    }
231
232    /// 处理字符串
233    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
234        let start_pos = state.get_position();
235
236        if let Some('"') = state.peek() {
237            state.advance(1);
238
239            while let Some(ch) = state.peek() {
240                if ch == '"' {
241                    state.advance(1);
242                    break;
243                }
244                else if ch == '\\' {
245                    state.advance(1);
246                    if let Some(_) = state.peek() {
247                        state.advance(1)
248                    }
249                }
250                else {
251                    state.advance(ch.len_utf8())
252                }
253            }
254
255            state.add_token(MsilTokenType::StringToken, start_pos, state.get_position());
256            true
257        }
258        else {
259            false
260        }
261    }
262
263    /// 处理分隔符
264    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
265        let start_pos = state.get_position();
266
267        if let Some(ch) = state.peek() {
268            let kind = match ch {
269                '{' => MsilTokenType::LeftBrace,
270                '}' => MsilTokenType::RightBrace,
271                '(' => MsilTokenType::LeftParen,
272                ')' => MsilTokenType::RightParen,
273                '[' => MsilTokenType::LeftBracket,
274                ']' => MsilTokenType::RightBracket,
275                '.' => MsilTokenType::Dot,
276                ':' => MsilTokenType::Colon,
277                ';' => MsilTokenType::Semicolon,
278                ',' => MsilTokenType::Comma,
279                '=' => MsilTokenType::Equal,
280                '/' => MsilTokenType::Slash,
281                _ => return false,
282            };
283
284            state.advance(ch.len_utf8());
285            state.add_token(kind, start_pos, state.get_position());
286            true
287        }
288        else {
289            false
290        }
291    }
292}
293
294impl Lexer<MsilLanguage> for MsilLexer<'_> {
295    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<MsilLanguage>) -> LexOutput<MsilLanguage> {
296        let mut state = State::new_with_cache(source, 0, cache);
297        let result = self.run(&mut state);
298        state.finish_with_cache(result, cache)
299    }
300}
301
302impl MsilLexer<'_> {
303    pub fn tokenize<'a>(&self, text: &'a str) -> Vec<oak_core::Token<<MsilLanguage as oak_core::Language>::TokenType>> {
304        let source = oak_core::SourceText::new(text);
305        let mut cache = oak_core::parser::session::ParseSession::<MsilLanguage>::default();
306        let mut state = State::new_with_cache(&source, 0, &mut cache);
307        let result = self.run(&mut state);
308        state.finish_with_cache(result, &mut cache).result.unwrap().to_vec()
309    }
310}