Skip to main content

oak_msil/lexer/
mod.rs

1use crate::{kind::MsilSyntaxKind, language::MsilLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, MsilLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct MsilLexer<'config> {
8    _config: &'config MsilLanguage,
9}
10
11impl<'config> MsilLexer<'config> {
12    pub fn new(config: &'config MsilLanguage) -> Self {
13        Self { _config: config }
14    }
15}
16
17impl MsilLexer<'_> {
18    pub fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), oak_core::OakError> {
19        let safe_point = state.get_position();
20        while state.not_at_end() {
21            if self.skip_whitespace(state) {
22                continue;
23            }
24
25            if self.lex_newline(state) {
26                continue;
27            }
28
29            if self.lex_comment(state) {
30                continue;
31            }
32
33            if self.lex_string(state) {
34                continue;
35            }
36
37            if self.lex_number(state) {
38                continue;
39            }
40
41            if self.lex_identifier(state) {
42                continue;
43            }
44
45            if self.lex_delimiter(state) {
46                continue;
47            }
48
49            // 如果没有匹配任何规则,跳过当前字符
50            if let Some(ch) = state.peek() {
51                let start_pos = state.get_position();
52                state.advance(ch.len_utf8());
53                state.add_token(MsilSyntaxKind::Error, start_pos, state.get_position());
54            }
55
56            state.advance_if_dead_lock(safe_point);
57        }
58
59        state.add_eof();
60        Ok(())
61    }
62
63    /// 跳过空白字符
64    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
65        let start_pos = state.get_position();
66
67        while let Some(ch) = state.peek() {
68            if ch == ' ' || ch == '\t' {
69                state.advance(ch.len_utf8());
70            }
71            else {
72                break;
73            }
74        }
75
76        if state.get_position() > start_pos {
77            state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
78            true
79        }
80        else {
81            false
82        }
83    }
84
85    /// 处理换行
86    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87        let start_pos = state.get_position();
88
89        if let Some('\n') = state.peek() {
90            state.advance(1);
91            state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
92            true
93        }
94        else if let Some('\r') = state.peek() {
95            state.advance(1);
96            if let Some('\n') = state.peek() {
97                state.advance(1);
98            }
99            state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
100            true
101        }
102        else {
103            false
104        }
105    }
106
107    /// 处理注释
108    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
109        let start_pos = state.get_position();
110
111        if let Some('/') = state.peek() {
112            if let Some('/') = state.peek_next_n(1) {
113                // 行注释
114                state.advance(2);
115                while let Some(ch) = state.peek() {
116                    if ch == '\n' || ch == '\r' {
117                        break;
118                    }
119                    state.advance(ch.len_utf8());
120                }
121                state.add_token(MsilSyntaxKind::CommentToken, start_pos, state.get_position());
122                return true;
123            }
124        }
125
126        false
127    }
128
129    /// 处理标识符和关键字
130    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
131        let start_pos = state.get_position();
132
133        if let Some(ch) = state.peek() {
134            if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
135                return false;
136            }
137
138            // 收集标识符字符
139            while let Some(ch) = state.peek() {
140                if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
141                    state.advance(ch.len_utf8());
142                }
143                else {
144                    break;
145                }
146            }
147
148            // 检查是否是关键字
149            let text = state.get_text_in((start_pos..state.get_position()).into());
150            let token_kind = match text {
151                std::borrow::Cow::Borrowed(".assembly") => MsilSyntaxKind::AssemblyKeyword,
152                std::borrow::Cow::Borrowed("extern") => MsilSyntaxKind::ExternKeyword,
153                std::borrow::Cow::Borrowed(".module") => MsilSyntaxKind::ModuleKeyword,
154                std::borrow::Cow::Borrowed(".class") => MsilSyntaxKind::ClassKeyword,
155                std::borrow::Cow::Borrowed(".method") => MsilSyntaxKind::MethodKeyword,
156                std::borrow::Cow::Borrowed("public") => MsilSyntaxKind::PublicKeyword,
157                std::borrow::Cow::Borrowed("private") => MsilSyntaxKind::PrivateKeyword,
158                std::borrow::Cow::Borrowed("static") => MsilSyntaxKind::StaticKeyword,
159                _ => MsilSyntaxKind::IdentifierToken,
160            };
161
162            state.add_token(token_kind, start_pos, state.get_position());
163            true
164        }
165        else {
166            false
167        }
168    }
169
170    /// 处理数字
171    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
172        let start_pos = state.get_position();
173
174        if let Some(ch) = state.peek() {
175            if !ch.is_ascii_digit() {
176                return false;
177            }
178
179            // 处理整数部分
180            while let Some(ch) = state.peek() {
181                if ch.is_ascii_digit() {
182                    state.advance(ch.len_utf8());
183                }
184                else {
185                    break;
186                }
187            }
188
189            // 处理小数点
190            if let Some('.') = state.peek() {
191                if let Some(next_ch) = state.peek_next_n(1) {
192                    if next_ch.is_ascii_digit() {
193                        state.advance(1); // 跳过小数点
194                        while let Some(ch) = state.peek() {
195                            if ch.is_ascii_digit() {
196                                state.advance(ch.len_utf8());
197                            }
198                            else {
199                                break;
200                            }
201                        }
202                    }
203                }
204            }
205
206            state.add_token(MsilSyntaxKind::NumberToken, start_pos, state.get_position());
207            true
208        }
209        else {
210            false
211        }
212    }
213
214    /// 处理字符串
215    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
216        let start_pos = state.get_position();
217
218        if let Some('"') = state.peek() {
219            state.advance(1);
220
221            while let Some(ch) = state.peek() {
222                if ch == '"' {
223                    state.advance(1);
224                    break;
225                }
226                else if ch == '\\' {
227                    state.advance(1);
228                    if let Some(_) = state.peek() {
229                        state.advance(1);
230                    }
231                }
232                else {
233                    state.advance(ch.len_utf8());
234                }
235            }
236
237            state.add_token(MsilSyntaxKind::StringToken, start_pos, state.get_position());
238            true
239        }
240        else {
241            false
242        }
243    }
244
245    /// 处理分隔符
246    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
247        let start_pos = state.get_position();
248
249        if let Some(ch) = state.peek() {
250            let kind = match ch {
251                '{' => MsilSyntaxKind::LeftBrace,
252                '}' => MsilSyntaxKind::RightBrace,
253                '(' => MsilSyntaxKind::LeftParen,
254                ')' => MsilSyntaxKind::RightParen,
255                '[' => MsilSyntaxKind::LeftBracket,
256                ']' => MsilSyntaxKind::RightBracket,
257                '.' => MsilSyntaxKind::Dot,
258                ':' => MsilSyntaxKind::Colon,
259                ';' => MsilSyntaxKind::Semicolon,
260                ',' => MsilSyntaxKind::Comma,
261                _ => return false,
262            };
263
264            state.advance(ch.len_utf8());
265            state.add_token(kind, start_pos, state.get_position());
266            true
267        }
268        else {
269            false
270        }
271    }
272}
273
274impl Lexer<MsilLanguage> for MsilLexer<'_> {
275    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<MsilLanguage>) -> LexOutput<MsilLanguage> {
276        let mut state = State::new_with_cache(source, 0, cache);
277        let result = self.run(&mut state);
278        state.finish_with_cache(result, cache)
279    }
280}
281
282impl MsilLexer<'_> {
283    pub fn tokenize<'a>(&self, text: &'a str) -> Vec<oak_core::Token<<MsilLanguage as oak_core::Language>::TokenType>> {
284        let source = oak_core::SourceText::new(text);
285        let mut cache = oak_core::parser::session::ParseSession::<MsilLanguage>::default();
286        let mut state = State::new_with_cache(&source, 0, &mut cache);
287        let result = self.run(&mut state);
288        state.finish_with_cache(result, &mut cache).result.unwrap().to_vec()
289    }
290}