oak_msil/lexer/
mod.rs

1use crate::{kind::MsilSyntaxKind, language::MsilLanguage};
2use oak_core::{
3    IncrementalCache, Lexer,
4    lexer::{LexOutput, LexerState},
5    source::Source,
6};
7
8#[derive(Clone)]
9pub struct MsilLexer<'config> {
10    config: &'config MsilLanguage,
11}
12
13impl<'config> MsilLexer<'config> {
14    pub fn new(config: &'config MsilLanguage) -> Self {
15        Self { config }
16    }
17
18    /// 跳过空白字符
19    fn skip_whitespace<S: Source>(&self, state: &mut LexerState<S, MsilLanguage>) -> bool {
20        let start_pos = state.get_position();
21
22        while let Some(ch) = state.peek() {
23            if ch == ' ' || ch == '\t' {
24                state.advance(ch.len_utf8());
25            }
26            else {
27                break;
28            }
29        }
30
31        if state.get_position() > start_pos {
32            state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
33            true
34        }
35        else {
36            false
37        }
38    }
39
40    /// 处理换行
41    fn lex_newline<S: Source>(&self, state: &mut LexerState<S, MsilLanguage>) -> bool {
42        let start_pos = state.get_position();
43
44        if let Some('\n') = state.peek() {
45            state.advance(1);
46            state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
47            true
48        }
49        else if let Some('\r') = state.peek() {
50            state.advance(1);
51            if let Some('\n') = state.peek() {
52                state.advance(1);
53            }
54            state.add_token(MsilSyntaxKind::Whitespace, start_pos, state.get_position());
55            true
56        }
57        else {
58            false
59        }
60    }
61
62    /// 处理注释
63    fn lex_comment<S: Source>(&self, state: &mut LexerState<S, MsilLanguage>) -> bool {
64        let start_pos = state.get_position();
65
66        if let Some('/') = state.peek() {
67            if let Some('/') = state.peek_next_n(1) {
68                // 行注释
69                state.advance(2);
70                while let Some(ch) = state.peek() {
71                    if ch == '\n' || ch == '\r' {
72                        break;
73                    }
74                    state.advance(ch.len_utf8());
75                }
76                state.add_token(MsilSyntaxKind::CommentToken, start_pos, state.get_position());
77                return true;
78            }
79        }
80
81        false
82    }
83
84    /// 处理标识符和关键字
85    fn lex_identifier<S: Source>(&self, state: &mut LexerState<S, MsilLanguage>) -> bool {
86        let start_pos = state.get_position();
87
88        if let Some(ch) = state.peek() {
89            if !ch.is_ascii_alphabetic() && ch != '_' && ch != '.' {
90                return false;
91            }
92
93            // 收集标识符字符
94            while let Some(ch) = state.peek() {
95                if ch.is_ascii_alphanumeric() || ch == '_' || ch == '.' {
96                    state.advance(ch.len_utf8());
97                }
98                else {
99                    break;
100                }
101            }
102
103            // 检查是否是关键字
104            let text = state.get_text_in((start_pos..state.get_position()).into());
105            let token_kind = match text {
106                ".assembly" => MsilSyntaxKind::AssemblyKeyword,
107                "extern" => MsilSyntaxKind::ExternKeyword,
108                ".module" => MsilSyntaxKind::ModuleKeyword,
109                ".class" => MsilSyntaxKind::ClassKeyword,
110                ".method" => MsilSyntaxKind::MethodKeyword,
111                "public" => MsilSyntaxKind::PublicKeyword,
112                "private" => MsilSyntaxKind::PrivateKeyword,
113                "static" => MsilSyntaxKind::StaticKeyword,
114                _ => MsilSyntaxKind::IdentifierToken,
115            };
116
117            state.add_token(token_kind, start_pos, state.get_position());
118            true
119        }
120        else {
121            false
122        }
123    }
124
125    /// 处理数字
126    fn lex_number<S: Source>(&self, state: &mut LexerState<S, MsilLanguage>) -> bool {
127        let start_pos = state.get_position();
128
129        if let Some(ch) = state.peek() {
130            if !ch.is_ascii_digit() {
131                return false;
132            }
133
134            // 处理整数部分
135            while let Some(ch) = state.peek() {
136                if ch.is_ascii_digit() {
137                    state.advance(ch.len_utf8());
138                }
139                else {
140                    break;
141                }
142            }
143
144            // 处理小数点
145            if let Some('.') = state.peek() {
146                if let Some(next_ch) = state.peek_next_n(1) {
147                    if next_ch.is_ascii_digit() {
148                        state.advance(1); // 跳过小数点
149                        while let Some(ch) = state.peek() {
150                            if ch.is_ascii_digit() {
151                                state.advance(ch.len_utf8());
152                            }
153                            else {
154                                break;
155                            }
156                        }
157                    }
158                }
159            }
160
161            state.add_token(MsilSyntaxKind::NumberToken, start_pos, state.get_position());
162            true
163        }
164        else {
165            false
166        }
167    }
168
169    /// 处理字符串
170    fn lex_string<S: Source>(&self, state: &mut LexerState<S, MsilLanguage>) -> bool {
171        let start_pos = state.get_position();
172
173        if let Some('"') = state.peek() {
174            state.advance(1); // 跳过开始引号
175
176            while let Some(ch) = state.peek() {
177                if ch == '"' {
178                    state.advance(1); // 跳过结束引号
179                    break;
180                }
181                else if ch == '\\' {
182                    state.advance(1); // 跳过转义字符
183                    if let Some(_) = state.peek() {
184                        state.advance(1); // 跳过被转义的字符
185                    }
186                }
187                else {
188                    state.advance(ch.len_utf8());
189                }
190            }
191
192            state.add_token(MsilSyntaxKind::StringToken, start_pos, state.get_position());
193            true
194        }
195        else {
196            false
197        }
198    }
199
200    /// 处理分隔符
201    fn lex_delimiter<S: Source>(&self, state: &mut LexerState<S, MsilLanguage>) -> bool {
202        let start_pos = state.get_position();
203
204        if let Some(ch) = state.peek() {
205            let token_kind = match ch {
206                '{' => MsilSyntaxKind::LeftBrace,
207                '}' => MsilSyntaxKind::RightBrace,
208                '(' => MsilSyntaxKind::LeftParen,
209                ')' => MsilSyntaxKind::RightParen,
210                '[' => MsilSyntaxKind::LeftBracket,
211                ']' => MsilSyntaxKind::RightBracket,
212                '.' => MsilSyntaxKind::Dot,
213                ':' => MsilSyntaxKind::Colon,
214                ';' => MsilSyntaxKind::Semicolon,
215                ',' => MsilSyntaxKind::Comma,
216                _ => return false,
217            };
218
219            state.advance(ch.len_utf8());
220            state.add_token(token_kind, start_pos, state.get_position());
221            true
222        }
223        else {
224            false
225        }
226    }
227}
228
229impl<'config> Lexer<MsilLanguage> for MsilLexer<'config> {
230    fn lex_incremental(
231        &self,
232        source: impl Source,
233        changed: usize,
234        cache: IncrementalCache<MsilLanguage>,
235    ) -> LexOutput<MsilLanguage> {
236        let mut state = LexerState::new_with_cache(source, changed, cache);
237
238        while state.not_at_end() {
239            // 尝试各种词法规则
240            if self.skip_whitespace(&mut state) {
241                continue;
242            }
243
244            if self.lex_newline(&mut state) {
245                continue;
246            }
247
248            if self.lex_comment(&mut state) {
249                continue;
250            }
251
252            if self.lex_string(&mut state) {
253                continue;
254            }
255
256            if self.lex_number(&mut state) {
257                continue;
258            }
259
260            if self.lex_identifier(&mut state) {
261                continue;
262            }
263
264            if self.lex_delimiter(&mut state) {
265                continue;
266            }
267
268            // 如果所有规则都不匹配,跳过当前字符并标记为错误
269            let start_pos = state.get_position();
270            if let Some(ch) = state.peek() {
271                state.advance(ch.len_utf8());
272                state.add_token(MsilSyntaxKind::Error, start_pos, state.get_position());
273            }
274        }
275
276        // 添加 EOF token
277        let eof_pos = state.get_position();
278        state.add_token(MsilSyntaxKind::Eof, eof_pos, eof_pos);
279
280        state.finish(Ok(()))
281    }
282}