Skip to main content

oak_jasmin/lexer/
mod.rs

1use crate::{kind::JasminSyntaxKind, language::JasminLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, JasminLanguage>;
5
6/// Jasmin 词法分析器
7#[derive(Clone)]
8pub struct JasminLexer<'config> {
9    _config: &'config JasminLanguage,
10}
11
12impl<'config> JasminLexer<'config> {
13    pub fn new(config: &'config JasminLanguage) -> Self {
14        Self { _config: config }
15    }
16
17    /// 判断是关键字还是标识符
18    fn keyword_or_identifier(&self, text: &str) -> JasminSyntaxKind {
19        match text {
20            ".class" => JasminSyntaxKind::ClassKw,
21            ".version" => JasminSyntaxKind::VersionKw,
22            ".method" => JasminSyntaxKind::MethodKw,
23            ".field" => JasminSyntaxKind::FieldKw,
24            ".string" => JasminSyntaxKind::StringKw,
25            ".source" => JasminSyntaxKind::SourceFileKw,
26            ".stack" => JasminSyntaxKind::StackKw,
27            ".locals" => JasminSyntaxKind::LocalsKw,
28            ".end" => JasminSyntaxKind::EndKw,
29            ".compiled" => JasminSyntaxKind::CompiledKw,
30            ".from" => JasminSyntaxKind::FromKw,
31            ".inner" => JasminSyntaxKind::InnerClassKw,
32            ".nest" => JasminSyntaxKind::NestMembersKw,
33            ".bootstrap" => JasminSyntaxKind::BootstrapMethodKw,
34
35            "public" => JasminSyntaxKind::Public,
36            "private" => JasminSyntaxKind::Private,
37            "protected" => JasminSyntaxKind::Protected,
38            "static" => JasminSyntaxKind::Static,
39            "super" => JasminSyntaxKind::Super,
40            "final" => JasminSyntaxKind::Final,
41            "abstract" => JasminSyntaxKind::Abstract,
42            "synchronized" => JasminSyntaxKind::Synchronized,
43            "native" => JasminSyntaxKind::Native,
44            "synthetic" => JasminSyntaxKind::Synthetic,
45            "deprecated" => JasminSyntaxKind::Deprecated,
46            "varargs" => JasminSyntaxKind::Varargs,
47
48            "aload_0" => JasminSyntaxKind::ALoad0,
49            "aload_1" => JasminSyntaxKind::ALoad1,
50            "aload_2" => JasminSyntaxKind::ALoad2,
51            "aload_3" => JasminSyntaxKind::ALoad3,
52            "iload_0" => JasminSyntaxKind::ILoad0,
53            "iload_1" => JasminSyntaxKind::ILoad1,
54            "iload_2" => JasminSyntaxKind::ILoad2,
55            "iload_3" => JasminSyntaxKind::ILoad3,
56            "ldc" => JasminSyntaxKind::Ldc,
57            "ldc_w" => JasminSyntaxKind::LdcW,
58            "ldc2_w" => JasminSyntaxKind::Ldc2W,
59            "invokespecial" => JasminSyntaxKind::InvokeSpecial,
60            "invokevirtual" => JasminSyntaxKind::InvokeVirtual,
61            "invokestatic" => JasminSyntaxKind::InvokeStatic,
62            "invokeinterface" => JasminSyntaxKind::InvokeInterface,
63            "invokedynamic" => JasminSyntaxKind::InvokeDynamic,
64            "getstatic" => JasminSyntaxKind::GetStatic,
65            "putstatic" => JasminSyntaxKind::PutStatic,
66            "getfield" => JasminSyntaxKind::GetField,
67            "putfield" => JasminSyntaxKind::PutField,
68            "return" => JasminSyntaxKind::Return,
69            "areturn" => JasminSyntaxKind::AReturn,
70            "ireturn" => JasminSyntaxKind::IReturn,
71            "pop" => JasminSyntaxKind::Pop,
72            "new" => JasminSyntaxKind::New,
73
74            _ => {
75                // 检查是否是类型描述符
76                if self.is_type_descriptor(text) { JasminSyntaxKind::TypeDescriptor } else { JasminSyntaxKind::IdentifierToken }
77            }
78        }
79    }
80
81    /// 检查是否是类型描述符
82    fn is_type_descriptor(&self, text: &str) -> bool {
83        if text.is_empty() {
84            return false;
85        }
86
87        // 基本类型
88        if matches!(text, "B" | "C" | "D" | "F" | "I" | "J" | "S" | "Z" | "V") {
89            return true;
90        }
91
92        // 数组类型
93        if text.starts_with('[') {
94            return true;
95        }
96
97        // 对象类型
98        if text.starts_with('L') && text.ends_with(';') {
99            return true;
100        }
101
102        false
103    }
104
105    /// 跳过空白字符
106    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
107        let start_pos = state.get_position();
108        let mut consumed = false;
109
110        while let Some(ch) = state.peek() {
111            if ch.is_whitespace() {
112                consumed = true;
113                state.advance(ch.len_utf8());
114            }
115            else {
116                break;
117            }
118        }
119
120        if consumed {
121            state.add_token(JasminSyntaxKind::Whitespace, start_pos, state.get_position());
122            true
123        }
124        else {
125            false
126        }
127    }
128
129    /// 跳过注释
130    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
131        if let Some(ch) = state.peek() {
132            if ch == ';' {
133                let start_pos = state.get_position();
134                // 跳过到行尾
135                while let Some(ch) = state.peek() {
136                    state.advance(ch.len_utf8());
137                    if ch == '\n' {
138                        break;
139                    }
140                }
141                state.add_token(JasminSyntaxKind::Comment, start_pos, state.get_position());
142                return true;
143            }
144        }
145        false
146    }
147
148    /// 处理字符串字面量
149    fn lex_string<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
150        if let Some(ch) = state.peek() {
151            if ch == '"' {
152                let start_pos = state.get_position();
153                state.advance(1); // 跳过开始的引号
154
155                while let Some(ch) = state.peek() {
156                    if ch == '"' {
157                        state.advance(1); // 跳过结束的引号
158                        break;
159                    }
160                    else if ch == '\\' {
161                        state.advance(1); // 跳过转义字符
162                        if state.peek().is_some() {
163                            state.advance(1); // 跳过被转义的字符
164                        }
165                    }
166                    else {
167                        state.advance(ch.len_utf8());
168                    }
169                }
170
171                state.add_token(JasminSyntaxKind::StringLiteral, start_pos, state.get_position());
172                return true;
173            }
174        }
175        false
176    }
177
178    /// 处理数字字面量
179    fn lex_number<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
180        let start_pos = state.get_position();
181
182        if let Some(first) = state.peek() {
183            // 只处理以数字开头的情况,简化逻辑
184            if !first.is_ascii_digit() {
185                return false;
186            }
187
188            // 消费数字
189            while let Some(ch) = state.peek() {
190                if ch.is_ascii_digit() {
191                    state.advance(ch.len_utf8());
192                }
193                else if ch == '.' {
194                    // 浮点数
195                    state.advance(1);
196                    while let Some(ch) = state.peek() {
197                        if ch.is_ascii_digit() {
198                            state.advance(ch.len_utf8());
199                        }
200                        else {
201                            break;
202                        }
203                    }
204                    break;
205                }
206                else {
207                    break;
208                }
209            }
210
211            state.add_token(JasminSyntaxKind::Number, start_pos, state.get_position());
212            return true;
213        }
214
215        false
216    }
217
218    /// 处理标识符或关键字
219    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
220        let start = state.get_position();
221        let first = match state.peek() {
222            Some(ch) => ch,
223            None => return false,
224        };
225
226        // 标识符必须以字母、下划线或点开始
227        if !first.is_ascii_alphabetic() && first != '_' && first != '.' {
228            return false;
229        }
230
231        // 消费第一个字符
232        state.advance(first.len_utf8());
233
234        // 消费后续字符
235        while let Some(ch) = state.peek() {
236            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '/' || ch == '$' || ch == '<' || ch == '>' {
237                state.advance(ch.len_utf8());
238            }
239            else {
240                break;
241            }
242        }
243
244        let end = state.get_position();
245        let text = state.get_text_in(oak_core::Range { start, end });
246        let kind = self.keyword_or_identifier(&text);
247        state.add_token(kind, start, state.get_position());
248        true
249    }
250
251    /// 处理操作符和分隔符
252    fn lex_operator_or_delimiter<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
253        let start = state.get_position();
254        let ch = match state.peek() {
255            Some(ch) => ch,
256            None => return false,
257        };
258
259        let kind = match ch {
260            '{' => JasminSyntaxKind::LeftBrace,
261            '}' => JasminSyntaxKind::RightBrace,
262            '(' => JasminSyntaxKind::LeftParen,
263            ')' => JasminSyntaxKind::RightParen,
264            '[' => JasminSyntaxKind::LeftBracket,
265            ']' => JasminSyntaxKind::RightBracket,
266            ':' => JasminSyntaxKind::Colon,
267            ';' => JasminSyntaxKind::Semicolon,
268            '.' => JasminSyntaxKind::Dot,
269            ',' => JasminSyntaxKind::Comma,
270            '/' => JasminSyntaxKind::Slash,
271            _ => return false,
272        };
273
274        state.advance(ch.len_utf8());
275        state.add_token(kind, start, state.get_position());
276        true
277    }
278
279    /// 主要的词法分析循环
280    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
281        while state.not_at_end() {
282            let safe_point = state.get_position();
283
284            // 尝试各种词法规则
285            if self.skip_whitespace(state) {
286                continue;
287            }
288
289            if self.skip_comment(state) {
290                continue;
291            }
292
293            if self.lex_string(state) {
294                continue;
295            }
296
297            if self.lex_number(state) {
298                continue;
299            }
300
301            if self.lex_identifier_or_keyword(state) {
302                continue;
303            }
304
305            if self.lex_operator_or_delimiter(state) {
306                continue;
307            }
308
309            // 如果所有规则都不匹配,跳过当前字符并标记为错误
310            let start_pos = state.get_position();
311            if let Some(ch) = state.peek() {
312                state.advance(ch.len_utf8());
313                state.add_token(JasminSyntaxKind::Error, start_pos, state.get_position());
314            }
315
316            state.advance_if_dead_lock(safe_point);
317        }
318
319        Ok(())
320    }
321}
322
323impl<'config> Lexer<JasminLanguage> for JasminLexer<'config> {
324    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<JasminLanguage>) -> LexOutput<JasminLanguage> {
325        let mut state = LexerState::new(source);
326        let result = self.run(&mut state);
327        if result.is_ok() {
328            state.add_eof();
329        }
330        state.finish_with_cache(result, cache)
331    }
332}