Skip to main content

oak_jasm/lexer/
mod.rs

1use crate::{language::JasmLanguage, syntax::JasmSyntaxKind};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, JasmLanguage>;
10
11static JASM_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "", block_end: "", nested_blocks: false });
12static JASM_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
13
14#[derive(Clone, Debug)]
15pub struct JasmLexer<'config> {
16    _config: &'config JasmLanguage,
17}
18
19impl<'config> Lexer<JasmLanguage> for JasmLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], _cache: &'a mut impl LexerCache<JasmLanguage>) -> LexOutput<JasmLanguage> {
21        let mut state = State::new(source);
22        let result = self.run(&mut state);
23        state.finish(result)
24    }
25}
26
27impl<'config> JasmLexer<'config> {
28    pub fn new(config: &'config JasmLanguage) -> Self {
29        Self { _config: config }
30    }
31
32    /// 主要的词法分析循环
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let safe_point = state.get_position();
36
37            if self.skip_whitespace(state) {
38                continue;
39            }
40
41            if self.lex_newline(state) {
42                continue;
43            }
44
45            if self.skip_comment(state) {
46                continue;
47            }
48
49            if self.lex_string_literal(state) {
50                continue;
51            }
52
53            if self.lex_number_literal(state) {
54                continue;
55            }
56
57            if self.lex_identifier_or_keyword(state) {
58                continue;
59            }
60
61            if self.lex_punctuation(state) {
62                continue;
63            }
64
65            state.advance_if_dead_lock(safe_point);
66        }
67
68        // 添加 EOF token
69        state.add_eof();
70        Ok(())
71    }
72
73    /// 跳过空白字符(不包括换行符)
74    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
75        let start = state.get_position();
76
77        while let Some(ch) = state.peek() {
78            if ch == ' ' || ch == '\t' || ch == '\r' {
79                state.advance(ch.len_utf8());
80            }
81            else {
82                break;
83            }
84        }
85
86        if state.get_position() > start {
87            state.add_token(JasmSyntaxKind::Whitespace, start, state.get_position());
88            return true;
89        }
90
91        false
92    }
93
94    /// 处理换行
95    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
96        let start = state.get_position();
97
98        if state.current() == Some('\n') {
99            state.advance(1);
100            state.add_token(JasmSyntaxKind::Newline, start, state.get_position());
101            return true;
102        }
103        false
104    }
105
106    /// 跳过注释
107    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
108        JASM_COMMENT.scan(state, JasmSyntaxKind::Comment, JasmSyntaxKind::Comment)
109    }
110
111    /// 处理字符串字面量
112    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
113        JASM_STRING.scan(state, JasmSyntaxKind::StringLiteral)
114    }
115
116    /// 处理数字字面量
117    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
118        let start = state.get_position();
119        let first = match state.peek() {
120            Some(c) => c,
121            None => return false,
122        };
123
124        // 检查是否以数字或负号开始
125        if !first.is_ascii_digit() && first != '-' && first != '+' {
126            return false;
127        }
128
129        // 如果是符号,检查后面是否跟数字
130        if first == '-' || first == '+' {
131            if let Some(next) = state.peek_next_n(1) {
132                if !next.is_ascii_digit() {
133                    return false;
134                }
135            }
136            else {
137                return false;
138            }
139        }
140
141        state.advance(first.len_utf8());
142        let mut has_dot = false;
143        let mut has_exp = false;
144
145        while let Some(ch) = state.peek() {
146            if ch.is_ascii_digit() {
147                state.advance(ch.len_utf8());
148            }
149            else if ch == '.' && !has_dot && !has_exp {
150                has_dot = true;
151                state.advance(1);
152            }
153            else if (ch == 'e' || ch == 'E') && !has_exp {
154                has_exp = true;
155                state.advance(1);
156                // 处理指数符号
157                if let Some(sign) = state.peek() {
158                    if sign == '+' || sign == '-' {
159                        state.advance(1);
160                    }
161                }
162            }
163            else {
164                break;
165            }
166        }
167
168        state.add_token(JasmSyntaxKind::Number, start, state.get_position());
169        true
170    }
171
172    /// 处理标识符或关键字
173    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
174        let start = state.get_position();
175        let ch = match state.peek() {
176            Some(c) => c,
177            None => return false,
178        };
179
180        // 标识符必须以字母或下划线开始
181        if !(ch.is_ascii_alphabetic() || ch == '_') {
182            return false;
183        }
184
185        state.advance(ch.len_utf8());
186        while let Some(c) = state.peek() {
187            if c.is_ascii_alphanumeric() || c == '_' {
188                state.advance(c.len_utf8());
189            }
190            else {
191                break;
192            }
193        }
194
195        let end = state.get_position();
196        let text = state.get_text_in((start..end).into());
197
198        // 检查是否为关键字或指令
199        let kind = self.classify_identifier(&text);
200        state.add_token(kind, start, state.get_position());
201        true
202    }
203
204    /// 分类标识符为关键字、指令或普通标识符
205    fn classify_identifier(&self, text: &str) -> JasmSyntaxKind {
206        match text {
207            // 关键字
208            "class" => JasmSyntaxKind::ClassKw,
209            "version" => JasmSyntaxKind::VersionKw,
210            "method" => JasmSyntaxKind::MethodKw,
211            "field" => JasmSyntaxKind::FieldKw,
212            "string" => JasmSyntaxKind::StringKw,
213            "sourcefile" => JasmSyntaxKind::SourceFileKw,
214            "stack" => JasmSyntaxKind::StackKw,
215            "locals" => JasmSyntaxKind::LocalsKw,
216            "end" => JasmSyntaxKind::EndKw,
217            "compiled" => JasmSyntaxKind::CompiledKw,
218            "from" => JasmSyntaxKind::FromKw,
219            "innerclass" => JasmSyntaxKind::InnerClassKw,
220            "nestmembers" => JasmSyntaxKind::NestMembersKw,
221            "bootstrapmethod" => JasmSyntaxKind::BootstrapMethodKw,
222
223            // 访问修饰符
224            "public" => JasmSyntaxKind::Public,
225            "private" => JasmSyntaxKind::Private,
226            "protected" => JasmSyntaxKind::Protected,
227            "static" => JasmSyntaxKind::Static,
228            "super" => JasmSyntaxKind::Super,
229            "final" => JasmSyntaxKind::Final,
230            "abstract" => JasmSyntaxKind::Abstract,
231            "synchronized" => JasmSyntaxKind::Synchronized,
232            "native" => JasmSyntaxKind::Native,
233            "synthetic" => JasmSyntaxKind::Synthetic,
234            "deprecated" => JasmSyntaxKind::Deprecated,
235            "varargs" => JasmSyntaxKind::Varargs,
236
237            // 字节码指令
238            "aload_0" => JasmSyntaxKind::ALoad0,
239            "aload_1" => JasmSyntaxKind::ALoad1,
240            "aload_2" => JasmSyntaxKind::ALoad2,
241            "aload_3" => JasmSyntaxKind::ALoad3,
242            "iload_0" => JasmSyntaxKind::ILoad0,
243            "iload_1" => JasmSyntaxKind::ILoad1,
244            "iload_2" => JasmSyntaxKind::ILoad2,
245            "iload_3" => JasmSyntaxKind::ILoad3,
246            "ldc" => JasmSyntaxKind::Ldc,
247            "ldc_w" => JasmSyntaxKind::LdcW,
248            "ldc2_w" => JasmSyntaxKind::Ldc2W,
249            "invokespecial" => JasmSyntaxKind::InvokeSpecial,
250            "invokevirtual" => JasmSyntaxKind::InvokeVirtual,
251            "invokestatic" => JasmSyntaxKind::InvokeStatic,
252            "invokeinterface" => JasmSyntaxKind::InvokeInterface,
253            "invokedynamic" => JasmSyntaxKind::InvokeDynamic,
254            "getstatic" => JasmSyntaxKind::GetStatic,
255            "putstatic" => JasmSyntaxKind::PutStatic,
256            "getfield" => JasmSyntaxKind::GetField,
257            "putfield" => JasmSyntaxKind::PutField,
258            "return" => JasmSyntaxKind::Return,
259            "ireturn" => JasmSyntaxKind::IReturn,
260            "areturn" => JasmSyntaxKind::AReturn,
261            "lreturn" => JasmSyntaxKind::LReturn,
262            "freturn" => JasmSyntaxKind::FReturn,
263            "dreturn" => JasmSyntaxKind::DReturn,
264            "nop" => JasmSyntaxKind::Nop,
265            "dup" => JasmSyntaxKind::Dup,
266            "pop" => JasmSyntaxKind::Pop,
267            "new" => JasmSyntaxKind::New,
268
269            // 默认为标识符
270            _ => JasmSyntaxKind::IdentifierToken,
271        }
272    }
273
274    /// 处理标点符号
275    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
276        let start = state.get_position();
277
278        if let Some(ch) = state.current() {
279            let kind = match ch {
280                '{' => JasmSyntaxKind::LeftBrace,
281                '}' => JasmSyntaxKind::RightBrace,
282                '(' => JasmSyntaxKind::LeftParen,
283                ')' => JasmSyntaxKind::RightParen,
284                '[' => JasmSyntaxKind::LeftBracket,
285                ']' => JasmSyntaxKind::RightBracket,
286                ':' => JasmSyntaxKind::Colon,
287                ';' => JasmSyntaxKind::Semicolon,
288                '.' => JasmSyntaxKind::Dot,
289                ',' => JasmSyntaxKind::Comma,
290                '/' => JasmSyntaxKind::Slash,
291                _ => return false,
292            };
293
294            state.advance(ch.len_utf8());
295            state.add_token(kind, start, state.get_position());
296            return true;
297        }
298
299        false
300    }
301}