oak_jasm/lexer/
mod.rs

1use crate::{language::JasmLanguage, syntax::JasmSyntaxKind};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, JasmLanguage>;
10
11static JASM_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "", block_end: "", nested_blocks: false });
12static JASM_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
13
14#[derive(Clone, Default)]
15pub struct JasmLexer {}
16
17impl Lexer<JasmLanguage> for JasmLexer {
18    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], _cache: &'a mut impl LexerCache<JasmLanguage>) -> LexOutput<JasmLanguage> {
19        let mut state = State::new(source);
20        let result = self.run(&mut state);
21        state.finish(result)
22    }
23}
24
25impl JasmLexer {
26    pub fn new(_config: &JasmLanguage) -> Self {
27        Self {}
28    }
29
30    /// 主要的词法分析循环
31    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
32        while state.not_at_end() {
33            let safe_point = state.get_position();
34
35            if self.skip_whitespace(state) {
36                continue;
37            }
38
39            if self.lex_newline(state) {
40                continue;
41            }
42
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            if self.lex_string_literal(state) {
48                continue;
49            }
50
51            if self.lex_number_literal(state) {
52                continue;
53            }
54
55            if self.lex_identifier_or_keyword(state) {
56                continue;
57            }
58
59            if self.lex_punctuation(state) {
60                continue;
61            }
62
63            state.advance_if_dead_lock(safe_point);
64        }
65
66        // 添加 EOF token
67        state.add_eof();
68        Ok(())
69    }
70
71    /// 跳过空白字符(不包括换行符)
72    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
73        let start = state.get_position();
74
75        while let Some(ch) = state.peek() {
76            if ch == ' ' || ch == '\t' || ch == '\r' {
77                state.advance(ch.len_utf8());
78            }
79            else {
80                break;
81            }
82        }
83
84        if state.get_position() > start {
85            state.add_token(JasmSyntaxKind::Whitespace, start, state.get_position());
86            return true;
87        }
88
89        false
90    }
91
92    /// 处理换行
93    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
94        let start = state.get_position();
95
96        if state.current() == Some('\n') {
97            state.advance(1);
98            state.add_token(JasmSyntaxKind::Newline, start, state.get_position());
99            return true;
100        }
101        false
102    }
103
104    /// 跳过注释
105    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
106        JASM_COMMENT.scan(state, JasmSyntaxKind::Comment, JasmSyntaxKind::Comment)
107    }
108
109    /// 处理字符串字面量
110    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
111        JASM_STRING.scan(state, JasmSyntaxKind::StringLiteral)
112    }
113
114    /// 处理数字字面量
115    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
116        let start = state.get_position();
117        let first = match state.peek() {
118            Some(c) => c,
119            None => return false,
120        };
121
122        // 检查是否以数字或负号开始
123        if !first.is_ascii_digit() && first != '-' && first != '+' {
124            return false;
125        }
126
127        // 如果是符号,检查后面是否跟数字
128        if first == '-' || first == '+' {
129            if let Some(next) = state.peek_next_n(1) {
130                if !next.is_ascii_digit() {
131                    return false;
132                }
133            }
134            else {
135                return false;
136            }
137        }
138
139        state.advance(first.len_utf8());
140        let mut has_dot = false;
141        let mut has_exp = false;
142
143        while let Some(ch) = state.peek() {
144            if ch.is_ascii_digit() {
145                state.advance(ch.len_utf8());
146            }
147            else if ch == '.' && !has_dot && !has_exp {
148                has_dot = true;
149                state.advance(1);
150            }
151            else if (ch == 'e' || ch == 'E') && !has_exp {
152                has_exp = true;
153                state.advance(1);
154                // 处理指数符号
155                if let Some(sign) = state.peek() {
156                    if sign == '+' || sign == '-' {
157                        state.advance(1);
158                    }
159                }
160            }
161            else {
162                break;
163            }
164        }
165
166        state.add_token(JasmSyntaxKind::Number, start, state.get_position());
167        true
168    }
169
170    /// 处理标识符或关键字
171    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
172        let start = state.get_position();
173        let ch = match state.peek() {
174            Some(c) => c,
175            None => return false,
176        };
177
178        // 标识符必须以字母或下划线开始
179        if !(ch.is_ascii_alphabetic() || ch == '_') {
180            return false;
181        }
182
183        state.advance(ch.len_utf8());
184        while let Some(c) = state.peek() {
185            if c.is_ascii_alphanumeric() || c == '_' {
186                state.advance(c.len_utf8());
187            }
188            else {
189                break;
190            }
191        }
192
193        let end = state.get_position();
194        let text = state.get_text_in((start..end).into());
195
196        // 检查是否为关键字或指令
197        let kind = self.classify_identifier(&text);
198        state.add_token(kind, start, state.get_position());
199        true
200    }
201
202    /// 分类标识符为关键字、指令或普通标识符
203    fn classify_identifier(&self, text: &str) -> JasmSyntaxKind {
204        match text {
205            // 关键字
206            "class" => JasmSyntaxKind::ClassKw,
207            "version" => JasmSyntaxKind::VersionKw,
208            "method" => JasmSyntaxKind::MethodKw,
209            "field" => JasmSyntaxKind::FieldKw,
210            "string" => JasmSyntaxKind::StringKw,
211            "sourcefile" => JasmSyntaxKind::SourceFileKw,
212            "stack" => JasmSyntaxKind::StackKw,
213            "locals" => JasmSyntaxKind::LocalsKw,
214            "end" => JasmSyntaxKind::EndKw,
215            "compiled" => JasmSyntaxKind::CompiledKw,
216            "from" => JasmSyntaxKind::FromKw,
217            "innerclass" => JasmSyntaxKind::InnerClassKw,
218            "nestmembers" => JasmSyntaxKind::NestMembersKw,
219            "bootstrapmethod" => JasmSyntaxKind::BootstrapMethodKw,
220
221            // 访问修饰符
222            "public" => JasmSyntaxKind::Public,
223            "private" => JasmSyntaxKind::Private,
224            "protected" => JasmSyntaxKind::Protected,
225            "static" => JasmSyntaxKind::Static,
226            "super" => JasmSyntaxKind::Super,
227            "final" => JasmSyntaxKind::Final,
228            "abstract" => JasmSyntaxKind::Abstract,
229            "synchronized" => JasmSyntaxKind::Synchronized,
230            "native" => JasmSyntaxKind::Native,
231            "synthetic" => JasmSyntaxKind::Synthetic,
232            "deprecated" => JasmSyntaxKind::Deprecated,
233            "varargs" => JasmSyntaxKind::Varargs,
234
235            // 字节码指令
236            "aload_0" => JasmSyntaxKind::ALoad0,
237            "aload_1" => JasmSyntaxKind::ALoad1,
238            "aload_2" => JasmSyntaxKind::ALoad2,
239            "aload_3" => JasmSyntaxKind::ALoad3,
240            "iload_0" => JasmSyntaxKind::ILoad0,
241            "iload_1" => JasmSyntaxKind::ILoad1,
242            "iload_2" => JasmSyntaxKind::ILoad2,
243            "iload_3" => JasmSyntaxKind::ILoad3,
244            "ldc" => JasmSyntaxKind::Ldc,
245            "ldc_w" => JasmSyntaxKind::LdcW,
246            "ldc2_w" => JasmSyntaxKind::Ldc2W,
247            "invokespecial" => JasmSyntaxKind::InvokeSpecial,
248            "invokevirtual" => JasmSyntaxKind::InvokeVirtual,
249            "invokestatic" => JasmSyntaxKind::InvokeStatic,
250            "invokeinterface" => JasmSyntaxKind::InvokeInterface,
251            "invokedynamic" => JasmSyntaxKind::InvokeDynamic,
252            "getstatic" => JasmSyntaxKind::GetStatic,
253            "putstatic" => JasmSyntaxKind::PutStatic,
254            "getfield" => JasmSyntaxKind::GetField,
255            "putfield" => JasmSyntaxKind::PutField,
256            "return" => JasmSyntaxKind::Return,
257            "ireturn" => JasmSyntaxKind::IReturn,
258            "areturn" => JasmSyntaxKind::AReturn,
259            "lreturn" => JasmSyntaxKind::LReturn,
260            "freturn" => JasmSyntaxKind::FReturn,
261            "dreturn" => JasmSyntaxKind::DReturn,
262            "nop" => JasmSyntaxKind::Nop,
263            "dup" => JasmSyntaxKind::Dup,
264            "pop" => JasmSyntaxKind::Pop,
265            "new" => JasmSyntaxKind::New,
266
267            // 默认为标识符
268            _ => JasmSyntaxKind::IdentifierToken,
269        }
270    }
271
272    /// 处理标点符号
273    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
274        let start = state.get_position();
275
276        if let Some(ch) = state.current() {
277            let kind = match ch {
278                '{' => JasmSyntaxKind::LeftBrace,
279                '}' => JasmSyntaxKind::RightBrace,
280                '(' => JasmSyntaxKind::LeftParen,
281                ')' => JasmSyntaxKind::RightParen,
282                '[' => JasmSyntaxKind::LeftBracket,
283                ']' => JasmSyntaxKind::RightBracket,
284                ':' => JasmSyntaxKind::Colon,
285                ';' => JasmSyntaxKind::Semicolon,
286                '.' => JasmSyntaxKind::Dot,
287                ',' => JasmSyntaxKind::Comma,
288                '/' => JasmSyntaxKind::Slash,
289                _ => return false,
290            };
291
292            state.advance(ch.len_utf8());
293            state.add_token(kind, start, state.get_position());
294            return true;
295        }
296
297        false
298    }
299}