oak_jasm/lexer/
mod.rs

1use crate::{language::JasmLanguage, syntax::JasmSyntaxKind};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, JasmLanguage>;
10
11static JASM_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static JASM_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static JASM_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct JasmLexer<'config> {
17    config: &'config JasmLanguage,
18}
19
20impl<'config> Lexer<JasmLanguage> for JasmLexer<'config> {
21    fn lex_incremental(
22        &self,
23        source: impl Source,
24        changed: usize,
25        cache: IncrementalCache<JasmLanguage>,
26    ) -> LexOutput<JasmLanguage> {
27        let mut state = LexerState::new_with_cache(source, changed, cache);
28        let result = self.run(&mut state);
29        state.finish(result)
30    }
31}
32
33impl<'config> JasmLexer<'config> {
34    pub fn new(config: &'config JasmLanguage) -> Self {
35        Self { config }
36    }
37
38    /// 主要的词法分析循环
39    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.skip_whitespace(state) {
44                continue;
45            }
46
47            if self.lex_newline(state) {
48                continue;
49            }
50
51            if self.skip_comment(state) {
52                continue;
53            }
54
55            if self.lex_string_literal(state) {
56                continue;
57            }
58
59            if self.lex_number_literal(state) {
60                continue;
61            }
62
63            if self.lex_identifier_or_keyword(state) {
64                continue;
65            }
66
67            if self.lex_punctuation(state) {
68                continue;
69            }
70
71            state.safe_check(safe_point);
72        }
73
74        // 添加 EOF token
75        let eof_pos = state.get_position();
76        state.add_token(JasmSyntaxKind::Eof, eof_pos, eof_pos);
77        Ok(())
78    }
79
80    /// 跳过空白字符(不包括换行符)
81    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
82        let start = state.get_position();
83
84        while let Some(ch) = state.peek() {
85            if ch == ' ' || ch == '\t' || ch == '\r' {
86                state.advance(ch.len_utf8());
87            }
88            else {
89                break;
90            }
91        }
92
93        if state.get_position() > start {
94            state.add_token(JasmSyntaxKind::Whitespace, start, state.get_position());
95            return true;
96        }
97        false
98    }
99
100    /// 处理换行
101    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
102        let start = state.get_position();
103
104        if state.current() == Some('\n') {
105            state.advance(1);
106            state.add_token(JasmSyntaxKind::Newline, start, state.get_position());
107            return true;
108        }
109        false
110    }
111
112    /// 跳过注释
113    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
114        let start = state.get_position();
115        let rest = state.rest();
116
117        if rest.starts_with("//") {
118            // 跳过注释标记
119            state.advance(2);
120
121            // 读取到行尾
122            while let Some(ch) = state.peek() {
123                if ch != '\n' {
124                    state.advance(ch.len_utf8());
125                }
126                else {
127                    break;
128                }
129            }
130
131            state.add_token(JasmSyntaxKind::Comment, start, state.get_position());
132            return true;
133        }
134        false
135    }
136
137    /// 处理字符串字面量
138    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
139        let start = state.get_position();
140
141        if state.current() == Some('"') {
142            // 跳过开始引号
143            state.advance(1);
144
145            while let Some(ch) = state.peek() {
146                if ch != '"' {
147                    if ch == '\\' {
148                        state.advance(1); // 转义字符
149                        if let Some(_) = state.peek() {
150                            state.advance(1); // 被转义的字符
151                        }
152                    }
153                    else {
154                        state.advance(ch.len_utf8());
155                    }
156                }
157                else {
158                    // 找到结束引号
159                    state.advance(1);
160                    break;
161                }
162            }
163
164            state.add_token(JasmSyntaxKind::StringLiteral, start, state.get_position());
165            return true;
166        }
167        false
168    }
169
170    /// 处理数字字面量
171    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
172        let start = state.get_position();
173        let first = match state.current() {
174            Some(c) => c,
175            None => return false,
176        };
177
178        // 检查是否以数字或负号开始
179        if !first.is_ascii_digit() && first != '-' && first != '+' {
180            return false;
181        }
182
183        // 如果是符号,检查后面是否跟数字
184        if first == '-' || first == '+' {
185            if let Some(next) = state.peek_next_n(1) {
186                if !next.is_ascii_digit() {
187                    return false;
188                }
189            }
190            else {
191                return false;
192            }
193        }
194
195        state.advance(1);
196        let mut has_dot = false;
197        let mut has_exp = false;
198
199        while let Some(ch) = state.peek() {
200            if ch.is_ascii_digit() {
201                state.advance(1);
202            }
203            else if ch == '.' && !has_dot && !has_exp {
204                has_dot = true;
205                state.advance(1);
206            }
207            else if (ch == 'e' || ch == 'E') && !has_exp {
208                has_exp = true;
209                state.advance(1);
210                // 处理指数符号
211                if let Some(sign) = state.peek() {
212                    if sign == '+' || sign == '-' {
213                        state.advance(1);
214                    }
215                }
216            }
217            else {
218                break;
219            }
220        }
221
222        // 检查是否为有效数字
223        let end = state.get_position();
224        let text = state.get_text_in((start..end).into());
225
226        // 简单验证:不能只是符号或只是点
227        if text == "-" || text == "+" || text == "." {
228            // 回退
229            state.set_position(start);
230            return false;
231        }
232
233        state.add_token(JasmSyntaxKind::Number, start, state.get_position());
234        true
235    }
236
237    /// 处理标识符或关键字
238    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
239        let start = state.get_position();
240        let ch = match state.current() {
241            Some(c) => c,
242            None => return false,
243        };
244
245        // 标识符必须以字母或下划线开始
246        if !(ch.is_ascii_alphabetic() || ch == '_') {
247            return false;
248        }
249
250        state.advance(1);
251        while let Some(c) = state.current() {
252            if c.is_ascii_alphanumeric() || c == '_' {
253                state.advance(1);
254            }
255            else {
256                break;
257            }
258        }
259
260        let end = state.get_position();
261        let text = state.get_text_in((start..end).into());
262
263        // 检查是否为关键字或指令
264        let kind = self.classify_identifier(text);
265        state.add_token(kind, start, state.get_position());
266        true
267    }
268
269    /// 分类标识符为关键字、指令或普通标识符
270    fn classify_identifier(&self, text: &str) -> JasmSyntaxKind {
271        match text {
272            // 关键字
273            "class" => JasmSyntaxKind::ClassKw,
274            "version" => JasmSyntaxKind::VersionKw,
275            "method" => JasmSyntaxKind::MethodKw,
276            "field" => JasmSyntaxKind::FieldKw,
277            "string" => JasmSyntaxKind::StringKw,
278            "sourcefile" => JasmSyntaxKind::SourceFileKw,
279            "stack" => JasmSyntaxKind::StackKw,
280            "locals" => JasmSyntaxKind::LocalsKw,
281            "end" => JasmSyntaxKind::EndKw,
282            "compiled" => JasmSyntaxKind::CompiledKw,
283            "from" => JasmSyntaxKind::FromKw,
284            "innerclass" => JasmSyntaxKind::InnerClassKw,
285            "nestmembers" => JasmSyntaxKind::NestMembersKw,
286            "bootstrapmethod" => JasmSyntaxKind::BootstrapMethodKw,
287
288            // 访问修饰符
289            "public" => JasmSyntaxKind::Public,
290            "private" => JasmSyntaxKind::Private,
291            "protected" => JasmSyntaxKind::Protected,
292            "static" => JasmSyntaxKind::Static,
293            "super" => JasmSyntaxKind::Super,
294            "final" => JasmSyntaxKind::Final,
295            "abstract" => JasmSyntaxKind::Abstract,
296            "synchronized" => JasmSyntaxKind::Synchronized,
297            "native" => JasmSyntaxKind::Native,
298            "synthetic" => JasmSyntaxKind::Synthetic,
299            "deprecated" => JasmSyntaxKind::Deprecated,
300            "varargs" => JasmSyntaxKind::Varargs,
301
302            // 字节码指令
303            "aload_0" => JasmSyntaxKind::ALoad0,
304            "aload_1" => JasmSyntaxKind::ALoad1,
305            "aload_2" => JasmSyntaxKind::ALoad2,
306            "aload_3" => JasmSyntaxKind::ALoad3,
307            "iload_0" => JasmSyntaxKind::ILoad0,
308            "iload_1" => JasmSyntaxKind::ILoad1,
309            "iload_2" => JasmSyntaxKind::ILoad2,
310            "iload_3" => JasmSyntaxKind::ILoad3,
311            "ldc" => JasmSyntaxKind::Ldc,
312            "ldc_w" => JasmSyntaxKind::LdcW,
313            "ldc2_w" => JasmSyntaxKind::Ldc2W,
314            "invokespecial" => JasmSyntaxKind::InvokeSpecial,
315            "invokevirtual" => JasmSyntaxKind::InvokeVirtual,
316            "invokestatic" => JasmSyntaxKind::InvokeStatic,
317            "invokeinterface" => JasmSyntaxKind::InvokeInterface,
318            "invokedynamic" => JasmSyntaxKind::InvokeDynamic,
319            "getstatic" => JasmSyntaxKind::GetStatic,
320            "putstatic" => JasmSyntaxKind::PutStatic,
321            "getfield" => JasmSyntaxKind::GetField,
322            "putfield" => JasmSyntaxKind::PutField,
323            "return" => JasmSyntaxKind::Return,
324            "ireturn" => JasmSyntaxKind::IReturn,
325            "areturn" => JasmSyntaxKind::AReturn,
326            "lreturn" => JasmSyntaxKind::LReturn,
327            "freturn" => JasmSyntaxKind::FReturn,
328            "dreturn" => JasmSyntaxKind::DReturn,
329            "nop" => JasmSyntaxKind::Nop,
330            "dup" => JasmSyntaxKind::Dup,
331            "pop" => JasmSyntaxKind::Pop,
332            "new" => JasmSyntaxKind::New,
333
334            // 默认为标识符
335            _ => JasmSyntaxKind::IdentifierToken,
336        }
337    }
338
339    /// 处理标点符号
340    fn lex_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
341        let start = state.get_position();
342
343        if let Some(ch) = state.current() {
344            let kind = match ch {
345                '{' => JasmSyntaxKind::LeftBrace,
346                '}' => JasmSyntaxKind::RightBrace,
347                '(' => JasmSyntaxKind::LeftParen,
348                ')' => JasmSyntaxKind::RightParen,
349                '[' => JasmSyntaxKind::LeftBracket,
350                ']' => JasmSyntaxKind::RightBracket,
351                ':' => JasmSyntaxKind::Colon,
352                ';' => JasmSyntaxKind::Semicolon,
353                '.' => JasmSyntaxKind::Dot,
354                ',' => JasmSyntaxKind::Comma,
355                '/' => JasmSyntaxKind::Slash,
356                _ => return false,
357            };
358
359            state.advance(ch.len_utf8());
360            state.add_token(kind, start, state.get_position());
361            return true;
362        }
363
364        false
365    }
366}