Skip to main content

oak_jasm/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError, Source,
4    lexer::{CommentConfig, LexOutput, StringConfig},
5};
6pub mod token_type;
7
8use crate::{language::JasmLanguage, lexer::token_type::JasmTokenType};
9use std::sync::LazyLock;
10
11type State<'a, S> = LexerState<'a, S, JasmLanguage>;
12
13static JASM_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "", block_end: "", nested_blocks: false });
14static JASM_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
15
16#[derive(Clone, Debug)]
17pub struct JasmLexer<'config> {
18    _config: &'config JasmLanguage,
19}
20
21impl<'config> Lexer<JasmLanguage> for JasmLexer<'config> {
22    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], _cache: &'a mut impl LexerCache<JasmLanguage>) -> LexOutput<JasmLanguage> {
23        let mut state = State::new(source);
24        let result = self.run(&mut state);
25        state.finish(result)
26    }
27}
28
29impl<'config> JasmLexer<'config> {
30    pub fn new(config: &'config JasmLanguage) -> Self {
31        Self { _config: config }
32    }
33
34    /// 主要的词法分析循环
35    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            let safe_point = state.get_position();
38
39            if self.skip_whitespace(state) {
40                continue;
41            }
42
43            if self.lex_newline(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_punctuation(state) {
64                continue;
65            }
66
67            state.advance_if_dead_lock(safe_point);
68        }
69
70        // 添加 EOF token
71        state.add_eof();
72        Ok(())
73    }
74
75    /// 跳过空白字符(不包括换行符)
76    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
77        let start = state.get_position();
78
79        while let Some(ch) = state.peek() {
80            if ch == ' ' || ch == '\t' || ch == '\r' {
81                state.advance(ch.len_utf8());
82            }
83            else {
84                break;
85            }
86        }
87
88        if state.get_position() > start {
89            state.add_token(JasmTokenType::Whitespace, start, state.get_position());
90            return true;
91        }
92
93        false
94    }
95
96    /// 处理换行
97    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
98        let start = state.get_position();
99
100        if state.current() == Some('\n') {
101            state.advance(1);
102            state.add_token(JasmTokenType::Newline, start, state.get_position());
103            return true;
104        }
105        false
106    }
107
108    /// 跳过注释
109    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
110        JASM_COMMENT.scan(state, JasmTokenType::Comment, JasmTokenType::Comment)
111    }
112
113    /// 处理字符串字面量
114    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
115        JASM_STRING.scan(state, JasmTokenType::StringLiteral)
116    }
117
118    /// 处理数字字面量
119    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
120        let start = state.get_position();
121        let first = match state.peek() {
122            Some(c) => c,
123            None => return false,
124        };
125
126        // 检查是否以数字或负号开始
127        if !first.is_ascii_digit() && first != '-' && first != '+' {
128            return false;
129        }
130
131        // 如果是符号,检查后面是否跟数字
132        if first == '-' || first == '+' {
133            if let Some(next) = state.peek_next_n(1) {
134                if !next.is_ascii_digit() {
135                    return false;
136                }
137            }
138            else {
139                return false;
140            }
141        }
142
143        state.advance(first.len_utf8());
144        let mut has_dot = false;
145        let mut has_exp = false;
146
147        while let Some(ch) = state.peek() {
148            if ch.is_ascii_digit() {
149                state.advance(ch.len_utf8());
150            }
151            else if ch == '.' && !has_dot && !has_exp {
152                has_dot = true;
153                state.advance(1);
154            }
155            else if (ch == 'e' || ch == 'E') && !has_exp {
156                has_exp = true;
157                state.advance(1);
158                // 处理指数符号
159                if let Some(sign) = state.peek() {
160                    if sign == '+' || sign == '-' {
161                        state.advance(1);
162                    }
163                }
164            }
165            else {
166                break;
167            }
168        }
169
170        state.add_token(JasmTokenType::Number, start, state.get_position());
171        true
172    }
173
174    /// 处理标识符或关键字
175    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
176        let start = state.get_position();
177        let ch = match state.peek() {
178            Some(c) => c,
179            None => return false,
180        };
181
182        // 标识符必须以字母或下划线开始
183        if !(ch.is_ascii_alphabetic() || ch == '_') {
184            return false;
185        }
186
187        state.advance(ch.len_utf8());
188        while let Some(c) = state.peek() {
189            if c.is_ascii_alphanumeric() || c == '_' {
190                state.advance(c.len_utf8());
191            }
192            else {
193                break;
194            }
195        }
196
197        let end = state.get_position();
198        let text = state.get_text_in((start..end).into());
199
200        // 检查是否为关键字或指令
201        let kind = self.classify_identifier(&text);
202        state.add_token(kind, start, state.get_position());
203        true
204    }
205
206    /// 分类标识符为关键字、指令或普通标识符
207    fn classify_identifier(&self, text: &str) -> JasmTokenType {
208        match text {
209            // 关键字
210            "class" => JasmTokenType::ClassKw,
211            "version" => JasmTokenType::VersionKw,
212            "method" => JasmTokenType::MethodKw,
213            "field" => JasmTokenType::FieldKw,
214            "string" => JasmTokenType::StringKw,
215            "sourcefile" => JasmTokenType::SourceFileKw,
216            "stack" => JasmTokenType::StackKw,
217            "locals" => JasmTokenType::LocalsKw,
218            "end" => JasmTokenType::EndKw,
219            "compiled" => JasmTokenType::CompiledKw,
220            "from" => JasmTokenType::FromKw,
221            "innerclass" => JasmTokenType::InnerClassKw,
222            "nestmembers" => JasmTokenType::NestMembersKw,
223            "bootstrapmethod" => JasmTokenType::BootstrapMethodKw,
224
225            // 访问修饰符
226            "public" => JasmTokenType::Public,
227            "private" => JasmTokenType::Private,
228            "protected" => JasmTokenType::Protected,
229            "static" => JasmTokenType::Static,
230            "super" => JasmTokenType::Super,
231            "final" => JasmTokenType::Final,
232            "abstract" => JasmTokenType::Abstract,
233            "synchronized" => JasmTokenType::Synchronized,
234            "native" => JasmTokenType::Native,
235            "synthetic" => JasmTokenType::Synthetic,
236            "deprecated" => JasmTokenType::Deprecated,
237            "varargs" => JasmTokenType::Varargs,
238
239            // 字节码指令
240            "aload_0" => JasmTokenType::ALoad0,
241            "aload_1" => JasmTokenType::ALoad1,
242            "aload_2" => JasmTokenType::ALoad2,
243            "aload_3" => JasmTokenType::ALoad3,
244            "iload_0" => JasmTokenType::ILoad0,
245            "iload_1" => JasmTokenType::ILoad1,
246            "iload_2" => JasmTokenType::ILoad2,
247            "iload_3" => JasmTokenType::ILoad3,
248            "ldc" => JasmTokenType::Ldc,
249            "ldc_w" => JasmTokenType::LdcW,
250            "ldc2_w" => JasmTokenType::Ldc2W,
251            "invokespecial" => JasmTokenType::InvokeSpecial,
252            "invokevirtual" => JasmTokenType::InvokeVirtual,
253            "invokestatic" => JasmTokenType::InvokeStatic,
254            "invokeinterface" => JasmTokenType::InvokeInterface,
255            "invokedynamic" => JasmTokenType::InvokeDynamic,
256            "getstatic" => JasmTokenType::GetStatic,
257            "putstatic" => JasmTokenType::PutStatic,
258            "getfield" => JasmTokenType::GetField,
259            "putfield" => JasmTokenType::PutField,
260            "return" => JasmTokenType::Return,
261            "ireturn" => JasmTokenType::IReturn,
262            "areturn" => JasmTokenType::AReturn,
263            "lreturn" => JasmTokenType::LReturn,
264            "freturn" => JasmTokenType::FReturn,
265            "dreturn" => JasmTokenType::DReturn,
266            "nop" => JasmTokenType::Nop,
267            "dup" => JasmTokenType::Dup,
268            "pop" => JasmTokenType::Pop,
269            "new" => JasmTokenType::New,
270
271            // 默认为标识符
272            _ => JasmTokenType::IdentifierToken,
273        }
274    }
275
276    /// 处理标点符号
277    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
278        let start = state.get_position();
279
280        if let Some(ch) = state.current() {
281            let kind = match ch {
282                '{' => JasmTokenType::LeftBrace,
283                '}' => JasmTokenType::RightBrace,
284                '(' => JasmTokenType::LeftParen,
285                ')' => JasmTokenType::RightParen,
286                '[' => JasmTokenType::LeftBracket,
287                ']' => JasmTokenType::RightBracket,
288                ':' => JasmTokenType::Colon,
289                ';' => JasmTokenType::Semicolon,
290                '.' => JasmTokenType::Dot,
291                ',' => JasmTokenType::Comma,
292                '/' => JasmTokenType::Slash,
293                _ => return false,
294            };
295
296            state.advance(ch.len_utf8());
297            state.add_token(kind, start, state.get_position());
298            return true;
299        }
300
301        false
302    }
303}