Skip to main content

oak_jasm/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError, Source,
4    lexer::{CommentConfig, LexOutput, StringConfig},
5};
6/// Token types for the JASM language.
7pub mod token_type;
8
9use crate::{language::JasmLanguage, lexer::token_type::JasmTokenType};
10use std::sync::LazyLock;
11
12pub(crate) type State<'a, S> = LexerState<'a, S, JasmLanguage>;
13
14static JASM_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "", block_end: "", nested_blocks: false });
15static JASM_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
16
17/// Lexer for the JASM language.
18#[derive(Clone, Debug)]
19pub struct JasmLexer<'config> {
20    config: &'config JasmLanguage,
21}
22
23impl<'config> Lexer<JasmLanguage> for JasmLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], _cache: &'a mut impl LexerCache<JasmLanguage>) -> LexOutput<JasmLanguage> {
25        let mut state = State::new(source);
26        let result = self.run(&mut state);
27        state.finish(result)
28    }
29}
30
31impl<'config> JasmLexer<'config> {
32    /// Creates a new `JasmLexer`.
33    pub fn new(config: &'config JasmLanguage) -> Self {
34        Self { config }
35    }
36
37    /// Main lexing loop.
38    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.lex_newline(state) {
47                continue;
48            }
49
50            if self.skip_comment(state) {
51                continue;
52            }
53
54            if self.lex_string_literal(state) {
55                continue;
56            }
57
58            if self.lex_number_literal(state) {
59                continue;
60            }
61
62            if self.lex_identifier_or_keyword(state) {
63                continue;
64            }
65
66            if self.lex_punctuation(state) {
67                continue;
68            }
69
70            state.advance_if_dead_lock(safe_point);
71        }
72
73        // Add EOF token
74        state.add_eof();
75        Ok(())
76    }
77
78    /// Skips whitespace characters (excluding newlines).
79    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
80        let start = state.get_position();
81
82        while let Some(ch) = state.peek() {
83            if ch == ' ' || ch == '\t' || ch == '\r' {
84                state.advance(ch.len_utf8());
85            }
86            else {
87                break;
88            }
89        }
90
91        if state.get_position() > start {
92            state.add_token(JasmTokenType::Whitespace, start, state.get_position());
93            return true;
94        }
95
96        false
97    }
98
99    /// Handles newlines.
100    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
101        let start = state.get_position();
102
103        if state.current() == Some('\n') {
104            state.advance(1);
105            state.add_token(JasmTokenType::Newline, start, state.get_position());
106            return true;
107        }
108        false
109    }
110
111    /// Skips comments.
112    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
113        if !self.config.comments {
114            return false;
115        }
116        JASM_COMMENT.scan(state, JasmTokenType::Comment, JasmTokenType::Comment)
117    }
118
119    /// Handles string literals.
120    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
121        JASM_STRING.scan(state, JasmTokenType::String)
122    }
123
124    /// Handles number literals.
125    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
126        let start = state.get_position();
127        let first = match state.peek() {
128            Some(c) => c,
129            None => return false,
130        };
131
132        // Check if starts with a digit or sign
133        if !first.is_ascii_digit() && first != '-' && first != '+' {
134            return false;
135        }
136
137        // If sign, check if followed by a digit
138        if first == '-' || first == '+' {
139            if let Some(next) = state.peek_next_n(1) {
140                if !next.is_ascii_digit() {
141                    return false;
142                }
143            }
144            else {
145                return false;
146            }
147        }
148
149        state.advance(first.len_utf8());
150        let mut has_dot = false;
151        let mut has_exp = false;
152
153        while let Some(ch) = state.peek() {
154            if ch.is_ascii_digit() {
155                state.advance(ch.len_utf8());
156            }
157            else if ch == '.' && !has_dot && !has_exp {
158                has_dot = true;
159                state.advance(1);
160            }
161            else if (ch == 'e' || ch == 'E') && !has_exp {
162                has_exp = true;
163                state.advance(1);
164                // Handle exponent sign
165                if let Some(sign) = state.peek() {
166                    if sign == '+' || sign == '-' {
167                        state.advance(1);
168                    }
169                }
170            }
171            else {
172                break;
173            }
174        }
175
176        state.add_token(JasmTokenType::Number, start, state.get_position());
177        true
178    }
179
180    /// Handles identifiers or keywords.
181    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
182        let start = state.get_position();
183        let ch = match state.peek() {
184            Some(c) => c,
185            None => return false,
186        };
187
188        // Identifier must start with a letter or underscore
189        if !(ch.is_ascii_alphabetic() || ch == '_') {
190            return false;
191        }
192
193        state.advance(ch.len_utf8());
194        while let Some(c) = state.peek() {
195            if c.is_ascii_alphanumeric() || c == '_' {
196                state.advance(c.len_utf8());
197            }
198            else {
199                break;
200            }
201        }
202
203        let end = state.get_position();
204        let text = state.get_text_in((start..end).into());
205
206        // Check if keyword or instruction
207        let kind = self.classify_identifier(&text);
208        state.add_token(kind, start, state.get_position());
209        true
210    }
211
212    /// Classifies an identifier as a keyword, instruction, or identifier.
213    fn classify_identifier(&self, text: &str) -> JasmTokenType {
214        match text {
215            // Keywords
216            "class" => JasmTokenType::ClassKw,
217            "version" => JasmTokenType::VersionKw,
218            "method" => JasmTokenType::MethodKw,
219            "field" => JasmTokenType::FieldKw,
220            "string" => JasmTokenType::StringKw,
221            "source" => JasmTokenType::SourceKw,
222            "sourcefile" => JasmTokenType::SourceFileKw,
223            "stack" => JasmTokenType::StackKw,
224            "locals" => JasmTokenType::LocalsKw,
225            "end" => JasmTokenType::EndKw,
226            "compiled" => JasmTokenType::CompiledKw,
227            "from" => JasmTokenType::FromKw,
228            "innerclass" => JasmTokenType::InnerClassKw,
229            "nestmembers" => JasmTokenType::NestMembersKw,
230            "bootstrapmethod" => JasmTokenType::BootstrapMethodKw,
231            "interface" => JasmTokenType::InterfaceKw,
232            "extends" => JasmTokenType::ExtendsKw,
233            "implements" => JasmTokenType::ImplementsKw,
234            "catch" => JasmTokenType::CatchKw,
235            "attribute" => JasmTokenType::AttributeKw,
236            "stackmap" => JasmTokenType::StackMapKw,
237
238            // Access modifiers
239            "public" => JasmTokenType::Public,
240            "private" => JasmTokenType::Private,
241            "protected" => JasmTokenType::Protected,
242            "static" => JasmTokenType::Static,
243            "super" => JasmTokenType::Super,
244            "final" => JasmTokenType::Final,
245            "abstract" => JasmTokenType::Abstract,
246            "synchronized" => JasmTokenType::Synchronized,
247            "native" => JasmTokenType::Native,
248            "synthetic" => JasmTokenType::Synthetic,
249            "deprecated" => JasmTokenType::Deprecated,
250            "varargs" => JasmTokenType::Varargs,
251
252            // Base bytecode instructions
253            "aload_0" => JasmTokenType::ALoad0,
254            "aload_1" => JasmTokenType::ALoad1,
255            "aload_2" => JasmTokenType::ALoad2,
256            "aload_3" => JasmTokenType::ALoad3,
257            "iload_0" => JasmTokenType::ILoad0,
258            "iload_1" => JasmTokenType::ILoad1,
259            "iload_2" => JasmTokenType::ILoad2,
260            "iload_3" => JasmTokenType::ILoad3,
261            "ldc" => JasmTokenType::Ldc,
262            "ldc_w" => JasmTokenType::LdcW,
263            "ldc2_w" => JasmTokenType::Ldc2W,
264            "invokespecial" => JasmTokenType::InvokeSpecial,
265            "invokevirtual" => JasmTokenType::InvokeVirtual,
266            "invokestatic" => JasmTokenType::InvokeStatic,
267            "getstatic" => JasmTokenType::GetStatic,
268            "putstatic" => JasmTokenType::PutStatic,
269            "getfield" => JasmTokenType::GetField,
270            "putfield" => JasmTokenType::PutField,
271            "return" => JasmTokenType::Return,
272            "ireturn" => JasmTokenType::IReturn,
273            "areturn" => JasmTokenType::AReturn,
274            "lreturn" => JasmTokenType::LReturn,
275            "freturn" => JasmTokenType::FReturn,
276            "dreturn" => JasmTokenType::DReturn,
277            "nop" => JasmTokenType::Nop,
278            "dup" => JasmTokenType::Dup,
279            "pop" => JasmTokenType::Pop,
280            "new" => JasmTokenType::New,
281
282            // Extended bytecode instructions (only if extended mode is enabled)
283            _ if self.config.extended => match text {
284                "invokeinterface" => JasmTokenType::InvokeInterface,
285                "invokedynamic" => JasmTokenType::InvokeDynamic,
286                "checkcast" => JasmTokenType::CheckCast,
287                "instanceof" => JasmTokenType::InstanceOf,
288                "newarray" => JasmTokenType::NewArray,
289                "anewarray" => JasmTokenType::ANewArray,
290                "arraylength" => JasmTokenType::ArrayLength,
291                "athrow" => JasmTokenType::AThrow,
292                "monitorenter" => JasmTokenType::MonitorEnter,
293                "monitorexit" => JasmTokenType::MonitorExit,
294                "multianewarray" => JasmTokenType::MultiANewArray,
295                "ifnull" => JasmTokenType::IfNull,
296                "ifnonnull" => JasmTokenType::IfNonNull,
297                "goto" => JasmTokenType::Goto,
298                "goto_w" => JasmTokenType::GotoW,
299                "jsr" => JasmTokenType::Jsr,
300                "jsr_w" => JasmTokenType::JsrW,
301                "ret" => JasmTokenType::Ret,
302                "tableswitch" => JasmTokenType::TableSwitch,
303                "lookupswitch" => JasmTokenType::LookupSwitch,
304                "bipush" => JasmTokenType::BiPush,
305                "sipush" => JasmTokenType::SiPush,
306                "iinc" => JasmTokenType::IInc,
307                "wide" => JasmTokenType::Wide,
308                "breakpoint" => JasmTokenType::BreakPoint,
309                "impdep1" => JasmTokenType::ImpDep1,
310                "impdep2" => JasmTokenType::ImpDep2,
311                _ => JasmTokenType::Identifier,
312            },
313
314            // Default to identifier
315            _ => JasmTokenType::Identifier,
316        }
317    }
318
319    /// Handles punctuation marks.
320    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
321        let start = state.get_position();
322
323        if let Some(ch) = state.current() {
324            let kind = match ch {
325                '{' => JasmTokenType::LeftBrace,
326                '}' => JasmTokenType::RightBrace,
327                '(' => JasmTokenType::LeftParen,
328                ')' => JasmTokenType::RightParen,
329                '[' => JasmTokenType::LeftBracket,
330                ']' => JasmTokenType::RightBracket,
331                ':' => JasmTokenType::Colon,
332                ';' => JasmTokenType::Semicolon,
333                '.' => JasmTokenType::Dot,
334                ',' => JasmTokenType::Comma,
335                '/' => JasmTokenType::Slash,
336                '@' => JasmTokenType::At,
337                _ => return false,
338            };
339
340            state.advance(ch.len_utf8());
341            state.add_token(kind, start, state.get_position());
342            return true;
343        }
344
345        false
346    }
347}