Skip to main content

oak_jasm/lexer/
mod.rs

1//! Lexer implementation for the JASM language.
2
3#![doc = include_str!("readme.md")]
4use oak_core::{
5    Lexer, LexerCache, LexerState, OakError, Source,
6    lexer::{CommentConfig, LexOutput, StringConfig},
7};
8/// Token types for the JASM language.
9pub mod token_type;
10
11use crate::{language::JasmLanguage, lexer::token_type::JasmTokenType};
12use std::sync::LazyLock;
13
14pub(crate) type State<'a, S> = LexerState<'a, S, JasmLanguage>;
15
16static JASM_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "", block_end: "", nested_blocks: false });
17static JASM_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18
19/// Lexer for the JASM language.
20#[derive(Clone, Debug)]
21pub struct JasmLexer<'config> {
22    config: &'config JasmLanguage,
23}
24
25impl<'config> Lexer<JasmLanguage> for JasmLexer<'config> {
26    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], _cache: &'a mut impl LexerCache<JasmLanguage>) -> LexOutput<JasmLanguage> {
27        let mut state = State::new(source);
28        let result = self.run(&mut state);
29        state.finish(result)
30    }
31}
32
33impl<'config> JasmLexer<'config> {
34    /// Creates a new `JasmLexer`.
35    pub fn new(config: &'config JasmLanguage) -> Self {
36        Self { config }
37    }
38
39    /// Main lexing loop.
40    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.lex_newline(state) {
49                continue;
50            }
51
52            if self.skip_comment(state) {
53                continue;
54            }
55
56            if self.lex_string_literal(state) {
57                continue;
58            }
59
60            if self.lex_number_literal(state) {
61                continue;
62            }
63
64            if self.lex_identifier_or_keyword(state) {
65                continue;
66            }
67
68            if self.lex_punctuation(state) {
69                continue;
70            }
71
72            state.advance_if_dead_lock(safe_point);
73        }
74
75        // Add EOF token
76        state.add_eof();
77        Ok(())
78    }
79
80    /// Skips whitespace characters (excluding newlines).
81    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
82        let start = state.get_position();
83
84        while let Some(ch) = state.peek() {
85            if ch == ' ' || ch == '\t' || ch == '\r' {
86                state.advance(ch.len_utf8());
87            }
88            else {
89                break;
90            }
91        }
92
93        if state.get_position() > start {
94            state.add_token(JasmTokenType::Whitespace, start, state.get_position());
95            return true;
96        }
97
98        false
99    }
100
101    /// Handles newlines.
102    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
103        let start = state.get_position();
104
105        if state.current() == Some('\n') {
106            state.advance(1);
107            state.add_token(JasmTokenType::Newline, start, state.get_position());
108            return true;
109        }
110        false
111    }
112
113    /// Skips comments.
114    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
115        JASM_COMMENT.scan(state, JasmTokenType::Comment, JasmTokenType::Comment)
116    }
117
118    /// Handles string literals.
119    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
120        JASM_STRING.scan(state, JasmTokenType::String)
121    }
122
123    /// Handles number literals.
124    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
125        let start = state.get_position();
126        let first = match state.peek() {
127            Some(c) => c,
128            None => return false,
129        };
130
131        // Check if starts with a digit or sign
132        if !first.is_ascii_digit() && first != '-' && first != '+' {
133            return false;
134        }
135
136        // If sign, check if followed by a digit
137        if first == '-' || first == '+' {
138            if let Some(next) = state.peek_next_n(1) {
139                if !next.is_ascii_digit() {
140                    return false;
141                }
142            }
143            else {
144                return false;
145            }
146        }
147
148        state.advance(first.len_utf8());
149        let mut has_dot = false;
150        let mut has_exp = false;
151
152        while let Some(ch) = state.peek() {
153            if ch.is_ascii_digit() {
154                state.advance(ch.len_utf8());
155            }
156            else if ch == '.' && !has_dot && !has_exp {
157                has_dot = true;
158                state.advance(1);
159            }
160            else if (ch == 'e' || ch == 'E') && !has_exp {
161                has_exp = true;
162                state.advance(1);
163                // Handle exponent sign
164                if let Some(sign) = state.peek() {
165                    if sign == '+' || sign == '-' {
166                        state.advance(1);
167                    }
168                }
169            }
170            else {
171                break;
172            }
173        }
174
175        state.add_token(JasmTokenType::Number, start, state.get_position());
176        true
177    }
178
179    /// Handles identifiers or keywords.
180    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
181        let start = state.get_position();
182        let ch = match state.peek() {
183            Some(c) => c,
184            None => return false,
185        };
186
187        // Identifier must start with a letter or underscore
188        if !(ch.is_ascii_alphabetic() || ch == '_') {
189            return false;
190        }
191
192        state.advance(ch.len_utf8());
193        while let Some(c) = state.peek() {
194            if c.is_ascii_alphanumeric() || c == '_' {
195                state.advance(c.len_utf8());
196            }
197            else {
198                break;
199            }
200        }
201
202        let end = state.get_position();
203        let text = state.get_text_in((start..end).into());
204
205        // Check if keyword or instruction
206        let kind = self.classify_identifier(&text);
207        state.add_token(kind, start, state.get_position());
208        true
209    }
210
211    /// Classifies an identifier as a keyword, instruction, or identifier.
212    fn classify_identifier(&self, text: &str) -> JasmTokenType {
213        match text {
214            // Keywords
215            "class" => JasmTokenType::ClassKw,
216            "version" => JasmTokenType::VersionKw,
217            "method" => JasmTokenType::MethodKw,
218            "field" => JasmTokenType::FieldKw,
219            "string" => JasmTokenType::StringKw,
220            "sourcefile" => JasmTokenType::SourceFileKw,
221            "stack" => JasmTokenType::StackKw,
222            "locals" => JasmTokenType::LocalsKw,
223            "end" => JasmTokenType::EndKw,
224            "compiled" => JasmTokenType::CompiledKw,
225            "from" => JasmTokenType::FromKw,
226            "innerclass" => JasmTokenType::InnerClassKw,
227            "nestmembers" => JasmTokenType::NestMembersKw,
228            "bootstrapmethod" => JasmTokenType::BootstrapMethodKw,
229
230            // Access modifiers
231            "public" => JasmTokenType::Public,
232            "private" => JasmTokenType::Private,
233            "protected" => JasmTokenType::Protected,
234            "static" => JasmTokenType::Static,
235            "super" => JasmTokenType::Super,
236            "final" => JasmTokenType::Final,
237            "abstract" => JasmTokenType::Abstract,
238            "synchronized" => JasmTokenType::Synchronized,
239            "native" => JasmTokenType::Native,
240            "synthetic" => JasmTokenType::Synthetic,
241            "deprecated" => JasmTokenType::Deprecated,
242            "varargs" => JasmTokenType::Varargs,
243
244            // Bytecode instructions
245            "aload_0" => JasmTokenType::ALoad0,
246            "aload_1" => JasmTokenType::ALoad1,
247            "aload_2" => JasmTokenType::ALoad2,
248            "aload_3" => JasmTokenType::ALoad3,
249            "iload_0" => JasmTokenType::ILoad0,
250            "iload_1" => JasmTokenType::ILoad1,
251            "iload_2" => JasmTokenType::ILoad2,
252            "iload_3" => JasmTokenType::ILoad3,
253            "ldc" => JasmTokenType::Ldc,
254            "ldc_w" => JasmTokenType::LdcW,
255            "ldc2_w" => JasmTokenType::Ldc2W,
256            "invokespecial" => JasmTokenType::InvokeSpecial,
257            "invokevirtual" => JasmTokenType::InvokeVirtual,
258            "invokestatic" => JasmTokenType::InvokeStatic,
259            "invokeinterface" => JasmTokenType::InvokeInterface,
260            "invokedynamic" => JasmTokenType::InvokeDynamic,
261            "getstatic" => JasmTokenType::GetStatic,
262            "putstatic" => JasmTokenType::PutStatic,
263            "getfield" => JasmTokenType::GetField,
264            "putfield" => JasmTokenType::PutField,
265            "return" => JasmTokenType::Return,
266            "ireturn" => JasmTokenType::IReturn,
267            "areturn" => JasmTokenType::AReturn,
268            "lreturn" => JasmTokenType::LReturn,
269            "freturn" => JasmTokenType::FReturn,
270            "dreturn" => JasmTokenType::DReturn,
271            "nop" => JasmTokenType::Nop,
272            "dup" => JasmTokenType::Dup,
273            "pop" => JasmTokenType::Pop,
274            "new" => JasmTokenType::New,
275
276            // Default to identifier
277            _ => JasmTokenType::Identifier,
278        }
279    }
280
281    /// Handles punctuation marks.
282    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
283        let start = state.get_position();
284
285        if let Some(ch) = state.current() {
286            let kind = match ch {
287                '{' => JasmTokenType::LeftBrace,
288                '}' => JasmTokenType::RightBrace,
289                '(' => JasmTokenType::LeftParen,
290                ')' => JasmTokenType::RightParen,
291                '[' => JasmTokenType::LeftBracket,
292                ']' => JasmTokenType::RightBracket,
293                ':' => JasmTokenType::Colon,
294                ';' => JasmTokenType::Semicolon,
295                '.' => JasmTokenType::Dot,
296                ',' => JasmTokenType::Comma,
297                '/' => JasmTokenType::Slash,
298                _ => return false,
299            };
300
301            state.advance(ch.len_utf8());
302            state.add_token(kind, start, state.get_position());
303            return true;
304        }
305
306        false
307    }
308}