Skip to main content

oak_java/lexer/
mod.rs

1//! Java lexer implementation.
2
3/// Java token types.
4pub mod token_type;
5
6use crate::{language::JavaLanguage, lexer::token_type::JavaTokenType};
7use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
8
9pub(crate) type State<'a, S> = LexerState<'a, S, JavaLanguage>;
10
11/// Java lexer.
12#[derive(Clone, Debug)]
13pub struct JavaLexer<'config> {
14    _config: &'config JavaLanguage,
15}
16
17impl<'config> Lexer<JavaLanguage> for JavaLexer<'config> {
18    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<JavaLanguage>) -> LexOutput<JavaLanguage> {
19        let mut state = State::new(source);
20        let result = self.run(&mut state);
21        if result.is_ok() {
22            state.add_eof();
23        }
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> JavaLexer<'config> {
29    /// Create a new Java lexer.
30    pub fn new(config: &'config JavaLanguage) -> Self {
31        Self { _config: config }
32    }
33
34    /// Main lexing loop
35    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
36        while state.not_at_end() {
37            let safe_point = state.get_position();
38
39            if self.skip_whitespace(state) {
40                continue;
41            }
42
43            if self.lex_newline(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_char_literal(state) {
56                continue;
57            }
58
59            if self.lex_number_literal(state) {
60                continue;
61            }
62
63            if self.lex_identifier_or_keyword(state) {
64                continue;
65            }
66
67            if self.lex_operator_or_delimiter(state) {
68                continue;
69            }
70
71            // If no rule matches, advance one character and mark as error
72            let start_pos = state.get_position();
73            if let Some(ch) = state.peek() {
74                state.advance(ch.len_utf8());
75                state.add_token(JavaTokenType::Error, start_pos, state.get_position());
76            }
77
78            state.advance_if_dead_lock(safe_point);
79        }
80
81        Ok(())
82    }
83
84    /// Skip whitespace characters (excluding newlines)
85    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86        let start = state.get_position();
87
88        while let Some(ch) = state.peek() {
89            if ch == ' ' || ch == '\t' || ch == '\r' {
90                state.advance(ch.len_utf8());
91            }
92            else {
93                break;
94            }
95        }
96
97        if state.get_position() > start {
98            state.add_token(JavaTokenType::Whitespace, start, state.get_position());
99            return true;
100        }
101        false
102    }
103
104    /// Handle newline
105    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106        let start = state.get_position();
107
108        if let Some('\n') = state.peek() {
109            state.advance(1);
110            state.add_token(JavaTokenType::Whitespace, start, state.get_position());
111            true
112        }
113        else {
114            false
115        }
116    }
117
118    /// Skip comments
119    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120        let start = state.get_position();
121
122        // Single-line comment //
123        if state.peek() == Some('/') && state.peek_next_n(1) == Some('/') {
124            state.advance(2);
125            while let Some(ch) = state.peek() {
126                if ch == '\n' {
127                    break;
128                }
129                state.advance(ch.len_utf8());
130            }
131            state.add_token(JavaTokenType::LineComment, start, state.get_position());
132            return true;
133        }
134
135        // Multi-line comment /* */
136        if state.peek() == Some('/') && state.peek_next_n(1) == Some('*') {
137            let start = state.get_position();
138            state.advance(2);
139            while let Some(ch) = state.peek() {
140                if ch == '*' && state.peek_next_n(1) == Some('/') {
141                    state.advance(2);
142                    break;
143                }
144                state.advance(ch.len_utf8());
145            }
146            state.add_token(JavaTokenType::BlockComment, start, state.get_position());
147            return true;
148        }
149
150        false
151    }
152
153    /// Handle string literal
154    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
155        let start = state.get_position();
156
157        if let Some('"') = state.peek() {
158            state.advance(1);
159
160            while let Some(ch) = state.peek() {
161                if ch == '"' {
162                    state.advance(1);
163                    break;
164                }
165                else if ch == '\\' {
166                    state.advance(1);
167                    if let Some(escaped) = state.peek() {
168                        state.advance(escaped.len_utf8());
169                    }
170                }
171                else if ch == '\n' {
172                    // Unclosed string
173                    break;
174                }
175                else {
176                    state.advance(ch.len_utf8());
177                }
178            }
179
180            state.add_token(JavaTokenType::StringLiteral, start, state.get_position());
181            return true;
182        }
183
184        false
185    }
186
187    /// Handle character literal
188    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
189        let start = state.get_position();
190
191        if let Some('\'') = state.peek() {
192            state.advance(1);
193
194            if let Some(ch) = state.peek() {
195                if ch == '\\' {
196                    state.advance(1);
197                    if let Some(escaped) = state.peek() {
198                        state.advance(escaped.len_utf8());
199                    }
200                }
201                else if ch != '\'' && ch != '\n' {
202                    state.advance(ch.len_utf8());
203                }
204            }
205
206            if let Some('\'') = state.peek() {
207                state.advance(1);
208            }
209
210            state.add_token(JavaTokenType::CharacterLiteral, start, state.get_position());
211            return true;
212        }
213
214        false
215    }
216
217    /// Handle number literal
218    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219        let start = state.get_position();
220
221        if let Some(ch) = state.peek() {
222            if ch.is_ascii_digit() {
223                // Handle integer part
224                while let Some(ch) = state.peek() {
225                    if ch.is_ascii_digit() {
226                        state.advance(ch.len_utf8());
227                    }
228                    else {
229                        break;
230                    }
231                }
232
233                // Handle fractional part
234                if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
235                    state.advance(1); // '.'
236                    while let Some(ch) = state.peek() {
237                        if ch.is_ascii_digit() {
238                            state.advance(ch.len_utf8());
239                        }
240                        else {
241                            break;
242                        }
243                    }
244                }
245
246                // Handle exponent part
247                if let Some(ch) = state.peek() {
248                    if ch == 'e' || ch == 'E' {
249                        state.advance(1);
250                        if let Some(sign) = state.peek() {
251                            if sign == '+' || sign == '-' {
252                                state.advance(1);
253                            }
254                        }
255                        while let Some(ch) = state.peek() {
256                            if ch.is_ascii_digit() {
257                                state.advance(ch.len_utf8());
258                            }
259                            else {
260                                break;
261                            }
262                        }
263                    }
264                }
265
266                // Handle suffix
267                if let Some(suffix) = state.peek() {
268                    if suffix == 'f' || suffix == 'F' || suffix == 'd' || suffix == 'D' || suffix == 'l' || suffix == 'L' {
269                        state.advance(1);
270                    }
271                }
272
273                let text = state.get_text_in((start..state.get_position()).into());
274                let kind = if text.contains('.') || text.contains('e') || text.contains('E') || text.ends_with('f') || text.ends_with('F') || text.ends_with('d') || text.ends_with('D') {
275                    JavaTokenType::FloatingPointLiteral
276                }
277                else {
278                    JavaTokenType::IntegerLiteral
279                };
280
281                eprintln!("DEBUG: Lexer classified '{}' as {:?} at {}..{}", text, kind, start, state.get_position());
282                state.add_token(kind, start, state.get_position());
283                return true;
284            }
285        }
286        false
287    }
288
289    /// Handle identifier or keyword
290    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
291        let start = state.get_position();
292
293        if let Some(ch) = state.peek() {
294            if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
295                state.advance(ch.len_utf8());
296
297                while let Some(ch) = state.peek() {
298                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
299                        state.advance(ch.len_utf8());
300                    }
301                    else {
302                        break;
303                    }
304                }
305
306                let text = state.get_text_in((start..state.get_position()).into());
307                let token_kind = self.classify_identifier(text.as_ref());
308
309                eprintln!("DEBUG: Lexer classified '{}' as {:?} at {}..{}", text, token_kind, start, state.get_position());
310                state.add_token(token_kind, start, state.get_position());
311                true
312            }
313            else {
314                false
315            }
316        }
317        else {
318            false
319        }
320    }
321
322    /// Classify identifier as keyword or plain identifier
323    fn classify_identifier(&self, text: &str) -> JavaTokenType {
324        match text {
325            "abstract" => JavaTokenType::Abstract,
326            "assert" => JavaTokenType::Assert,
327            "boolean" => JavaTokenType::Boolean,
328            "break" => JavaTokenType::Break,
329            "byte" => JavaTokenType::Byte,
330            "case" => JavaTokenType::Case,
331            "catch" => JavaTokenType::Catch,
332            "char" => JavaTokenType::Char,
333            "class" => JavaTokenType::Class,
334            "const" => JavaTokenType::Const,
335            "continue" => JavaTokenType::Continue,
336            "default" => JavaTokenType::Default,
337            "do" => JavaTokenType::Do,
338            "double" => JavaTokenType::Double,
339            "else" => JavaTokenType::Else,
340            "enum" => JavaTokenType::Enum,
341            "extends" => JavaTokenType::Extends,
342            "final" => JavaTokenType::Final,
343            "finally" => JavaTokenType::Finally,
344            "float" => JavaTokenType::Float,
345            "for" => JavaTokenType::For,
346            "goto" => JavaTokenType::Goto,
347            "if" => JavaTokenType::If,
348            "implements" => JavaTokenType::Implements,
349            "import" => JavaTokenType::Import,
350            "instanceof" => JavaTokenType::Instanceof,
351            "int" => JavaTokenType::Int,
352            "interface" => JavaTokenType::Interface,
353            "long" => JavaTokenType::Long,
354            "native" => JavaTokenType::Native,
355            "new" => JavaTokenType::New,
356            "package" => JavaTokenType::Package,
357            "private" => JavaTokenType::Private,
358            "protected" => JavaTokenType::Protected,
359            "public" => JavaTokenType::Public,
360            "record" => JavaTokenType::Record,
361            "return" => JavaTokenType::Return,
362            "short" => JavaTokenType::Short,
363            "static" => JavaTokenType::Static,
364            "strictfp" => JavaTokenType::Strictfp,
365            "struct" => JavaTokenType::Struct,
366            "super" => JavaTokenType::Super,
367            "switch" => JavaTokenType::Switch,
368            "synchronized" => JavaTokenType::Synchronized,
369            "this" => JavaTokenType::This,
370            "throw" => JavaTokenType::Throw,
371            "throws" => JavaTokenType::Throws,
372            "transient" => JavaTokenType::Transient,
373            "try" => JavaTokenType::Try,
374            "void" => JavaTokenType::Void,
375            "volatile" => JavaTokenType::Volatile,
376            "while" => JavaTokenType::While,
377            "true" | "false" => JavaTokenType::BooleanLiteral,
378            "null" => JavaTokenType::NullLiteral,
379            _ => JavaTokenType::Identifier,
380        }
381    }
382
383    /// Handle operator and delimiter
384    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
385        let start = state.get_position();
386
387        if let Some(ch) = state.peek() {
388            let token_kind = match ch {
389                '+' => {
390                    state.advance(1);
391                    if state.peek() == Some('+') {
392                        state.advance(1);
393                        JavaTokenType::PlusPlus
394                    }
395                    else if state.peek() == Some('=') {
396                        state.advance(1);
397                        JavaTokenType::PlusEquals
398                    }
399                    else {
400                        JavaTokenType::Plus
401                    }
402                }
403                '-' => {
404                    state.advance(1);
405                    if state.peek() == Some('-') {
406                        state.advance(1);
407                        JavaTokenType::MinusMinus
408                    }
409                    else if state.peek() == Some('=') {
410                        state.advance(1);
411                        JavaTokenType::MinusEquals
412                    }
413                    else {
414                        JavaTokenType::Minus
415                    }
416                }
417                '*' => {
418                    state.advance(1);
419                    if state.peek() == Some('=') {
420                        state.advance(1);
421                        JavaTokenType::AsteriskEquals
422                    }
423                    else {
424                        JavaTokenType::Asterisk
425                    }
426                }
427                '/' => {
428                    state.advance(1);
429                    if state.peek() == Some('=') {
430                        state.advance(1);
431                        JavaTokenType::SlashEquals
432                    }
433                    else {
434                        JavaTokenType::Slash
435                    }
436                }
437                '%' => {
438                    state.advance(1);
439                    if state.peek() == Some('=') {
440                        state.advance(1);
441                        JavaTokenType::PercentEquals
442                    }
443                    else {
444                        JavaTokenType::Percent
445                    }
446                }
447                '=' => {
448                    state.advance(1);
449                    if state.peek() == Some('=') {
450                        state.advance(1);
451                        JavaTokenType::Equals
452                    }
453                    else {
454                        JavaTokenType::Assign
455                    }
456                }
457                '!' => {
458                    state.advance(1);
459                    if state.peek() == Some('=') {
460                        state.advance(1);
461                        JavaTokenType::BangEquals
462                    }
463                    else {
464                        JavaTokenType::Bang
465                    }
466                }
467                '<' => {
468                    state.advance(1);
469                    if state.peek() == Some('=') {
470                        state.advance(1);
471                        JavaTokenType::LessThanEquals
472                    }
473                    else if state.peek() == Some('<') {
474                        state.advance(1);
475                        if state.peek() == Some('=') {
476                            state.advance(1);
477                            JavaTokenType::LeftShiftEquals
478                        }
479                        else {
480                            JavaTokenType::LeftShift
481                        }
482                    }
483                    else {
484                        JavaTokenType::LessThan
485                    }
486                }
487                '>' => {
488                    state.advance(1);
489                    if state.peek() == Some('=') {
490                        state.advance(1);
491                        JavaTokenType::GreaterThanEquals
492                    }
493                    else if state.peek() == Some('>') {
494                        state.advance(1);
495                        if state.peek() == Some('>') {
496                            state.advance(1);
497                            if state.peek() == Some('=') {
498                                state.advance(1);
499                                JavaTokenType::UnsignedRightShiftEquals
500                            }
501                            else {
502                                JavaTokenType::UnsignedRightShift
503                            }
504                        }
505                        else if state.peek() == Some('=') {
506                            state.advance(1);
507                            JavaTokenType::RightShiftEquals
508                        }
509                        else {
510                            JavaTokenType::RightShift
511                        }
512                    }
513                    else {
514                        JavaTokenType::GreaterThan
515                    }
516                }
517                '&' => {
518                    state.advance(1);
519                    if state.peek() == Some('&') {
520                        state.advance(1);
521                        JavaTokenType::AmpersandAmpersand
522                    }
523                    else if state.peek() == Some('=') {
524                        state.advance(1);
525                        JavaTokenType::AmpersandEquals
526                    }
527                    else {
528                        JavaTokenType::Ampersand
529                    }
530                }
531                '|' => {
532                    state.advance(1);
533                    if state.peek() == Some('|') {
534                        state.advance(1);
535                        JavaTokenType::PipePipe
536                    }
537                    else if state.peek() == Some('=') {
538                        state.advance(1);
539                        JavaTokenType::PipeEquals
540                    }
541                    else {
542                        JavaTokenType::Pipe
543                    }
544                }
545                '^' => {
546                    state.advance(1);
547                    if state.peek() == Some('=') {
548                        state.advance(1);
549                        JavaTokenType::CaretEquals
550                    }
551                    else {
552                        JavaTokenType::Caret
553                    }
554                }
555                '~' => {
556                    state.advance(1);
557                    JavaTokenType::Tilde
558                }
559                '?' => {
560                    state.advance(1);
561                    JavaTokenType::Question
562                }
563                ':' => {
564                    state.advance(1);
565                    JavaTokenType::Colon
566                }
567                ';' => {
568                    state.advance(1);
569                    JavaTokenType::Semicolon
570                }
571                ',' => {
572                    state.advance(1);
573                    JavaTokenType::Comma
574                }
575                '.' => {
576                    state.advance(1);
577                    if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') {
578                        state.advance(2);
579                        JavaTokenType::Ellipsis
580                    }
581                    else {
582                        JavaTokenType::Dot
583                    }
584                }
585                '(' => {
586                    state.advance(1);
587                    JavaTokenType::LeftParen
588                }
589                ')' => {
590                    state.advance(1);
591                    JavaTokenType::RightParen
592                }
593                '{' => {
594                    state.advance(1);
595                    JavaTokenType::LeftBrace
596                }
597                '}' => {
598                    state.advance(1);
599                    JavaTokenType::RightBrace
600                }
601                '[' => {
602                    state.advance(1);
603                    JavaTokenType::LeftBracket
604                }
605                ']' => {
606                    state.advance(1);
607                    JavaTokenType::RightBracket
608                }
609                '@' => {
610                    state.advance(1);
611                    JavaTokenType::At
612                }
613                _ => return false,
614            };
615
616            state.add_token(token_kind, start, state.get_position());
617            true
618        }
619        else {
620            false
621        }
622    }
623}