Skip to main content

oak_java/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::JavaLanguage, lexer::token_type::JavaTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, JavaLanguage>;
8
9#[derive(Clone, Debug)]
10pub struct JavaLexer<'config> {
11    _config: &'config JavaLanguage,
12}
13
14impl<'config> Lexer<JavaLanguage> for JavaLexer<'config> {
15    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<JavaLanguage>) -> LexOutput<JavaLanguage> {
16        let mut state = State::new(source);
17        let result = self.run(&mut state);
18        if result.is_ok() {
19            state.add_eof();
20        }
21        state.finish_with_cache(result, cache)
22    }
23}
24
25impl<'config> JavaLexer<'config> {
26    pub fn new(config: &'config JavaLanguage) -> Self {
27        Self { _config: config }
28    }
29
30    /// 主要的词法分析循环
31    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
32        while state.not_at_end() {
33            let safe_point = state.get_position();
34
35            if self.skip_whitespace(state) {
36                continue;
37            }
38
39            if self.lex_newline(state) {
40                continue;
41            }
42
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            if self.lex_string_literal(state) {
48                continue;
49            }
50
51            if self.lex_char_literal(state) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_operator_or_delimiter(state) {
64                continue;
65            }
66
67            // 如果没有匹配到任何规则,前进一个字符并标记为错误
68            let start_pos = state.get_position();
69            if let Some(ch) = state.peek() {
70                state.advance(ch.len_utf8());
71                state.add_token(JavaTokenType::Error, start_pos, state.get_position());
72            }
73
74            state.advance_if_dead_lock(safe_point);
75        }
76
77        Ok(())
78    }
79
80    /// 跳过空白字符(不包括换行符)
81    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
82        let start = state.get_position();
83
84        while let Some(ch) = state.peek() {
85            if ch == ' ' || ch == '\t' || ch == '\r' {
86                state.advance(ch.len_utf8());
87            }
88            else {
89                break;
90            }
91        }
92
93        if state.get_position() > start {
94            state.add_token(JavaTokenType::Whitespace, start, state.get_position());
95            return true;
96        }
97        false
98    }
99
100    /// 处理换行
101    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102        let start = state.get_position();
103
104        if let Some('\n') = state.peek() {
105            state.advance(1);
106            state.add_token(JavaTokenType::Whitespace, start, state.get_position());
107            true
108        }
109        else {
110            false
111        }
112    }
113
114    /// 跳过注释
115    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
116        let start = state.get_position();
117
118        // 单行注释 //
119        if state.peek() == Some('/') && state.peek_next_n(1) == Some('/') {
120            state.advance(2);
121            while let Some(ch) = state.peek() {
122                if ch == '\n' {
123                    break;
124                }
125                state.advance(ch.len_utf8());
126            }
127            state.add_token(JavaTokenType::LineComment, start, state.get_position());
128            return true;
129        }
130
131        // 多行注释 /* */
132        if state.peek() == Some('/') && state.peek_next_n(1) == Some('*') {
133            let start = state.get_position();
134            state.advance(2);
135            while let Some(ch) = state.peek() {
136                if ch == '*' && state.peek_next_n(1) == Some('/') {
137                    state.advance(2);
138                    break;
139                }
140                state.advance(ch.len_utf8());
141            }
142            state.add_token(JavaTokenType::BlockComment, start, state.get_position());
143            return true;
144        }
145
146        false
147    }
148
149    /// 处理字符串字面量
150    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
151        let start = state.get_position();
152
153        if let Some('"') = state.peek() {
154            state.advance(1);
155
156            while let Some(ch) = state.peek() {
157                if ch == '"' {
158                    state.advance(1);
159                    break;
160                }
161                else if ch == '\\' {
162                    state.advance(1);
163                    if let Some(escaped) = state.peek() {
164                        state.advance(escaped.len_utf8());
165                    }
166                }
167                else if ch == '\n' {
168                    // 未闭合的字符串
169                    break;
170                }
171                else {
172                    state.advance(ch.len_utf8());
173                }
174            }
175
176            state.add_token(JavaTokenType::StringLiteral, start, state.get_position());
177            return true;
178        }
179
180        false
181    }
182
183    /// 处理字符字面�?
184    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
185        let start = state.get_position();
186
187        if let Some('\'') = state.peek() {
188            state.advance(1);
189
190            if let Some(ch) = state.peek() {
191                if ch == '\\' {
192                    state.advance(1);
193                    if let Some(escaped) = state.peek() {
194                        state.advance(escaped.len_utf8());
195                    }
196                }
197                else if ch != '\'' && ch != '\n' {
198                    state.advance(ch.len_utf8());
199                }
200            }
201
202            if let Some('\'') = state.peek() {
203                state.advance(1);
204            }
205
206            state.add_token(JavaTokenType::CharacterLiteral, start, state.get_position());
207            return true;
208        }
209
210        false
211    }
212
213    /// 处理数字字面�?
214    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
215        let start = state.get_position();
216
217        if let Some(ch) = state.peek() {
218            if ch.is_ascii_digit() {
219                // 处理整数部分
220                while let Some(ch) = state.peek() {
221                    if ch.is_ascii_digit() {
222                        state.advance(ch.len_utf8());
223                    }
224                    else {
225                        break;
226                    }
227                }
228
229                // 处理小数部分
230                if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
231                    state.advance(1); // '.'
232                    while let Some(ch) = state.peek() {
233                        if ch.is_ascii_digit() {
234                            state.advance(ch.len_utf8());
235                        }
236                        else {
237                            break;
238                        }
239                    }
240                }
241
242                // 处理指数部分
243                if let Some(ch) = state.peek() {
244                    if ch == 'e' || ch == 'E' {
245                        state.advance(1);
246                        if let Some(sign) = state.peek() {
247                            if sign == '+' || sign == '-' {
248                                state.advance(1);
249                            }
250                        }
251                        while let Some(ch) = state.peek() {
252                            if ch.is_ascii_digit() {
253                                state.advance(ch.len_utf8());
254                            }
255                            else {
256                                break;
257                            }
258                        }
259                    }
260                }
261
262                // 处理后缀
263                if let Some(suffix) = state.peek() {
264                    if suffix == 'f' || suffix == 'F' || suffix == 'd' || suffix == 'D' || suffix == 'l' || suffix == 'L' {
265                        state.advance(1);
266                    }
267                }
268
269                let text = state.get_text_in((start..state.get_position()).into());
270                let kind = if text.contains('.') || text.contains('e') || text.contains('E') || text.ends_with('f') || text.ends_with('F') || text.ends_with('d') || text.ends_with('D') {
271                    JavaTokenType::FloatingPointLiteral
272                }
273                else {
274                    JavaTokenType::IntegerLiteral
275                };
276
277                eprintln!("DEBUG: Lexer classified '{}' as {:?} at {}..{}", text, kind, start, state.get_position());
278                state.add_token(kind, start, state.get_position());
279                return true;
280            }
281        }
282        false
283    }
284
285    /// 处理标识符或关键�?
286    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
287        let start = state.get_position();
288
289        if let Some(ch) = state.peek() {
290            if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
291                state.advance(ch.len_utf8());
292
293                while let Some(ch) = state.peek() {
294                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
295                        state.advance(ch.len_utf8());
296                    }
297                    else {
298                        break;
299                    }
300                }
301
302                let text = state.get_text_in((start..state.get_position()).into());
303                let token_kind = self.classify_identifier(text.as_ref());
304
305                eprintln!("DEBUG: Lexer classified '{}' as {:?} at {}..{}", text, token_kind, start, state.get_position());
306                state.add_token(token_kind, start, state.get_position());
307                true
308            }
309            else {
310                false
311            }
312        }
313        else {
314            false
315        }
316    }
317
318    /// 分类标识符为关键字或普通标识符
319    fn classify_identifier(&self, text: &str) -> JavaTokenType {
320        match text {
321            "abstract" => JavaTokenType::Abstract,
322            "assert" => JavaTokenType::Assert,
323            "boolean" => JavaTokenType::Boolean,
324            "break" => JavaTokenType::Break,
325            "byte" => JavaTokenType::Byte,
326            "case" => JavaTokenType::Case,
327            "catch" => JavaTokenType::Catch,
328            "char" => JavaTokenType::Char,
329            "class" => JavaTokenType::Class,
330            "const" => JavaTokenType::Const,
331            "continue" => JavaTokenType::Continue,
332            "default" => JavaTokenType::Default,
333            "do" => JavaTokenType::Do,
334            "double" => JavaTokenType::Double,
335            "else" => JavaTokenType::Else,
336            "enum" => JavaTokenType::Enum,
337            "extends" => JavaTokenType::Extends,
338            "final" => JavaTokenType::Final,
339            "finally" => JavaTokenType::Finally,
340            "float" => JavaTokenType::Float,
341            "for" => JavaTokenType::For,
342            "goto" => JavaTokenType::Goto,
343            "if" => JavaTokenType::If,
344            "implements" => JavaTokenType::Implements,
345            "import" => JavaTokenType::Import,
346            "instanceof" => JavaTokenType::Instanceof,
347            "int" => JavaTokenType::Int,
348            "interface" => JavaTokenType::Interface,
349            "long" => JavaTokenType::Long,
350            "native" => JavaTokenType::Native,
351            "new" => JavaTokenType::New,
352            "package" => JavaTokenType::Package,
353            "private" => JavaTokenType::Private,
354            "protected" => JavaTokenType::Protected,
355            "public" => JavaTokenType::Public,
356            "record" => JavaTokenType::Record,
357            "return" => JavaTokenType::Return,
358            "short" => JavaTokenType::Short,
359            "static" => JavaTokenType::Static,
360            "strictfp" => JavaTokenType::Strictfp,
361            "struct" => JavaTokenType::Struct,
362            "super" => JavaTokenType::Super,
363            "switch" => JavaTokenType::Switch,
364            "synchronized" => JavaTokenType::Synchronized,
365            "this" => JavaTokenType::This,
366            "throw" => JavaTokenType::Throw,
367            "throws" => JavaTokenType::Throws,
368            "transient" => JavaTokenType::Transient,
369            "try" => JavaTokenType::Try,
370            "void" => JavaTokenType::Void,
371            "volatile" => JavaTokenType::Volatile,
372            "while" => JavaTokenType::While,
373            "true" | "false" => JavaTokenType::BooleanLiteral,
374            "null" => JavaTokenType::NullLiteral,
375            _ => JavaTokenType::Identifier,
376        }
377    }
378
379    /// 处理操作符和分隔�?
380    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
381        let start = state.get_position();
382
383        if let Some(ch) = state.peek() {
384            let token_kind = match ch {
385                '+' => {
386                    state.advance(1);
387                    if state.peek() == Some('+') {
388                        state.advance(1);
389                        JavaTokenType::PlusPlus
390                    }
391                    else if state.peek() == Some('=') {
392                        state.advance(1);
393                        JavaTokenType::PlusEquals
394                    }
395                    else {
396                        JavaTokenType::Plus
397                    }
398                }
399                '-' => {
400                    state.advance(1);
401                    if state.peek() == Some('-') {
402                        state.advance(1);
403                        JavaTokenType::MinusMinus
404                    }
405                    else if state.peek() == Some('=') {
406                        state.advance(1);
407                        JavaTokenType::MinusEquals
408                    }
409                    else {
410                        JavaTokenType::Minus
411                    }
412                }
413                '*' => {
414                    state.advance(1);
415                    if state.peek() == Some('=') {
416                        state.advance(1);
417                        JavaTokenType::AsteriskEquals
418                    }
419                    else {
420                        JavaTokenType::Asterisk
421                    }
422                }
423                '/' => {
424                    state.advance(1);
425                    if state.peek() == Some('=') {
426                        state.advance(1);
427                        JavaTokenType::SlashEquals
428                    }
429                    else {
430                        JavaTokenType::Slash
431                    }
432                }
433                '%' => {
434                    state.advance(1);
435                    if state.peek() == Some('=') {
436                        state.advance(1);
437                        JavaTokenType::PercentEquals
438                    }
439                    else {
440                        JavaTokenType::Percent
441                    }
442                }
443                '=' => {
444                    state.advance(1);
445                    if state.peek() == Some('=') {
446                        state.advance(1);
447                        JavaTokenType::Equals
448                    }
449                    else {
450                        JavaTokenType::Assign
451                    }
452                }
453                '!' => {
454                    state.advance(1);
455                    if state.peek() == Some('=') {
456                        state.advance(1);
457                        JavaTokenType::BangEquals
458                    }
459                    else {
460                        JavaTokenType::Bang
461                    }
462                }
463                '<' => {
464                    state.advance(1);
465                    if state.peek() == Some('=') {
466                        state.advance(1);
467                        JavaTokenType::LessThanEquals
468                    }
469                    else if state.peek() == Some('<') {
470                        state.advance(1);
471                        if state.peek() == Some('=') {
472                            state.advance(1);
473                            JavaTokenType::LeftShiftEquals
474                        }
475                        else {
476                            JavaTokenType::LeftShift
477                        }
478                    }
479                    else {
480                        JavaTokenType::LessThan
481                    }
482                }
483                '>' => {
484                    state.advance(1);
485                    if state.peek() == Some('=') {
486                        state.advance(1);
487                        JavaTokenType::GreaterThanEquals
488                    }
489                    else if state.peek() == Some('>') {
490                        state.advance(1);
491                        if state.peek() == Some('>') {
492                            state.advance(1);
493                            if state.peek() == Some('=') {
494                                state.advance(1);
495                                JavaTokenType::UnsignedRightShiftEquals
496                            }
497                            else {
498                                JavaTokenType::UnsignedRightShift
499                            }
500                        }
501                        else if state.peek() == Some('=') {
502                            state.advance(1);
503                            JavaTokenType::RightShiftEquals
504                        }
505                        else {
506                            JavaTokenType::RightShift
507                        }
508                    }
509                    else {
510                        JavaTokenType::GreaterThan
511                    }
512                }
513                '&' => {
514                    state.advance(1);
515                    if state.peek() == Some('&') {
516                        state.advance(1);
517                        JavaTokenType::AmpersandAmpersand
518                    }
519                    else if state.peek() == Some('=') {
520                        state.advance(1);
521                        JavaTokenType::AmpersandEquals
522                    }
523                    else {
524                        JavaTokenType::Ampersand
525                    }
526                }
527                '|' => {
528                    state.advance(1);
529                    if state.peek() == Some('|') {
530                        state.advance(1);
531                        JavaTokenType::PipePipe
532                    }
533                    else if state.peek() == Some('=') {
534                        state.advance(1);
535                        JavaTokenType::PipeEquals
536                    }
537                    else {
538                        JavaTokenType::Pipe
539                    }
540                }
541                '^' => {
542                    state.advance(1);
543                    if state.peek() == Some('=') {
544                        state.advance(1);
545                        JavaTokenType::CaretEquals
546                    }
547                    else {
548                        JavaTokenType::Caret
549                    }
550                }
551                '~' => {
552                    state.advance(1);
553                    JavaTokenType::Tilde
554                }
555                '?' => {
556                    state.advance(1);
557                    JavaTokenType::Question
558                }
559                ':' => {
560                    state.advance(1);
561                    JavaTokenType::Colon
562                }
563                ';' => {
564                    state.advance(1);
565                    JavaTokenType::Semicolon
566                }
567                ',' => {
568                    state.advance(1);
569                    JavaTokenType::Comma
570                }
571                '.' => {
572                    state.advance(1);
573                    if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') {
574                        state.advance(2);
575                        JavaTokenType::Ellipsis
576                    }
577                    else {
578                        JavaTokenType::Dot
579                    }
580                }
581                '(' => {
582                    state.advance(1);
583                    JavaTokenType::LeftParen
584                }
585                ')' => {
586                    state.advance(1);
587                    JavaTokenType::RightParen
588                }
589                '{' => {
590                    state.advance(1);
591                    JavaTokenType::LeftBrace
592                }
593                '}' => {
594                    state.advance(1);
595                    JavaTokenType::RightBrace
596                }
597                '[' => {
598                    state.advance(1);
599                    JavaTokenType::LeftBracket
600                }
601                ']' => {
602                    state.advance(1);
603                    JavaTokenType::RightBracket
604                }
605                '@' => {
606                    state.advance(1);
607                    JavaTokenType::At
608                }
609                _ => return false,
610            };
611
612            state.add_token(token_kind, start, state.get_position());
613            true
614        }
615        else {
616            false
617        }
618    }
619}