Skip to main content

oak_java/lexer/
mod.rs

1use crate::{kind::JavaSyntaxKind, language::JavaLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, JavaLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct JavaLexer<'config> {
8    _config: &'config JavaLanguage,
9}
10
11impl<'config> Lexer<JavaLanguage> for JavaLexer<'config> {
12    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<JavaLanguage>) -> LexOutput<JavaLanguage> {
13        let mut state = State::new(source);
14        let result = self.run(&mut state);
15        if result.is_ok() {
16            state.add_eof();
17        }
18        state.finish_with_cache(result, cache)
19    }
20}
21
22impl<'config> JavaLexer<'config> {
23    pub fn new(config: &'config JavaLanguage) -> Self {
24        Self { _config: config }
25    }
26
27    /// 主要的词法分析循环
28    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
29        while state.not_at_end() {
30            let safe_point = state.get_position();
31
32            if self.skip_whitespace(state) {
33                continue;
34            }
35
36            if self.lex_newline(state) {
37                continue;
38            }
39
40            if self.skip_comment(state) {
41                continue;
42            }
43
44            if self.lex_string_literal(state) {
45                continue;
46            }
47
48            if self.lex_char_literal(state) {
49                continue;
50            }
51
52            if self.lex_number_literal(state) {
53                continue;
54            }
55
56            if self.lex_identifier_or_keyword(state) {
57                continue;
58            }
59
60            if self.lex_operator_or_delimiter(state) {
61                continue;
62            }
63
64            // 如果没有匹配到任何规则,前进一个字符并标记为错误
65            let start_pos = state.get_position();
66            if let Some(ch) = state.peek() {
67                state.advance(ch.len_utf8());
68                state.add_token(JavaSyntaxKind::Error, start_pos, state.get_position());
69            }
70
71            state.advance_if_dead_lock(safe_point);
72        }
73
74        Ok(())
75    }
76
77    /// 跳过空白字符(不包括换行符)
78    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79        let start = state.get_position();
80
81        while let Some(ch) = state.peek() {
82            if ch == ' ' || ch == '\t' || ch == '\r' {
83                state.advance(ch.len_utf8());
84            }
85            else {
86                break;
87            }
88        }
89
90        if state.get_position() > start {
91            state.add_token(JavaSyntaxKind::Whitespace, start, state.get_position());
92            return true;
93        }
94        false
95    }
96
97    /// 处理换行
98    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
99        let start = state.get_position();
100
101        if let Some('\n') = state.peek() {
102            state.advance(1);
103            state.add_token(JavaSyntaxKind::Whitespace, start, state.get_position());
104            true
105        }
106        else {
107            false
108        }
109    }
110
111    /// 跳过注释
112    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113        let start = state.get_position();
114
115        // 单行注释 //
116        if state.peek() == Some('/') && state.peek_next_n(1) == Some('/') {
117            state.advance(2);
118            while let Some(ch) = state.peek() {
119                if ch == '\n' {
120                    break;
121                }
122                state.advance(ch.len_utf8());
123            }
124            state.add_token(JavaSyntaxKind::LineComment, start, state.get_position());
125            return true;
126        }
127
128        // 多行注释 /* */
129        if state.peek() == Some('/') && state.peek_next_n(1) == Some('*') {
130            state.advance(2);
131            while let Some(ch) = state.peek() {
132                if ch == '*' && state.peek_next_n(1) == Some('/') {
133                    state.advance(2);
134                    break;
135                }
136                state.advance(ch.len_utf8());
137            }
138            state.add_token(JavaSyntaxKind::BlockComment, start, state.get_position());
139            return true;
140        }
141
142        false
143    }
144
145    /// 处理字符串字面量
146    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
147        let start = state.get_position();
148
149        if let Some('"') = state.peek() {
150            state.advance(1);
151
152            while let Some(ch) = state.peek() {
153                if ch == '"' {
154                    state.advance(1);
155                    break;
156                }
157                else if ch == '\\' {
158                    state.advance(1);
159                    if let Some(escaped) = state.peek() {
160                        state.advance(escaped.len_utf8());
161                    }
162                }
163                else if ch == '\n' {
164                    // 未闭合的字符�?
165                    break;
166                }
167                else {
168                    state.advance(ch.len_utf8());
169                }
170            }
171
172            state.add_token(JavaSyntaxKind::StringLiteral, start, state.get_position());
173            return true;
174        }
175
176        false
177    }
178
179    /// 处理字符字面�?
180    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
181        let start = state.get_position();
182
183        if let Some('\'') = state.peek() {
184            state.advance(1);
185
186            if let Some(ch) = state.peek() {
187                if ch == '\\' {
188                    state.advance(1);
189                    if let Some(escaped) = state.peek() {
190                        state.advance(escaped.len_utf8());
191                    }
192                }
193                else if ch != '\'' && ch != '\n' {
194                    state.advance(ch.len_utf8());
195                }
196            }
197
198            if let Some('\'') = state.peek() {
199                state.advance(1);
200            }
201
202            state.add_token(JavaSyntaxKind::CharacterLiteral, start, state.get_position());
203            return true;
204        }
205
206        false
207    }
208
209    /// 处理数字字面�?
210    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
211        let start = state.get_position();
212
213        if let Some(ch) = state.peek() {
214            if ch.is_ascii_digit() {
215                // 处理整数部分
216                while let Some(ch) = state.peek() {
217                    if ch.is_ascii_digit() {
218                        state.advance(ch.len_utf8());
219                    }
220                    else {
221                        break;
222                    }
223                }
224
225                // 处理小数部分
226                if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
227                    state.advance(1); // '.'
228                    while let Some(ch) = state.peek() {
229                        if ch.is_ascii_digit() {
230                            state.advance(ch.len_utf8());
231                        }
232                        else {
233                            break;
234                        }
235                    }
236                }
237
238                // 处理指数部分
239                if let Some(ch) = state.peek() {
240                    if ch == 'e' || ch == 'E' {
241                        state.advance(1);
242                        if let Some(sign) = state.peek() {
243                            if sign == '+' || sign == '-' {
244                                state.advance(1);
245                            }
246                        }
247                        while let Some(ch) = state.peek() {
248                            if ch.is_ascii_digit() {
249                                state.advance(ch.len_utf8());
250                            }
251                            else {
252                                break;
253                            }
254                        }
255                    }
256                }
257
258                // 处理后缀
259                if let Some(suffix) = state.peek() {
260                    if suffix == 'f' || suffix == 'F' || suffix == 'd' || suffix == 'D' || suffix == 'l' || suffix == 'L' {
261                        state.advance(1);
262                    }
263                }
264
265                state.add_token(JavaSyntaxKind::IntegerLiteral, start, state.get_position());
266                return true;
267            }
268        }
269
270        false
271    }
272
273    /// 处理标识符或关键�?
274    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
275        let start = state.get_position();
276
277        if let Some(ch) = state.peek() {
278            if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
279                state.advance(ch.len_utf8());
280
281                while let Some(ch) = state.peek() {
282                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
283                        state.advance(ch.len_utf8());
284                    }
285                    else {
286                        break;
287                    }
288                }
289
290                let text = state.get_text_in((start..state.get_position()).into());
291                let token_kind = self.classify_identifier(text.as_ref());
292
293                state.add_token(token_kind, start, state.get_position());
294                true
295            }
296            else {
297                false
298            }
299        }
300        else {
301            false
302        }
303    }
304
305    /// 分类标识符为关键字或普通标识符
306    fn classify_identifier(&self, text: &str) -> JavaSyntaxKind {
307        let kind = match text {
308            "abstract" => JavaSyntaxKind::Abstract,
309            "assert" => JavaSyntaxKind::Assert,
310            "boolean" => JavaSyntaxKind::Boolean,
311            "break" => JavaSyntaxKind::Break,
312            "byte" => JavaSyntaxKind::Byte,
313            "case" => JavaSyntaxKind::Case,
314            "catch" => JavaSyntaxKind::Catch,
315            "char" => JavaSyntaxKind::Char,
316            "class" => JavaSyntaxKind::Class,
317            "const" => JavaSyntaxKind::Const,
318            "continue" => JavaSyntaxKind::Continue,
319            "default" => JavaSyntaxKind::Default,
320            "do" => JavaSyntaxKind::Do,
321            "double" => JavaSyntaxKind::Double,
322            "else" => JavaSyntaxKind::Else,
323            "enum" => JavaSyntaxKind::Enum,
324            "extends" => JavaSyntaxKind::Extends,
325            "final" => JavaSyntaxKind::Final,
326            "finally" => JavaSyntaxKind::Finally,
327            "float" => JavaSyntaxKind::Float,
328            "for" => JavaSyntaxKind::For,
329            "goto" => JavaSyntaxKind::Goto,
330            "if" => JavaSyntaxKind::If,
331            "implements" => JavaSyntaxKind::Implements,
332            "import" => JavaSyntaxKind::Import,
333            "instanceof" => JavaSyntaxKind::Instanceof,
334            "int" => JavaSyntaxKind::Int,
335            "interface" => JavaSyntaxKind::Interface,
336            "long" => JavaSyntaxKind::Long,
337            "native" => JavaSyntaxKind::Native,
338            "new" => JavaSyntaxKind::New,
339            "package" => JavaSyntaxKind::Package,
340            "private" => JavaSyntaxKind::Private,
341            "protected" => JavaSyntaxKind::Protected,
342            "public" => JavaSyntaxKind::Public,
343            "return" => JavaSyntaxKind::Return,
344            "short" => JavaSyntaxKind::Short,
345            "static" => JavaSyntaxKind::Static,
346            "strictfp" => JavaSyntaxKind::Strictfp,
347            "super" => JavaSyntaxKind::Super,
348            "switch" => JavaSyntaxKind::Switch,
349            "synchronized" => JavaSyntaxKind::Synchronized,
350            "this" => JavaSyntaxKind::This,
351            "throw" => JavaSyntaxKind::Throw,
352            "throws" => JavaSyntaxKind::Throws,
353            "transient" => JavaSyntaxKind::Transient,
354            "try" => JavaSyntaxKind::Try,
355            "void" => JavaSyntaxKind::Void,
356            "volatile" => JavaSyntaxKind::Volatile,
357            "while" => JavaSyntaxKind::While,
358            "true" | "false" => JavaSyntaxKind::BooleanLiteral,
359            "null" => JavaSyntaxKind::NullLiteral,
360            _ => JavaSyntaxKind::Identifier,
361        };
362        eprintln!("DEBUG: Lexer classified '{}' as {:?}", text, kind);
363        kind
364    }
365
366    /// 处理操作符和分隔�?
367    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
368        let start = state.get_position();
369
370        if let Some(ch) = state.peek() {
371            let token_kind = match ch {
372                '+' => {
373                    state.advance(1);
374                    if state.peek() == Some('+') {
375                        state.advance(1);
376                        JavaSyntaxKind::PlusPlus
377                    }
378                    else if state.peek() == Some('=') {
379                        state.advance(1);
380                        JavaSyntaxKind::PlusEquals
381                    }
382                    else {
383                        JavaSyntaxKind::Plus
384                    }
385                }
386                '-' => {
387                    state.advance(1);
388                    if state.peek() == Some('-') {
389                        state.advance(1);
390                        JavaSyntaxKind::MinusMinus
391                    }
392                    else if state.peek() == Some('=') {
393                        state.advance(1);
394                        JavaSyntaxKind::MinusEquals
395                    }
396                    else {
397                        JavaSyntaxKind::Minus
398                    }
399                }
400                '*' => {
401                    state.advance(1);
402                    if state.peek() == Some('=') {
403                        state.advance(1);
404                        JavaSyntaxKind::AsteriskEquals
405                    }
406                    else {
407                        JavaSyntaxKind::Asterisk
408                    }
409                }
410                '/' => {
411                    state.advance(1);
412                    if state.peek() == Some('=') {
413                        state.advance(1);
414                        JavaSyntaxKind::SlashEquals
415                    }
416                    else {
417                        JavaSyntaxKind::Slash
418                    }
419                }
420                '%' => {
421                    state.advance(1);
422                    if state.peek() == Some('=') {
423                        state.advance(1);
424                        JavaSyntaxKind::PercentEquals
425                    }
426                    else {
427                        JavaSyntaxKind::Percent
428                    }
429                }
430                '=' => {
431                    state.advance(1);
432                    if state.peek() == Some('=') {
433                        state.advance(1);
434                        JavaSyntaxKind::Equals
435                    }
436                    else {
437                        JavaSyntaxKind::Assign
438                    }
439                }
440                '!' => {
441                    state.advance(1);
442                    if state.peek() == Some('=') {
443                        state.advance(1);
444                        JavaSyntaxKind::BangEquals
445                    }
446                    else {
447                        JavaSyntaxKind::Bang
448                    }
449                }
450                '<' => {
451                    state.advance(1);
452                    if state.peek() == Some('=') {
453                        state.advance(1);
454                        JavaSyntaxKind::LessThanEquals
455                    }
456                    else if state.peek() == Some('<') {
457                        state.advance(1);
458                        if state.peek() == Some('=') {
459                            state.advance(1);
460                            JavaSyntaxKind::LeftShiftEquals
461                        }
462                        else {
463                            JavaSyntaxKind::LeftShift
464                        }
465                    }
466                    else {
467                        JavaSyntaxKind::LessThan
468                    }
469                }
470                '>' => {
471                    state.advance(1);
472                    if state.peek() == Some('=') {
473                        state.advance(1);
474                        JavaSyntaxKind::GreaterThanEquals
475                    }
476                    else if state.peek() == Some('>') {
477                        state.advance(1);
478                        if state.peek() == Some('>') {
479                            state.advance(1);
480                            if state.peek() == Some('=') {
481                                state.advance(1);
482                                JavaSyntaxKind::UnsignedRightShiftEquals
483                            }
484                            else {
485                                JavaSyntaxKind::UnsignedRightShift
486                            }
487                        }
488                        else if state.peek() == Some('=') {
489                            state.advance(1);
490                            JavaSyntaxKind::RightShiftEquals
491                        }
492                        else {
493                            JavaSyntaxKind::RightShift
494                        }
495                    }
496                    else {
497                        JavaSyntaxKind::GreaterThan
498                    }
499                }
500                '&' => {
501                    state.advance(1);
502                    if state.peek() == Some('&') {
503                        state.advance(1);
504                        JavaSyntaxKind::AmpersandAmpersand
505                    }
506                    else if state.peek() == Some('=') {
507                        state.advance(1);
508                        JavaSyntaxKind::AmpersandEquals
509                    }
510                    else {
511                        JavaSyntaxKind::Ampersand
512                    }
513                }
514                '|' => {
515                    state.advance(1);
516                    if state.peek() == Some('|') {
517                        state.advance(1);
518                        JavaSyntaxKind::PipePipe
519                    }
520                    else if state.peek() == Some('=') {
521                        state.advance(1);
522                        JavaSyntaxKind::PipeEquals
523                    }
524                    else {
525                        JavaSyntaxKind::Pipe
526                    }
527                }
528                '^' => {
529                    state.advance(1);
530                    if state.peek() == Some('=') {
531                        state.advance(1);
532                        JavaSyntaxKind::CaretEquals
533                    }
534                    else {
535                        JavaSyntaxKind::Caret
536                    }
537                }
538                '~' => {
539                    state.advance(1);
540                    JavaSyntaxKind::Tilde
541                }
542                '?' => {
543                    state.advance(1);
544                    JavaSyntaxKind::Question
545                }
546                ':' => {
547                    state.advance(1);
548                    JavaSyntaxKind::Colon
549                }
550                ';' => {
551                    state.advance(1);
552                    JavaSyntaxKind::Semicolon
553                }
554                ',' => {
555                    state.advance(1);
556                    JavaSyntaxKind::Comma
557                }
558                '.' => {
559                    state.advance(1);
560                    if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') {
561                        state.advance(2);
562                        JavaSyntaxKind::Ellipsis
563                    }
564                    else {
565                        JavaSyntaxKind::Dot
566                    }
567                }
568                '(' => {
569                    state.advance(1);
570                    JavaSyntaxKind::LeftParen
571                }
572                ')' => {
573                    state.advance(1);
574                    JavaSyntaxKind::RightParen
575                }
576                '{' => {
577                    state.advance(1);
578                    JavaSyntaxKind::LeftBrace
579                }
580                '}' => {
581                    state.advance(1);
582                    JavaSyntaxKind::RightBrace
583                }
584                '[' => {
585                    state.advance(1);
586                    JavaSyntaxKind::LeftBracket
587                }
588                ']' => {
589                    state.advance(1);
590                    JavaSyntaxKind::RightBracket
591                }
592                '@' => {
593                    state.advance(1);
594                    JavaSyntaxKind::At
595                }
596                _ => return false,
597            };
598
599            state.add_token(token_kind, start, state.get_position());
600            true
601        }
602        else {
603            false
604        }
605    }
606}