oak_java/lexer/
mod.rs

1use crate::{kind::JavaSyntaxKind, language::JavaLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, JavaLanguage>;
5
6#[derive(Clone)]
7pub struct JavaLexer<'config> {
8    _config: &'config JavaLanguage,
9}
10
11impl<'config> Lexer<JavaLanguage> for JavaLexer<'config> {
12    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<JavaLanguage>) -> LexOutput<JavaLanguage> {
13        let mut state = State::new(source);
14        let result = self.run(&mut state);
15        if result.is_ok() {
16            state.add_eof();
17        }
18        state.finish_with_cache(result, cache)
19    }
20}
21
22impl<'config> JavaLexer<'config> {
23    pub fn new(config: &'config JavaLanguage) -> Self {
24        Self { _config: config }
25    }
26
27    /// 主要的词法分析循环
28    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
29        while state.not_at_end() {
30            let safe_point = state.get_position();
31
32            if self.skip_whitespace(state) {
33                continue;
34            }
35
36            if self.lex_newline(state) {
37                continue;
38            }
39
40            if self.skip_comment(state) {
41                continue;
42            }
43
44            if self.lex_string_literal(state) {
45                continue;
46            }
47
48            if self.lex_char_literal(state) {
49                continue;
50            }
51
52            if self.lex_number_literal(state) {
53                continue;
54            }
55
56            if self.lex_identifier_or_keyword(state) {
57                continue;
58            }
59
60            if self.lex_operator_or_delimiter(state) {
61                continue;
62            }
63
64            // 如果没有匹配到任何规则,前进一个字符并标记为错误
65            let start_pos = state.get_position();
66            if let Some(ch) = state.peek() {
67                state.advance(ch.len_utf8());
68                state.add_token(JavaSyntaxKind::Error, start_pos, state.get_position());
69            }
70
71            state.advance_if_dead_lock(safe_point);
72        }
73
74        Ok(())
75    }
76
77    /// 跳过空白字符(不包括换行符)
78    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
79        let start = state.get_position();
80
81        while let Some(ch) = state.peek() {
82            if ch == ' ' || ch == '\t' || ch == '\r' {
83                state.advance(ch.len_utf8());
84            }
85            else {
86                break;
87            }
88        }
89
90        if state.get_position() > start {
91            state.add_token(JavaSyntaxKind::Whitespace, start, state.get_position());
92            return true;
93        }
94        false
95    }
96
97    /// 处理换行
98    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
99        let start = state.get_position();
100
101        if let Some('\n') = state.peek() {
102            state.advance(1);
103            state.add_token(JavaSyntaxKind::Whitespace, start, state.get_position());
104            true
105        }
106        else {
107            false
108        }
109    }
110
111    /// 跳过注释
112    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113        let start = state.get_position();
114
115        // 单行注释 //
116        if state.peek() == Some('/') && state.peek_next_n(1) == Some('/') {
117            state.advance(2);
118            while let Some(ch) = state.peek() {
119                if ch == '\n' {
120                    break;
121                }
122                state.advance(ch.len_utf8());
123            }
124            state.add_token(JavaSyntaxKind::LineComment, start, state.get_position());
125            return true;
126        }
127
128        // 多行注释 /* */
129        if state.peek() == Some('/') && state.peek_next_n(1) == Some('*') {
130            state.advance(2);
131            while let Some(ch) = state.peek() {
132                if ch == '*' && state.peek_next_n(1) == Some('/') {
133                    state.advance(2);
134                    break;
135                }
136                state.advance(ch.len_utf8());
137            }
138            state.add_token(JavaSyntaxKind::BlockComment, start, state.get_position());
139            return true;
140        }
141
142        false
143    }
144
145    /// 处理字符串字面量
146    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
147        let start = state.get_position();
148
149        if let Some('"') = state.peek() {
150            state.advance(1);
151
152            while let Some(ch) = state.peek() {
153                if ch == '"' {
154                    state.advance(1);
155                    break;
156                }
157                else if ch == '\\' {
158                    state.advance(1);
159                    if let Some(escaped) = state.peek() {
160                        state.advance(escaped.len_utf8());
161                    }
162                }
163                else if ch == '\n' {
164                    // 未闭合的字符�?
165                    break;
166                }
167                else {
168                    state.advance(ch.len_utf8());
169                }
170            }
171
172            state.add_token(JavaSyntaxKind::StringLiteral, start, state.get_position());
173            return true;
174        }
175
176        false
177    }
178
179    /// 处理字符字面�?
180    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
181        let start = state.get_position();
182
183        if let Some('\'') = state.peek() {
184            state.advance(1);
185
186            if let Some(ch) = state.peek() {
187                if ch == '\\' {
188                    state.advance(1);
189                    if let Some(escaped) = state.peek() {
190                        state.advance(escaped.len_utf8());
191                    }
192                }
193                else if ch != '\'' && ch != '\n' {
194                    state.advance(ch.len_utf8());
195                }
196            }
197
198            if let Some('\'') = state.peek() {
199                state.advance(1);
200            }
201
202            state.add_token(JavaSyntaxKind::CharacterLiteral, start, state.get_position());
203            return true;
204        }
205
206        false
207    }
208
209    /// 处理数字字面�?
210    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
211        let start = state.get_position();
212
213        if let Some(ch) = state.peek() {
214            if ch.is_ascii_digit() {
215                // 处理整数部分
216                while let Some(ch) = state.peek() {
217                    if ch.is_ascii_digit() {
218                        state.advance(ch.len_utf8());
219                    }
220                    else {
221                        break;
222                    }
223                }
224
225                // 处理小数部分
226                if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
227                    state.advance(1); // '.'
228                    while let Some(ch) = state.peek() {
229                        if ch.is_ascii_digit() {
230                            state.advance(ch.len_utf8());
231                        }
232                        else {
233                            break;
234                        }
235                    }
236                }
237
238                // 处理指数部分
239                if let Some(ch) = state.peek() {
240                    if ch == 'e' || ch == 'E' {
241                        state.advance(1);
242                        if let Some(sign) = state.peek() {
243                            if sign == '+' || sign == '-' {
244                                state.advance(1);
245                            }
246                        }
247                        while let Some(ch) = state.peek() {
248                            if ch.is_ascii_digit() {
249                                state.advance(ch.len_utf8());
250                            }
251                            else {
252                                break;
253                            }
254                        }
255                    }
256                }
257
258                // 处理后缀
259                if let Some(suffix) = state.peek() {
260                    if suffix == 'f' || suffix == 'F' || suffix == 'd' || suffix == 'D' || suffix == 'l' || suffix == 'L' {
261                        state.advance(1);
262                    }
263                }
264
265                state.add_token(JavaSyntaxKind::IntegerLiteral, start, state.get_position());
266                return true;
267            }
268        }
269
270        false
271    }
272
273    /// 处理标识符或关键�?
274    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
275        let start = state.get_position();
276
277        if let Some(ch) = state.peek() {
278            if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
279                state.advance(ch.len_utf8());
280
281                while let Some(ch) = state.peek() {
282                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
283                        state.advance(ch.len_utf8());
284                    }
285                    else {
286                        break;
287                    }
288                }
289
290                let text = state.get_text_in((start..state.get_position()).into());
291                let token_kind = self.classify_identifier(text.as_ref());
292
293                state.add_token(token_kind, start, state.get_position());
294                true
295            }
296            else {
297                false
298            }
299        }
300        else {
301            false
302        }
303    }
304
305    /// 分类标识符为关键字或普通标识符
306    fn classify_identifier(&self, text: &str) -> JavaSyntaxKind {
307        match text {
308            "abstract" => JavaSyntaxKind::Abstract,
309            "assert" => JavaSyntaxKind::Assert,
310            "boolean" => JavaSyntaxKind::Boolean,
311            "break" => JavaSyntaxKind::Break,
312            "byte" => JavaSyntaxKind::Byte,
313            "case" => JavaSyntaxKind::Case,
314            "catch" => JavaSyntaxKind::Catch,
315            "char" => JavaSyntaxKind::Char,
316            "class" => JavaSyntaxKind::Class,
317            "const" => JavaSyntaxKind::Const,
318            "continue" => JavaSyntaxKind::Continue,
319            "default" => JavaSyntaxKind::Default,
320            "do" => JavaSyntaxKind::Do,
321            "double" => JavaSyntaxKind::Double,
322            "else" => JavaSyntaxKind::Else,
323            "enum" => JavaSyntaxKind::Enum,
324            "extends" => JavaSyntaxKind::Extends,
325            "final" => JavaSyntaxKind::Final,
326            "finally" => JavaSyntaxKind::Finally,
327            "float" => JavaSyntaxKind::Float,
328            "for" => JavaSyntaxKind::For,
329            "goto" => JavaSyntaxKind::Goto,
330            "if" => JavaSyntaxKind::If,
331            "implements" => JavaSyntaxKind::Implements,
332            "import" => JavaSyntaxKind::Import,
333            "instanceof" => JavaSyntaxKind::Instanceof,
334            "int" => JavaSyntaxKind::Int,
335            "interface" => JavaSyntaxKind::Interface,
336            "long" => JavaSyntaxKind::Long,
337            "native" => JavaSyntaxKind::Native,
338            "new" => JavaSyntaxKind::New,
339            "package" => JavaSyntaxKind::Package,
340            "private" => JavaSyntaxKind::Private,
341            "protected" => JavaSyntaxKind::Protected,
342            "public" => JavaSyntaxKind::Public,
343            "return" => JavaSyntaxKind::Return,
344            "short" => JavaSyntaxKind::Short,
345            "static" => JavaSyntaxKind::Static,
346            "strictfp" => JavaSyntaxKind::Strictfp,
347            "super" => JavaSyntaxKind::Super,
348            "switch" => JavaSyntaxKind::Switch,
349            "synchronized" => JavaSyntaxKind::Synchronized,
350            "this" => JavaSyntaxKind::This,
351            "throw" => JavaSyntaxKind::Throw,
352            "throws" => JavaSyntaxKind::Throws,
353            "transient" => JavaSyntaxKind::Transient,
354            "try" => JavaSyntaxKind::Try,
355            "void" => JavaSyntaxKind::Void,
356            "volatile" => JavaSyntaxKind::Volatile,
357            "while" => JavaSyntaxKind::While,
358            "true" | "false" => JavaSyntaxKind::BooleanLiteral,
359            "null" => JavaSyntaxKind::NullLiteral,
360            _ => JavaSyntaxKind::Identifier,
361        }
362    }
363
364    /// 处理操作符和分隔�?
365    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
366        let start = state.get_position();
367
368        if let Some(ch) = state.peek() {
369            let token_kind = match ch {
370                '+' => {
371                    state.advance(1);
372                    if state.peek() == Some('+') {
373                        state.advance(1);
374                        JavaSyntaxKind::PlusPlus
375                    }
376                    else if state.peek() == Some('=') {
377                        state.advance(1);
378                        JavaSyntaxKind::PlusEquals
379                    }
380                    else {
381                        JavaSyntaxKind::Plus
382                    }
383                }
384                '-' => {
385                    state.advance(1);
386                    if state.peek() == Some('-') {
387                        state.advance(1);
388                        JavaSyntaxKind::MinusMinus
389                    }
390                    else if state.peek() == Some('=') {
391                        state.advance(1);
392                        JavaSyntaxKind::MinusEquals
393                    }
394                    else {
395                        JavaSyntaxKind::Minus
396                    }
397                }
398                '*' => {
399                    state.advance(1);
400                    if state.peek() == Some('=') {
401                        state.advance(1);
402                        JavaSyntaxKind::AsteriskEquals
403                    }
404                    else {
405                        JavaSyntaxKind::Asterisk
406                    }
407                }
408                '/' => {
409                    state.advance(1);
410                    if state.peek() == Some('=') {
411                        state.advance(1);
412                        JavaSyntaxKind::SlashEquals
413                    }
414                    else {
415                        JavaSyntaxKind::Slash
416                    }
417                }
418                '%' => {
419                    state.advance(1);
420                    if state.peek() == Some('=') {
421                        state.advance(1);
422                        JavaSyntaxKind::PercentEquals
423                    }
424                    else {
425                        JavaSyntaxKind::Percent
426                    }
427                }
428                '=' => {
429                    state.advance(1);
430                    if state.peek() == Some('=') {
431                        state.advance(1);
432                        JavaSyntaxKind::Equals
433                    }
434                    else {
435                        JavaSyntaxKind::Assign
436                    }
437                }
438                '!' => {
439                    state.advance(1);
440                    if state.peek() == Some('=') {
441                        state.advance(1);
442                        JavaSyntaxKind::BangEquals
443                    }
444                    else {
445                        JavaSyntaxKind::Bang
446                    }
447                }
448                '<' => {
449                    state.advance(1);
450                    if state.peek() == Some('=') {
451                        state.advance(1);
452                        JavaSyntaxKind::LessThanEquals
453                    }
454                    else if state.peek() == Some('<') {
455                        state.advance(1);
456                        if state.peek() == Some('=') {
457                            state.advance(1);
458                            JavaSyntaxKind::LeftShiftEquals
459                        }
460                        else {
461                            JavaSyntaxKind::LeftShift
462                        }
463                    }
464                    else {
465                        JavaSyntaxKind::LessThan
466                    }
467                }
468                '>' => {
469                    state.advance(1);
470                    if state.peek() == Some('=') {
471                        state.advance(1);
472                        JavaSyntaxKind::GreaterThanEquals
473                    }
474                    else if state.peek() == Some('>') {
475                        state.advance(1);
476                        if state.peek() == Some('>') {
477                            state.advance(1);
478                            if state.peek() == Some('=') {
479                                state.advance(1);
480                                JavaSyntaxKind::UnsignedRightShiftEquals
481                            }
482                            else {
483                                JavaSyntaxKind::UnsignedRightShift
484                            }
485                        }
486                        else if state.peek() == Some('=') {
487                            state.advance(1);
488                            JavaSyntaxKind::RightShiftEquals
489                        }
490                        else {
491                            JavaSyntaxKind::RightShift
492                        }
493                    }
494                    else {
495                        JavaSyntaxKind::GreaterThan
496                    }
497                }
498                '&' => {
499                    state.advance(1);
500                    if state.peek() == Some('&') {
501                        state.advance(1);
502                        JavaSyntaxKind::AmpersandAmpersand
503                    }
504                    else if state.peek() == Some('=') {
505                        state.advance(1);
506                        JavaSyntaxKind::AmpersandEquals
507                    }
508                    else {
509                        JavaSyntaxKind::Ampersand
510                    }
511                }
512                '|' => {
513                    state.advance(1);
514                    if state.peek() == Some('|') {
515                        state.advance(1);
516                        JavaSyntaxKind::PipePipe
517                    }
518                    else if state.peek() == Some('=') {
519                        state.advance(1);
520                        JavaSyntaxKind::PipeEquals
521                    }
522                    else {
523                        JavaSyntaxKind::Pipe
524                    }
525                }
526                '^' => {
527                    state.advance(1);
528                    if state.peek() == Some('=') {
529                        state.advance(1);
530                        JavaSyntaxKind::CaretEquals
531                    }
532                    else {
533                        JavaSyntaxKind::Caret
534                    }
535                }
536                '~' => {
537                    state.advance(1);
538                    JavaSyntaxKind::Tilde
539                }
540                '?' => {
541                    state.advance(1);
542                    JavaSyntaxKind::Question
543                }
544                ':' => {
545                    state.advance(1);
546                    JavaSyntaxKind::Colon
547                }
548                ';' => {
549                    state.advance(1);
550                    JavaSyntaxKind::Semicolon
551                }
552                ',' => {
553                    state.advance(1);
554                    JavaSyntaxKind::Comma
555                }
556                '.' => {
557                    state.advance(1);
558                    if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') {
559                        state.advance(2);
560                        JavaSyntaxKind::Ellipsis
561                    }
562                    else {
563                        JavaSyntaxKind::Dot
564                    }
565                }
566                '(' => {
567                    state.advance(1);
568                    JavaSyntaxKind::LeftParen
569                }
570                ')' => {
571                    state.advance(1);
572                    JavaSyntaxKind::RightParen
573                }
574                '{' => {
575                    state.advance(1);
576                    JavaSyntaxKind::LeftBrace
577                }
578                '}' => {
579                    state.advance(1);
580                    JavaSyntaxKind::RightBrace
581                }
582                '[' => {
583                    state.advance(1);
584                    JavaSyntaxKind::LeftBracket
585                }
586                ']' => {
587                    state.advance(1);
588                    JavaSyntaxKind::RightBracket
589                }
590                '@' => {
591                    state.advance(1);
592                    JavaSyntaxKind::At
593                }
594                _ => return false,
595            };
596
597            state.add_token(token_kind, start, state.get_position());
598            true
599        }
600        else {
601            false
602        }
603    }
604}