oak_java/lexer/
mod.rs

1use crate::{kind::JavaSyntaxKind, language::JavaLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, JavaLanguage>;
10
11static JAVA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static JAVA_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static JAVA_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct JavaLexer<'config> {
17    config: &'config JavaLanguage,
18}
19
20impl<'config> Lexer<JavaLanguage> for JavaLexer<'config> {
21    fn lex_incremental(
22        &self,
23        source: impl Source,
24        changed: usize,
25        cache: IncrementalCache<JavaLanguage>,
26    ) -> LexOutput<JavaLanguage> {
27        let mut state = LexerState::new_with_cache(source, changed, cache);
28        let result = self.run(&mut state);
29        state.finish(result)
30    }
31}
32
33impl<'config> JavaLexer<'config> {
34    pub fn new(config: &'config JavaLanguage) -> Self {
35        Self { config }
36    }
37
38    /// 主要的词法分析循环
39    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.skip_whitespace(state) {
44                continue;
45            }
46
47            if self.lex_newline(state) {
48                continue;
49            }
50
51            if self.skip_comment(state) {
52                continue;
53            }
54
55            if self.lex_string_literal(state) {
56                continue;
57            }
58
59            if self.lex_char_literal(state) {
60                continue;
61            }
62
63            if self.lex_number_literal(state) {
64                continue;
65            }
66
67            if self.lex_identifier_or_keyword(state) {
68                continue;
69            }
70
71            if self.lex_operator_or_delimiter(state) {
72                continue;
73            }
74
75            state.safe_check(safe_point);
76        }
77
78        // 添加 EOF token
79        let eof_pos = state.get_position();
80        state.add_token(JavaSyntaxKind::Eof, eof_pos, eof_pos);
81        Ok(())
82    }
83
84    /// 跳过空白字符(不包括换行符)
85    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
86        let start = state.get_position();
87
88        while let Some(ch) = state.peek() {
89            if ch == ' ' || ch == '\t' || ch == '\r' {
90                state.advance(ch.len_utf8());
91            }
92            else {
93                break;
94            }
95        }
96
97        if state.get_position() > start {
98            state.add_token(JavaSyntaxKind::Whitespace, start, state.get_position());
99            return true;
100        }
101        false
102    }
103
104    /// 处理换行
105    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
106        let start = state.get_position();
107
108        if let Some('\n') = state.peek() {
109            state.advance(1);
110            state.add_token(JavaSyntaxKind::Whitespace, start, state.get_position());
111            true
112        }
113        else {
114            false
115        }
116    }
117
118    /// 跳过注释
119    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
120        let start = state.get_position();
121
122        // 单行注释 //
123        if state.peek() == Some('/') && state.peek_next_n(1) == Some('/') {
124            state.advance(2);
125            while let Some(ch) = state.peek() {
126                if ch == '\n' {
127                    break;
128                }
129                state.advance(ch.len_utf8());
130            }
131            state.add_token(JavaSyntaxKind::LineComment, start, state.get_position());
132            return true;
133        }
134
135        // 多行注释 /* */
136        if state.peek() == Some('/') && state.peek_next_n(1) == Some('*') {
137            state.advance(2);
138            while let Some(ch) = state.peek() {
139                if ch == '*' && state.peek_next_n(1) == Some('/') {
140                    state.advance(2);
141                    break;
142                }
143                state.advance(ch.len_utf8());
144            }
145            state.add_token(JavaSyntaxKind::BlockComment, start, state.get_position());
146            return true;
147        }
148
149        false
150    }
151
152    /// 处理字符串字面量
153    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
154        let start = state.get_position();
155
156        if let Some('"') = state.peek() {
157            state.advance(1);
158
159            while let Some(ch) = state.peek() {
160                if ch == '"' {
161                    state.advance(1);
162                    break;
163                }
164                else if ch == '\\' {
165                    state.advance(1);
166                    if let Some(escaped) = state.peek() {
167                        state.advance(escaped.len_utf8());
168                    }
169                }
170                else if ch == '\n' {
171                    // 未闭合的字符�?
172                    break;
173                }
174                else {
175                    state.advance(ch.len_utf8());
176                }
177            }
178
179            state.add_token(JavaSyntaxKind::StringLiteral, start, state.get_position());
180            return true;
181        }
182
183        false
184    }
185
186    /// 处理字符字面�?
187    fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
188        let start = state.get_position();
189
190        if let Some('\'') = state.peek() {
191            state.advance(1);
192
193            if let Some(ch) = state.peek() {
194                if ch == '\\' {
195                    state.advance(1);
196                    if let Some(escaped) = state.peek() {
197                        state.advance(escaped.len_utf8());
198                    }
199                }
200                else if ch != '\'' && ch != '\n' {
201                    state.advance(ch.len_utf8());
202                }
203            }
204
205            if let Some('\'') = state.peek() {
206                state.advance(1);
207            }
208
209            state.add_token(JavaSyntaxKind::CharacterLiteral, start, state.get_position());
210            return true;
211        }
212
213        false
214    }
215
216    /// 处理数字字面�?
217    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
218        let start = state.get_position();
219
220        if let Some(ch) = state.peek() {
221            if ch.is_ascii_digit() {
222                // 处理整数部分
223                while let Some(ch) = state.peek() {
224                    if ch.is_ascii_digit() {
225                        state.advance(ch.len_utf8());
226                    }
227                    else {
228                        break;
229                    }
230                }
231
232                // 处理小数部分
233                if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
234                    state.advance(1); // '.'
235                    while let Some(ch) = state.peek() {
236                        if ch.is_ascii_digit() {
237                            state.advance(ch.len_utf8());
238                        }
239                        else {
240                            break;
241                        }
242                    }
243                }
244
245                // 处理指数部分
246                if let Some(ch) = state.peek() {
247                    if ch == 'e' || ch == 'E' {
248                        state.advance(1);
249                        if let Some(sign) = state.peek() {
250                            if sign == '+' || sign == '-' {
251                                state.advance(1);
252                            }
253                        }
254                        while let Some(ch) = state.peek() {
255                            if ch.is_ascii_digit() {
256                                state.advance(ch.len_utf8());
257                            }
258                            else {
259                                break;
260                            }
261                        }
262                    }
263                }
264
265                // 处理后缀
266                if let Some(suffix) = state.peek() {
267                    if suffix == 'f' || suffix == 'F' || suffix == 'd' || suffix == 'D' || suffix == 'l' || suffix == 'L' {
268                        state.advance(1);
269                    }
270                }
271
272                state.add_token(JavaSyntaxKind::IntegerLiteral, start, state.get_position());
273                return true;
274            }
275        }
276
277        false
278    }
279
280    /// 处理标识符或关键�?
281    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
282        let start = state.get_position();
283
284        if let Some(ch) = state.peek() {
285            if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
286                state.advance(ch.len_utf8());
287
288                while let Some(ch) = state.peek() {
289                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
290                        state.advance(ch.len_utf8());
291                    }
292                    else {
293                        break;
294                    }
295                }
296
297                let text = state.get_text_in((start..state.get_position()).into());
298                let token_kind = self.classify_identifier(&text);
299
300                state.add_token(token_kind, start, state.get_position());
301                true
302            }
303            else {
304                false
305            }
306        }
307        else {
308            false
309        }
310    }
311
312    /// 分类标识符为关键字或普通标识符
313    fn classify_identifier(&self, text: &str) -> JavaSyntaxKind {
314        match text {
315            "abstract" => JavaSyntaxKind::Abstract,
316            "assert" => JavaSyntaxKind::Assert,
317            "boolean" => JavaSyntaxKind::Boolean,
318            "break" => JavaSyntaxKind::Break,
319            "byte" => JavaSyntaxKind::Byte,
320            "case" => JavaSyntaxKind::Case,
321            "catch" => JavaSyntaxKind::Catch,
322            "char" => JavaSyntaxKind::Char,
323            "class" => JavaSyntaxKind::Class,
324            "const" => JavaSyntaxKind::Const,
325            "continue" => JavaSyntaxKind::Continue,
326            "default" => JavaSyntaxKind::Default,
327            "do" => JavaSyntaxKind::Do,
328            "double" => JavaSyntaxKind::Double,
329            "else" => JavaSyntaxKind::Else,
330            "enum" => JavaSyntaxKind::Enum,
331            "extends" => JavaSyntaxKind::Extends,
332            "final" => JavaSyntaxKind::Final,
333            "finally" => JavaSyntaxKind::Finally,
334            "float" => JavaSyntaxKind::Float,
335            "for" => JavaSyntaxKind::For,
336            "goto" => JavaSyntaxKind::Goto,
337            "if" => JavaSyntaxKind::If,
338            "implements" => JavaSyntaxKind::Implements,
339            "import" => JavaSyntaxKind::Import,
340            "instanceof" => JavaSyntaxKind::Instanceof,
341            "int" => JavaSyntaxKind::Int,
342            "interface" => JavaSyntaxKind::Interface,
343            "long" => JavaSyntaxKind::Long,
344            "native" => JavaSyntaxKind::Native,
345            "new" => JavaSyntaxKind::New,
346            "package" => JavaSyntaxKind::Package,
347            "private" => JavaSyntaxKind::Private,
348            "protected" => JavaSyntaxKind::Protected,
349            "public" => JavaSyntaxKind::Public,
350            "return" => JavaSyntaxKind::Return,
351            "short" => JavaSyntaxKind::Short,
352            "static" => JavaSyntaxKind::Static,
353            "strictfp" => JavaSyntaxKind::Strictfp,
354            "super" => JavaSyntaxKind::Super,
355            "switch" => JavaSyntaxKind::Switch,
356            "synchronized" => JavaSyntaxKind::Synchronized,
357            "this" => JavaSyntaxKind::This,
358            "throw" => JavaSyntaxKind::Throw,
359            "throws" => JavaSyntaxKind::Throws,
360            "transient" => JavaSyntaxKind::Transient,
361            "try" => JavaSyntaxKind::Try,
362            "void" => JavaSyntaxKind::Void,
363            "volatile" => JavaSyntaxKind::Volatile,
364            "while" => JavaSyntaxKind::While,
365            "true" | "false" => JavaSyntaxKind::BooleanLiteral,
366            "null" => JavaSyntaxKind::NullLiteral,
367            _ => JavaSyntaxKind::Identifier,
368        }
369    }
370
371    /// 处理操作符和分隔�?
372    fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
373        let start = state.get_position();
374
375        if let Some(ch) = state.peek() {
376            let token_kind = match ch {
377                '+' => {
378                    state.advance(1);
379                    if state.peek() == Some('+') {
380                        state.advance(1);
381                        JavaSyntaxKind::PlusPlus
382                    }
383                    else if state.peek() == Some('=') {
384                        state.advance(1);
385                        JavaSyntaxKind::PlusEquals
386                    }
387                    else {
388                        JavaSyntaxKind::Plus
389                    }
390                }
391                '-' => {
392                    state.advance(1);
393                    if state.peek() == Some('-') {
394                        state.advance(1);
395                        JavaSyntaxKind::MinusMinus
396                    }
397                    else if state.peek() == Some('=') {
398                        state.advance(1);
399                        JavaSyntaxKind::MinusEquals
400                    }
401                    else {
402                        JavaSyntaxKind::Minus
403                    }
404                }
405                '*' => {
406                    state.advance(1);
407                    if state.peek() == Some('=') {
408                        state.advance(1);
409                        JavaSyntaxKind::AsteriskEquals
410                    }
411                    else {
412                        JavaSyntaxKind::Asterisk
413                    }
414                }
415                '/' => {
416                    state.advance(1);
417                    if state.peek() == Some('=') {
418                        state.advance(1);
419                        JavaSyntaxKind::SlashEquals
420                    }
421                    else {
422                        JavaSyntaxKind::Slash
423                    }
424                }
425                '%' => {
426                    state.advance(1);
427                    if state.peek() == Some('=') {
428                        state.advance(1);
429                        JavaSyntaxKind::PercentEquals
430                    }
431                    else {
432                        JavaSyntaxKind::Percent
433                    }
434                }
435                '=' => {
436                    state.advance(1);
437                    if state.peek() == Some('=') {
438                        state.advance(1);
439                        JavaSyntaxKind::Equals
440                    }
441                    else {
442                        JavaSyntaxKind::Assign
443                    }
444                }
445                '!' => {
446                    state.advance(1);
447                    if state.peek() == Some('=') {
448                        state.advance(1);
449                        JavaSyntaxKind::BangEquals
450                    }
451                    else {
452                        JavaSyntaxKind::Bang
453                    }
454                }
455                '<' => {
456                    state.advance(1);
457                    if state.peek() == Some('=') {
458                        state.advance(1);
459                        JavaSyntaxKind::LessThanEquals
460                    }
461                    else if state.peek() == Some('<') {
462                        state.advance(1);
463                        if state.peek() == Some('=') {
464                            state.advance(1);
465                            JavaSyntaxKind::LeftShiftEquals
466                        }
467                        else {
468                            JavaSyntaxKind::LeftShift
469                        }
470                    }
471                    else {
472                        JavaSyntaxKind::LessThan
473                    }
474                }
475                '>' => {
476                    state.advance(1);
477                    if state.peek() == Some('=') {
478                        state.advance(1);
479                        JavaSyntaxKind::GreaterThanEquals
480                    }
481                    else if state.peek() == Some('>') {
482                        state.advance(1);
483                        if state.peek() == Some('>') {
484                            state.advance(1);
485                            if state.peek() == Some('=') {
486                                state.advance(1);
487                                JavaSyntaxKind::UnsignedRightShiftEquals
488                            }
489                            else {
490                                JavaSyntaxKind::UnsignedRightShift
491                            }
492                        }
493                        else if state.peek() == Some('=') {
494                            state.advance(1);
495                            JavaSyntaxKind::RightShiftEquals
496                        }
497                        else {
498                            JavaSyntaxKind::RightShift
499                        }
500                    }
501                    else {
502                        JavaSyntaxKind::GreaterThan
503                    }
504                }
505                '&' => {
506                    state.advance(1);
507                    if state.peek() == Some('&') {
508                        state.advance(1);
509                        JavaSyntaxKind::AmpersandAmpersand
510                    }
511                    else if state.peek() == Some('=') {
512                        state.advance(1);
513                        JavaSyntaxKind::AmpersandEquals
514                    }
515                    else {
516                        JavaSyntaxKind::Ampersand
517                    }
518                }
519                '|' => {
520                    state.advance(1);
521                    if state.peek() == Some('|') {
522                        state.advance(1);
523                        JavaSyntaxKind::PipePipe
524                    }
525                    else if state.peek() == Some('=') {
526                        state.advance(1);
527                        JavaSyntaxKind::PipeEquals
528                    }
529                    else {
530                        JavaSyntaxKind::Pipe
531                    }
532                }
533                '^' => {
534                    state.advance(1);
535                    if state.peek() == Some('=') {
536                        state.advance(1);
537                        JavaSyntaxKind::CaretEquals
538                    }
539                    else {
540                        JavaSyntaxKind::Caret
541                    }
542                }
543                '~' => {
544                    state.advance(1);
545                    JavaSyntaxKind::Tilde
546                }
547                '?' => {
548                    state.advance(1);
549                    JavaSyntaxKind::Question
550                }
551                ':' => {
552                    state.advance(1);
553                    JavaSyntaxKind::Colon
554                }
555                ';' => {
556                    state.advance(1);
557                    JavaSyntaxKind::Semicolon
558                }
559                ',' => {
560                    state.advance(1);
561                    JavaSyntaxKind::Comma
562                }
563                '.' => {
564                    state.advance(1);
565                    if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') {
566                        state.advance(2);
567                        JavaSyntaxKind::Ellipsis
568                    }
569                    else {
570                        JavaSyntaxKind::Dot
571                    }
572                }
573                '(' => {
574                    state.advance(1);
575                    JavaSyntaxKind::LeftParen
576                }
577                ')' => {
578                    state.advance(1);
579                    JavaSyntaxKind::RightParen
580                }
581                '{' => {
582                    state.advance(1);
583                    JavaSyntaxKind::LeftBrace
584                }
585                '}' => {
586                    state.advance(1);
587                    JavaSyntaxKind::RightBrace
588                }
589                '[' => {
590                    state.advance(1);
591                    JavaSyntaxKind::LeftBracket
592                }
593                ']' => {
594                    state.advance(1);
595                    JavaSyntaxKind::RightBracket
596                }
597                '@' => {
598                    state.advance(1);
599                    JavaSyntaxKind::At
600                }
601                _ => return false,
602            };
603
604            state.add_token(token_kind, start, state.get_position());
605            true
606        }
607        else {
608            false
609        }
610    }
611}