1use crate::{kind::JavaSyntaxKind, language::JavaLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, JavaLanguage>;
10
11static JAVA_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static JAVA_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static JAVA_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct JavaLexer<'config> {
17 config: &'config JavaLanguage,
18}
19
20impl<'config> Lexer<JavaLanguage> for JavaLexer<'config> {
21 fn lex_incremental(
22 &self,
23 source: impl Source,
24 changed: usize,
25 cache: IncrementalCache<JavaLanguage>,
26 ) -> LexOutput<JavaLanguage> {
27 let mut state = LexerState::new_with_cache(source, changed, cache);
28 let result = self.run(&mut state);
29 state.finish(result)
30 }
31}
32
33impl<'config> JavaLexer<'config> {
34 pub fn new(config: &'config JavaLanguage) -> Self {
35 Self { config }
36 }
37
38 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
40 while state.not_at_end() {
41 let safe_point = state.get_position();
42
43 if self.skip_whitespace(state) {
44 continue;
45 }
46
47 if self.lex_newline(state) {
48 continue;
49 }
50
51 if self.skip_comment(state) {
52 continue;
53 }
54
55 if self.lex_string_literal(state) {
56 continue;
57 }
58
59 if self.lex_char_literal(state) {
60 continue;
61 }
62
63 if self.lex_number_literal(state) {
64 continue;
65 }
66
67 if self.lex_identifier_or_keyword(state) {
68 continue;
69 }
70
71 if self.lex_operator_or_delimiter(state) {
72 continue;
73 }
74
75 state.safe_check(safe_point);
76 }
77
78 let eof_pos = state.get_position();
80 state.add_token(JavaSyntaxKind::Eof, eof_pos, eof_pos);
81 Ok(())
82 }
83
84 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
86 let start = state.get_position();
87
88 while let Some(ch) = state.peek() {
89 if ch == ' ' || ch == '\t' || ch == '\r' {
90 state.advance(ch.len_utf8());
91 }
92 else {
93 break;
94 }
95 }
96
97 if state.get_position() > start {
98 state.add_token(JavaSyntaxKind::Whitespace, start, state.get_position());
99 return true;
100 }
101 false
102 }
103
104 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
106 let start = state.get_position();
107
108 if let Some('\n') = state.peek() {
109 state.advance(1);
110 state.add_token(JavaSyntaxKind::Whitespace, start, state.get_position());
111 true
112 }
113 else {
114 false
115 }
116 }
117
118 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
120 let start = state.get_position();
121
122 if state.peek() == Some('/') && state.peek_next_n(1) == Some('/') {
124 state.advance(2);
125 while let Some(ch) = state.peek() {
126 if ch == '\n' {
127 break;
128 }
129 state.advance(ch.len_utf8());
130 }
131 state.add_token(JavaSyntaxKind::LineComment, start, state.get_position());
132 return true;
133 }
134
135 if state.peek() == Some('/') && state.peek_next_n(1) == Some('*') {
137 state.advance(2);
138 while let Some(ch) = state.peek() {
139 if ch == '*' && state.peek_next_n(1) == Some('/') {
140 state.advance(2);
141 break;
142 }
143 state.advance(ch.len_utf8());
144 }
145 state.add_token(JavaSyntaxKind::BlockComment, start, state.get_position());
146 return true;
147 }
148
149 false
150 }
151
152 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
154 let start = state.get_position();
155
156 if let Some('"') = state.peek() {
157 state.advance(1);
158
159 while let Some(ch) = state.peek() {
160 if ch == '"' {
161 state.advance(1);
162 break;
163 }
164 else if ch == '\\' {
165 state.advance(1);
166 if let Some(escaped) = state.peek() {
167 state.advance(escaped.len_utf8());
168 }
169 }
170 else if ch == '\n' {
171 break;
173 }
174 else {
175 state.advance(ch.len_utf8());
176 }
177 }
178
179 state.add_token(JavaSyntaxKind::StringLiteral, start, state.get_position());
180 return true;
181 }
182
183 false
184 }
185
186 fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
188 let start = state.get_position();
189
190 if let Some('\'') = state.peek() {
191 state.advance(1);
192
193 if let Some(ch) = state.peek() {
194 if ch == '\\' {
195 state.advance(1);
196 if let Some(escaped) = state.peek() {
197 state.advance(escaped.len_utf8());
198 }
199 }
200 else if ch != '\'' && ch != '\n' {
201 state.advance(ch.len_utf8());
202 }
203 }
204
205 if let Some('\'') = state.peek() {
206 state.advance(1);
207 }
208
209 state.add_token(JavaSyntaxKind::CharacterLiteral, start, state.get_position());
210 return true;
211 }
212
213 false
214 }
215
216 fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
218 let start = state.get_position();
219
220 if let Some(ch) = state.peek() {
221 if ch.is_ascii_digit() {
222 while let Some(ch) = state.peek() {
224 if ch.is_ascii_digit() {
225 state.advance(ch.len_utf8());
226 }
227 else {
228 break;
229 }
230 }
231
232 if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
234 state.advance(1); while let Some(ch) = state.peek() {
236 if ch.is_ascii_digit() {
237 state.advance(ch.len_utf8());
238 }
239 else {
240 break;
241 }
242 }
243 }
244
245 if let Some(ch) = state.peek() {
247 if ch == 'e' || ch == 'E' {
248 state.advance(1);
249 if let Some(sign) = state.peek() {
250 if sign == '+' || sign == '-' {
251 state.advance(1);
252 }
253 }
254 while let Some(ch) = state.peek() {
255 if ch.is_ascii_digit() {
256 state.advance(ch.len_utf8());
257 }
258 else {
259 break;
260 }
261 }
262 }
263 }
264
265 if let Some(suffix) = state.peek() {
267 if suffix == 'f' || suffix == 'F' || suffix == 'd' || suffix == 'D' || suffix == 'l' || suffix == 'L' {
268 state.advance(1);
269 }
270 }
271
272 state.add_token(JavaSyntaxKind::IntegerLiteral, start, state.get_position());
273 return true;
274 }
275 }
276
277 false
278 }
279
280 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
282 let start = state.get_position();
283
284 if let Some(ch) = state.peek() {
285 if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
286 state.advance(ch.len_utf8());
287
288 while let Some(ch) = state.peek() {
289 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
290 state.advance(ch.len_utf8());
291 }
292 else {
293 break;
294 }
295 }
296
297 let text = state.get_text_in((start..state.get_position()).into());
298 let token_kind = self.classify_identifier(&text);
299
300 state.add_token(token_kind, start, state.get_position());
301 true
302 }
303 else {
304 false
305 }
306 }
307 else {
308 false
309 }
310 }
311
312 fn classify_identifier(&self, text: &str) -> JavaSyntaxKind {
314 match text {
315 "abstract" => JavaSyntaxKind::Abstract,
316 "assert" => JavaSyntaxKind::Assert,
317 "boolean" => JavaSyntaxKind::Boolean,
318 "break" => JavaSyntaxKind::Break,
319 "byte" => JavaSyntaxKind::Byte,
320 "case" => JavaSyntaxKind::Case,
321 "catch" => JavaSyntaxKind::Catch,
322 "char" => JavaSyntaxKind::Char,
323 "class" => JavaSyntaxKind::Class,
324 "const" => JavaSyntaxKind::Const,
325 "continue" => JavaSyntaxKind::Continue,
326 "default" => JavaSyntaxKind::Default,
327 "do" => JavaSyntaxKind::Do,
328 "double" => JavaSyntaxKind::Double,
329 "else" => JavaSyntaxKind::Else,
330 "enum" => JavaSyntaxKind::Enum,
331 "extends" => JavaSyntaxKind::Extends,
332 "final" => JavaSyntaxKind::Final,
333 "finally" => JavaSyntaxKind::Finally,
334 "float" => JavaSyntaxKind::Float,
335 "for" => JavaSyntaxKind::For,
336 "goto" => JavaSyntaxKind::Goto,
337 "if" => JavaSyntaxKind::If,
338 "implements" => JavaSyntaxKind::Implements,
339 "import" => JavaSyntaxKind::Import,
340 "instanceof" => JavaSyntaxKind::Instanceof,
341 "int" => JavaSyntaxKind::Int,
342 "interface" => JavaSyntaxKind::Interface,
343 "long" => JavaSyntaxKind::Long,
344 "native" => JavaSyntaxKind::Native,
345 "new" => JavaSyntaxKind::New,
346 "package" => JavaSyntaxKind::Package,
347 "private" => JavaSyntaxKind::Private,
348 "protected" => JavaSyntaxKind::Protected,
349 "public" => JavaSyntaxKind::Public,
350 "return" => JavaSyntaxKind::Return,
351 "short" => JavaSyntaxKind::Short,
352 "static" => JavaSyntaxKind::Static,
353 "strictfp" => JavaSyntaxKind::Strictfp,
354 "super" => JavaSyntaxKind::Super,
355 "switch" => JavaSyntaxKind::Switch,
356 "synchronized" => JavaSyntaxKind::Synchronized,
357 "this" => JavaSyntaxKind::This,
358 "throw" => JavaSyntaxKind::Throw,
359 "throws" => JavaSyntaxKind::Throws,
360 "transient" => JavaSyntaxKind::Transient,
361 "try" => JavaSyntaxKind::Try,
362 "void" => JavaSyntaxKind::Void,
363 "volatile" => JavaSyntaxKind::Volatile,
364 "while" => JavaSyntaxKind::While,
365 "true" | "false" => JavaSyntaxKind::BooleanLiteral,
366 "null" => JavaSyntaxKind::NullLiteral,
367 _ => JavaSyntaxKind::Identifier,
368 }
369 }
370
371 fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
373 let start = state.get_position();
374
375 if let Some(ch) = state.peek() {
376 let token_kind = match ch {
377 '+' => {
378 state.advance(1);
379 if state.peek() == Some('+') {
380 state.advance(1);
381 JavaSyntaxKind::PlusPlus
382 }
383 else if state.peek() == Some('=') {
384 state.advance(1);
385 JavaSyntaxKind::PlusEquals
386 }
387 else {
388 JavaSyntaxKind::Plus
389 }
390 }
391 '-' => {
392 state.advance(1);
393 if state.peek() == Some('-') {
394 state.advance(1);
395 JavaSyntaxKind::MinusMinus
396 }
397 else if state.peek() == Some('=') {
398 state.advance(1);
399 JavaSyntaxKind::MinusEquals
400 }
401 else {
402 JavaSyntaxKind::Minus
403 }
404 }
405 '*' => {
406 state.advance(1);
407 if state.peek() == Some('=') {
408 state.advance(1);
409 JavaSyntaxKind::AsteriskEquals
410 }
411 else {
412 JavaSyntaxKind::Asterisk
413 }
414 }
415 '/' => {
416 state.advance(1);
417 if state.peek() == Some('=') {
418 state.advance(1);
419 JavaSyntaxKind::SlashEquals
420 }
421 else {
422 JavaSyntaxKind::Slash
423 }
424 }
425 '%' => {
426 state.advance(1);
427 if state.peek() == Some('=') {
428 state.advance(1);
429 JavaSyntaxKind::PercentEquals
430 }
431 else {
432 JavaSyntaxKind::Percent
433 }
434 }
435 '=' => {
436 state.advance(1);
437 if state.peek() == Some('=') {
438 state.advance(1);
439 JavaSyntaxKind::Equals
440 }
441 else {
442 JavaSyntaxKind::Assign
443 }
444 }
445 '!' => {
446 state.advance(1);
447 if state.peek() == Some('=') {
448 state.advance(1);
449 JavaSyntaxKind::BangEquals
450 }
451 else {
452 JavaSyntaxKind::Bang
453 }
454 }
455 '<' => {
456 state.advance(1);
457 if state.peek() == Some('=') {
458 state.advance(1);
459 JavaSyntaxKind::LessThanEquals
460 }
461 else if state.peek() == Some('<') {
462 state.advance(1);
463 if state.peek() == Some('=') {
464 state.advance(1);
465 JavaSyntaxKind::LeftShiftEquals
466 }
467 else {
468 JavaSyntaxKind::LeftShift
469 }
470 }
471 else {
472 JavaSyntaxKind::LessThan
473 }
474 }
475 '>' => {
476 state.advance(1);
477 if state.peek() == Some('=') {
478 state.advance(1);
479 JavaSyntaxKind::GreaterThanEquals
480 }
481 else if state.peek() == Some('>') {
482 state.advance(1);
483 if state.peek() == Some('>') {
484 state.advance(1);
485 if state.peek() == Some('=') {
486 state.advance(1);
487 JavaSyntaxKind::UnsignedRightShiftEquals
488 }
489 else {
490 JavaSyntaxKind::UnsignedRightShift
491 }
492 }
493 else if state.peek() == Some('=') {
494 state.advance(1);
495 JavaSyntaxKind::RightShiftEquals
496 }
497 else {
498 JavaSyntaxKind::RightShift
499 }
500 }
501 else {
502 JavaSyntaxKind::GreaterThan
503 }
504 }
505 '&' => {
506 state.advance(1);
507 if state.peek() == Some('&') {
508 state.advance(1);
509 JavaSyntaxKind::AmpersandAmpersand
510 }
511 else if state.peek() == Some('=') {
512 state.advance(1);
513 JavaSyntaxKind::AmpersandEquals
514 }
515 else {
516 JavaSyntaxKind::Ampersand
517 }
518 }
519 '|' => {
520 state.advance(1);
521 if state.peek() == Some('|') {
522 state.advance(1);
523 JavaSyntaxKind::PipePipe
524 }
525 else if state.peek() == Some('=') {
526 state.advance(1);
527 JavaSyntaxKind::PipeEquals
528 }
529 else {
530 JavaSyntaxKind::Pipe
531 }
532 }
533 '^' => {
534 state.advance(1);
535 if state.peek() == Some('=') {
536 state.advance(1);
537 JavaSyntaxKind::CaretEquals
538 }
539 else {
540 JavaSyntaxKind::Caret
541 }
542 }
543 '~' => {
544 state.advance(1);
545 JavaSyntaxKind::Tilde
546 }
547 '?' => {
548 state.advance(1);
549 JavaSyntaxKind::Question
550 }
551 ':' => {
552 state.advance(1);
553 JavaSyntaxKind::Colon
554 }
555 ';' => {
556 state.advance(1);
557 JavaSyntaxKind::Semicolon
558 }
559 ',' => {
560 state.advance(1);
561 JavaSyntaxKind::Comma
562 }
563 '.' => {
564 state.advance(1);
565 if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') {
566 state.advance(2);
567 JavaSyntaxKind::Ellipsis
568 }
569 else {
570 JavaSyntaxKind::Dot
571 }
572 }
573 '(' => {
574 state.advance(1);
575 JavaSyntaxKind::LeftParen
576 }
577 ')' => {
578 state.advance(1);
579 JavaSyntaxKind::RightParen
580 }
581 '{' => {
582 state.advance(1);
583 JavaSyntaxKind::LeftBrace
584 }
585 '}' => {
586 state.advance(1);
587 JavaSyntaxKind::RightBrace
588 }
589 '[' => {
590 state.advance(1);
591 JavaSyntaxKind::LeftBracket
592 }
593 ']' => {
594 state.advance(1);
595 JavaSyntaxKind::RightBracket
596 }
597 '@' => {
598 state.advance(1);
599 JavaSyntaxKind::At
600 }
601 _ => return false,
602 };
603
604 state.add_token(token_kind, start, state.get_position());
605 true
606 }
607 else {
608 false
609 }
610 }
611}