1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::JavaLanguage, lexer::token_type::JavaTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, JavaLanguage>;
8
9#[derive(Clone, Debug)]
10pub struct JavaLexer<'config> {
11 _config: &'config JavaLanguage,
12}
13
14impl<'config> Lexer<JavaLanguage> for JavaLexer<'config> {
15 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<JavaLanguage>) -> LexOutput<JavaLanguage> {
16 let mut state = State::new(source);
17 let result = self.run(&mut state);
18 if result.is_ok() {
19 state.add_eof();
20 }
21 state.finish_with_cache(result, cache)
22 }
23}
24
25impl<'config> JavaLexer<'config> {
26 pub fn new(config: &'config JavaLanguage) -> Self {
27 Self { _config: config }
28 }
29
30 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
32 while state.not_at_end() {
33 let safe_point = state.get_position();
34
35 if self.skip_whitespace(state) {
36 continue;
37 }
38
39 if self.lex_newline(state) {
40 continue;
41 }
42
43 if self.skip_comment(state) {
44 continue;
45 }
46
47 if self.lex_string_literal(state) {
48 continue;
49 }
50
51 if self.lex_char_literal(state) {
52 continue;
53 }
54
55 if self.lex_number_literal(state) {
56 continue;
57 }
58
59 if self.lex_identifier_or_keyword(state) {
60 continue;
61 }
62
63 if self.lex_operator_or_delimiter(state) {
64 continue;
65 }
66
67 let start_pos = state.get_position();
69 if let Some(ch) = state.peek() {
70 state.advance(ch.len_utf8());
71 state.add_token(JavaTokenType::Error, start_pos, state.get_position());
72 }
73
74 state.advance_if_dead_lock(safe_point);
75 }
76
77 Ok(())
78 }
79
80 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
82 let start = state.get_position();
83
84 while let Some(ch) = state.peek() {
85 if ch == ' ' || ch == '\t' || ch == '\r' {
86 state.advance(ch.len_utf8());
87 }
88 else {
89 break;
90 }
91 }
92
93 if state.get_position() > start {
94 state.add_token(JavaTokenType::Whitespace, start, state.get_position());
95 return true;
96 }
97 false
98 }
99
100 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102 let start = state.get_position();
103
104 if let Some('\n') = state.peek() {
105 state.advance(1);
106 state.add_token(JavaTokenType::Whitespace, start, state.get_position());
107 true
108 }
109 else {
110 false
111 }
112 }
113
114 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
116 let start = state.get_position();
117
118 if state.peek() == Some('/') && state.peek_next_n(1) == Some('/') {
120 state.advance(2);
121 while let Some(ch) = state.peek() {
122 if ch == '\n' {
123 break;
124 }
125 state.advance(ch.len_utf8());
126 }
127 state.add_token(JavaTokenType::LineComment, start, state.get_position());
128 return true;
129 }
130
131 if state.peek() == Some('/') && state.peek_next_n(1) == Some('*') {
133 let start = state.get_position();
134 state.advance(2);
135 while let Some(ch) = state.peek() {
136 if ch == '*' && state.peek_next_n(1) == Some('/') {
137 state.advance(2);
138 break;
139 }
140 state.advance(ch.len_utf8());
141 }
142 state.add_token(JavaTokenType::BlockComment, start, state.get_position());
143 return true;
144 }
145
146 false
147 }
148
149 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
151 let start = state.get_position();
152
153 if let Some('"') = state.peek() {
154 state.advance(1);
155
156 while let Some(ch) = state.peek() {
157 if ch == '"' {
158 state.advance(1);
159 break;
160 }
161 else if ch == '\\' {
162 state.advance(1);
163 if let Some(escaped) = state.peek() {
164 state.advance(escaped.len_utf8());
165 }
166 }
167 else if ch == '\n' {
168 break;
170 }
171 else {
172 state.advance(ch.len_utf8());
173 }
174 }
175
176 state.add_token(JavaTokenType::StringLiteral, start, state.get_position());
177 return true;
178 }
179
180 false
181 }
182
183 fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
185 let start = state.get_position();
186
187 if let Some('\'') = state.peek() {
188 state.advance(1);
189
190 if let Some(ch) = state.peek() {
191 if ch == '\\' {
192 state.advance(1);
193 if let Some(escaped) = state.peek() {
194 state.advance(escaped.len_utf8());
195 }
196 }
197 else if ch != '\'' && ch != '\n' {
198 state.advance(ch.len_utf8());
199 }
200 }
201
202 if let Some('\'') = state.peek() {
203 state.advance(1);
204 }
205
206 state.add_token(JavaTokenType::CharacterLiteral, start, state.get_position());
207 return true;
208 }
209
210 false
211 }
212
213 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
215 let start = state.get_position();
216
217 if let Some(ch) = state.peek() {
218 if ch.is_ascii_digit() {
219 while let Some(ch) = state.peek() {
221 if ch.is_ascii_digit() {
222 state.advance(ch.len_utf8());
223 }
224 else {
225 break;
226 }
227 }
228
229 if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
231 state.advance(1); while let Some(ch) = state.peek() {
233 if ch.is_ascii_digit() {
234 state.advance(ch.len_utf8());
235 }
236 else {
237 break;
238 }
239 }
240 }
241
242 if let Some(ch) = state.peek() {
244 if ch == 'e' || ch == 'E' {
245 state.advance(1);
246 if let Some(sign) = state.peek() {
247 if sign == '+' || sign == '-' {
248 state.advance(1);
249 }
250 }
251 while let Some(ch) = state.peek() {
252 if ch.is_ascii_digit() {
253 state.advance(ch.len_utf8());
254 }
255 else {
256 break;
257 }
258 }
259 }
260 }
261
262 if let Some(suffix) = state.peek() {
264 if suffix == 'f' || suffix == 'F' || suffix == 'd' || suffix == 'D' || suffix == 'l' || suffix == 'L' {
265 state.advance(1);
266 }
267 }
268
269 let text = state.get_text_in((start..state.get_position()).into());
270 let kind = if text.contains('.') || text.contains('e') || text.contains('E') || text.ends_with('f') || text.ends_with('F') || text.ends_with('d') || text.ends_with('D') {
271 JavaTokenType::FloatingPointLiteral
272 }
273 else {
274 JavaTokenType::IntegerLiteral
275 };
276
277 eprintln!("DEBUG: Lexer classified '{}' as {:?} at {}..{}", text, kind, start, state.get_position());
278 state.add_token(kind, start, state.get_position());
279 return true;
280 }
281 }
282 false
283 }
284
285 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
287 let start = state.get_position();
288
289 if let Some(ch) = state.peek() {
290 if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
291 state.advance(ch.len_utf8());
292
293 while let Some(ch) = state.peek() {
294 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
295 state.advance(ch.len_utf8());
296 }
297 else {
298 break;
299 }
300 }
301
302 let text = state.get_text_in((start..state.get_position()).into());
303 let token_kind = self.classify_identifier(text.as_ref());
304
305 eprintln!("DEBUG: Lexer classified '{}' as {:?} at {}..{}", text, token_kind, start, state.get_position());
306 state.add_token(token_kind, start, state.get_position());
307 true
308 }
309 else {
310 false
311 }
312 }
313 else {
314 false
315 }
316 }
317
318 fn classify_identifier(&self, text: &str) -> JavaTokenType {
320 match text {
321 "abstract" => JavaTokenType::Abstract,
322 "assert" => JavaTokenType::Assert,
323 "boolean" => JavaTokenType::Boolean,
324 "break" => JavaTokenType::Break,
325 "byte" => JavaTokenType::Byte,
326 "case" => JavaTokenType::Case,
327 "catch" => JavaTokenType::Catch,
328 "char" => JavaTokenType::Char,
329 "class" => JavaTokenType::Class,
330 "const" => JavaTokenType::Const,
331 "continue" => JavaTokenType::Continue,
332 "default" => JavaTokenType::Default,
333 "do" => JavaTokenType::Do,
334 "double" => JavaTokenType::Double,
335 "else" => JavaTokenType::Else,
336 "enum" => JavaTokenType::Enum,
337 "extends" => JavaTokenType::Extends,
338 "final" => JavaTokenType::Final,
339 "finally" => JavaTokenType::Finally,
340 "float" => JavaTokenType::Float,
341 "for" => JavaTokenType::For,
342 "goto" => JavaTokenType::Goto,
343 "if" => JavaTokenType::If,
344 "implements" => JavaTokenType::Implements,
345 "import" => JavaTokenType::Import,
346 "instanceof" => JavaTokenType::Instanceof,
347 "int" => JavaTokenType::Int,
348 "interface" => JavaTokenType::Interface,
349 "long" => JavaTokenType::Long,
350 "native" => JavaTokenType::Native,
351 "new" => JavaTokenType::New,
352 "package" => JavaTokenType::Package,
353 "private" => JavaTokenType::Private,
354 "protected" => JavaTokenType::Protected,
355 "public" => JavaTokenType::Public,
356 "record" => JavaTokenType::Record,
357 "return" => JavaTokenType::Return,
358 "short" => JavaTokenType::Short,
359 "static" => JavaTokenType::Static,
360 "strictfp" => JavaTokenType::Strictfp,
361 "struct" => JavaTokenType::Struct,
362 "super" => JavaTokenType::Super,
363 "switch" => JavaTokenType::Switch,
364 "synchronized" => JavaTokenType::Synchronized,
365 "this" => JavaTokenType::This,
366 "throw" => JavaTokenType::Throw,
367 "throws" => JavaTokenType::Throws,
368 "transient" => JavaTokenType::Transient,
369 "try" => JavaTokenType::Try,
370 "void" => JavaTokenType::Void,
371 "volatile" => JavaTokenType::Volatile,
372 "while" => JavaTokenType::While,
373 "true" | "false" => JavaTokenType::BooleanLiteral,
374 "null" => JavaTokenType::NullLiteral,
375 _ => JavaTokenType::Identifier,
376 }
377 }
378
379 fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
381 let start = state.get_position();
382
383 if let Some(ch) = state.peek() {
384 let token_kind = match ch {
385 '+' => {
386 state.advance(1);
387 if state.peek() == Some('+') {
388 state.advance(1);
389 JavaTokenType::PlusPlus
390 }
391 else if state.peek() == Some('=') {
392 state.advance(1);
393 JavaTokenType::PlusEquals
394 }
395 else {
396 JavaTokenType::Plus
397 }
398 }
399 '-' => {
400 state.advance(1);
401 if state.peek() == Some('-') {
402 state.advance(1);
403 JavaTokenType::MinusMinus
404 }
405 else if state.peek() == Some('=') {
406 state.advance(1);
407 JavaTokenType::MinusEquals
408 }
409 else {
410 JavaTokenType::Minus
411 }
412 }
413 '*' => {
414 state.advance(1);
415 if state.peek() == Some('=') {
416 state.advance(1);
417 JavaTokenType::AsteriskEquals
418 }
419 else {
420 JavaTokenType::Asterisk
421 }
422 }
423 '/' => {
424 state.advance(1);
425 if state.peek() == Some('=') {
426 state.advance(1);
427 JavaTokenType::SlashEquals
428 }
429 else {
430 JavaTokenType::Slash
431 }
432 }
433 '%' => {
434 state.advance(1);
435 if state.peek() == Some('=') {
436 state.advance(1);
437 JavaTokenType::PercentEquals
438 }
439 else {
440 JavaTokenType::Percent
441 }
442 }
443 '=' => {
444 state.advance(1);
445 if state.peek() == Some('=') {
446 state.advance(1);
447 JavaTokenType::Equals
448 }
449 else {
450 JavaTokenType::Assign
451 }
452 }
453 '!' => {
454 state.advance(1);
455 if state.peek() == Some('=') {
456 state.advance(1);
457 JavaTokenType::BangEquals
458 }
459 else {
460 JavaTokenType::Bang
461 }
462 }
463 '<' => {
464 state.advance(1);
465 if state.peek() == Some('=') {
466 state.advance(1);
467 JavaTokenType::LessThanEquals
468 }
469 else if state.peek() == Some('<') {
470 state.advance(1);
471 if state.peek() == Some('=') {
472 state.advance(1);
473 JavaTokenType::LeftShiftEquals
474 }
475 else {
476 JavaTokenType::LeftShift
477 }
478 }
479 else {
480 JavaTokenType::LessThan
481 }
482 }
483 '>' => {
484 state.advance(1);
485 if state.peek() == Some('=') {
486 state.advance(1);
487 JavaTokenType::GreaterThanEquals
488 }
489 else if state.peek() == Some('>') {
490 state.advance(1);
491 if state.peek() == Some('>') {
492 state.advance(1);
493 if state.peek() == Some('=') {
494 state.advance(1);
495 JavaTokenType::UnsignedRightShiftEquals
496 }
497 else {
498 JavaTokenType::UnsignedRightShift
499 }
500 }
501 else if state.peek() == Some('=') {
502 state.advance(1);
503 JavaTokenType::RightShiftEquals
504 }
505 else {
506 JavaTokenType::RightShift
507 }
508 }
509 else {
510 JavaTokenType::GreaterThan
511 }
512 }
513 '&' => {
514 state.advance(1);
515 if state.peek() == Some('&') {
516 state.advance(1);
517 JavaTokenType::AmpersandAmpersand
518 }
519 else if state.peek() == Some('=') {
520 state.advance(1);
521 JavaTokenType::AmpersandEquals
522 }
523 else {
524 JavaTokenType::Ampersand
525 }
526 }
527 '|' => {
528 state.advance(1);
529 if state.peek() == Some('|') {
530 state.advance(1);
531 JavaTokenType::PipePipe
532 }
533 else if state.peek() == Some('=') {
534 state.advance(1);
535 JavaTokenType::PipeEquals
536 }
537 else {
538 JavaTokenType::Pipe
539 }
540 }
541 '^' => {
542 state.advance(1);
543 if state.peek() == Some('=') {
544 state.advance(1);
545 JavaTokenType::CaretEquals
546 }
547 else {
548 JavaTokenType::Caret
549 }
550 }
551 '~' => {
552 state.advance(1);
553 JavaTokenType::Tilde
554 }
555 '?' => {
556 state.advance(1);
557 JavaTokenType::Question
558 }
559 ':' => {
560 state.advance(1);
561 JavaTokenType::Colon
562 }
563 ';' => {
564 state.advance(1);
565 JavaTokenType::Semicolon
566 }
567 ',' => {
568 state.advance(1);
569 JavaTokenType::Comma
570 }
571 '.' => {
572 state.advance(1);
573 if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') {
574 state.advance(2);
575 JavaTokenType::Ellipsis
576 }
577 else {
578 JavaTokenType::Dot
579 }
580 }
581 '(' => {
582 state.advance(1);
583 JavaTokenType::LeftParen
584 }
585 ')' => {
586 state.advance(1);
587 JavaTokenType::RightParen
588 }
589 '{' => {
590 state.advance(1);
591 JavaTokenType::LeftBrace
592 }
593 '}' => {
594 state.advance(1);
595 JavaTokenType::RightBrace
596 }
597 '[' => {
598 state.advance(1);
599 JavaTokenType::LeftBracket
600 }
601 ']' => {
602 state.advance(1);
603 JavaTokenType::RightBracket
604 }
605 '@' => {
606 state.advance(1);
607 JavaTokenType::At
608 }
609 _ => return false,
610 };
611
612 state.add_token(token_kind, start, state.get_position());
613 true
614 }
615 else {
616 false
617 }
618 }
619}