1pub mod token_type;
5
6use crate::{language::JavaLanguage, lexer::token_type::JavaTokenType};
7use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
8
9pub(crate) type State<'a, S> = LexerState<'a, S, JavaLanguage>;
10
11#[derive(Clone, Debug)]
13pub struct JavaLexer<'config> {
14 _config: &'config JavaLanguage,
15}
16
17impl<'config> Lexer<JavaLanguage> for JavaLexer<'config> {
18 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<JavaLanguage>) -> LexOutput<JavaLanguage> {
19 let mut state = State::new(source);
20 let result = self.run(&mut state);
21 if result.is_ok() {
22 state.add_eof();
23 }
24 state.finish_with_cache(result, cache)
25 }
26}
27
28impl<'config> JavaLexer<'config> {
29 pub fn new(config: &'config JavaLanguage) -> Self {
31 Self { _config: config }
32 }
33
34 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
36 while state.not_at_end() {
37 let safe_point = state.get_position();
38
39 if self.skip_whitespace(state) {
40 continue;
41 }
42
43 if self.lex_newline(state) {
44 continue;
45 }
46
47 if self.skip_comment(state) {
48 continue;
49 }
50
51 if self.lex_string_literal(state) {
52 continue;
53 }
54
55 if self.lex_char_literal(state) {
56 continue;
57 }
58
59 if self.lex_number_literal(state) {
60 continue;
61 }
62
63 if self.lex_identifier_or_keyword(state) {
64 continue;
65 }
66
67 if self.lex_operator_or_delimiter(state) {
68 continue;
69 }
70
71 let start_pos = state.get_position();
73 if let Some(ch) = state.peek() {
74 state.advance(ch.len_utf8());
75 state.add_token(JavaTokenType::Error, start_pos, state.get_position());
76 }
77
78 state.advance_if_dead_lock(safe_point);
79 }
80
81 Ok(())
82 }
83
84 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
86 let start = state.get_position();
87
88 while let Some(ch) = state.peek() {
89 if ch == ' ' || ch == '\t' || ch == '\r' {
90 state.advance(ch.len_utf8());
91 }
92 else {
93 break;
94 }
95 }
96
97 if state.get_position() > start {
98 state.add_token(JavaTokenType::Whitespace, start, state.get_position());
99 return true;
100 }
101 false
102 }
103
104 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106 let start = state.get_position();
107
108 if let Some('\n') = state.peek() {
109 state.advance(1);
110 state.add_token(JavaTokenType::Whitespace, start, state.get_position());
111 true
112 }
113 else {
114 false
115 }
116 }
117
118 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120 let start = state.get_position();
121
122 if state.peek() == Some('/') && state.peek_next_n(1) == Some('/') {
124 state.advance(2);
125 while let Some(ch) = state.peek() {
126 if ch == '\n' {
127 break;
128 }
129 state.advance(ch.len_utf8());
130 }
131 state.add_token(JavaTokenType::LineComment, start, state.get_position());
132 return true;
133 }
134
135 if state.peek() == Some('/') && state.peek_next_n(1) == Some('*') {
137 let start = state.get_position();
138 state.advance(2);
139 while let Some(ch) = state.peek() {
140 if ch == '*' && state.peek_next_n(1) == Some('/') {
141 state.advance(2);
142 break;
143 }
144 state.advance(ch.len_utf8());
145 }
146 state.add_token(JavaTokenType::BlockComment, start, state.get_position());
147 return true;
148 }
149
150 false
151 }
152
153 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
155 let start = state.get_position();
156
157 if let Some('"') = state.peek() {
158 state.advance(1);
159
160 while let Some(ch) = state.peek() {
161 if ch == '"' {
162 state.advance(1);
163 break;
164 }
165 else if ch == '\\' {
166 state.advance(1);
167 if let Some(escaped) = state.peek() {
168 state.advance(escaped.len_utf8());
169 }
170 }
171 else if ch == '\n' {
172 break;
174 }
175 else {
176 state.advance(ch.len_utf8());
177 }
178 }
179
180 state.add_token(JavaTokenType::StringLiteral, start, state.get_position());
181 return true;
182 }
183
184 false
185 }
186
187 fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
189 let start = state.get_position();
190
191 if let Some('\'') = state.peek() {
192 state.advance(1);
193
194 if let Some(ch) = state.peek() {
195 if ch == '\\' {
196 state.advance(1);
197 if let Some(escaped) = state.peek() {
198 state.advance(escaped.len_utf8());
199 }
200 }
201 else if ch != '\'' && ch != '\n' {
202 state.advance(ch.len_utf8());
203 }
204 }
205
206 if let Some('\'') = state.peek() {
207 state.advance(1);
208 }
209
210 state.add_token(JavaTokenType::CharacterLiteral, start, state.get_position());
211 return true;
212 }
213
214 false
215 }
216
217 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219 let start = state.get_position();
220
221 if let Some(ch) = state.peek() {
222 if ch.is_ascii_digit() {
223 while let Some(ch) = state.peek() {
225 if ch.is_ascii_digit() {
226 state.advance(ch.len_utf8());
227 }
228 else {
229 break;
230 }
231 }
232
233 if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
235 state.advance(1); while let Some(ch) = state.peek() {
237 if ch.is_ascii_digit() {
238 state.advance(ch.len_utf8());
239 }
240 else {
241 break;
242 }
243 }
244 }
245
246 if let Some(ch) = state.peek() {
248 if ch == 'e' || ch == 'E' {
249 state.advance(1);
250 if let Some(sign) = state.peek() {
251 if sign == '+' || sign == '-' {
252 state.advance(1);
253 }
254 }
255 while let Some(ch) = state.peek() {
256 if ch.is_ascii_digit() {
257 state.advance(ch.len_utf8());
258 }
259 else {
260 break;
261 }
262 }
263 }
264 }
265
266 if let Some(suffix) = state.peek() {
268 if suffix == 'f' || suffix == 'F' || suffix == 'd' || suffix == 'D' || suffix == 'l' || suffix == 'L' {
269 state.advance(1);
270 }
271 }
272
273 let text = state.get_text_in((start..state.get_position()).into());
274 let kind = if text.contains('.') || text.contains('e') || text.contains('E') || text.ends_with('f') || text.ends_with('F') || text.ends_with('d') || text.ends_with('D') {
275 JavaTokenType::FloatingPointLiteral
276 }
277 else {
278 JavaTokenType::IntegerLiteral
279 };
280
281 eprintln!("DEBUG: Lexer classified '{}' as {:?} at {}..{}", text, kind, start, state.get_position());
282 state.add_token(kind, start, state.get_position());
283 return true;
284 }
285 }
286 false
287 }
288
289 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
291 let start = state.get_position();
292
293 if let Some(ch) = state.peek() {
294 if ch.is_ascii_alphabetic() || ch == '_' || ch == '$' {
295 state.advance(ch.len_utf8());
296
297 while let Some(ch) = state.peek() {
298 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
299 state.advance(ch.len_utf8());
300 }
301 else {
302 break;
303 }
304 }
305
306 let text = state.get_text_in((start..state.get_position()).into());
307 let token_kind = self.classify_identifier(text.as_ref());
308
309 eprintln!("DEBUG: Lexer classified '{}' as {:?} at {}..{}", text, token_kind, start, state.get_position());
310 state.add_token(token_kind, start, state.get_position());
311 true
312 }
313 else {
314 false
315 }
316 }
317 else {
318 false
319 }
320 }
321
322 fn classify_identifier(&self, text: &str) -> JavaTokenType {
324 match text {
325 "abstract" => JavaTokenType::Abstract,
326 "assert" => JavaTokenType::Assert,
327 "boolean" => JavaTokenType::Boolean,
328 "break" => JavaTokenType::Break,
329 "byte" => JavaTokenType::Byte,
330 "case" => JavaTokenType::Case,
331 "catch" => JavaTokenType::Catch,
332 "char" => JavaTokenType::Char,
333 "class" => JavaTokenType::Class,
334 "const" => JavaTokenType::Const,
335 "continue" => JavaTokenType::Continue,
336 "default" => JavaTokenType::Default,
337 "do" => JavaTokenType::Do,
338 "double" => JavaTokenType::Double,
339 "else" => JavaTokenType::Else,
340 "enum" => JavaTokenType::Enum,
341 "extends" => JavaTokenType::Extends,
342 "final" => JavaTokenType::Final,
343 "finally" => JavaTokenType::Finally,
344 "float" => JavaTokenType::Float,
345 "for" => JavaTokenType::For,
346 "goto" => JavaTokenType::Goto,
347 "if" => JavaTokenType::If,
348 "implements" => JavaTokenType::Implements,
349 "import" => JavaTokenType::Import,
350 "instanceof" => JavaTokenType::Instanceof,
351 "int" => JavaTokenType::Int,
352 "interface" => JavaTokenType::Interface,
353 "long" => JavaTokenType::Long,
354 "native" => JavaTokenType::Native,
355 "new" => JavaTokenType::New,
356 "package" => JavaTokenType::Package,
357 "private" => JavaTokenType::Private,
358 "protected" => JavaTokenType::Protected,
359 "public" => JavaTokenType::Public,
360 "record" => JavaTokenType::Record,
361 "return" => JavaTokenType::Return,
362 "short" => JavaTokenType::Short,
363 "static" => JavaTokenType::Static,
364 "strictfp" => JavaTokenType::Strictfp,
365 "struct" => JavaTokenType::Struct,
366 "super" => JavaTokenType::Super,
367 "switch" => JavaTokenType::Switch,
368 "synchronized" => JavaTokenType::Synchronized,
369 "this" => JavaTokenType::This,
370 "throw" => JavaTokenType::Throw,
371 "throws" => JavaTokenType::Throws,
372 "transient" => JavaTokenType::Transient,
373 "try" => JavaTokenType::Try,
374 "void" => JavaTokenType::Void,
375 "volatile" => JavaTokenType::Volatile,
376 "while" => JavaTokenType::While,
377 "true" | "false" => JavaTokenType::BooleanLiteral,
378 "null" => JavaTokenType::NullLiteral,
379 _ => JavaTokenType::Identifier,
380 }
381 }
382
383 fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
385 let start = state.get_position();
386
387 if let Some(ch) = state.peek() {
388 let token_kind = match ch {
389 '+' => {
390 state.advance(1);
391 if state.peek() == Some('+') {
392 state.advance(1);
393 JavaTokenType::PlusPlus
394 }
395 else if state.peek() == Some('=') {
396 state.advance(1);
397 JavaTokenType::PlusEquals
398 }
399 else {
400 JavaTokenType::Plus
401 }
402 }
403 '-' => {
404 state.advance(1);
405 if state.peek() == Some('-') {
406 state.advance(1);
407 JavaTokenType::MinusMinus
408 }
409 else if state.peek() == Some('=') {
410 state.advance(1);
411 JavaTokenType::MinusEquals
412 }
413 else {
414 JavaTokenType::Minus
415 }
416 }
417 '*' => {
418 state.advance(1);
419 if state.peek() == Some('=') {
420 state.advance(1);
421 JavaTokenType::AsteriskEquals
422 }
423 else {
424 JavaTokenType::Asterisk
425 }
426 }
427 '/' => {
428 state.advance(1);
429 if state.peek() == Some('=') {
430 state.advance(1);
431 JavaTokenType::SlashEquals
432 }
433 else {
434 JavaTokenType::Slash
435 }
436 }
437 '%' => {
438 state.advance(1);
439 if state.peek() == Some('=') {
440 state.advance(1);
441 JavaTokenType::PercentEquals
442 }
443 else {
444 JavaTokenType::Percent
445 }
446 }
447 '=' => {
448 state.advance(1);
449 if state.peek() == Some('=') {
450 state.advance(1);
451 JavaTokenType::Equals
452 }
453 else {
454 JavaTokenType::Assign
455 }
456 }
457 '!' => {
458 state.advance(1);
459 if state.peek() == Some('=') {
460 state.advance(1);
461 JavaTokenType::BangEquals
462 }
463 else {
464 JavaTokenType::Bang
465 }
466 }
467 '<' => {
468 state.advance(1);
469 if state.peek() == Some('=') {
470 state.advance(1);
471 JavaTokenType::LessThanEquals
472 }
473 else if state.peek() == Some('<') {
474 state.advance(1);
475 if state.peek() == Some('=') {
476 state.advance(1);
477 JavaTokenType::LeftShiftEquals
478 }
479 else {
480 JavaTokenType::LeftShift
481 }
482 }
483 else {
484 JavaTokenType::LessThan
485 }
486 }
487 '>' => {
488 state.advance(1);
489 if state.peek() == Some('=') {
490 state.advance(1);
491 JavaTokenType::GreaterThanEquals
492 }
493 else if state.peek() == Some('>') {
494 state.advance(1);
495 if state.peek() == Some('>') {
496 state.advance(1);
497 if state.peek() == Some('=') {
498 state.advance(1);
499 JavaTokenType::UnsignedRightShiftEquals
500 }
501 else {
502 JavaTokenType::UnsignedRightShift
503 }
504 }
505 else if state.peek() == Some('=') {
506 state.advance(1);
507 JavaTokenType::RightShiftEquals
508 }
509 else {
510 JavaTokenType::RightShift
511 }
512 }
513 else {
514 JavaTokenType::GreaterThan
515 }
516 }
517 '&' => {
518 state.advance(1);
519 if state.peek() == Some('&') {
520 state.advance(1);
521 JavaTokenType::AmpersandAmpersand
522 }
523 else if state.peek() == Some('=') {
524 state.advance(1);
525 JavaTokenType::AmpersandEquals
526 }
527 else {
528 JavaTokenType::Ampersand
529 }
530 }
531 '|' => {
532 state.advance(1);
533 if state.peek() == Some('|') {
534 state.advance(1);
535 JavaTokenType::PipePipe
536 }
537 else if state.peek() == Some('=') {
538 state.advance(1);
539 JavaTokenType::PipeEquals
540 }
541 else {
542 JavaTokenType::Pipe
543 }
544 }
545 '^' => {
546 state.advance(1);
547 if state.peek() == Some('=') {
548 state.advance(1);
549 JavaTokenType::CaretEquals
550 }
551 else {
552 JavaTokenType::Caret
553 }
554 }
555 '~' => {
556 state.advance(1);
557 JavaTokenType::Tilde
558 }
559 '?' => {
560 state.advance(1);
561 JavaTokenType::Question
562 }
563 ':' => {
564 state.advance(1);
565 JavaTokenType::Colon
566 }
567 ';' => {
568 state.advance(1);
569 JavaTokenType::Semicolon
570 }
571 ',' => {
572 state.advance(1);
573 JavaTokenType::Comma
574 }
575 '.' => {
576 state.advance(1);
577 if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') {
578 state.advance(2);
579 JavaTokenType::Ellipsis
580 }
581 else {
582 JavaTokenType::Dot
583 }
584 }
585 '(' => {
586 state.advance(1);
587 JavaTokenType::LeftParen
588 }
589 ')' => {
590 state.advance(1);
591 JavaTokenType::RightParen
592 }
593 '{' => {
594 state.advance(1);
595 JavaTokenType::LeftBrace
596 }
597 '}' => {
598 state.advance(1);
599 JavaTokenType::RightBrace
600 }
601 '[' => {
602 state.advance(1);
603 JavaTokenType::LeftBracket
604 }
605 ']' => {
606 state.advance(1);
607 JavaTokenType::RightBracket
608 }
609 '@' => {
610 state.advance(1);
611 JavaTokenType::At
612 }
613 _ => return false,
614 };
615
616 state.add_token(token_kind, start, state.get_position());
617 true
618 }
619 else {
620 false
621 }
622 }
623}