Skip to main content

oak_groovy/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use oak_core::Source;
4pub mod token_type;
5
6use crate::{language::GroovyLanguage, lexer::token_type::GroovyTokenType};
7use oak_core::{
8    Lexer, LexerCache, LexerState, OakError,
9    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
10};
11use std::sync::LazyLock;
12
13pub(crate) type State<'a, S> = LexerState<'a, S, GroovyLanguage>;
14
15static GROOVY_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static GROOVY_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: false });
17static GROOVY_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18static GROOVY_CHAR: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
19
20/// Lexer for Groovy source code.
21#[derive(Clone)]
22pub struct GroovyLexer<'config> {
23    config: &'config GroovyLanguage,
24}
25
26impl<'config> Lexer<GroovyLanguage> for GroovyLexer<'config> {
27    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<GroovyLanguage>) -> LexOutput<GroovyLanguage> {
28        let mut state = LexerState::new(source);
29        let result = self.run(&mut state);
30        if result.is_ok() {
31            state.add_eof();
32        }
33        state.finish_with_cache(result, cache)
34    }
35}
36
37impl<'config> GroovyLexer<'config> {
38    /// Creates a new `GroovyLexer` with the given configuration.
39    pub fn new(config: &'config GroovyLanguage) -> Self {
40        Self { config }
41    }
42
43    /// Runs the lexer on the given state.
44    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
45        while state.not_at_end() {
46            let safe_point = state.get_position();
47
48            if self.skip_whitespace(state) {
49                continue;
50            }
51
52            if self.skip_comment(state) {
53                continue;
54            }
55
56            if self.lex_string_literal(state) {
57                continue;
58            }
59
60            if self.lex_char_literal(state) {
61                continue;
62            }
63
64            if self.lex_number_literal(state) {
65                continue;
66            }
67
68            if self.lex_identifier_or_keyword(state) {
69                continue;
70            }
71
72            if self.lex_operators(state) {
73                continue;
74            }
75
76            if self.lex_single_char_tokens(state) {
77                continue;
78            }
79
80            state.advance_if_dead_lock(safe_point);
81        }
82
83        Ok(())
84    }
85
86    /// Skips whitespace characters.
87    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        GROOVY_WHITESPACE.scan(state, GroovyTokenType::Whitespace)
89    }
90
91    /// Skips comments.
92    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
93        // Line comment // and Block comment /* ... */
94        if GROOVY_COMMENT.scan(state, GroovyTokenType::Comment, GroovyTokenType::Comment) {
95            return true;
96        }
97
98        false
99    }
100
101    /// Lexes string literals.
102    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
103        // Normal string "..."
104        if GROOVY_STRING.scan(state, GroovyTokenType::StringLiteral) {
105            return true;
106        }
107
108        // Triple-quoted string """..."""
109        if state.consume_if_starts_with("\"\"\"") {
110            let start = state.get_position() - 3;
111
112            while state.not_at_end() {
113                if state.consume_if_starts_with("\"\"\"") {
114                    break;
115                }
116                if let Some(ch) = state.peek() {
117                    state.advance(ch.len_utf8());
118                }
119            }
120
121            let end = state.get_position();
122            state.add_token(GroovyTokenType::StringLiteral, start, end);
123            return true;
124        }
125
126        // GString $/.../$
127        if state.consume_if_starts_with("$/") {
128            let start = state.get_position() - 2;
129
130            while state.not_at_end() {
131                if state.consume_if_starts_with("/$") {
132                    break;
133                }
134                if let Some(ch) = state.peek() {
135                    state.advance(ch.len_utf8());
136                }
137            }
138
139            let end = state.get_position();
140            state.add_token(GroovyTokenType::StringLiteral, start, end);
141            return true;
142        }
143
144        false
145    }
146
147    /// Lexes character literals.
148    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
149        GROOVY_CHAR.scan(state, GroovyTokenType::CharLiteral)
150    }
151
152    /// Lexes number literals.
153    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
154        let start = state.get_position();
155        let mut has_digits = false;
156        let mut _is_float = false;
157
158        // Handle negative sign
159        if state.consume_if_starts_with("-") {
160            // Negative sign
161        }
162
163        // Handle hex 0x...
164        if state.consume_if_starts_with("0x") || state.consume_if_starts_with("0X") {
165            while let Some(ch) = state.peek() {
166                if ch.is_ascii_hexdigit() {
167                    state.advance(ch.len_utf8());
168                    has_digits = true;
169                }
170                else {
171                    break;
172                }
173            }
174        }
175        // Handle octal 0...
176        else if state.peek() == Some('0') {
177            state.advance(1);
178            has_digits = true;
179            while let Some(ch) = state.peek() {
180                if ch >= '0' && ch <= '7' {
181                    state.advance(ch.len_utf8());
182                }
183                else {
184                    break;
185                }
186            }
187        }
188        // Handle decimal
189        else {
190            // Handle integer part
191            while let Some(ch) = state.peek() {
192                if ch.is_ascii_digit() {
193                    state.advance(ch.len_utf8());
194                    has_digits = true;
195                }
196                else {
197                    break;
198                }
199            }
200
201            // Handle fractional part
202            if state.peek() == Some('.') && has_digits {
203                if let Some(next_ch) = state.peek_next_n(1) {
204                    if next_ch.is_ascii_digit() {
205                        state.advance(1); // skip .
206                        _is_float = true;
207
208                        while let Some(ch) = state.peek() {
209                            if ch.is_ascii_digit() {
210                                state.advance(ch.len_utf8());
211                            }
212                            else {
213                                break;
214                            }
215                        }
216                    }
217                }
218            }
219
220            // Handle exponent part
221            if let Some(ch) = state.peek() {
222                if (ch == 'e' || ch == 'E') && has_digits {
223                    state.advance(1);
224                    _is_float = true;
225
226                    // Handle exponent sign
227                    if let Some(next) = state.peek() {
228                        if next == '+' || next == '-' {
229                            state.advance(1);
230                        }
231                    }
232
233                    // Handle exponent digits
234                    let mut exp_digits = false;
235                    while let Some(ch) = state.peek() {
236                        if ch.is_ascii_digit() {
237                            state.advance(ch.len_utf8());
238                            exp_digits = true;
239                        }
240                        else {
241                            break;
242                        }
243                    }
244
245                    if !exp_digits {
246                        // Exponent part must have digits
247                        return false;
248                    }
249                }
250            }
251        }
252
253        // Handle number suffixes (G, L, F, D)
254        if has_digits {
255            if let Some(ch) = state.peek() {
256                if matches!(ch, 'G' | 'g' | 'L' | 'l' | 'F' | 'f' | 'D' | 'd') {
257                    state.advance(ch.len_utf8());
258                    _is_float = matches!(ch, 'F' | 'f' | 'D' | 'd' | 'G' | 'g');
259                }
260            }
261        }
262
263        if has_digits {
264            let end = state.get_position();
265            let kind = if _is_float { GroovyTokenType::FloatLiteral } else { GroovyTokenType::IntLiteral };
266            state.add_token(kind, start, end);
267            true
268        }
269        else {
270            false
271        }
272    }
273
274    /// Lexes identifiers or keywords.
275    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276        let start = state.get_position();
277
278        // Identifier must start with a letter or underscore or $
279        if let Some(first_ch) = state.peek() {
280            if !first_ch.is_alphabetic() && first_ch != '_' && first_ch != '$' {
281                return false;
282            }
283
284            state.advance(first_ch.len_utf8());
285
286            // Subsequent characters can be letters, digits, underscores, or $
287            while let Some(ch) = state.peek() {
288                if ch.is_alphanumeric() || ch == '_' || ch == '$' { state.advance(ch.len_utf8()) } else { break }
289            }
290
291            let end = state.get_position();
292            let text = state.get_text_in((start..end).into());
293            let kind = self.keyword_or_identifier(text.as_ref());
294            state.add_token(kind, start, end);
295            true
296        }
297        else {
298            false
299        }
300    }
301
302    /// Returns the token type for the given text, which can be a keyword or an identifier.
303    fn keyword_or_identifier(&self, text: &str) -> GroovyTokenType {
304        match text {
305            // Keywords
306            "abstract" => GroovyTokenType::AbstractKeyword,
307            "as" => GroovyTokenType::AsKeyword,
308            "assert" => GroovyTokenType::AssertKeyword,
309            "break" => GroovyTokenType::BreakKeyword,
310            "case" => GroovyTokenType::CaseKeyword,
311            "catch" => GroovyTokenType::CatchKeyword,
312            "class" => GroovyTokenType::ClassKeyword,
313            "const" => GroovyTokenType::ConstKeyword,
314            "continue" => GroovyTokenType::ContinueKeyword,
315            "def" => GroovyTokenType::DefKeyword,
316            "default" => GroovyTokenType::DefaultKeyword,
317            "do" => GroovyTokenType::DoKeyword,
318            "else" => GroovyTokenType::ElseKeyword,
319            "enum" => GroovyTokenType::EnumKeyword,
320            "extends" => GroovyTokenType::ExtendsKeyword,
321            "final" => GroovyTokenType::FinalKeyword,
322            "finally" => GroovyTokenType::FinallyKeyword,
323            "for" => GroovyTokenType::ForKeyword,
324            "goto" => GroovyTokenType::GotoKeyword,
325            "if" => GroovyTokenType::IfKeyword,
326            "implements" => GroovyTokenType::ImplementsKeyword,
327            "import" => GroovyTokenType::ImportKeyword,
328            "in" => GroovyTokenType::InKeyword,
329            "instanceof" => GroovyTokenType::InstanceofKeyword,
330            "interface" => GroovyTokenType::InterfaceKeyword,
331            "native" => GroovyTokenType::NativeKeyword,
332            "new" => GroovyTokenType::NewKeyword,
333            "package" => GroovyTokenType::PackageKeyword,
334            "private" => GroovyTokenType::PrivateKeyword,
335            "protected" => GroovyTokenType::ProtectedKeyword,
336            "public" => GroovyTokenType::PublicKeyword,
337            "return" => GroovyTokenType::ReturnKeyword,
338            "static" => GroovyTokenType::StaticKeyword,
339            "strictfp" => GroovyTokenType::StrictfpKeyword,
340            "super" => GroovyTokenType::SuperKeyword,
341            "switch" => GroovyTokenType::SwitchKeyword,
342            "synchronized" => GroovyTokenType::SynchronizedKeyword,
343            "this" => GroovyTokenType::ThisKeyword,
344            "throw" => GroovyTokenType::ThrowKeyword,
345            "throws" => GroovyTokenType::ThrowsKeyword,
346            "trait" => GroovyTokenType::TraitKeyword,
347            "transient" => GroovyTokenType::TransientKeyword,
348            "try" => GroovyTokenType::TryKeyword,
349            "void" => GroovyTokenType::VoidKeyword,
350            "volatile" => GroovyTokenType::VolatileKeyword,
351            "while" => GroovyTokenType::WhileKeyword,
352
353            // Special literals
354            "true" | "false" => GroovyTokenType::BooleanLiteral,
355            "null" => GroovyTokenType::NullLiteral,
356
357            // Default to identifier
358            _ => GroovyTokenType::Identifier,
359        }
360    }
361
362    /// Lexes operators.
363    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
364        let start = state.get_position();
365
366        // Three-character operators
367        if state.consume_if_starts_with(">>>") {
368            state.add_token(GroovyTokenType::UnsignedRightShift, start, state.get_position());
369            return true;
370        }
371        if state.consume_if_starts_with("<=>") {
372            state.add_token(GroovyTokenType::Spaceship, start, state.get_position());
373            return true;
374        }
375
376        // Two-character operators
377        if state.consume_if_starts_with("**") {
378            state.add_token(GroovyTokenType::Power, start, state.get_position());
379            return true;
380        }
381        if state.consume_if_starts_with("+=") {
382            state.add_token(GroovyTokenType::PlusAssign, start, state.get_position());
383            return true;
384        }
385        if state.consume_if_starts_with("-=") {
386            state.add_token(GroovyTokenType::MinusAssign, start, state.get_position());
387            return true;
388        }
389        if state.consume_if_starts_with("*=") {
390            state.add_token(GroovyTokenType::StarAssign, start, state.get_position());
391            return true;
392        }
393        if state.consume_if_starts_with("/=") {
394            state.add_token(GroovyTokenType::SlashAssign, start, state.get_position());
395            return true;
396        }
397        if state.consume_if_starts_with("%=") {
398            state.add_token(GroovyTokenType::PercentAssign, start, state.get_position());
399            return true;
400        }
401        if state.consume_if_starts_with("**=") {
402            state.add_token(GroovyTokenType::PowerAssign, start, state.get_position());
403            return true;
404        }
405        if state.consume_if_starts_with("==") {
406            state.add_token(GroovyTokenType::Equal, start, state.get_position());
407            return true;
408        }
409        if state.consume_if_starts_with("!=") {
410            state.add_token(GroovyTokenType::NotEqual, start, state.get_position());
411            return true;
412        }
413        if state.consume_if_starts_with("<=") {
414            state.add_token(GroovyTokenType::LessEqual, start, state.get_position());
415            return true;
416        }
417        if state.consume_if_starts_with(">=") {
418            state.add_token(GroovyTokenType::GreaterEqual, start, state.get_position());
419            return true;
420        }
421        if state.consume_if_starts_with("&&") {
422            state.add_token(GroovyTokenType::LogicalAnd, start, state.get_position());
423            return true;
424        }
425        if state.consume_if_starts_with("||") {
426            state.add_token(GroovyTokenType::LogicalOr, start, state.get_position());
427            return true;
428        }
429        if state.consume_if_starts_with("<<") {
430            state.add_token(GroovyTokenType::LeftShift, start, state.get_position());
431            return true;
432        }
433        if state.consume_if_starts_with(">>") {
434            state.add_token(GroovyTokenType::RightShift, start, state.get_position());
435            return true;
436        }
437        if state.consume_if_starts_with("++") {
438            state.add_token(GroovyTokenType::Increment, start, state.get_position());
439            return true;
440        }
441        if state.consume_if_starts_with("--") {
442            state.add_token(GroovyTokenType::Decrement, start, state.get_position());
443            return true;
444        }
445        if state.consume_if_starts_with("?:") {
446            state.add_token(GroovyTokenType::Elvis, start, state.get_position());
447            return true;
448        }
449        if state.consume_if_starts_with("?.") {
450            state.add_token(GroovyTokenType::SafeNavigation, start, state.get_position());
451            return true;
452        }
453
454        false
455    }
456
457    /// Lexes single-character tokens.
458    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
459        if let Some(ch) = state.peek() {
460            let start = state.get_position();
461            let kind = match ch {
462                '+' => Some(GroovyTokenType::Plus),
463                '-' => Some(GroovyTokenType::Minus),
464                '*' => Some(GroovyTokenType::Star),
465                '/' => Some(GroovyTokenType::Slash),
466                '%' => Some(GroovyTokenType::Percent),
467                '=' => Some(GroovyTokenType::Assign),
468                '<' => Some(GroovyTokenType::Less),
469                '>' => Some(GroovyTokenType::Greater),
470                '!' => Some(GroovyTokenType::LogicalNot),
471                '&' => Some(GroovyTokenType::BitAnd),
472                '|' => Some(GroovyTokenType::BitOr),
473                '^' => Some(GroovyTokenType::BitXor),
474                '~' => Some(GroovyTokenType::BitNot),
475                '?' => Some(GroovyTokenType::Question),
476                ':' => Some(GroovyTokenType::Colon),
477                '(' => Some(GroovyTokenType::LeftParen),
478                ')' => Some(GroovyTokenType::RightParen),
479                '[' => Some(GroovyTokenType::LeftBracket),
480                ']' => Some(GroovyTokenType::RightBracket),
481                '{' => Some(GroovyTokenType::LeftBrace),
482                '}' => Some(GroovyTokenType::RightBrace),
483                ',' => Some(GroovyTokenType::Comma),
484                '.' => Some(GroovyTokenType::Period),
485                ';' => Some(GroovyTokenType::Semicolon),
486                '@' => Some(GroovyTokenType::At),
487                _ => None,
488            };
489
490            if let Some(token_kind) = kind {
491                state.advance(ch.len_utf8());
492                let end = state.get_position();
493                state.add_token(token_kind, start, end);
494                true
495            }
496            else {
497                false
498            }
499        }
500        else {
501            false
502        }
503    }
504}