Skip to main content

oak_ruby/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the Ruby language.
3pub mod token_type;
4
5use crate::{language::RubyLanguage, lexer::token_type::RubyTokenType};
6use oak_core::{LexOutput, Lexer, LexerCache, LexerState, OakError, Source, TextEdit};
7
8pub(crate) type State<'a, S> = LexerState<'a, S, RubyLanguage>;
9
10/// A lexer for the Ruby language.
11#[derive(Clone, Debug)]
12pub struct RubyLexer<'config> {
13    config: &'config RubyLanguage,
14}
15
16impl<'config> Lexer<RubyLanguage> for RubyLexer<'config> {
17    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RubyLanguage>) -> LexOutput<RubyLanguage> {
18        let mut state: State<'_, S> = LexerState::new(source);
19        let result = self.run(&mut state);
20        if result.is_ok() {
21            state.add_eof()
22        }
23        state.finish_with_cache(result, cache)
24    }
25}
26
27impl<'config> RubyLexer<'config> {
28    /// Creates a new `RubyLexer` with the given configuration.
29    pub fn new(config: &'config RubyLanguage) -> Self {
30        Self { config }
31    }
32
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let safe_point = state.get_position();
36
37            if self.skip_whitespace(state) {
38                continue;
39            }
40
41            if self.lex_newline(state) {
42                continue;
43            }
44
45            if self.skip_comment(state) {
46                continue;
47            }
48
49            if self.lex_string_literal(state) {
50                continue;
51            }
52
53            if self.lex_symbol(state) {
54                continue;
55            }
56
57            if self.lex_number_literal(state) {
58                continue;
59            }
60
61            if self.lex_identifier_or_keyword(state) {
62                continue;
63            }
64
65            if self.lex_operators(state) {
66                continue;
67            }
68
69            if self.lex_single_char_tokens(state) {
70                continue;
71            }
72
73            state.advance_if_dead_lock(safe_point)
74        }
75
76        Ok(())
77    }
78
79    /// Skips whitespace characters
80    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81        let start_pos = state.get_position();
82
83        while let Some(ch) = state.peek() {
84            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
85        }
86
87        if state.get_position() > start_pos {
88            state.add_token(RubyTokenType::Whitespace, start_pos, state.get_position());
89            true
90        }
91        else {
92            false
93        }
94    }
95
96    /// Handles newlines
97    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
98        let start_pos = state.get_position();
99
100        if let Some('\n') = state.peek() {
101            state.advance(1);
102            state.add_token(RubyTokenType::Newline, start_pos, state.get_position());
103            true
104        }
105        else if let Some('\r') = state.peek() {
106            state.advance(1);
107            if let Some('\n') = state.peek() {
108                state.advance(1)
109            }
110            state.add_token(RubyTokenType::Newline, start_pos, state.get_position());
111            true
112        }
113        else {
114            false
115        }
116    }
117
118    /// Handles comments
119    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120        if let Some('#') = state.peek() {
121            let start_pos = state.get_position();
122            state.advance(1); // Skip '#'
123
124            // Read to end of line
125            while let Some(ch) = state.peek() {
126                if ch == '\n' || ch == '\r' {
127                    break;
128                }
129                state.advance(ch.len_utf8())
130            }
131
132            state.add_token(RubyTokenType::Comment, start_pos, state.get_position());
133            true
134        }
135        else {
136            false
137        }
138    }
139
140    /// Handles string literals
141    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
142        let start_pos = state.get_position();
143
144        // Check if it's the start of a string
145        let quote_char = match state.peek() {
146            Some('"') => '"',
147            Some('\'') => '\'',
148            Some('`') => '`',
149            _ => return false,
150        };
151
152        state.advance(1); // Skip the starting quote
153        let mut escaped = false;
154        while let Some(ch) = state.peek() {
155            if escaped {
156                escaped = false;
157                state.advance(ch.len_utf8());
158                continue;
159            }
160
161            if ch == '\\' {
162                escaped = true;
163                state.advance(1);
164                continue;
165            }
166
167            if ch == quote_char {
168                state.advance(1); // Skip the ending quote
169                break;
170            }
171            else if ch == '\n' || ch == '\r' {
172                // Ruby strings can span multiple lines
173                state.advance(ch.len_utf8())
174            }
175            else {
176                state.advance(ch.len_utf8())
177            }
178        }
179
180        state.add_token(RubyTokenType::StringLiteral, start_pos, state.get_position());
181        true
182    }
183
184    /// Handles symbols
185    fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
186        if let Some(':') = state.peek() {
187            let start_pos = state.get_position();
188            state.advance(1); // Skip ':'
189
190            // Check if the next character is the start of an identifier
191            if let Some(ch) = state.peek() {
192                if ch.is_ascii_alphabetic() || ch == '_' {
193                    // Read identifier
194                    while let Some(ch) = state.peek() {
195                        if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' { state.advance(1) } else { break }
196                    }
197                    state.add_token(RubyTokenType::Symbol, start_pos, state.get_position());
198                    return true;
199                }
200                else if ch == '"' || ch == '\'' {
201                    // Quoted symbol
202                    let quote = ch;
203                    state.advance(1);
204
205                    let mut escaped = false;
206                    while let Some(ch) = state.peek() {
207                        if escaped {
208                            escaped = false;
209                            state.advance(ch.len_utf8());
210                            continue;
211                        }
212
213                        if ch == '\\' {
214                            escaped = true;
215                            state.advance(1);
216                            continue;
217                        }
218
219                        if ch == quote {
220                            state.advance(1);
221                            break;
222                        }
223                        else {
224                            state.advance(ch.len_utf8())
225                        }
226                    }
227                    state.add_token(RubyTokenType::Symbol, start_pos, state.get_position());
228                    return true;
229                }
230            }
231        }
232        false
233    }
234
235    /// Handles number literals
236    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
237        let start_pos = state.get_position();
238
239        if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
240            return false;
241        }
242
243        let mut is_float = false;
244
245        // Check for base prefix
246        if state.peek() == Some('0') {
247            let next_char = state.peek_next_n(1);
248            match next_char {
249                Some('b') | Some('B') => {
250                    state.advance(2); // Skip '0b' or '0B'
251                    // Read binary number
252                    while let Some(ch) = state.peek() {
253                        if ch == '0' || ch == '1' {
254                            state.advance(1);
255                        }
256                        else if ch == '_' {
257                            state.advance(1); // Digit separator
258                        }
259                        else {
260                            break;
261                        }
262                    }
263                }
264                Some('o') | Some('O') => {
265                    state.advance(2); // Skip '0o' or '0O'
266                    // Read octal number
267                    while let Some(ch) = state.peek() {
268                        if ch.is_ascii_digit() && ch < '8' {
269                            state.advance(1);
270                        }
271                        else if ch == '_' {
272                            state.advance(1); // Digit separator
273                        }
274                        else {
275                            break;
276                        }
277                    }
278                }
279                Some('x') | Some('X') => {
280                    state.advance(2); // Skip '0x' or '0X'
281                    // Read hexadecimal number
282                    while let Some(ch) = state.peek() {
283                        if ch.is_ascii_hexdigit() {
284                            state.advance(1);
285                        }
286                        else if ch == '_' {
287                            state.advance(1); // Digit separator
288                        }
289                        else {
290                            break;
291                        }
292                    }
293                }
294                _ => {
295                    // Decimal number
296                    self.lex_decimal_number(state, &mut is_float)
297                }
298            }
299        }
300        else {
301            // Decimal number
302            self.lex_decimal_number(state, &mut is_float)
303        }
304
305        let kind = if is_float { RubyTokenType::FloatLiteral } else { RubyTokenType::IntegerLiteral };
306
307        state.add_token(kind, start_pos, state.get_position());
308        true
309    }
310
311    /// Handles decimal numbers
312    fn lex_decimal_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, is_float: &mut bool) {
313        // Read integer part
314        while let Some(ch) = state.peek() {
315            if ch.is_ascii_digit() {
316                state.advance(1);
317            }
318            else if ch == '_' {
319                state.advance(1); // Digit separator
320            }
321            else {
322                break;
323            }
324        }
325
326        // Check for decimal point
327        if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
328            *is_float = true;
329            state.advance(1); // Skip the decimal point
330            // Read fractional part
331            while let Some(ch) = state.peek() {
332                if ch.is_ascii_digit() {
333                    state.advance(1);
334                }
335                else if ch == '_' {
336                    state.advance(1); // Digit separator
337                }
338                else {
339                    break;
340                }
341            }
342        }
343
344        // Check for scientific notation
345        if let Some('e') | Some('E') = state.peek() {
346            *is_float = true;
347            state.advance(1);
348
349            // Optional sign
350            if let Some('+') | Some('-') = state.peek() {
351                state.advance(1);
352            }
353
354            // Exponent part
355            while let Some(ch) = state.peek() {
356                if ch.is_ascii_digit() {
357                    state.advance(1);
358                }
359                else if ch == '_' {
360                    state.advance(1); // Digit separator
361                }
362                else {
363                    break;
364                }
365            }
366        }
367    }
368
369    /// Handles identifiers or keywords
370    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
371        let start_pos = state.get_position();
372
373        // Check the first character
374        if !state.peek().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
375            return false;
376        }
377
378        // Build identifier string
379        let mut buf = String::new();
380
381        // Read identifier
382        while let Some(ch) = state.peek() {
383            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' {
384                buf.push(ch);
385                state.advance(1);
386            }
387            else {
388                break;
389            }
390        }
391
392        // Check if it's a keyword
393        let kind = match buf.as_str() {
394            "if" => RubyTokenType::If,
395            "unless" => RubyTokenType::Unless,
396            "elsif" => RubyTokenType::Elsif,
397            "else" => RubyTokenType::Else,
398            "case" => RubyTokenType::Case,
399            "when" => RubyTokenType::When,
400            "then" => RubyTokenType::Then,
401            "for" => RubyTokenType::For,
402            "while" => RubyTokenType::While,
403            "until" => RubyTokenType::Until,
404            "break" => RubyTokenType::Break,
405            "next" => RubyTokenType::Next,
406            "redo" => RubyTokenType::Redo,
407            "retry" => RubyTokenType::Retry,
408            "return" => RubyTokenType::Return,
409            "yield" => RubyTokenType::Yield,
410            "def" => RubyTokenType::Def,
411            "class" => RubyTokenType::Class,
412            "module" => RubyTokenType::Module,
413            "end" => RubyTokenType::End,
414            "lambda" => RubyTokenType::Lambda,
415            "proc" => RubyTokenType::Proc,
416            "begin" => RubyTokenType::Begin,
417            "rescue" => RubyTokenType::Rescue,
418            "ensure" => RubyTokenType::Ensure,
419            "raise" => RubyTokenType::Raise,
420            "require" => RubyTokenType::Require,
421            "load" => RubyTokenType::Load,
422            "include" => RubyTokenType::Include,
423            "extend" => RubyTokenType::Extend,
424            "prepend" => RubyTokenType::Prepend,
425            "and" => RubyTokenType::And,
426            "or" => RubyTokenType::Or,
427            "not" => RubyTokenType::Not,
428            "in" => RubyTokenType::In,
429            "true" => RubyTokenType::True,
430            "false" => RubyTokenType::False,
431            "nil" => RubyTokenType::Nil,
432            "super" => RubyTokenType::Super,
433            "self" => RubyTokenType::Self_,
434            "alias" => RubyTokenType::Alias,
435            "undef" => RubyTokenType::Undef,
436            "defined?" => RubyTokenType::Defined,
437            "do" => RubyTokenType::Do,
438            _ => RubyTokenType::Identifier,
439        };
440
441        state.add_token(kind, start_pos, state.get_position());
442        true
443    }
444
445    /// Handles operators
446    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
447        let start_pos = state.get_position();
448
449        // Try to match multi-character operators
450        let three_char_ops = ["<=>", "===", "**=", "<<=", ">>=", "||=", "&&=", "..."];
451        for op in &three_char_ops {
452            if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) && state.peek_next_n(2) == op.chars().nth(2) {
453                state.advance(3);
454                let kind = match *op {
455                    "<=>" => RubyTokenType::Spaceship,
456                    "===" => RubyTokenType::EqualEqualEqual,
457                    "**=" => RubyTokenType::PowerAssign,
458                    "<<=" => RubyTokenType::LeftShiftAssign,
459                    ">>=" => RubyTokenType::RightShiftAssign,
460                    "||=" => RubyTokenType::OrOrAssign,
461                    "&&=" => RubyTokenType::AndAndAssign,
462                    "..." => RubyTokenType::DotDotDot,
463                    _ => RubyTokenType::Invalid,
464                };
465                state.add_token(kind, start_pos, state.get_position());
466                return true;
467            }
468        }
469
470        let two_char_ops = ["**", "<<", ">>", "<=", ">=", "==", "!=", "=~", "!~", "&&", "||", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "..", "=>"];
471        for op in &two_char_ops {
472            if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) {
473                state.advance(2);
474                let kind = match *op {
475                    "**" => RubyTokenType::Power,
476                    "<<" => RubyTokenType::LeftShift,
477                    ">>" => RubyTokenType::RightShift,
478                    "<=" => RubyTokenType::LessEqual,
479                    ">=" => RubyTokenType::GreaterEqual,
480                    "==" => RubyTokenType::EqualEqual,
481                    "!=" => RubyTokenType::NotEqual,
482                    "=~" => RubyTokenType::Match,
483                    "!~" => RubyTokenType::NotMatch,
484                    "&&" => RubyTokenType::AndAnd,
485                    "||" => RubyTokenType::OrOr,
486                    "+=" => RubyTokenType::PlusAssign,
487                    "-=" => RubyTokenType::MinusAssign,
488                    "*=" => RubyTokenType::MultiplyAssign,
489                    "/=" => RubyTokenType::DivideAssign,
490                    "%=" => RubyTokenType::ModuloAssign,
491                    "&=" => RubyTokenType::AndAssign,
492                    "|=" => RubyTokenType::OrAssign,
493                    "^=" => RubyTokenType::XorAssign,
494                    ".." => RubyTokenType::DotDot,
495                    "=>" => RubyTokenType::EqualGreater,
496                    _ => RubyTokenType::Invalid,
497                };
498                state.add_token(kind, start_pos, state.get_position());
499                return true;
500            }
501        }
502
503        // Try to match single-character operators
504        let single_char_ops = ['+', '-', '*', '/', '%', '=', '<', '>', '&', '|', '^', '!', '~', '?'];
505
506        if let Some(ch) = state.peek() {
507            if single_char_ops.contains(&ch) {
508                state.advance(1);
509                let kind = match ch {
510                    '+' => RubyTokenType::Plus,
511                    '-' => RubyTokenType::Minus,
512                    '*' => RubyTokenType::Multiply,
513                    '/' => RubyTokenType::Divide,
514                    '%' => RubyTokenType::Modulo,
515                    '=' => RubyTokenType::Assign,
516                    '<' => RubyTokenType::Less,
517                    '>' => RubyTokenType::Greater,
518                    '&' => RubyTokenType::BitAnd,
519                    '|' => RubyTokenType::BitOr,
520                    '^' => RubyTokenType::Xor,
521                    '!' => RubyTokenType::LogicalNot,
522                    '~' => RubyTokenType::Tilde,
523                    '?' => RubyTokenType::Question,
524                    _ => RubyTokenType::Invalid,
525                };
526                state.add_token(kind, start_pos, state.get_position());
527                return true;
528            }
529        }
530
531        false
532    }
533
534    /// Handles delimiters
535    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
536        let start_pos = state.get_position();
537
538        // Check for double colon
539        if state.peek() == Some(':') && state.peek_next_n(1) == Some(':') {
540            state.advance(2);
541            state.add_token(RubyTokenType::DoubleColon, start_pos, state.get_position());
542            return true;
543        }
544
545        // Single-character delimiters
546        let delimiters = ['(', ')', '[', ']', '{', '}', ',', ';', '.', ':', '@', '$'];
547
548        if let Some(ch) = state.peek() {
549            if delimiters.contains(&ch) {
550                state.advance(1);
551                let kind = match ch {
552                    '(' => RubyTokenType::LeftParen,
553                    ')' => RubyTokenType::RightParen,
554                    '[' => RubyTokenType::LeftBracket,
555                    ']' => RubyTokenType::RightBracket,
556                    '{' => RubyTokenType::LeftBrace,
557                    '}' => RubyTokenType::RightBrace,
558                    ',' => RubyTokenType::Comma,
559                    ';' => RubyTokenType::Semicolon,
560                    '.' => RubyTokenType::Dot,
561                    ':' => RubyTokenType::Colon,
562                    '@' => RubyTokenType::At,
563                    '$' => RubyTokenType::Dollar,
564                    _ => RubyTokenType::Invalid,
565                };
566                state.add_token(kind, start_pos, state.get_position());
567                return true;
568            }
569        }
570
571        // If no known characters are matched, mark as Invalid and advance the position
572        if let Some(_ch) = state.peek() {
573            state.advance(1);
574            state.add_token(RubyTokenType::Invalid, start_pos, state.get_position());
575            return true;
576        }
577
578        false
579    }
580}