oak_ruby/lexer/
mod.rs

1use crate::{kind::RubySyntaxKind, language::RubyLanguage};
2use oak_core::{LexOutput, Lexer, LexerCache, LexerState, OakError, Source, TextEdit};
3
4type State<'a, S> = LexerState<'a, S, RubyLanguage>;
5
6#[derive(Clone, Default)]
7pub struct RubyLexer {}
8
9impl Lexer<RubyLanguage> for RubyLexer {
10    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RubyLanguage>) -> LexOutput<RubyLanguage> {
11        let mut state: State<'_, S> = LexerState::new(source);
12        let result = self.run(&mut state);
13        if result.is_ok() {
14            state.add_eof();
15        }
16        state.finish_with_cache(result, cache)
17    }
18}
19
20impl RubyLexer {
21    pub fn new(_config: &RubyLanguage) -> Self {
22        Self {}
23    }
24
25    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
26        while state.not_at_end() {
27            let safe_point = state.get_position();
28
29            if self.skip_whitespace(state) {
30                continue;
31            }
32
33            if self.lex_newline(state) {
34                continue;
35            }
36
37            if self.skip_comment(state) {
38                continue;
39            }
40
41            if self.lex_string_literal(state) {
42                continue;
43            }
44
45            if self.lex_symbol(state) {
46                continue;
47            }
48
49            if self.lex_number_literal(state) {
50                continue;
51            }
52
53            if self.lex_identifier_or_keyword(state) {
54                continue;
55            }
56
57            if self.lex_operators(state) {
58                continue;
59            }
60
61            if self.lex_single_char_tokens(state) {
62                continue;
63            }
64
65            state.advance_if_dead_lock(safe_point);
66        }
67
68        Ok(())
69    }
70
71    /// 跳过空白字符
72    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
73        let start_pos = state.get_position();
74
75        while let Some(ch) = state.peek() {
76            if ch == ' ' || ch == '\t' {
77                state.advance(ch.len_utf8());
78            }
79            else {
80                break;
81            }
82        }
83
84        if state.get_position() > start_pos {
85            state.add_token(RubySyntaxKind::Whitespace, start_pos, state.get_position());
86            true
87        }
88        else {
89            false
90        }
91    }
92
93    /// 处理换行
94    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        let start_pos = state.get_position();
96
97        if let Some('\n') = state.peek() {
98            state.advance(1);
99            state.add_token(RubySyntaxKind::Newline, start_pos, state.get_position());
100            true
101        }
102        else if let Some('\r') = state.peek() {
103            state.advance(1);
104            if let Some('\n') = state.peek() {
105                state.advance(1);
106            }
107            state.add_token(RubySyntaxKind::Newline, start_pos, state.get_position());
108            true
109        }
110        else {
111            false
112        }
113    }
114
115    /// 处理注释
116    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
117        if let Some('#') = state.peek() {
118            let start_pos = state.get_position();
119            state.advance(1); // 跳过 '#'
120
121            // 读取到行
122            while let Some(ch) = state.peek() {
123                if ch == '\n' || ch == '\r' {
124                    break;
125                }
126                state.advance(ch.len_utf8());
127            }
128
129            state.add_token(RubySyntaxKind::Comment, start_pos, state.get_position());
130            true
131        }
132        else {
133            false
134        }
135    }
136
137    /// 处理字符串字面量
138    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
139        let start_pos = state.get_position();
140
141        // 检查是否是字符串开
142        let quote_char = match state.peek() {
143            Some('"') => '"',
144            Some('\'') => '\'',
145            Some('`') => '`',
146            _ => return false,
147        };
148
149        state.advance(1); // 跳过开始引
150        let mut escaped = false;
151        while let Some(ch) = state.peek() {
152            if escaped {
153                escaped = false;
154                state.advance(ch.len_utf8());
155                continue;
156            }
157
158            if ch == '\\' {
159                escaped = true;
160                state.advance(1);
161                continue;
162            }
163
164            if ch == quote_char {
165                state.advance(1); // 跳过结束引号
166                break;
167            }
168            else if ch == '\n' || ch == '\r' {
169                // Ruby 字符串可以跨多行
170                state.advance(ch.len_utf8());
171            }
172            else {
173                state.advance(ch.len_utf8());
174            }
175        }
176
177        state.add_token(RubySyntaxKind::StringLiteral, start_pos, state.get_position());
178        true
179    }
180
181    /// 处理符号
182    fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
183        if let Some(':') = state.peek() {
184            let start_pos = state.get_position();
185            state.advance(1); // 跳过 ':'
186
187            // 检查下一个字符是否是标识符开
188            if let Some(ch) = state.peek() {
189                if ch.is_ascii_alphabetic() || ch == '_' {
190                    // 读取标识
191                    while let Some(ch) = state.peek() {
192                        if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' {
193                            state.advance(1);
194                        }
195                        else {
196                            break;
197                        }
198                    }
199                    state.add_token(RubySyntaxKind::Symbol, start_pos, state.get_position());
200                    return true;
201                }
202                else if ch == '"' || ch == '\'' {
203                    // 引号符号
204                    let quote = ch;
205                    state.advance(1);
206
207                    let mut escaped = false;
208                    while let Some(ch) = state.peek() {
209                        if escaped {
210                            escaped = false;
211                            state.advance(ch.len_utf8());
212                            continue;
213                        }
214
215                        if ch == '\\' {
216                            escaped = true;
217                            state.advance(1);
218                            continue;
219                        }
220
221                        if ch == quote {
222                            state.advance(1);
223                            break;
224                        }
225                        else {
226                            state.advance(ch.len_utf8());
227                        }
228                    }
229                    state.add_token(RubySyntaxKind::Symbol, start_pos, state.get_position());
230                    return true;
231                }
232            }
233        }
234        false
235    }
236
237    /// 处理数字字面
238    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
239        let start_pos = state.get_position();
240
241        if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
242            return false;
243        }
244
245        let mut is_float = false;
246
247        // 检查进制前缀
248        if state.peek() == Some('0') {
249            let next_char = state.peek_next_n(1);
250            match next_char {
251                Some('b') | Some('B') => {
252                    state.advance(2); // 跳过 '0b' '0B'
253                    // 读取二进制数
254                    while let Some(ch) = state.peek() {
255                        if ch == '0' || ch == '1' {
256                            state.advance(1);
257                        }
258                        else if ch == '_' {
259                            state.advance(1); // 数字分隔
260                        }
261                        else {
262                            break;
263                        }
264                    }
265                }
266                Some('o') | Some('O') => {
267                    state.advance(2); // 跳过 '0o' '0O'
268                    // 读取八进制数
269                    while let Some(ch) = state.peek() {
270                        if ch.is_ascii_digit() && ch < '8' {
271                            state.advance(1);
272                        }
273                        else if ch == '_' {
274                            state.advance(1); // 数字分隔
275                        }
276                        else {
277                            break;
278                        }
279                    }
280                }
281                Some('x') | Some('X') => {
282                    state.advance(2); // 跳过 '0x' '0X'
283                    // 读取十六进制数字
284                    while let Some(ch) = state.peek() {
285                        if ch.is_ascii_hexdigit() {
286                            state.advance(1);
287                        }
288                        else if ch == '_' {
289                            state.advance(1); // 数字分隔
290                        }
291                        else {
292                            break;
293                        }
294                    }
295                }
296                _ => {
297                    // 十进制数
298                    self.lex_decimal_number(state, &mut is_float);
299                }
300            }
301        }
302        else {
303            // 十进制数
304            self.lex_decimal_number(state, &mut is_float);
305        }
306
307        let kind = if is_float { RubySyntaxKind::FloatLiteral } else { RubySyntaxKind::IntegerLiteral };
308
309        state.add_token(kind, start_pos, state.get_position());
310        true
311    }
312
313    /// 处理十进制数
314    fn lex_decimal_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, is_float: &mut bool) {
315        // 读取整数部分
316        while let Some(ch) = state.peek() {
317            if ch.is_ascii_digit() {
318                state.advance(1);
319            }
320            else if ch == '_' {
321                state.advance(1); // 数字分隔            } else {
322                break;
323            }
324        }
325
326        // 检查小数点
327        if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
328            *is_float = true;
329            state.advance(1); // 跳过小数
330            // 读取小数部分
331            while let Some(ch) = state.peek() {
332                if ch.is_ascii_digit() {
333                    state.advance(1);
334                }
335                else if ch == '_' {
336                    state.advance(1); // 数字分隔
337                }
338                else {
339                    break;
340                }
341            }
342        }
343
344        // 检查科学计数法
345        if let Some('e') | Some('E') = state.peek() {
346            *is_float = true;
347            state.advance(1);
348
349            // 可选的符号
350            if let Some('+') | Some('-') = state.peek() {
351                state.advance(1);
352            }
353
354            // 指数部分
355            while let Some(ch) = state.peek() {
356                if ch.is_ascii_digit() {
357                    state.advance(1);
358                }
359                else if ch == '_' {
360                    state.advance(1); // 数字分隔                } else {
361                    break;
362                }
363            }
364        }
365    }
366
367    /// 处理标识符或关键
368    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
369        let start_pos = state.get_position();
370
371        // 检查第一个字
372        if !state.peek().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
373            return false;
374        }
375
376        // 构建标识符字符串
377        let mut buf = String::new();
378
379        // 读取标识
380        while let Some(ch) = state.peek() {
381            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' {
382                buf.push(ch);
383                state.advance(1);
384            }
385            else {
386                break;
387            }
388        }
389
390        // 检查是否是关键字
391        let kind = match buf.as_str() {
392            "if" => RubySyntaxKind::If,
393            "unless" => RubySyntaxKind::Unless,
394            "elsif" => RubySyntaxKind::Elsif,
395            "else" => RubySyntaxKind::Else,
396            "case" => RubySyntaxKind::Case,
397            "when" => RubySyntaxKind::When,
398            "then" => RubySyntaxKind::Then,
399            "for" => RubySyntaxKind::For,
400            "while" => RubySyntaxKind::While,
401            "until" => RubySyntaxKind::Until,
402            "break" => RubySyntaxKind::Break,
403            "next" => RubySyntaxKind::Next,
404            "redo" => RubySyntaxKind::Redo,
405            "retry" => RubySyntaxKind::Retry,
406            "return" => RubySyntaxKind::Return,
407            "yield" => RubySyntaxKind::Yield,
408            "def" => RubySyntaxKind::Def,
409            "class" => RubySyntaxKind::Class,
410            "module" => RubySyntaxKind::Module,
411            "end" => RubySyntaxKind::End,
412            "lambda" => RubySyntaxKind::Lambda,
413            "proc" => RubySyntaxKind::Proc,
414            "begin" => RubySyntaxKind::Begin,
415            "rescue" => RubySyntaxKind::Rescue,
416            "ensure" => RubySyntaxKind::Ensure,
417            "raise" => RubySyntaxKind::Raise,
418            "require" => RubySyntaxKind::Require,
419            "load" => RubySyntaxKind::Load,
420            "include" => RubySyntaxKind::Include,
421            "extend" => RubySyntaxKind::Extend,
422            "prepend" => RubySyntaxKind::Prepend,
423            "and" => RubySyntaxKind::And,
424            "or" => RubySyntaxKind::Or,
425            "not" => RubySyntaxKind::Not,
426            "in" => RubySyntaxKind::In,
427            "true" => RubySyntaxKind::True,
428            "false" => RubySyntaxKind::False,
429            "nil" => RubySyntaxKind::Nil,
430            "super" => RubySyntaxKind::Super,
431            "self" => RubySyntaxKind::Self_,
432            "alias" => RubySyntaxKind::Alias,
433            "undef" => RubySyntaxKind::Undef,
434            "defined?" => RubySyntaxKind::Defined,
435            "do" => RubySyntaxKind::Do,
436            _ => RubySyntaxKind::Identifier,
437        };
438
439        state.add_token(kind, start_pos, state.get_position());
440        true
441    }
442
443    /// 处理操作
444    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
445        let start_pos = state.get_position();
446
447        // 尝试匹配多字符操作符
448        let three_char_ops = ["<=>", "===", "**=", "<<=", ">>=", "||=", "&&=", "..."];
449        for op in &three_char_ops {
450            if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) && state.peek_next_n(2) == op.chars().nth(2) {
451                state.advance(3);
452                let kind = match *op {
453                    "<=>" => RubySyntaxKind::Spaceship,
454                    "===" => RubySyntaxKind::EqualEqualEqual,
455                    "**=" => RubySyntaxKind::PowerAssign,
456                    "<<=" => RubySyntaxKind::LeftShiftAssign,
457                    ">>=" => RubySyntaxKind::RightShiftAssign,
458                    "||=" => RubySyntaxKind::OrOrAssign,
459                    "&&=" => RubySyntaxKind::AndAndAssign,
460                    "..." => RubySyntaxKind::DotDotDot,
461                    _ => RubySyntaxKind::Invalid,
462                };
463                state.add_token(kind, start_pos, state.get_position());
464                return true;
465            }
466        }
467
468        let two_char_ops = ["**", "<<", ">>", "<=", ">=", "==", "!=", "=~", "!~", "&&", "||", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", ".."];
469        for op in &two_char_ops {
470            if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) {
471                state.advance(2);
472                let kind = match *op {
473                    "**" => RubySyntaxKind::Power,
474                    "<<" => RubySyntaxKind::LeftShift,
475                    ">>" => RubySyntaxKind::RightShift,
476                    "<=" => RubySyntaxKind::LessEqual,
477                    ">=" => RubySyntaxKind::GreaterEqual,
478                    "==" => RubySyntaxKind::EqualEqual,
479                    "!=" => RubySyntaxKind::NotEqual,
480                    "=~" => RubySyntaxKind::Match,
481                    "!~" => RubySyntaxKind::NotMatch,
482                    "&&" => RubySyntaxKind::AndAnd,
483                    "||" => RubySyntaxKind::OrOr,
484                    "+=" => RubySyntaxKind::PlusAssign,
485                    "-=" => RubySyntaxKind::MinusAssign,
486                    "*=" => RubySyntaxKind::MultiplyAssign,
487                    "/=" => RubySyntaxKind::DivideAssign,
488                    "%=" => RubySyntaxKind::ModuloAssign,
489                    "&=" => RubySyntaxKind::AndAssign,
490                    "|=" => RubySyntaxKind::OrAssign,
491                    "^=" => RubySyntaxKind::XorAssign,
492                    ".." => RubySyntaxKind::DotDot,
493                    _ => RubySyntaxKind::Invalid,
494                };
495                state.add_token(kind, start_pos, state.get_position());
496                return true;
497            }
498        }
499
500        // 尝试匹配单字符操作符
501        let single_char_ops = ['+', '-', '*', '/', '%', '=', '<', '>', '&', '|', '^', '!', '~', '?'];
502
503        if let Some(ch) = state.peek() {
504            if single_char_ops.contains(&ch) {
505                state.advance(1);
506                let kind = match ch {
507                    '+' => RubySyntaxKind::Plus,
508                    '-' => RubySyntaxKind::Minus,
509                    '*' => RubySyntaxKind::Multiply,
510                    '/' => RubySyntaxKind::Divide,
511                    '%' => RubySyntaxKind::Modulo,
512                    '=' => RubySyntaxKind::Assign,
513                    '<' => RubySyntaxKind::Less,
514                    '>' => RubySyntaxKind::Greater,
515                    '&' => RubySyntaxKind::BitAnd,
516                    '|' => RubySyntaxKind::BitOr,
517                    '^' => RubySyntaxKind::Xor,
518                    '!' => RubySyntaxKind::LogicalNot,
519                    '~' => RubySyntaxKind::Tilde,
520                    '?' => RubySyntaxKind::Question,
521                    _ => RubySyntaxKind::Invalid,
522                };
523                state.add_token(kind, start_pos, state.get_position());
524                return true;
525            }
526        }
527
528        false
529    }
530
531    /// 处理分隔
532    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
533        let start_pos = state.get_position();
534
535        // 检查双冒号
536        if state.peek() == Some(':') && state.peek_next_n(1) == Some(':') {
537            state.advance(2);
538            state.add_token(RubySyntaxKind::DoubleColon, start_pos, state.get_position());
539            return true;
540        }
541
542        // 单字符分隔符
543        let delimiters = ['(', ')', '[', ']', '{', '}', ',', ';', '.', ':', '@', '$'];
544
545        if let Some(ch) = state.peek() {
546            if delimiters.contains(&ch) {
547                state.advance(1);
548                let kind = match ch {
549                    '(' => RubySyntaxKind::LeftParen,
550                    ')' => RubySyntaxKind::RightParen,
551                    '[' => RubySyntaxKind::LeftBracket,
552                    ']' => RubySyntaxKind::RightBracket,
553                    '{' => RubySyntaxKind::LeftBrace,
554                    '}' => RubySyntaxKind::RightBrace,
555                    ',' => RubySyntaxKind::Comma,
556                    ';' => RubySyntaxKind::Semicolon,
557                    '.' => RubySyntaxKind::Dot,
558                    ':' => RubySyntaxKind::Colon,
559                    '@' => RubySyntaxKind::At,
560                    '$' => RubySyntaxKind::Dollar,
561                    _ => RubySyntaxKind::Invalid,
562                };
563                state.add_token(kind, start_pos, state.get_position());
564                return true;
565            }
566        }
567
568        // 如果没有匹配任何已知字符,将其标记为 Invalid 并推进位置
569        if let Some(_ch) = state.peek() {
570            state.advance(1);
571            state.add_token(RubySyntaxKind::Invalid, start_pos, state.get_position());
572            return true;
573        }
574
575        false
576    }
577}