Skip to main content

oak_ruby/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::RubyLanguage, lexer::token_type::RubyTokenType};
5use oak_core::{LexOutput, Lexer, LexerCache, LexerState, OakError, Source, TextEdit};
6
7type State<'a, S> = LexerState<'a, S, RubyLanguage>;
8
9#[derive(Clone, Debug)]
10pub struct RubyLexer<'config> {
11    _config: &'config RubyLanguage,
12}
13
14impl<'config> Lexer<RubyLanguage> for RubyLexer<'config> {
15    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RubyLanguage>) -> LexOutput<RubyLanguage> {
16        let mut state: State<'_, S> = LexerState::new(source);
17        let result = self.run(&mut state);
18        if result.is_ok() {
19            state.add_eof()
20        }
21        state.finish_with_cache(result, cache)
22    }
23}
24
25impl<'config> RubyLexer<'config> {
26    pub fn new(config: &'config RubyLanguage) -> Self {
27        Self { _config: config }
28    }
29
30    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
31        while state.not_at_end() {
32            let safe_point = state.get_position();
33
34            if self.skip_whitespace(state) {
35                continue;
36            }
37
38            if self.lex_newline(state) {
39                continue;
40            }
41
42            if self.skip_comment(state) {
43                continue;
44            }
45
46            if self.lex_string_literal(state) {
47                continue;
48            }
49
50            if self.lex_symbol(state) {
51                continue;
52            }
53
54            if self.lex_number_literal(state) {
55                continue;
56            }
57
58            if self.lex_identifier_or_keyword(state) {
59                continue;
60            }
61
62            if self.lex_operators(state) {
63                continue;
64            }
65
66            if self.lex_single_char_tokens(state) {
67                continue;
68            }
69
70            state.advance_if_dead_lock(safe_point)
71        }
72
73        Ok(())
74    }
75
76    /// 跳过空白字符
77    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
78        let start_pos = state.get_position();
79
80        while let Some(ch) = state.peek() {
81            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
82        }
83
84        if state.get_position() > start_pos {
85            state.add_token(RubyTokenType::Whitespace, start_pos, state.get_position());
86            true
87        }
88        else {
89            false
90        }
91    }
92
93    /// 处理换行
94    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        let start_pos = state.get_position();
96
97        if let Some('\n') = state.peek() {
98            state.advance(1);
99            state.add_token(RubyTokenType::Newline, start_pos, state.get_position());
100            true
101        }
102        else if let Some('\r') = state.peek() {
103            state.advance(1);
104            if let Some('\n') = state.peek() {
105                state.advance(1)
106            }
107            state.add_token(RubyTokenType::Newline, start_pos, state.get_position());
108            true
109        }
110        else {
111            false
112        }
113    }
114
115    /// 处理注释
116    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
117        if let Some('#') = state.peek() {
118            let start_pos = state.get_position();
119            state.advance(1); // 跳过 '#'
120
121            // 读取到行
122            while let Some(ch) = state.peek() {
123                if ch == '\n' || ch == '\r' {
124                    break;
125                }
126                state.advance(ch.len_utf8())
127            }
128
129            state.add_token(RubyTokenType::Comment, start_pos, state.get_position());
130            true
131        }
132        else {
133            false
134        }
135    }
136
137    /// 处理字符串字面量
138    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
139        let start_pos = state.get_position();
140
141        // 检查是否是字符串开
142        let quote_char = match state.peek() {
143            Some('"') => '"',
144            Some('\'') => '\'',
145            Some('`') => '`',
146            _ => return false,
147        };
148
149        state.advance(1); // 跳过开始引号
150        let mut escaped = false;
151        while let Some(ch) = state.peek() {
152            if escaped {
153                escaped = false;
154                state.advance(ch.len_utf8());
155                continue;
156            }
157
158            if ch == '\\' {
159                escaped = true;
160                state.advance(1);
161                continue;
162            }
163
164            if ch == quote_char {
165                state.advance(1); // 跳过结束引号
166                break;
167            }
168            else if ch == '\n' || ch == '\r' {
169                // Ruby 字符串可以跨多行
170                state.advance(ch.len_utf8())
171            }
172            else {
173                state.advance(ch.len_utf8())
174            }
175        }
176
177        state.add_token(RubyTokenType::StringLiteral, start_pos, state.get_position());
178        true
179    }
180
181    /// 处理符号
182    fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
183        if let Some(':') = state.peek() {
184            let start_pos = state.get_position();
185            state.advance(1); // 跳过 ':'
186
187            // 检查下一个字符是否是标识符开
188            if let Some(ch) = state.peek() {
189                if ch.is_ascii_alphabetic() || ch == '_' {
190                    // 读取标识
191                    while let Some(ch) = state.peek() {
192                        if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' { state.advance(1) } else { break }
193                    }
194                    state.add_token(RubyTokenType::Symbol, start_pos, state.get_position());
195                    return true;
196                }
197                else if ch == '"' || ch == '\'' {
198                    // 引号符号
199                    let quote = ch;
200                    state.advance(1);
201
202                    let mut escaped = false;
203                    while let Some(ch) = state.peek() {
204                        if escaped {
205                            escaped = false;
206                            state.advance(ch.len_utf8());
207                            continue;
208                        }
209
210                        if ch == '\\' {
211                            escaped = true;
212                            state.advance(1);
213                            continue;
214                        }
215
216                        if ch == quote {
217                            state.advance(1);
218                            break;
219                        }
220                        else {
221                            state.advance(ch.len_utf8())
222                        }
223                    }
224                    state.add_token(RubyTokenType::Symbol, start_pos, state.get_position());
225                    return true;
226                }
227            }
228        }
229        false
230    }
231
232    /// 处理数字字面
233    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
234        let start_pos = state.get_position();
235
236        if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
237            return false;
238        }
239
240        let mut is_float = false;
241
242        // 检查进制前缀
243        if state.peek() == Some('0') {
244            let next_char = state.peek_next_n(1);
245            match next_char {
246                Some('b') | Some('B') => {
247                    state.advance(2); // 跳过 '0b' '0B'
248                    // 读取二进制数
249                    while let Some(ch) = state.peek() {
250                        if ch == '0' || ch == '1' {
251                            state.advance(1);
252                        }
253                        else if ch == '_' {
254                            state.advance(1); // 数字分隔
255                        }
256                        else {
257                            break;
258                        }
259                    }
260                }
261                Some('o') | Some('O') => {
262                    state.advance(2); // 跳过 '0o' '0O'
263                    // 读取八进制数
264                    while let Some(ch) = state.peek() {
265                        if ch.is_ascii_digit() && ch < '8' {
266                            state.advance(1);
267                        }
268                        else if ch == '_' {
269                            state.advance(1); // 数字分隔
270                        }
271                        else {
272                            break;
273                        }
274                    }
275                }
276                Some('x') | Some('X') => {
277                    state.advance(2); // 跳过 '0x' '0X'
278                    // 读取十六进制数字
279                    while let Some(ch) = state.peek() {
280                        if ch.is_ascii_hexdigit() {
281                            state.advance(1);
282                        }
283                        else if ch == '_' {
284                            state.advance(1); // 数字分隔
285                        }
286                        else {
287                            break;
288                        }
289                    }
290                }
291                _ => {
292                    // 十进制数
293                    self.lex_decimal_number(state, &mut is_float)
294                }
295            }
296        }
297        else {
298            // 十进制数
299            self.lex_decimal_number(state, &mut is_float)
300        }
301
302        let kind = if is_float { RubyTokenType::FloatLiteral } else { RubyTokenType::IntegerLiteral };
303
304        state.add_token(kind, start_pos, state.get_position());
305        true
306    }
307
308    /// 处理十进制数
309    fn lex_decimal_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, is_float: &mut bool) {
310        // 读取整数部分
311        while let Some(ch) = state.peek() {
312            if ch.is_ascii_digit() {
313                state.advance(1);
314            }
315            else if ch == '_' {
316                state.advance(1); // 数字分隔
317            }
318            else {
319                break;
320            }
321        }
322
323        // 检查小数点
324        if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
325            *is_float = true;
326            state.advance(1); // 跳过小数
327            // 读取小数部分
328            while let Some(ch) = state.peek() {
329                if ch.is_ascii_digit() {
330                    state.advance(1);
331                }
332                else if ch == '_' {
333                    state.advance(1); // 数字分隔
334                }
335                else {
336                    break;
337                }
338            }
339        }
340
341        // 检查科学计数法
342        if let Some('e') | Some('E') = state.peek() {
343            *is_float = true;
344            state.advance(1);
345
346            // 可选的符号
347            if let Some('+') | Some('-') = state.peek() {
348                state.advance(1);
349            }
350
351            // 指数部分
352            while let Some(ch) = state.peek() {
353                if ch.is_ascii_digit() {
354                    state.advance(1);
355                }
356                else if ch == '_' {
357                    state.advance(1); // 数字分隔
358                }
359                else {
360                    break;
361                }
362            }
363        }
364    }
365
366    /// 处理标识符或关键
367    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
368        let start_pos = state.get_position();
369
370        // 检查第一个字符
371        if !state.peek().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
372            return false;
373        }
374
375        // 构建标识符字符串
376        let mut buf = String::new();
377
378        // 读取标识符
379        while let Some(ch) = state.peek() {
380            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' {
381                buf.push(ch);
382                state.advance(1);
383            }
384            else {
385                break;
386            }
387        }
388
389        // 检查是否是关键字
390        let kind = match buf.as_str() {
391            "if" => RubyTokenType::If,
392            "unless" => RubyTokenType::Unless,
393            "elsif" => RubyTokenType::Elsif,
394            "else" => RubyTokenType::Else,
395            "case" => RubyTokenType::Case,
396            "when" => RubyTokenType::When,
397            "then" => RubyTokenType::Then,
398            "for" => RubyTokenType::For,
399            "while" => RubyTokenType::While,
400            "until" => RubyTokenType::Until,
401            "break" => RubyTokenType::Break,
402            "next" => RubyTokenType::Next,
403            "redo" => RubyTokenType::Redo,
404            "retry" => RubyTokenType::Retry,
405            "return" => RubyTokenType::Return,
406            "yield" => RubyTokenType::Yield,
407            "def" => RubyTokenType::Def,
408            "class" => RubyTokenType::Class,
409            "module" => RubyTokenType::Module,
410            "end" => RubyTokenType::End,
411            "lambda" => RubyTokenType::Lambda,
412            "proc" => RubyTokenType::Proc,
413            "begin" => RubyTokenType::Begin,
414            "rescue" => RubyTokenType::Rescue,
415            "ensure" => RubyTokenType::Ensure,
416            "raise" => RubyTokenType::Raise,
417            "require" => RubyTokenType::Require,
418            "load" => RubyTokenType::Load,
419            "include" => RubyTokenType::Include,
420            "extend" => RubyTokenType::Extend,
421            "prepend" => RubyTokenType::Prepend,
422            "and" => RubyTokenType::And,
423            "or" => RubyTokenType::Or,
424            "not" => RubyTokenType::Not,
425            "in" => RubyTokenType::In,
426            "true" => RubyTokenType::True,
427            "false" => RubyTokenType::False,
428            "nil" => RubyTokenType::Nil,
429            "super" => RubyTokenType::Super,
430            "self" => RubyTokenType::Self_,
431            "alias" => RubyTokenType::Alias,
432            "undef" => RubyTokenType::Undef,
433            "defined?" => RubyTokenType::Defined,
434            "do" => RubyTokenType::Do,
435            _ => RubyTokenType::Identifier,
436        };
437
438        state.add_token(kind, start_pos, state.get_position());
439        true
440    }
441
442    /// 处理操作
443    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
444        let start_pos = state.get_position();
445
446        // 尝试匹配多字符操作符
447        let three_char_ops = ["<=>", "===", "**=", "<<=", ">>=", "||=", "&&=", "..."];
448        for op in &three_char_ops {
449            if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) && state.peek_next_n(2) == op.chars().nth(2) {
450                state.advance(3);
451                let kind = match *op {
452                    "<=>" => RubyTokenType::Spaceship,
453                    "===" => RubyTokenType::EqualEqualEqual,
454                    "**=" => RubyTokenType::PowerAssign,
455                    "<<=" => RubyTokenType::LeftShiftAssign,
456                    ">>=" => RubyTokenType::RightShiftAssign,
457                    "||=" => RubyTokenType::OrOrAssign,
458                    "&&=" => RubyTokenType::AndAndAssign,
459                    "..." => RubyTokenType::DotDotDot,
460                    _ => RubyTokenType::Invalid,
461                };
462                state.add_token(kind, start_pos, state.get_position());
463                return true;
464            }
465        }
466
467        let two_char_ops = ["**", "<<", ">>", "<=", ">=", "==", "!=", "=~", "!~", "&&", "||", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", ".."];
468        for op in &two_char_ops {
469            if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) {
470                state.advance(2);
471                let kind = match *op {
472                    "**" => RubyTokenType::Power,
473                    "<<" => RubyTokenType::LeftShift,
474                    ">>" => RubyTokenType::RightShift,
475                    "<=" => RubyTokenType::LessEqual,
476                    ">=" => RubyTokenType::GreaterEqual,
477                    "==" => RubyTokenType::EqualEqual,
478                    "!=" => RubyTokenType::NotEqual,
479                    "=~" => RubyTokenType::Match,
480                    "!~" => RubyTokenType::NotMatch,
481                    "&&" => RubyTokenType::AndAnd,
482                    "||" => RubyTokenType::OrOr,
483                    "+=" => RubyTokenType::PlusAssign,
484                    "-=" => RubyTokenType::MinusAssign,
485                    "*=" => RubyTokenType::MultiplyAssign,
486                    "/=" => RubyTokenType::DivideAssign,
487                    "%=" => RubyTokenType::ModuloAssign,
488                    "&=" => RubyTokenType::AndAssign,
489                    "|=" => RubyTokenType::OrAssign,
490                    "^=" => RubyTokenType::XorAssign,
491                    ".." => RubyTokenType::DotDot,
492                    _ => RubyTokenType::Invalid,
493                };
494                state.add_token(kind, start_pos, state.get_position());
495                return true;
496            }
497        }
498
499        // 尝试匹配单字符操作符
500        let single_char_ops = ['+', '-', '*', '/', '%', '=', '<', '>', '&', '|', '^', '!', '~', '?'];
501
502        if let Some(ch) = state.peek() {
503            if single_char_ops.contains(&ch) {
504                state.advance(1);
505                let kind = match ch {
506                    '+' => RubyTokenType::Plus,
507                    '-' => RubyTokenType::Minus,
508                    '*' => RubyTokenType::Multiply,
509                    '/' => RubyTokenType::Divide,
510                    '%' => RubyTokenType::Modulo,
511                    '=' => RubyTokenType::Assign,
512                    '<' => RubyTokenType::Less,
513                    '>' => RubyTokenType::Greater,
514                    '&' => RubyTokenType::BitAnd,
515                    '|' => RubyTokenType::BitOr,
516                    '^' => RubyTokenType::Xor,
517                    '!' => RubyTokenType::LogicalNot,
518                    '~' => RubyTokenType::Tilde,
519                    '?' => RubyTokenType::Question,
520                    _ => RubyTokenType::Invalid,
521                };
522                state.add_token(kind, start_pos, state.get_position());
523                return true;
524            }
525        }
526
527        false
528    }
529
530    /// 处理分隔符
531    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
532        let start_pos = state.get_position();
533
534        // 检查双冒号
535        if state.peek() == Some(':') && state.peek_next_n(1) == Some(':') {
536            state.advance(2);
537            state.add_token(RubyTokenType::DoubleColon, start_pos, state.get_position());
538            return true;
539        }
540
541        // 单字符分隔符
542        let delimiters = ['(', ')', '[', ']', '{', '}', ',', ';', '.', ':', '@', '$'];
543
544        if let Some(ch) = state.peek() {
545            if delimiters.contains(&ch) {
546                state.advance(1);
547                let kind = match ch {
548                    '(' => RubyTokenType::LeftParen,
549                    ')' => RubyTokenType::RightParen,
550                    '[' => RubyTokenType::LeftBracket,
551                    ']' => RubyTokenType::RightBracket,
552                    '{' => RubyTokenType::LeftBrace,
553                    '}' => RubyTokenType::RightBrace,
554                    ',' => RubyTokenType::Comma,
555                    ';' => RubyTokenType::Semicolon,
556                    '.' => RubyTokenType::Dot,
557                    ':' => RubyTokenType::Colon,
558                    '@' => RubyTokenType::At,
559                    '$' => RubyTokenType::Dollar,
560                    _ => RubyTokenType::Invalid,
561                };
562                state.add_token(kind, start_pos, state.get_position());
563                return true;
564            }
565        }
566
567        // 如果没有匹配任何已知字符,将其标记为 Invalid 并推进位置
568        if let Some(_ch) = state.peek() {
569            state.advance(1);
570            state.add_token(RubyTokenType::Invalid, start_pos, state.get_position());
571            return true;
572        }
573
574        false
575    }
576}