1use crate::{kind::RubySyntaxKind, language::RubyLanguage};
2use oak_core::{LexOutput, Lexer, LexerCache, LexerState, OakError, Source, TextEdit};
3
4type State<'a, S> = LexerState<'a, S, RubyLanguage>;
5
6#[derive(Clone, Default)]
7pub struct RubyLexer {}
8
9impl Lexer<RubyLanguage> for RubyLexer {
10 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RubyLanguage>) -> LexOutput<RubyLanguage> {
11 let mut state: State<'_, S> = LexerState::new(source);
12 let result = self.run(&mut state);
13 if result.is_ok() {
14 state.add_eof();
15 }
16 state.finish_with_cache(result, cache)
17 }
18}
19
20impl RubyLexer {
21 pub fn new(_config: &RubyLanguage) -> Self {
22 Self {}
23 }
24
25 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
26 while state.not_at_end() {
27 let safe_point = state.get_position();
28
29 if self.skip_whitespace(state) {
30 continue;
31 }
32
33 if self.lex_newline(state) {
34 continue;
35 }
36
37 if self.skip_comment(state) {
38 continue;
39 }
40
41 if self.lex_string_literal(state) {
42 continue;
43 }
44
45 if self.lex_symbol(state) {
46 continue;
47 }
48
49 if self.lex_number_literal(state) {
50 continue;
51 }
52
53 if self.lex_identifier_or_keyword(state) {
54 continue;
55 }
56
57 if self.lex_operators(state) {
58 continue;
59 }
60
61 if self.lex_single_char_tokens(state) {
62 continue;
63 }
64
65 state.advance_if_dead_lock(safe_point);
66 }
67
68 Ok(())
69 }
70
71 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
73 let start_pos = state.get_position();
74
75 while let Some(ch) = state.peek() {
76 if ch == ' ' || ch == '\t' {
77 state.advance(ch.len_utf8());
78 }
79 else {
80 break;
81 }
82 }
83
84 if state.get_position() > start_pos {
85 state.add_token(RubySyntaxKind::Whitespace, start_pos, state.get_position());
86 true
87 }
88 else {
89 false
90 }
91 }
92
93 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95 let start_pos = state.get_position();
96
97 if let Some('\n') = state.peek() {
98 state.advance(1);
99 state.add_token(RubySyntaxKind::Newline, start_pos, state.get_position());
100 true
101 }
102 else if let Some('\r') = state.peek() {
103 state.advance(1);
104 if let Some('\n') = state.peek() {
105 state.advance(1);
106 }
107 state.add_token(RubySyntaxKind::Newline, start_pos, state.get_position());
108 true
109 }
110 else {
111 false
112 }
113 }
114
115 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
117 if let Some('#') = state.peek() {
118 let start_pos = state.get_position();
119 state.advance(1); while let Some(ch) = state.peek() {
123 if ch == '\n' || ch == '\r' {
124 break;
125 }
126 state.advance(ch.len_utf8());
127 }
128
129 state.add_token(RubySyntaxKind::Comment, start_pos, state.get_position());
130 true
131 }
132 else {
133 false
134 }
135 }
136
137 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
139 let start_pos = state.get_position();
140
141 let quote_char = match state.peek() {
143 Some('"') => '"',
144 Some('\'') => '\'',
145 Some('`') => '`',
146 _ => return false,
147 };
148
149 state.advance(1); let mut escaped = false;
151 while let Some(ch) = state.peek() {
152 if escaped {
153 escaped = false;
154 state.advance(ch.len_utf8());
155 continue;
156 }
157
158 if ch == '\\' {
159 escaped = true;
160 state.advance(1);
161 continue;
162 }
163
164 if ch == quote_char {
165 state.advance(1); break;
167 }
168 else if ch == '\n' || ch == '\r' {
169 state.advance(ch.len_utf8());
171 }
172 else {
173 state.advance(ch.len_utf8());
174 }
175 }
176
177 state.add_token(RubySyntaxKind::StringLiteral, start_pos, state.get_position());
178 true
179 }
180
181 fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
183 if let Some(':') = state.peek() {
184 let start_pos = state.get_position();
185 state.advance(1); if let Some(ch) = state.peek() {
189 if ch.is_ascii_alphabetic() || ch == '_' {
190 while let Some(ch) = state.peek() {
192 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' {
193 state.advance(1);
194 }
195 else {
196 break;
197 }
198 }
199 state.add_token(RubySyntaxKind::Symbol, start_pos, state.get_position());
200 return true;
201 }
202 else if ch == '"' || ch == '\'' {
203 let quote = ch;
205 state.advance(1);
206
207 let mut escaped = false;
208 while let Some(ch) = state.peek() {
209 if escaped {
210 escaped = false;
211 state.advance(ch.len_utf8());
212 continue;
213 }
214
215 if ch == '\\' {
216 escaped = true;
217 state.advance(1);
218 continue;
219 }
220
221 if ch == quote {
222 state.advance(1);
223 break;
224 }
225 else {
226 state.advance(ch.len_utf8());
227 }
228 }
229 state.add_token(RubySyntaxKind::Symbol, start_pos, state.get_position());
230 return true;
231 }
232 }
233 }
234 false
235 }
236
237 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
239 let start_pos = state.get_position();
240
241 if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
242 return false;
243 }
244
245 let mut is_float = false;
246
247 if state.peek() == Some('0') {
249 let next_char = state.peek_next_n(1);
250 match next_char {
251 Some('b') | Some('B') => {
252 state.advance(2); while let Some(ch) = state.peek() {
255 if ch == '0' || ch == '1' {
256 state.advance(1);
257 }
258 else if ch == '_' {
259 state.advance(1); }
261 else {
262 break;
263 }
264 }
265 }
266 Some('o') | Some('O') => {
267 state.advance(2); while let Some(ch) = state.peek() {
270 if ch.is_ascii_digit() && ch < '8' {
271 state.advance(1);
272 }
273 else if ch == '_' {
274 state.advance(1); }
276 else {
277 break;
278 }
279 }
280 }
281 Some('x') | Some('X') => {
282 state.advance(2); while let Some(ch) = state.peek() {
285 if ch.is_ascii_hexdigit() {
286 state.advance(1);
287 }
288 else if ch == '_' {
289 state.advance(1); }
291 else {
292 break;
293 }
294 }
295 }
296 _ => {
297 self.lex_decimal_number(state, &mut is_float);
299 }
300 }
301 }
302 else {
303 self.lex_decimal_number(state, &mut is_float);
305 }
306
307 let kind = if is_float { RubySyntaxKind::FloatLiteral } else { RubySyntaxKind::IntegerLiteral };
308
309 state.add_token(kind, start_pos, state.get_position());
310 true
311 }
312
313 fn lex_decimal_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, is_float: &mut bool) {
315 while let Some(ch) = state.peek() {
317 if ch.is_ascii_digit() {
318 state.advance(1);
319 }
320 else if ch == '_' {
321 state.advance(1); break;
323 }
324 }
325
326 if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
328 *is_float = true;
329 state.advance(1); while let Some(ch) = state.peek() {
332 if ch.is_ascii_digit() {
333 state.advance(1);
334 }
335 else if ch == '_' {
336 state.advance(1); }
338 else {
339 break;
340 }
341 }
342 }
343
344 if let Some('e') | Some('E') = state.peek() {
346 *is_float = true;
347 state.advance(1);
348
349 if let Some('+') | Some('-') = state.peek() {
351 state.advance(1);
352 }
353
354 while let Some(ch) = state.peek() {
356 if ch.is_ascii_digit() {
357 state.advance(1);
358 }
359 else if ch == '_' {
360 state.advance(1); break;
362 }
363 }
364 }
365 }
366
367 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
369 let start_pos = state.get_position();
370
371 if !state.peek().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
373 return false;
374 }
375
376 let mut buf = String::new();
378
379 while let Some(ch) = state.peek() {
381 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' {
382 buf.push(ch);
383 state.advance(1);
384 }
385 else {
386 break;
387 }
388 }
389
390 let kind = match buf.as_str() {
392 "if" => RubySyntaxKind::If,
393 "unless" => RubySyntaxKind::Unless,
394 "elsif" => RubySyntaxKind::Elsif,
395 "else" => RubySyntaxKind::Else,
396 "case" => RubySyntaxKind::Case,
397 "when" => RubySyntaxKind::When,
398 "then" => RubySyntaxKind::Then,
399 "for" => RubySyntaxKind::For,
400 "while" => RubySyntaxKind::While,
401 "until" => RubySyntaxKind::Until,
402 "break" => RubySyntaxKind::Break,
403 "next" => RubySyntaxKind::Next,
404 "redo" => RubySyntaxKind::Redo,
405 "retry" => RubySyntaxKind::Retry,
406 "return" => RubySyntaxKind::Return,
407 "yield" => RubySyntaxKind::Yield,
408 "def" => RubySyntaxKind::Def,
409 "class" => RubySyntaxKind::Class,
410 "module" => RubySyntaxKind::Module,
411 "end" => RubySyntaxKind::End,
412 "lambda" => RubySyntaxKind::Lambda,
413 "proc" => RubySyntaxKind::Proc,
414 "begin" => RubySyntaxKind::Begin,
415 "rescue" => RubySyntaxKind::Rescue,
416 "ensure" => RubySyntaxKind::Ensure,
417 "raise" => RubySyntaxKind::Raise,
418 "require" => RubySyntaxKind::Require,
419 "load" => RubySyntaxKind::Load,
420 "include" => RubySyntaxKind::Include,
421 "extend" => RubySyntaxKind::Extend,
422 "prepend" => RubySyntaxKind::Prepend,
423 "and" => RubySyntaxKind::And,
424 "or" => RubySyntaxKind::Or,
425 "not" => RubySyntaxKind::Not,
426 "in" => RubySyntaxKind::In,
427 "true" => RubySyntaxKind::True,
428 "false" => RubySyntaxKind::False,
429 "nil" => RubySyntaxKind::Nil,
430 "super" => RubySyntaxKind::Super,
431 "self" => RubySyntaxKind::Self_,
432 "alias" => RubySyntaxKind::Alias,
433 "undef" => RubySyntaxKind::Undef,
434 "defined?" => RubySyntaxKind::Defined,
435 "do" => RubySyntaxKind::Do,
436 _ => RubySyntaxKind::Identifier,
437 };
438
439 state.add_token(kind, start_pos, state.get_position());
440 true
441 }
442
443 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
445 let start_pos = state.get_position();
446
447 let three_char_ops = ["<=>", "===", "**=", "<<=", ">>=", "||=", "&&=", "..."];
449 for op in &three_char_ops {
450 if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) && state.peek_next_n(2) == op.chars().nth(2) {
451 state.advance(3);
452 let kind = match *op {
453 "<=>" => RubySyntaxKind::Spaceship,
454 "===" => RubySyntaxKind::EqualEqualEqual,
455 "**=" => RubySyntaxKind::PowerAssign,
456 "<<=" => RubySyntaxKind::LeftShiftAssign,
457 ">>=" => RubySyntaxKind::RightShiftAssign,
458 "||=" => RubySyntaxKind::OrOrAssign,
459 "&&=" => RubySyntaxKind::AndAndAssign,
460 "..." => RubySyntaxKind::DotDotDot,
461 _ => RubySyntaxKind::Invalid,
462 };
463 state.add_token(kind, start_pos, state.get_position());
464 return true;
465 }
466 }
467
468 let two_char_ops = ["**", "<<", ">>", "<=", ">=", "==", "!=", "=~", "!~", "&&", "||", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", ".."];
469 for op in &two_char_ops {
470 if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) {
471 state.advance(2);
472 let kind = match *op {
473 "**" => RubySyntaxKind::Power,
474 "<<" => RubySyntaxKind::LeftShift,
475 ">>" => RubySyntaxKind::RightShift,
476 "<=" => RubySyntaxKind::LessEqual,
477 ">=" => RubySyntaxKind::GreaterEqual,
478 "==" => RubySyntaxKind::EqualEqual,
479 "!=" => RubySyntaxKind::NotEqual,
480 "=~" => RubySyntaxKind::Match,
481 "!~" => RubySyntaxKind::NotMatch,
482 "&&" => RubySyntaxKind::AndAnd,
483 "||" => RubySyntaxKind::OrOr,
484 "+=" => RubySyntaxKind::PlusAssign,
485 "-=" => RubySyntaxKind::MinusAssign,
486 "*=" => RubySyntaxKind::MultiplyAssign,
487 "/=" => RubySyntaxKind::DivideAssign,
488 "%=" => RubySyntaxKind::ModuloAssign,
489 "&=" => RubySyntaxKind::AndAssign,
490 "|=" => RubySyntaxKind::OrAssign,
491 "^=" => RubySyntaxKind::XorAssign,
492 ".." => RubySyntaxKind::DotDot,
493 _ => RubySyntaxKind::Invalid,
494 };
495 state.add_token(kind, start_pos, state.get_position());
496 return true;
497 }
498 }
499
500 let single_char_ops = ['+', '-', '*', '/', '%', '=', '<', '>', '&', '|', '^', '!', '~', '?'];
502
503 if let Some(ch) = state.peek() {
504 if single_char_ops.contains(&ch) {
505 state.advance(1);
506 let kind = match ch {
507 '+' => RubySyntaxKind::Plus,
508 '-' => RubySyntaxKind::Minus,
509 '*' => RubySyntaxKind::Multiply,
510 '/' => RubySyntaxKind::Divide,
511 '%' => RubySyntaxKind::Modulo,
512 '=' => RubySyntaxKind::Assign,
513 '<' => RubySyntaxKind::Less,
514 '>' => RubySyntaxKind::Greater,
515 '&' => RubySyntaxKind::BitAnd,
516 '|' => RubySyntaxKind::BitOr,
517 '^' => RubySyntaxKind::Xor,
518 '!' => RubySyntaxKind::LogicalNot,
519 '~' => RubySyntaxKind::Tilde,
520 '?' => RubySyntaxKind::Question,
521 _ => RubySyntaxKind::Invalid,
522 };
523 state.add_token(kind, start_pos, state.get_position());
524 return true;
525 }
526 }
527
528 false
529 }
530
531 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
533 let start_pos = state.get_position();
534
535 if state.peek() == Some(':') && state.peek_next_n(1) == Some(':') {
537 state.advance(2);
538 state.add_token(RubySyntaxKind::DoubleColon, start_pos, state.get_position());
539 return true;
540 }
541
542 let delimiters = ['(', ')', '[', ']', '{', '}', ',', ';', '.', ':', '@', '$'];
544
545 if let Some(ch) = state.peek() {
546 if delimiters.contains(&ch) {
547 state.advance(1);
548 let kind = match ch {
549 '(' => RubySyntaxKind::LeftParen,
550 ')' => RubySyntaxKind::RightParen,
551 '[' => RubySyntaxKind::LeftBracket,
552 ']' => RubySyntaxKind::RightBracket,
553 '{' => RubySyntaxKind::LeftBrace,
554 '}' => RubySyntaxKind::RightBrace,
555 ',' => RubySyntaxKind::Comma,
556 ';' => RubySyntaxKind::Semicolon,
557 '.' => RubySyntaxKind::Dot,
558 ':' => RubySyntaxKind::Colon,
559 '@' => RubySyntaxKind::At,
560 '$' => RubySyntaxKind::Dollar,
561 _ => RubySyntaxKind::Invalid,
562 };
563 state.add_token(kind, start_pos, state.get_position());
564 return true;
565 }
566 }
567
568 if let Some(_ch) = state.peek() {
570 state.advance(1);
571 state.add_token(RubySyntaxKind::Invalid, start_pos, state.get_position());
572 return true;
573 }
574
575 false
576 }
577}