1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::RubyLanguage, lexer::token_type::RubyTokenType};
5use oak_core::{LexOutput, Lexer, LexerCache, LexerState, OakError, Source, TextEdit};
6
7type State<'a, S> = LexerState<'a, S, RubyLanguage>;
8
9#[derive(Clone, Debug)]
10pub struct RubyLexer<'config> {
11 _config: &'config RubyLanguage,
12}
13
14impl<'config> Lexer<RubyLanguage> for RubyLexer<'config> {
15 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RubyLanguage>) -> LexOutput<RubyLanguage> {
16 let mut state: State<'_, S> = LexerState::new(source);
17 let result = self.run(&mut state);
18 if result.is_ok() {
19 state.add_eof()
20 }
21 state.finish_with_cache(result, cache)
22 }
23}
24
25impl<'config> RubyLexer<'config> {
26 pub fn new(config: &'config RubyLanguage) -> Self {
27 Self { _config: config }
28 }
29
30 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
31 while state.not_at_end() {
32 let safe_point = state.get_position();
33
34 if self.skip_whitespace(state) {
35 continue;
36 }
37
38 if self.lex_newline(state) {
39 continue;
40 }
41
42 if self.skip_comment(state) {
43 continue;
44 }
45
46 if self.lex_string_literal(state) {
47 continue;
48 }
49
50 if self.lex_symbol(state) {
51 continue;
52 }
53
54 if self.lex_number_literal(state) {
55 continue;
56 }
57
58 if self.lex_identifier_or_keyword(state) {
59 continue;
60 }
61
62 if self.lex_operators(state) {
63 continue;
64 }
65
66 if self.lex_single_char_tokens(state) {
67 continue;
68 }
69
70 state.advance_if_dead_lock(safe_point)
71 }
72
73 Ok(())
74 }
75
76 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
78 let start_pos = state.get_position();
79
80 while let Some(ch) = state.peek() {
81 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
82 }
83
84 if state.get_position() > start_pos {
85 state.add_token(RubyTokenType::Whitespace, start_pos, state.get_position());
86 true
87 }
88 else {
89 false
90 }
91 }
92
93 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95 let start_pos = state.get_position();
96
97 if let Some('\n') = state.peek() {
98 state.advance(1);
99 state.add_token(RubyTokenType::Newline, start_pos, state.get_position());
100 true
101 }
102 else if let Some('\r') = state.peek() {
103 state.advance(1);
104 if let Some('\n') = state.peek() {
105 state.advance(1)
106 }
107 state.add_token(RubyTokenType::Newline, start_pos, state.get_position());
108 true
109 }
110 else {
111 false
112 }
113 }
114
115 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
117 if let Some('#') = state.peek() {
118 let start_pos = state.get_position();
119 state.advance(1); while let Some(ch) = state.peek() {
123 if ch == '\n' || ch == '\r' {
124 break;
125 }
126 state.advance(ch.len_utf8())
127 }
128
129 state.add_token(RubyTokenType::Comment, start_pos, state.get_position());
130 true
131 }
132 else {
133 false
134 }
135 }
136
137 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
139 let start_pos = state.get_position();
140
141 let quote_char = match state.peek() {
143 Some('"') => '"',
144 Some('\'') => '\'',
145 Some('`') => '`',
146 _ => return false,
147 };
148
149 state.advance(1); let mut escaped = false;
151 while let Some(ch) = state.peek() {
152 if escaped {
153 escaped = false;
154 state.advance(ch.len_utf8());
155 continue;
156 }
157
158 if ch == '\\' {
159 escaped = true;
160 state.advance(1);
161 continue;
162 }
163
164 if ch == quote_char {
165 state.advance(1); break;
167 }
168 else if ch == '\n' || ch == '\r' {
169 state.advance(ch.len_utf8())
171 }
172 else {
173 state.advance(ch.len_utf8())
174 }
175 }
176
177 state.add_token(RubyTokenType::StringLiteral, start_pos, state.get_position());
178 true
179 }
180
181 fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
183 if let Some(':') = state.peek() {
184 let start_pos = state.get_position();
185 state.advance(1); if let Some(ch) = state.peek() {
189 if ch.is_ascii_alphabetic() || ch == '_' {
190 while let Some(ch) = state.peek() {
192 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' { state.advance(1) } else { break }
193 }
194 state.add_token(RubyTokenType::Symbol, start_pos, state.get_position());
195 return true;
196 }
197 else if ch == '"' || ch == '\'' {
198 let quote = ch;
200 state.advance(1);
201
202 let mut escaped = false;
203 while let Some(ch) = state.peek() {
204 if escaped {
205 escaped = false;
206 state.advance(ch.len_utf8());
207 continue;
208 }
209
210 if ch == '\\' {
211 escaped = true;
212 state.advance(1);
213 continue;
214 }
215
216 if ch == quote {
217 state.advance(1);
218 break;
219 }
220 else {
221 state.advance(ch.len_utf8())
222 }
223 }
224 state.add_token(RubyTokenType::Symbol, start_pos, state.get_position());
225 return true;
226 }
227 }
228 }
229 false
230 }
231
232 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
234 let start_pos = state.get_position();
235
236 if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
237 return false;
238 }
239
240 let mut is_float = false;
241
242 if state.peek() == Some('0') {
244 let next_char = state.peek_next_n(1);
245 match next_char {
246 Some('b') | Some('B') => {
247 state.advance(2); while let Some(ch) = state.peek() {
250 if ch == '0' || ch == '1' {
251 state.advance(1);
252 }
253 else if ch == '_' {
254 state.advance(1); }
256 else {
257 break;
258 }
259 }
260 }
261 Some('o') | Some('O') => {
262 state.advance(2); while let Some(ch) = state.peek() {
265 if ch.is_ascii_digit() && ch < '8' {
266 state.advance(1);
267 }
268 else if ch == '_' {
269 state.advance(1); }
271 else {
272 break;
273 }
274 }
275 }
276 Some('x') | Some('X') => {
277 state.advance(2); while let Some(ch) = state.peek() {
280 if ch.is_ascii_hexdigit() {
281 state.advance(1);
282 }
283 else if ch == '_' {
284 state.advance(1); }
286 else {
287 break;
288 }
289 }
290 }
291 _ => {
292 self.lex_decimal_number(state, &mut is_float)
294 }
295 }
296 }
297 else {
298 self.lex_decimal_number(state, &mut is_float)
300 }
301
302 let kind = if is_float { RubyTokenType::FloatLiteral } else { RubyTokenType::IntegerLiteral };
303
304 state.add_token(kind, start_pos, state.get_position());
305 true
306 }
307
308 fn lex_decimal_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, is_float: &mut bool) {
310 while let Some(ch) = state.peek() {
312 if ch.is_ascii_digit() {
313 state.advance(1);
314 }
315 else if ch == '_' {
316 state.advance(1); }
318 else {
319 break;
320 }
321 }
322
323 if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
325 *is_float = true;
326 state.advance(1); while let Some(ch) = state.peek() {
329 if ch.is_ascii_digit() {
330 state.advance(1);
331 }
332 else if ch == '_' {
333 state.advance(1); }
335 else {
336 break;
337 }
338 }
339 }
340
341 if let Some('e') | Some('E') = state.peek() {
343 *is_float = true;
344 state.advance(1);
345
346 if let Some('+') | Some('-') = state.peek() {
348 state.advance(1);
349 }
350
351 while let Some(ch) = state.peek() {
353 if ch.is_ascii_digit() {
354 state.advance(1);
355 }
356 else if ch == '_' {
357 state.advance(1); }
359 else {
360 break;
361 }
362 }
363 }
364 }
365
366 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
368 let start_pos = state.get_position();
369
370 if !state.peek().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
372 return false;
373 }
374
375 let mut buf = String::new();
377
378 while let Some(ch) = state.peek() {
380 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' {
381 buf.push(ch);
382 state.advance(1);
383 }
384 else {
385 break;
386 }
387 }
388
389 let kind = match buf.as_str() {
391 "if" => RubyTokenType::If,
392 "unless" => RubyTokenType::Unless,
393 "elsif" => RubyTokenType::Elsif,
394 "else" => RubyTokenType::Else,
395 "case" => RubyTokenType::Case,
396 "when" => RubyTokenType::When,
397 "then" => RubyTokenType::Then,
398 "for" => RubyTokenType::For,
399 "while" => RubyTokenType::While,
400 "until" => RubyTokenType::Until,
401 "break" => RubyTokenType::Break,
402 "next" => RubyTokenType::Next,
403 "redo" => RubyTokenType::Redo,
404 "retry" => RubyTokenType::Retry,
405 "return" => RubyTokenType::Return,
406 "yield" => RubyTokenType::Yield,
407 "def" => RubyTokenType::Def,
408 "class" => RubyTokenType::Class,
409 "module" => RubyTokenType::Module,
410 "end" => RubyTokenType::End,
411 "lambda" => RubyTokenType::Lambda,
412 "proc" => RubyTokenType::Proc,
413 "begin" => RubyTokenType::Begin,
414 "rescue" => RubyTokenType::Rescue,
415 "ensure" => RubyTokenType::Ensure,
416 "raise" => RubyTokenType::Raise,
417 "require" => RubyTokenType::Require,
418 "load" => RubyTokenType::Load,
419 "include" => RubyTokenType::Include,
420 "extend" => RubyTokenType::Extend,
421 "prepend" => RubyTokenType::Prepend,
422 "and" => RubyTokenType::And,
423 "or" => RubyTokenType::Or,
424 "not" => RubyTokenType::Not,
425 "in" => RubyTokenType::In,
426 "true" => RubyTokenType::True,
427 "false" => RubyTokenType::False,
428 "nil" => RubyTokenType::Nil,
429 "super" => RubyTokenType::Super,
430 "self" => RubyTokenType::Self_,
431 "alias" => RubyTokenType::Alias,
432 "undef" => RubyTokenType::Undef,
433 "defined?" => RubyTokenType::Defined,
434 "do" => RubyTokenType::Do,
435 _ => RubyTokenType::Identifier,
436 };
437
438 state.add_token(kind, start_pos, state.get_position());
439 true
440 }
441
442 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
444 let start_pos = state.get_position();
445
446 let three_char_ops = ["<=>", "===", "**=", "<<=", ">>=", "||=", "&&=", "..."];
448 for op in &three_char_ops {
449 if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) && state.peek_next_n(2) == op.chars().nth(2) {
450 state.advance(3);
451 let kind = match *op {
452 "<=>" => RubyTokenType::Spaceship,
453 "===" => RubyTokenType::EqualEqualEqual,
454 "**=" => RubyTokenType::PowerAssign,
455 "<<=" => RubyTokenType::LeftShiftAssign,
456 ">>=" => RubyTokenType::RightShiftAssign,
457 "||=" => RubyTokenType::OrOrAssign,
458 "&&=" => RubyTokenType::AndAndAssign,
459 "..." => RubyTokenType::DotDotDot,
460 _ => RubyTokenType::Invalid,
461 };
462 state.add_token(kind, start_pos, state.get_position());
463 return true;
464 }
465 }
466
467 let two_char_ops = ["**", "<<", ">>", "<=", ">=", "==", "!=", "=~", "!~", "&&", "||", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", ".."];
468 for op in &two_char_ops {
469 if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) {
470 state.advance(2);
471 let kind = match *op {
472 "**" => RubyTokenType::Power,
473 "<<" => RubyTokenType::LeftShift,
474 ">>" => RubyTokenType::RightShift,
475 "<=" => RubyTokenType::LessEqual,
476 ">=" => RubyTokenType::GreaterEqual,
477 "==" => RubyTokenType::EqualEqual,
478 "!=" => RubyTokenType::NotEqual,
479 "=~" => RubyTokenType::Match,
480 "!~" => RubyTokenType::NotMatch,
481 "&&" => RubyTokenType::AndAnd,
482 "||" => RubyTokenType::OrOr,
483 "+=" => RubyTokenType::PlusAssign,
484 "-=" => RubyTokenType::MinusAssign,
485 "*=" => RubyTokenType::MultiplyAssign,
486 "/=" => RubyTokenType::DivideAssign,
487 "%=" => RubyTokenType::ModuloAssign,
488 "&=" => RubyTokenType::AndAssign,
489 "|=" => RubyTokenType::OrAssign,
490 "^=" => RubyTokenType::XorAssign,
491 ".." => RubyTokenType::DotDot,
492 _ => RubyTokenType::Invalid,
493 };
494 state.add_token(kind, start_pos, state.get_position());
495 return true;
496 }
497 }
498
499 let single_char_ops = ['+', '-', '*', '/', '%', '=', '<', '>', '&', '|', '^', '!', '~', '?'];
501
502 if let Some(ch) = state.peek() {
503 if single_char_ops.contains(&ch) {
504 state.advance(1);
505 let kind = match ch {
506 '+' => RubyTokenType::Plus,
507 '-' => RubyTokenType::Minus,
508 '*' => RubyTokenType::Multiply,
509 '/' => RubyTokenType::Divide,
510 '%' => RubyTokenType::Modulo,
511 '=' => RubyTokenType::Assign,
512 '<' => RubyTokenType::Less,
513 '>' => RubyTokenType::Greater,
514 '&' => RubyTokenType::BitAnd,
515 '|' => RubyTokenType::BitOr,
516 '^' => RubyTokenType::Xor,
517 '!' => RubyTokenType::LogicalNot,
518 '~' => RubyTokenType::Tilde,
519 '?' => RubyTokenType::Question,
520 _ => RubyTokenType::Invalid,
521 };
522 state.add_token(kind, start_pos, state.get_position());
523 return true;
524 }
525 }
526
527 false
528 }
529
530 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
532 let start_pos = state.get_position();
533
534 if state.peek() == Some(':') && state.peek_next_n(1) == Some(':') {
536 state.advance(2);
537 state.add_token(RubyTokenType::DoubleColon, start_pos, state.get_position());
538 return true;
539 }
540
541 let delimiters = ['(', ')', '[', ']', '{', '}', ',', ';', '.', ':', '@', '$'];
543
544 if let Some(ch) = state.peek() {
545 if delimiters.contains(&ch) {
546 state.advance(1);
547 let kind = match ch {
548 '(' => RubyTokenType::LeftParen,
549 ')' => RubyTokenType::RightParen,
550 '[' => RubyTokenType::LeftBracket,
551 ']' => RubyTokenType::RightBracket,
552 '{' => RubyTokenType::LeftBrace,
553 '}' => RubyTokenType::RightBrace,
554 ',' => RubyTokenType::Comma,
555 ';' => RubyTokenType::Semicolon,
556 '.' => RubyTokenType::Dot,
557 ':' => RubyTokenType::Colon,
558 '@' => RubyTokenType::At,
559 '$' => RubyTokenType::Dollar,
560 _ => RubyTokenType::Invalid,
561 };
562 state.add_token(kind, start_pos, state.get_position());
563 return true;
564 }
565 }
566
567 if let Some(_ch) = state.peek() {
569 state.advance(1);
570 state.add_token(RubyTokenType::Invalid, start_pos, state.get_position());
571 return true;
572 }
573
574 false
575 }
576}