1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5use crate::{language::RubyLanguage, lexer::token_type::RubyTokenType};
6use oak_core::{LexOutput, Lexer, LexerCache, LexerState, OakError, Source, TextEdit};
7
8pub(crate) type State<'a, S> = LexerState<'a, S, RubyLanguage>;
9
10#[derive(Clone, Debug)]
12pub struct RubyLexer<'config> {
13 config: &'config RubyLanguage,
14}
15
16impl<'config> Lexer<RubyLanguage> for RubyLexer<'config> {
17 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RubyLanguage>) -> LexOutput<RubyLanguage> {
18 let mut state: State<'_, S> = LexerState::new(source);
19 let result = self.run(&mut state);
20 if result.is_ok() {
21 state.add_eof()
22 }
23 state.finish_with_cache(result, cache)
24 }
25}
26
27impl<'config> RubyLexer<'config> {
28 pub fn new(config: &'config RubyLanguage) -> Self {
30 Self { config }
31 }
32
33 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34 while state.not_at_end() {
35 let safe_point = state.get_position();
36
37 if self.skip_whitespace(state) {
38 continue;
39 }
40
41 if self.lex_newline(state) {
42 continue;
43 }
44
45 if self.skip_comment(state) {
46 continue;
47 }
48
49 if self.lex_string_literal(state) {
50 continue;
51 }
52
53 if self.lex_symbol(state) {
54 continue;
55 }
56
57 if self.lex_number_literal(state) {
58 continue;
59 }
60
61 if self.lex_identifier_or_keyword(state) {
62 continue;
63 }
64
65 if self.lex_operators(state) {
66 continue;
67 }
68
69 if self.lex_single_char_tokens(state) {
70 continue;
71 }
72
73 state.advance_if_dead_lock(safe_point)
74 }
75
76 Ok(())
77 }
78
79 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81 let start_pos = state.get_position();
82
83 while let Some(ch) = state.peek() {
84 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
85 }
86
87 if state.get_position() > start_pos {
88 state.add_token(RubyTokenType::Whitespace, start_pos, state.get_position());
89 true
90 }
91 else {
92 false
93 }
94 }
95
96 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
98 let start_pos = state.get_position();
99
100 if let Some('\n') = state.peek() {
101 state.advance(1);
102 state.add_token(RubyTokenType::Newline, start_pos, state.get_position());
103 true
104 }
105 else if let Some('\r') = state.peek() {
106 state.advance(1);
107 if let Some('\n') = state.peek() {
108 state.advance(1)
109 }
110 state.add_token(RubyTokenType::Newline, start_pos, state.get_position());
111 true
112 }
113 else {
114 false
115 }
116 }
117
118 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120 if let Some('#') = state.peek() {
121 let start_pos = state.get_position();
122 state.advance(1); while let Some(ch) = state.peek() {
126 if ch == '\n' || ch == '\r' {
127 break;
128 }
129 state.advance(ch.len_utf8())
130 }
131
132 state.add_token(RubyTokenType::Comment, start_pos, state.get_position());
133 true
134 }
135 else {
136 false
137 }
138 }
139
140 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
142 let start_pos = state.get_position();
143
144 let quote_char = match state.peek() {
146 Some('"') => '"',
147 Some('\'') => '\'',
148 Some('`') => '`',
149 _ => return false,
150 };
151
152 state.advance(1); let mut escaped = false;
154 while let Some(ch) = state.peek() {
155 if escaped {
156 escaped = false;
157 state.advance(ch.len_utf8());
158 continue;
159 }
160
161 if ch == '\\' {
162 escaped = true;
163 state.advance(1);
164 continue;
165 }
166
167 if ch == quote_char {
168 state.advance(1); break;
170 }
171 else if ch == '\n' || ch == '\r' {
172 state.advance(ch.len_utf8())
174 }
175 else {
176 state.advance(ch.len_utf8())
177 }
178 }
179
180 state.add_token(RubyTokenType::StringLiteral, start_pos, state.get_position());
181 true
182 }
183
184 fn lex_symbol<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
186 if let Some(':') = state.peek() {
187 let start_pos = state.get_position();
188 state.advance(1); if let Some(ch) = state.peek() {
192 if ch.is_ascii_alphabetic() || ch == '_' {
193 while let Some(ch) = state.peek() {
195 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' { state.advance(1) } else { break }
196 }
197 state.add_token(RubyTokenType::Symbol, start_pos, state.get_position());
198 return true;
199 }
200 else if ch == '"' || ch == '\'' {
201 let quote = ch;
203 state.advance(1);
204
205 let mut escaped = false;
206 while let Some(ch) = state.peek() {
207 if escaped {
208 escaped = false;
209 state.advance(ch.len_utf8());
210 continue;
211 }
212
213 if ch == '\\' {
214 escaped = true;
215 state.advance(1);
216 continue;
217 }
218
219 if ch == quote {
220 state.advance(1);
221 break;
222 }
223 else {
224 state.advance(ch.len_utf8())
225 }
226 }
227 state.add_token(RubyTokenType::Symbol, start_pos, state.get_position());
228 return true;
229 }
230 }
231 }
232 false
233 }
234
235 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
237 let start_pos = state.get_position();
238
239 if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
240 return false;
241 }
242
243 let mut is_float = false;
244
245 if state.peek() == Some('0') {
247 let next_char = state.peek_next_n(1);
248 match next_char {
249 Some('b') | Some('B') => {
250 state.advance(2); while let Some(ch) = state.peek() {
253 if ch == '0' || ch == '1' {
254 state.advance(1);
255 }
256 else if ch == '_' {
257 state.advance(1); }
259 else {
260 break;
261 }
262 }
263 }
264 Some('o') | Some('O') => {
265 state.advance(2); while let Some(ch) = state.peek() {
268 if ch.is_ascii_digit() && ch < '8' {
269 state.advance(1);
270 }
271 else if ch == '_' {
272 state.advance(1); }
274 else {
275 break;
276 }
277 }
278 }
279 Some('x') | Some('X') => {
280 state.advance(2); while let Some(ch) = state.peek() {
283 if ch.is_ascii_hexdigit() {
284 state.advance(1);
285 }
286 else if ch == '_' {
287 state.advance(1); }
289 else {
290 break;
291 }
292 }
293 }
294 _ => {
295 self.lex_decimal_number(state, &mut is_float)
297 }
298 }
299 }
300 else {
301 self.lex_decimal_number(state, &mut is_float)
303 }
304
305 let kind = if is_float { RubyTokenType::FloatLiteral } else { RubyTokenType::IntegerLiteral };
306
307 state.add_token(kind, start_pos, state.get_position());
308 true
309 }
310
311 fn lex_decimal_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>, is_float: &mut bool) {
313 while let Some(ch) = state.peek() {
315 if ch.is_ascii_digit() {
316 state.advance(1);
317 }
318 else if ch == '_' {
319 state.advance(1); }
321 else {
322 break;
323 }
324 }
325
326 if state.peek() == Some('.') && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit()) {
328 *is_float = true;
329 state.advance(1); while let Some(ch) = state.peek() {
332 if ch.is_ascii_digit() {
333 state.advance(1);
334 }
335 else if ch == '_' {
336 state.advance(1); }
338 else {
339 break;
340 }
341 }
342 }
343
344 if let Some('e') | Some('E') = state.peek() {
346 *is_float = true;
347 state.advance(1);
348
349 if let Some('+') | Some('-') = state.peek() {
351 state.advance(1);
352 }
353
354 while let Some(ch) = state.peek() {
356 if ch.is_ascii_digit() {
357 state.advance(1);
358 }
359 else if ch == '_' {
360 state.advance(1); }
362 else {
363 break;
364 }
365 }
366 }
367 }
368
369 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
371 let start_pos = state.get_position();
372
373 if !state.peek().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
375 return false;
376 }
377
378 let mut buf = String::new();
380
381 while let Some(ch) = state.peek() {
383 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '?' || ch == '!' {
384 buf.push(ch);
385 state.advance(1);
386 }
387 else {
388 break;
389 }
390 }
391
392 let kind = match buf.as_str() {
394 "if" => RubyTokenType::If,
395 "unless" => RubyTokenType::Unless,
396 "elsif" => RubyTokenType::Elsif,
397 "else" => RubyTokenType::Else,
398 "case" => RubyTokenType::Case,
399 "when" => RubyTokenType::When,
400 "then" => RubyTokenType::Then,
401 "for" => RubyTokenType::For,
402 "while" => RubyTokenType::While,
403 "until" => RubyTokenType::Until,
404 "break" => RubyTokenType::Break,
405 "next" => RubyTokenType::Next,
406 "redo" => RubyTokenType::Redo,
407 "retry" => RubyTokenType::Retry,
408 "return" => RubyTokenType::Return,
409 "yield" => RubyTokenType::Yield,
410 "def" => RubyTokenType::Def,
411 "class" => RubyTokenType::Class,
412 "module" => RubyTokenType::Module,
413 "end" => RubyTokenType::End,
414 "lambda" => RubyTokenType::Lambda,
415 "proc" => RubyTokenType::Proc,
416 "begin" => RubyTokenType::Begin,
417 "rescue" => RubyTokenType::Rescue,
418 "ensure" => RubyTokenType::Ensure,
419 "raise" => RubyTokenType::Raise,
420 "require" => RubyTokenType::Require,
421 "load" => RubyTokenType::Load,
422 "include" => RubyTokenType::Include,
423 "extend" => RubyTokenType::Extend,
424 "prepend" => RubyTokenType::Prepend,
425 "and" => RubyTokenType::And,
426 "or" => RubyTokenType::Or,
427 "not" => RubyTokenType::Not,
428 "in" => RubyTokenType::In,
429 "true" => RubyTokenType::True,
430 "false" => RubyTokenType::False,
431 "nil" => RubyTokenType::Nil,
432 "super" => RubyTokenType::Super,
433 "self" => RubyTokenType::Self_,
434 "alias" => RubyTokenType::Alias,
435 "undef" => RubyTokenType::Undef,
436 "defined?" => RubyTokenType::Defined,
437 "do" => RubyTokenType::Do,
438 _ => RubyTokenType::Identifier,
439 };
440
441 state.add_token(kind, start_pos, state.get_position());
442 true
443 }
444
445 fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
447 let start_pos = state.get_position();
448
449 let three_char_ops = ["<=>", "===", "**=", "<<=", ">>=", "||=", "&&=", "..."];
451 for op in &three_char_ops {
452 if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) && state.peek_next_n(2) == op.chars().nth(2) {
453 state.advance(3);
454 let kind = match *op {
455 "<=>" => RubyTokenType::Spaceship,
456 "===" => RubyTokenType::EqualEqualEqual,
457 "**=" => RubyTokenType::PowerAssign,
458 "<<=" => RubyTokenType::LeftShiftAssign,
459 ">>=" => RubyTokenType::RightShiftAssign,
460 "||=" => RubyTokenType::OrOrAssign,
461 "&&=" => RubyTokenType::AndAndAssign,
462 "..." => RubyTokenType::DotDotDot,
463 _ => RubyTokenType::Invalid,
464 };
465 state.add_token(kind, start_pos, state.get_position());
466 return true;
467 }
468 }
469
470 let two_char_ops = ["**", "<<", ">>", "<=", ">=", "==", "!=", "=~", "!~", "&&", "||", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "..", "=>"];
471 for op in &two_char_ops {
472 if state.peek() == op.chars().nth(0) && state.peek_next_n(1) == op.chars().nth(1) {
473 state.advance(2);
474 let kind = match *op {
475 "**" => RubyTokenType::Power,
476 "<<" => RubyTokenType::LeftShift,
477 ">>" => RubyTokenType::RightShift,
478 "<=" => RubyTokenType::LessEqual,
479 ">=" => RubyTokenType::GreaterEqual,
480 "==" => RubyTokenType::EqualEqual,
481 "!=" => RubyTokenType::NotEqual,
482 "=~" => RubyTokenType::Match,
483 "!~" => RubyTokenType::NotMatch,
484 "&&" => RubyTokenType::AndAnd,
485 "||" => RubyTokenType::OrOr,
486 "+=" => RubyTokenType::PlusAssign,
487 "-=" => RubyTokenType::MinusAssign,
488 "*=" => RubyTokenType::MultiplyAssign,
489 "/=" => RubyTokenType::DivideAssign,
490 "%=" => RubyTokenType::ModuloAssign,
491 "&=" => RubyTokenType::AndAssign,
492 "|=" => RubyTokenType::OrAssign,
493 "^=" => RubyTokenType::XorAssign,
494 ".." => RubyTokenType::DotDot,
495 "=>" => RubyTokenType::EqualGreater,
496 _ => RubyTokenType::Invalid,
497 };
498 state.add_token(kind, start_pos, state.get_position());
499 return true;
500 }
501 }
502
503 let single_char_ops = ['+', '-', '*', '/', '%', '=', '<', '>', '&', '|', '^', '!', '~', '?'];
505
506 if let Some(ch) = state.peek() {
507 if single_char_ops.contains(&ch) {
508 state.advance(1);
509 let kind = match ch {
510 '+' => RubyTokenType::Plus,
511 '-' => RubyTokenType::Minus,
512 '*' => RubyTokenType::Multiply,
513 '/' => RubyTokenType::Divide,
514 '%' => RubyTokenType::Modulo,
515 '=' => RubyTokenType::Assign,
516 '<' => RubyTokenType::Less,
517 '>' => RubyTokenType::Greater,
518 '&' => RubyTokenType::BitAnd,
519 '|' => RubyTokenType::BitOr,
520 '^' => RubyTokenType::Xor,
521 '!' => RubyTokenType::LogicalNot,
522 '~' => RubyTokenType::Tilde,
523 '?' => RubyTokenType::Question,
524 _ => RubyTokenType::Invalid,
525 };
526 state.add_token(kind, start_pos, state.get_position());
527 return true;
528 }
529 }
530
531 false
532 }
533
534 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
536 let start_pos = state.get_position();
537
538 if state.peek() == Some(':') && state.peek_next_n(1) == Some(':') {
540 state.advance(2);
541 state.add_token(RubyTokenType::DoubleColon, start_pos, state.get_position());
542 return true;
543 }
544
545 let delimiters = ['(', ')', '[', ']', '{', '}', ',', ';', '.', ':', '@', '$'];
547
548 if let Some(ch) = state.peek() {
549 if delimiters.contains(&ch) {
550 state.advance(1);
551 let kind = match ch {
552 '(' => RubyTokenType::LeftParen,
553 ')' => RubyTokenType::RightParen,
554 '[' => RubyTokenType::LeftBracket,
555 ']' => RubyTokenType::RightBracket,
556 '{' => RubyTokenType::LeftBrace,
557 '}' => RubyTokenType::RightBrace,
558 ',' => RubyTokenType::Comma,
559 ';' => RubyTokenType::Semicolon,
560 '.' => RubyTokenType::Dot,
561 ':' => RubyTokenType::Colon,
562 '@' => RubyTokenType::At,
563 '$' => RubyTokenType::Dollar,
564 _ => RubyTokenType::Invalid,
565 };
566 state.add_token(kind, start_pos, state.get_position());
567 return true;
568 }
569 }
570
571 if let Some(_ch) = state.peek() {
573 state.advance(1);
574 state.add_token(RubyTokenType::Invalid, start_pos, state.get_position());
575 return true;
576 }
577
578 false
579 }
580}