1use alloc_from_pool::{Factory as PoolFactory, PoolValue};
2
3use crate::lexer::*;
4use crate::maybe_byte::*;
5use crate::source::buffer::*;
6use crate::source::Comment;
7use crate::source::Decoder;
8use crate::source::MagicComment;
9use crate::str_term::{str_types::*, HeredocEnd, StrTerm, StringLiteral};
10use crate::Loc;
11use crate::SharedContext;
12use crate::StackState;
13use crate::StaticEnvironment;
14use crate::Token;
15use crate::TokenBuf;
16use crate::{error::Diagnostics, Bytes};
17use crate::{lex_states::*, LexState};
18use crate::{Diagnostic, DiagnosticMessage, ErrorLevel};
19
20#[derive(Debug, Default)]
23pub struct Lexer {
24 pub(crate) buffer: Buffer,
25
26 pub(crate) lval: Option<Bytes>,
27 pub(crate) lval_start: Option<usize>,
28 pub(crate) lval_end: Option<usize>,
29
30 pub(crate) strterm: Option<Box<StrTerm>>,
31 pub lex_state: LexState,
33 pub(crate) paren_nest: i32,
34 pub(crate) lpar_beg: i32,
35 pub(crate) brace_nest: i32,
36
37 pub cond: StackState,
40 pub cmdarg: StackState,
43
44 pub(crate) tokenbuf: TokenBuf,
45
46 pub(crate) context: SharedContext,
48
49 pub(crate) command_start: bool,
50 pub(crate) token_seen: bool,
51
52 pub static_env: StaticEnvironment,
78
79 pub(crate) diagnostics: Diagnostics,
80 pub(crate) comments: Vec<Comment>,
81 pub(crate) magic_comments: Vec<MagicComment>,
82
83 #[doc(hidden)]
84 pub tokens_factory: PoolFactory<Token>,
85}
86
87impl Lexer {
88 pub(crate) const NULL_CHAR: u8 = 0x00;
89 pub(crate) const CTRL_D_CHAR: u8 = 0x04;
90 pub(crate) const CTRL_Z_CHAR: u8 = 0x1a;
91 pub(crate) const LF_CHAR: u8 = 0x0c;
92 pub(crate) const VTAB_CHAR: u8 = 0x0b;
93
94 pub fn new<Bytes, Name>(bytes: Bytes, name: Name, decoder: Option<Decoder>) -> Self
96 where
97 Bytes: Into<Vec<u8>>,
98 Name: Into<String>,
99 {
100 Self {
101 cond: StackState::new("cond"),
102 cmdarg: StackState::new("cmdarg"),
103 lpar_beg: -1, buffer: Buffer::new(name.into(), bytes.into(), decoder),
105 ..Self::default()
106 }
107 }
108
109 pub fn tokenize_until_eof(&mut self) -> Vec<Token> {
117 let mut tokens = vec![];
118
119 loop {
120 let token = self.yylex().take_value();
121 match token.token_type {
122 Self::END_OF_INPUT => break,
123 _ => tokens.push(token),
124 }
125 }
126
127 tokens
128 }
129
130 pub(crate) fn yylex(&mut self) -> PoolValue<Token> {
131 self.lval = None;
132
133 let token_type = self.parser_yylex();
134
135 let begin = std::mem::take(&mut self.lval_start).unwrap_or(self.buffer.ptok);
136 let mut end = std::mem::take(&mut self.lval_end).unwrap_or(self.buffer.pcur);
137
138 let mut token_value = self
139 .lval
140 .take()
141 .or_else(|| {
142 self.buffer
144 .substr_at(begin, end)
145 .map(|s| Bytes::new(Vec::from(s)))
146 })
147 .unwrap_or_else(|| Bytes::new(vec![]));
148
149 if token_type == Self::tNL {
150 token_value = Bytes::new(vec![b'\n']);
151 end = begin + 1;
152 }
153
154 let token = self.tokens_factory.alloc(Token {
155 token_type,
156 token_value,
157 loc: Loc { begin, end },
158 });
159 println_if_debug_lexer!(
160 "yylex ({:?}, {:?}, {:?})",
161 token.token_name(),
162 token.token_value,
163 token.loc
164 );
165 token
166 }
167
168 pub(crate) fn nextc(&mut self) -> MaybeByte {
169 self.buffer.nextc()
170 }
171 pub(crate) fn char_at(&self, idx: usize) -> MaybeByte {
172 self.buffer.byte_at(idx)
173 }
174 pub(crate) fn token_flush(&mut self) {
175 self.buffer.token_flush()
176 }
177
178 pub(crate) fn parser_yylex(&mut self) -> i32 {
179 let mut c: MaybeByte;
180 let mut space_seen: bool = false;
181 let label: usize;
182 let mut last_state: LexState;
183 let token_seen = self.token_seen;
184
185 if let Some(strterm) = self.strterm.as_ref().map(|i| i.as_ref()) {
186 match strterm {
187 StrTerm::HeredocLiteral(_) => {
188 return self.here_document();
189 }
190
191 StrTerm::StringLiteral(_) => {
192 self.token_flush();
193 return self.parse_string();
194 }
195 }
196 }
197
198 let cmd_state = self.command_start;
199 self.command_start = false;
200 self.token_seen = true;
201
202 'retrying: loop {
203 last_state = self.lex_state;
204 self.token_flush();
205
206 c = self.nextc();
208
209 if c.is_eof() {
210 return Self::END_OF_INPUT;
211 }
212
213 match c.as_option() {
214 None
215 | Some(Self::NULL_CHAR)
216 | Some(Self::CTRL_D_CHAR)
217 | Some(Self::CTRL_Z_CHAR) => return Self::END_OF_INPUT,
218
219 Some(b'\r') => {
221 if !self.buffer.cr_seen {
222 self.buffer.cr_seen = true;
223 self.warn(
224 DiagnosticMessage::SlashRAtMiddleOfLine {},
225 self.current_loc(),
226 );
227 }
228 }
229
230 Some(b' ') | Some(b'\t') | Some(Self::LF_CHAR) | Some(Self::VTAB_CHAR) => {
231 space_seen = true;
232 continue 'retrying;
233 }
234
235 Some(b'#') | Some(b'\n') => {
236 if c == b'#' {
237 self.token_seen = token_seen;
239 let magic_comment = self
241 .magic_comment(self.buffer.pcur, self.buffer.pend - self.buffer.pcur);
242 match magic_comment {
243 Ok(magic_comment) => {
244 if !magic_comment && self.comment_at_top() {
245 self.set_file_encoding(self.buffer.pcur, self.buffer.pend)
246 }
247 }
248 Err(_) => return Self::END_OF_INPUT,
249 }
250 self.buffer.goto_eol();
251 self.comments
252 .push(Comment::new(self.current_loc(), &self.buffer.input.decoded))
253 }
254 self.token_seen = token_seen;
255 let cc = self
256 .lex_state
257 .is_some(EXPR_BEG | EXPR_CLASS | EXPR_FNAME | EXPR_DOT)
258 && !self.lex_state.is_some(EXPR_LABELED);
259 if cc || self.lex_state.is_all(EXPR_ARG | EXPR_LABELED) {
260 if !cc && self.context.in_kwarg() {
261 return self.normal_newline_leaf_label();
262 }
263 continue 'retrying;
264 }
265
266 loop {
267 c = self.nextc();
269
270 #[allow(clippy::never_loop)]
271 loop {
273 if c == b' '
274 || c == b'\t'
275 || c == Self::LF_CHAR
276 || c == b'\r'
277 || c == Self::VTAB_CHAR
278 {
279 space_seen = true;
280 break;
281 }
282
283 if c == b'#' {
284 self.buffer.pushback(c);
285 continue 'retrying;
286 }
287
288 if c == b'&' || c == b'.' {
289 if self.buffer.peek(b'.') == (c == b'&') {
290 self.buffer.pushback(c);
291 continue 'retrying;
292 }
293 }
294
295 if c.is_eof() {
296 self.buffer.eof_no_decrement();
298 return self.normal_newline_leaf_label();
299 }
300
301 self.buffer.ruby_sourceline -= 1;
303 self.buffer.nextline = self.buffer.lastline;
304 self.buffer.eof_no_decrement();
306 return self.normal_newline_leaf_label();
307 }
308 }
309 }
310
311 Some(b'*') => {
312 let result: i32;
313
314 c = self.nextc();
315
316 if c == b'*' {
317 c = self.nextc();
318 if c == b'=' {
319 self.set_yylval_id("**=");
320 self.lex_state.set(EXPR_BEG);
321 return Self::tOP_ASGN;
322 }
323 self.buffer.pushback(c);
324 if self.lex_state.is_spacearg(c, space_seen) {
325 self.warn(
326 DiagnosticMessage::DStarInterpretedAsArgPrefix {},
327 self.current_loc(),
328 );
329 result = Self::tDSTAR;
330 } else if self.lex_state.is_beg() {
331 result = Self::tDSTAR;
332 } else {
333 result = self.warn_balanced(
334 Self::tPOW,
335 "**",
336 "argument prefix",
337 c,
338 space_seen,
339 last_state,
340 );
341 }
342 } else {
343 if c == b'=' {
344 self.set_yylval_id("*=");
345 self.lex_state.set(EXPR_BEG);
346 return Self::tOP_ASGN;
347 }
348 self.buffer.pushback(c);
349 if self.lex_state.is_spacearg(c, space_seen) {
350 self.warn(
351 DiagnosticMessage::StarInterpretedAsArgPrefix {},
352 self.current_loc(),
353 );
354 result = Self::tSTAR;
355 } else if self.lex_state.is_beg() {
356 result = Self::tSTAR;
357 } else {
358 result = self.warn_balanced(
359 Self::tSTAR2,
360 "*",
361 "argument prefix",
362 c,
363 space_seen,
364 last_state,
365 );
366 }
367 }
368
369 self.lex_state.set(if self.lex_state.is_after_operator() {
370 EXPR_ARG
371 } else {
372 EXPR_BEG
373 });
374 return result;
375 }
376
377 Some(b'!') => {
378 c = self.nextc();
379 if self.lex_state.is_after_operator() {
380 self.lex_state.set(EXPR_ARG);
381 if c == b'@' {
382 return Self::tBANG;
383 }
384 } else {
385 self.lex_state.set(EXPR_BEG);
386 }
387 if c == b'=' {
388 return Self::tNEQ;
389 }
390 if c == b'~' {
391 return Self::tNMATCH;
392 }
393 self.buffer.pushback(c);
394 return Self::tBANG;
395 }
396
397 Some(b'=') => {
398 if self.buffer.was_bol() {
399 if self.buffer.is_word_match("begin") {
401 let begin_loc = self.loc(self.buffer.pcur - 1, self.buffer.pcur + 5);
402 self.buffer.goto_eol();
403 loop {
404 self.buffer.goto_eol();
405 c = self.nextc();
406 if c.is_eof() {
407 self.compile_error(
408 DiagnosticMessage::EmbeddedDocumentMeetsEof {},
409 begin_loc,
410 );
411 return Self::END_OF_INPUT;
412 }
413 if c == b'=' && self.buffer.is_word_match("end") {
414 break;
415 }
416 self.buffer.pushback(c);
417 }
418 self.buffer.goto_eol();
419 self.comments.push(Comment::new(
420 begin_loc.with_end(self.buffer.pcur),
421 &self.buffer.input.decoded,
422 ));
423 continue 'retrying;
424 }
425 }
426
427 self.lex_state.set(if self.lex_state.is_after_operator() {
428 EXPR_ARG
429 } else {
430 EXPR_BEG
431 });
432 c = self.nextc();
433 if c == b'=' {
434 c = self.nextc();
435 if c == b'=' {
436 return Self::tEQQ;
437 }
438 self.buffer.pushback(c);
439 return Self::tEQ;
440 }
441 if c == b'~' {
442 return Self::tMATCH;
443 } else if c == b'>' {
444 return Self::tASSOC;
445 }
446 self.buffer.pushback(c);
447 return Self::tEQL;
448 }
449
450 Some(b'<') => {
451 c = self.nextc();
452 if c == b'<'
453 && !self.lex_state.is_some(EXPR_DOT | EXPR_CLASS)
454 && !self.lex_state.is_end()
455 && (!self.lex_state.is_arg()
456 || self.lex_state.is_some(EXPR_LABELED)
457 || space_seen)
458 {
459 if let Some(token) = self.heredoc_identifier() {
460 return token;
461 }
462 }
463 if self.lex_state.is_after_operator() {
464 self.lex_state.set(EXPR_ARG);
465 } else {
466 if self.lex_state.is_some(EXPR_CLASS) {
467 self.command_start = true;
468 }
469 self.lex_state.set(EXPR_BEG);
470 }
471 if c == b'=' {
472 c = self.nextc();
473 if c == b'>' {
474 return Self::tCMP;
475 }
476 self.buffer.pushback(c);
477 return Self::tLEQ;
478 }
479 if c == b'<' {
480 c = self.nextc();
481 if c == b'=' {
482 self.set_yylval_id("<<=");
483 self.lex_state.set(EXPR_BEG);
484 return Self::tOP_ASGN;
485 }
486 self.buffer.pushback(c);
487 return self.warn_balanced(
488 Self::tLSHFT,
489 "<<",
490 "here document",
491 c,
492 space_seen,
493 last_state,
494 );
495 }
496 self.buffer.pushback(c);
497 return Self::tLT;
498 }
499
500 Some(b'>') => {
501 self.lex_state.set(if self.lex_state.is_after_operator() {
502 EXPR_ARG
503 } else {
504 EXPR_BEG
505 });
506
507 c = self.nextc();
508 if c == b'=' {
509 return Self::tGEQ;
510 }
511
512 if c == b'>' {
513 c = self.nextc();
514 if c == b'=' {
515 self.set_yylval_id(">>=");
516 self.lex_state.set(EXPR_BEG);
517 return Self::tOP_ASGN;
518 }
519 self.buffer.pushback(c);
520 return Self::tRSHFT;
521 }
522 self.buffer.pushback(c);
523 return Self::tGT;
524 }
525
526 Some(b'"') => {
527 label = if self.lex_state.is_label_possible(cmd_state) {
528 str_label
529 } else {
530 0
531 };
532 self.strterm = self.new_strterm(str_dquote | label, b'"', None, None);
533 self.buffer.set_ptok(self.buffer.pcur - 1);
534 return Self::tSTRING_BEG;
535 }
536
537 Some(b'`') => {
538 if self.lex_state.is_some(EXPR_FNAME) {
539 self.lex_state.set(EXPR_ENDFN);
540 return Self::tBACK_REF2;
541 }
542 if self.lex_state.is_some(EXPR_DOT) {
543 if cmd_state {
544 self.lex_state.set(EXPR_CMDARG);
545 } else {
546 self.lex_state.set(EXPR_ARG);
547 }
548 return Self::tBACK_REF2;
549 }
550 self.strterm = self.new_strterm(str_xquote, b'`', None, None);
551 return Self::tXSTRING_BEG;
552 }
553
554 Some(b'\'') => {
555 label = if self.lex_state.is_label_possible(cmd_state) {
556 str_label
557 } else {
558 0
559 };
560 self.strterm = self.new_strterm(str_squote | label, b'\'', None, None);
561 self.buffer.set_ptok(self.buffer.pcur - 1);
562 return Self::tSTRING_BEG;
563 }
564
565 Some(b'?') => {
566 return self.parse_qmark(space_seen).unwrap_or(-1);
567 }
568
569 Some(b'&') => {
570 let result: i32;
571
572 c = self.nextc();
573 if c == b'&' {
574 self.lex_state.set(EXPR_BEG);
575 c = self.nextc();
576 if c == b'=' {
577 self.set_yylval_id("&&=");
578 self.lex_state.set(EXPR_BEG);
579 return Self::tOP_ASGN;
580 }
581 self.buffer.pushback(c);
582 return Self::tANDOP;
583 } else if c == b'=' {
584 self.set_yylval_id("&=");
585 self.lex_state.set(EXPR_BEG);
586 return Self::tOP_ASGN;
587 } else if c == b'.' {
588 self.set_yylval_id("&.");
589 self.lex_state.set(EXPR_DOT);
590 return Self::tANDDOT;
591 }
592 self.buffer.pushback(c);
593 if self.lex_state.is_spacearg(c, space_seen) {
594 if c != b':'
595 || {
596 c = self.buffer.peekc_n(1);
597 !c.is_eof()
598 }
599 || !(c == b'\''
600 || c == b'"'
601 || self
602 .buffer
603 .is_identchar(self.buffer.pcur + 1, self.buffer.pend))
604 {
605 self.warn(
606 DiagnosticMessage::AmpersandInterpretedAsArgPrefix {},
607 self.current_loc(),
608 );
609 }
610 result = Self::tAMPER;
611 } else if self.lex_state.is_beg() {
612 result = Self::tAMPER;
613 } else {
614 result = self.warn_balanced(
615 Self::tAMPER2,
616 "&",
617 "argument prefix",
618 c,
619 space_seen,
620 last_state,
621 );
622 }
623 self.lex_state.set(if self.lex_state.is_after_operator() {
624 EXPR_ARG
625 } else {
626 EXPR_BEG
627 });
628 return result;
629 }
630
631 Some(b'|') => {
632 c = self.nextc();
633 if c == b'|' {
634 self.lex_state.set(EXPR_BEG);
635 c = self.nextc();
636 if c == b'=' {
637 self.set_yylval_id("||=");
638 self.lex_state.set(EXPR_BEG);
639 return Self::tOP_ASGN;
640 }
641 self.buffer.pushback(c);
642 if last_state.is_some(EXPR_BEG) {
643 self.buffer.pushback(b'|');
644 return Self::tPIPE;
645 }
646 return Self::tOROP;
647 }
648 if c == b'=' {
649 self.set_yylval_id("|=");
650 self.lex_state.set(EXPR_BEG);
651 return Self::tOP_ASGN;
652 }
653 self.lex_state.set(if self.lex_state.is_after_operator() {
654 EXPR_ARG
655 } else {
656 EXPR_BEG | EXPR_LABEL
657 });
658 self.buffer.pushback(c);
659 return Self::tPIPE;
660 }
661
662 Some(b'+') => {
663 c = self.nextc();
664 if self.lex_state.is_after_operator() {
665 self.lex_state.set(EXPR_ARG);
666 if c == b'@' {
667 return Self::tUPLUS;
668 }
669 self.buffer.pushback(c);
670 return Self::tPLUS;
671 }
672 if c == b'=' {
673 self.set_yylval_id("+=");
674 self.lex_state.set(EXPR_BEG);
675 return Self::tOP_ASGN;
676 }
677 if self.lex_state.is_beg()
678 || (self.lex_state.is_spacearg(c, space_seen)
679 && self.arg_ambiguous(b'+', self.current_loc().adjust_end(-1)))
680 {
681 self.lex_state.set(EXPR_BEG);
682 self.buffer.pushback(c);
683 if !c.is_eof() && c.is_digit() {
684 return self.parse_numeric(b'+');
685 }
686 return Self::tUPLUS;
687 }
688 self.lex_state.set(EXPR_BEG);
689 self.buffer.pushback(c);
690 return self.warn_balanced(
691 Self::tPLUS,
692 "+",
693 "unary operator",
694 c,
695 space_seen,
696 last_state,
697 );
698 }
699
700 Some(b'-') => {
701 c = self.nextc();
702 if self.lex_state.is_after_operator() {
703 self.lex_state.set(EXPR_ARG);
704 if c == b'@' {
705 return Self::tUMINUS;
706 }
707 self.buffer.pushback(c);
708 return Self::tMINUS;
709 }
710 if c == b'=' {
711 self.set_yylval_id("-=");
712 self.lex_state.set(EXPR_BEG);
713 return Self::tOP_ASGN;
714 }
715 if c == b'>' {
716 self.lex_state.set(EXPR_ENDFN);
717 return Self::tLAMBDA;
718 }
719 if self.lex_state.is_beg()
720 || (self.lex_state.is_spacearg(c, space_seen)
721 && self.arg_ambiguous(b'-', self.current_loc().adjust_end(-1)))
722 {
723 self.lex_state.set(EXPR_BEG);
724 self.buffer.pushback(c);
725 if !c.is_eof() && c.is_digit() {
726 return Self::tUMINUS_NUM;
727 }
728 return Self::tUMINUS;
729 }
730 self.lex_state.set(EXPR_BEG);
731 self.buffer.pushback(c);
732 return self.warn_balanced(
733 Self::tMINUS,
734 "-",
735 "unary operator",
736 c,
737 space_seen,
738 last_state,
739 );
740 }
741
742 Some(b'.') => {
743 let is_beg = self.lex_state.is_beg();
744 self.lex_state.set(EXPR_BEG);
745 c = self.nextc();
746 if c == b'.' {
747 c = self.nextc();
748 if c == b'.' {
749 if self.context.in_argdef() {
750 self.lex_state.set(EXPR_ENDARG);
751 return Self::tBDOT3;
752 }
753 if self.paren_nest == 0 && self.buffer.is_looking_at_eol() {
754 self.warn(DiagnosticMessage::TripleDotAtEol {}, self.current_loc());
755 } else if self.lpar_beg >= 0
756 && self.lpar_beg + 1 == self.paren_nest
757 && last_state.is_some(EXPR_LABEL)
758 {
759 return Self::tDOT3;
760 }
761 return if is_beg { Self::tBDOT3 } else { Self::tDOT3 };
762 }
763 self.buffer.pushback(c);
764 return if is_beg { Self::tBDOT2 } else { Self::tDOT2 };
765 }
766 self.buffer.pushback(c);
767 if !c.is_eof() && c.is_digit() {
768 let prev = if self.buffer.pcur - 1 > self.buffer.pbeg {
769 self.buffer.byte_at(self.buffer.pcur - 2)
770 } else {
771 MaybeByte::EndOfInput
772 };
773 self.parse_numeric(b'.');
774 if prev.is_digit() {
775 self.yyerror0(DiagnosticMessage::FractionAfterNumeric {});
776 } else {
777 self.yyerror0(DiagnosticMessage::NoDigitsAfterDot {});
778 }
779 self.lex_state.set(EXPR_END);
780 self.buffer.set_ptok(self.buffer.pcur);
781 continue 'retrying;
782 }
783 self.set_yylval_id(".");
784 self.lex_state.set(EXPR_DOT);
785 return Self::tDOT;
786 }
787
788 Some(c) if c.is_ascii_digit() => {
789 return self.parse_numeric(c);
790 }
791
792 Some(b')') => {
793 self.cond.pop();
794 self.cmdarg.pop();
795 self.lex_state.set(EXPR_ENDFN);
796 self.paren_nest -= 1;
797
798 return Self::tRPAREN;
799 }
800
801 Some(b']') => {
802 self.cond.pop();
803 self.cmdarg.pop();
804 self.lex_state.set(EXPR_END);
805 self.paren_nest -= 1;
806
807 return Self::tRBRACK;
808 }
809
810 Some(b'}') => {
811 if self.brace_nest == 0 {
813 self.brace_nest -= 1;
814 return Self::tSTRING_DEND;
815 }
816 self.brace_nest -= 1;
817 self.cond.pop();
818 self.cmdarg.pop();
819 self.lex_state.set(EXPR_END);
820 self.paren_nest -= 1;
821
822 return Self::tRCURLY;
823 }
824
825 Some(b':') => {
826 c = self.nextc();
827 if c == b':' {
828 if self.lex_state.is_beg()
829 || self.lex_state.is_some(EXPR_CLASS)
830 || self
831 .lex_state
832 .is_spacearg(MaybeByte::EndOfInput, space_seen)
833 {
834 self.lex_state.set(EXPR_BEG);
835 return Self::tCOLON3;
836 }
837 self.set_yylval_id("::");
838 self.lex_state.set(EXPR_DOT);
839 return Self::tCOLON2;
840 }
841 if self.lex_state.is_end() || c.is_space() || c == Some(b'#') {
842 self.buffer.pushback(c);
843 let result = self.warn_balanced(
844 Self::tCOLON,
845 ":",
846 "symbol literal",
847 c,
848 space_seen,
849 last_state,
850 );
851 self.lex_state.set(EXPR_BEG);
852 return result;
853 }
854 match c.as_option() {
855 Some(c) if c == b'\'' => {
856 self.strterm = self.new_strterm(str_ssym, c, None, None)
857 }
858 Some(c) if c == b'"' => {
859 self.strterm = self.new_strterm(str_dsym, c, None, None)
860 }
861 _ => self.buffer.pushback(c),
862 }
863 self.lex_state.set(EXPR_FNAME);
864 return Self::tSYMBEG;
865 }
866
867 Some(b'/') => {
868 if self.lex_state.is_beg() {
869 self.strterm = self.new_strterm(str_regexp, b'/', None, None);
870 return Self::tREGEXP_BEG;
871 }
872 c = self.nextc();
873 if c == b'=' {
874 self.set_yylval_id("/=");
875 self.lex_state.set(EXPR_BEG);
876 return Self::tOP_ASGN;
877 }
878 self.buffer.pushback(c);
879 if self.lex_state.is_spacearg(c, space_seen) {
880 self.arg_ambiguous(b'/', self.current_loc());
881 self.strterm = self.new_strterm(str_regexp, b'/', None, None);
882 return Self::tREGEXP_BEG;
883 }
884 self.lex_state.set(if self.lex_state.is_after_operator() {
885 EXPR_ARG
886 } else {
887 EXPR_BEG
888 });
889 return self.warn_balanced(
890 Self::tDIVIDE,
891 "/",
892 "regexp literal",
893 c,
894 space_seen,
895 last_state,
896 );
897 }
898
899 Some(b'^') => {
900 c = self.nextc();
901 if c == b'=' {
902 self.set_yylval_id("^=");
903 self.lex_state.set(EXPR_BEG);
904 return Self::tOP_ASGN;
905 }
906 self.lex_state.set(if self.lex_state.is_after_operator() {
907 EXPR_ARG
908 } else {
909 EXPR_BEG
910 });
911 self.buffer.pushback(c);
912 return Self::tCARET;
913 }
914
915 Some(b';') => {
916 self.lex_state.set(EXPR_BEG);
917 self.command_start = true;
918 return Self::tSEMI;
919 }
920
921 Some(b',') => {
922 self.lex_state.set(EXPR_BEG | EXPR_LABEL);
923 return Self::tCOMMA;
924 }
925
926 Some(b'~') => {
927 if self.lex_state.is_after_operator() {
928 c = self.nextc();
929 if c != b'@' {
930 self.buffer.pushback(c);
931 }
932 self.lex_state.set(EXPR_ARG);
933 } else {
934 self.lex_state.set(EXPR_BEG);
935 }
936
937 return Self::tTILDE;
938 }
939
940 Some(b'(') => {
941 let mut result: i32 = Self::tLPAREN2;
942
943 if self.lex_state.is_beg() {
944 result = Self::tLPAREN;
945 } else if !space_seen {
946 } else if self.lex_state.is_arg()
948 || self.lex_state.is_all(EXPR_END | EXPR_LABEL)
949 {
950 result = Self::tLPAREN_ARG;
951 } else if self.lex_state.is_some(EXPR_ENDFN) && !self.is_lambda_beginning() {
952 self.warn(
953 DiagnosticMessage::ParenthesesIterpretedAsArglist {},
954 self.current_loc(),
955 );
956 }
957
958 self.paren_nest += 1;
959 self.cond.push(false);
960 self.cmdarg.push(false);
961 self.lex_state.set(EXPR_BEG | EXPR_LABEL);
962
963 return result;
964 }
965
966 Some(b'[') => {
967 let mut result: i32 = Self::tLBRACK2;
968
969 self.paren_nest += 1;
970 if self.lex_state.is_after_operator() {
971 c = self.nextc();
972 if c == b']' {
973 self.paren_nest -= 1;
974 self.lex_state.set(EXPR_ARG);
975 c = self.nextc();
976 if c == b'=' {
977 return Self::tASET;
978 }
979 self.buffer.pushback(c);
980 return Self::tAREF;
981 }
982 self.buffer.pushback(c);
983 self.lex_state.set(EXPR_ARG | EXPR_LABEL);
984 return Self::tLBRACK2;
985 } else if self.lex_state.is_beg()
986 || (self.lex_state.is_arg()
987 && (space_seen || self.lex_state.is_some(EXPR_LABELED)))
988 {
989 result = Self::tLBRACK;
990 }
991 self.lex_state.set(EXPR_BEG | EXPR_LABEL);
992 self.cond.push(false);
993 self.cmdarg.push(false);
994 return result;
995 }
996
997 Some(b'{') => {
998 self.brace_nest += 1;
999
1000 let result: i32;
1001
1002 if self.is_lambda_beginning() {
1003 result = Self::tLAMBEG;
1004 } else if self.lex_state.is_some(EXPR_LABELED) {
1005 result = Self::tLBRACE;
1006 } else if self.lex_state.is_some(EXPR_ARG_ANY | EXPR_END | EXPR_ENDFN) {
1007 result = Self::tLCURLY;
1008 } else if self.lex_state.is_some(EXPR_ENDARG) {
1009 result = Self::tLBRACE_ARG;
1010 } else {
1011 result = Self::tLBRACE;
1012 }
1013
1014 if result != Self::tLBRACE {
1015 self.command_start = true;
1016 self.lex_state.set(EXPR_BEG);
1017 } else {
1018 self.lex_state.set(EXPR_BEG | EXPR_LABEL);
1019 }
1020
1021 self.paren_nest += 1;
1022 self.cond.push(false);
1023 self.cmdarg.push(false);
1024 return result;
1025 }
1026
1027 Some(b'\\') => {
1028 c = self.nextc();
1029 if c == b'\n' {
1030 space_seen = true;
1031 continue 'retrying; }
1033 if c == b' ' {
1034 return Self::tSP;
1035 }
1036 if c.is_space() {
1037 match c.as_option() {
1038 Some(b'\t') => return Self::tSLASH_T,
1039 Some(Self::LF_CHAR) => return Self::tSLASH_F,
1040 Some(b'\r') => return Self::tSLASH_R,
1041 Some(Self::VTAB_CHAR) => return Self::tVTAB,
1042 Some(other) => unreachable!("unsupported space char {:?}", other),
1043 None => {}
1044 }
1045 }
1046 self.buffer.pushback(c);
1047 return Self::tBACKSLASH;
1048 }
1049
1050 Some(b'%') => {
1051 return self.parse_percent(space_seen, last_state);
1052 }
1053
1054 Some(b'$') => {
1055 return self.parse_gvar(last_state);
1056 }
1057
1058 Some(b'@') => {
1059 return self.parse_atmark(last_state);
1060 }
1061
1062 Some(b'_') => {
1063 if self.buffer.was_bol() && self.buffer.is_whole_match(b"__END__", 0) {
1064 self.buffer.eofp = true;
1065 return Self::END_OF_INPUT;
1066 }
1067 self.newtok();
1068 }
1069
1070 Some(c) => {
1071 if !self.is_identchar() {
1072 self.compile_error(
1073 DiagnosticMessage::InvalidChar { c },
1074 self.current_loc(),
1075 );
1076 self.token_flush();
1077 continue 'retrying;
1078 }
1079
1080 self.newtok();
1081 }
1082 }
1083
1084 break;
1085 }
1086
1087 self.parse_ident(c, cmd_state)
1088 }
1089
1090 fn normal_newline_leaf_label(&mut self) -> i32 {
1091 self.command_start = true;
1092 self.lex_state.set(EXPR_BEG);
1093 Self::tNL
1094 }
1095
1096 pub(crate) fn warn(&mut self, message: DiagnosticMessage, loc: Loc) {
1097 println_if_debug_lexer!("WARNING: {}", message.render());
1098 let diagnostic = Diagnostic {
1099 level: ErrorLevel::Warning,
1100 message,
1101 loc,
1102 };
1103 self.diagnostics.emit(diagnostic);
1104 }
1105
1106 pub(crate) fn warn_balanced(
1107 &mut self,
1108 token_type: i32,
1109 op: &'static str,
1110 syn: &'static str,
1111 c: MaybeByte,
1112 space_seen: bool,
1113 last_state: LexState,
1114 ) -> i32 {
1115 if !last_state.is_some(EXPR_CLASS | EXPR_DOT | EXPR_FNAME | EXPR_ENDFN)
1116 && space_seen & !c.is_space()
1117 {
1118 self.warn(
1119 DiagnosticMessage::AmbiguousOperator {
1120 operator: op.to_string(),
1121 interpreted_as: syn.to_string(),
1122 },
1123 self.current_loc(),
1124 );
1125 }
1126 token_type
1127 }
1128
1129 pub(crate) fn compile_error(&mut self, message: DiagnosticMessage, loc: Loc) {
1130 println_if_debug_lexer!("Compile error: {}", message.render());
1131 let diagnostic = Diagnostic {
1132 level: ErrorLevel::Error,
1133 message,
1134 loc,
1135 };
1136 self.diagnostics.emit(diagnostic);
1137 }
1138
1139 pub(crate) fn new_strterm(
1140 &self,
1141 func: usize,
1142 term: u8,
1143 paren: Option<u8>,
1144 heredoc_end: Option<HeredocEnd>,
1145 ) -> Option<Box<StrTerm>> {
1146 Some(Box::new(StrTerm::new_literal(StringLiteral::new(
1147 0,
1148 func,
1149 paren,
1150 term,
1151 heredoc_end,
1152 ))))
1153 }
1154
1155 pub(crate) fn loc(&self, begin_pos: usize, end_pos: usize) -> Loc {
1156 Loc {
1157 begin: begin_pos,
1158 end: end_pos,
1159 }
1160 }
1161
1162 pub(crate) fn current_loc(&self) -> Loc {
1163 self.loc(self.buffer.ptok, self.buffer.pcur)
1164 }
1165
1166 pub(crate) fn arg_ambiguous(&mut self, c: u8, loc: Loc) -> bool {
1167 if c == b'/' {
1168 self.warn(DiagnosticMessage::AmbiguousRegexp {}, loc);
1169 } else {
1170 self.warn(
1171 DiagnosticMessage::AmbiguousFirstArgument { operator: c },
1172 loc,
1173 );
1174 }
1175 true
1176 }
1177
1178 pub(crate) fn toklen(&self) -> usize {
1179 self.tokenbuf.len()
1180 }
1181
1182 pub(crate) fn tokfix(&self) {
1183 }
1185
1186 pub(crate) fn yyerror0(&mut self, message: DiagnosticMessage) {
1187 self.yyerror1(message, self.current_loc());
1188 }
1189
1190 pub(crate) fn yyerror1(&mut self, message: DiagnosticMessage, loc: Loc) {
1191 println_if_debug_lexer!("yyerror0: {}", message.render());
1192 let diagnostic = Diagnostic {
1193 level: ErrorLevel::Error,
1194 message,
1195 loc,
1196 };
1197 self.diagnostics.emit(diagnostic);
1198 }
1199
1200 pub(crate) fn is_lambda_beginning(&self) -> bool {
1201 self.lpar_beg == self.paren_nest
1202 }
1203
1204 pub(crate) fn tokadd_ident(&mut self, mut c: MaybeByte) -> bool {
1205 loop {
1206 if self.tokadd_mbchar(c).is_err() {
1207 return true;
1208 }
1209 c = self.nextc();
1210
1211 if !self.is_identchar() {
1212 break;
1213 }
1214 }
1215
1216 self.buffer.pushback(c);
1217 false
1218 }
1219
1220 pub(crate) fn newtok(&mut self) {
1221 self.buffer.tokidx = 0;
1222 self.buffer.tokline = self.buffer.ruby_sourceline;
1223 self.tokenbuf = TokenBuf::default();
1224 }
1225
1226 pub(crate) fn literal_flush(&mut self, ptok: usize) {
1227 self.buffer.set_ptok(ptok);
1228 }
1229
1230 pub(crate) fn tokadd_mbchar(&mut self, c: MaybeByte) -> Result<(), ()> {
1231 let mut len = match self.multibyte_char_len(self.buffer.pcur - 1) {
1232 Some(len) => len,
1233 None => return Err(()),
1234 };
1235
1236 match c {
1237 MaybeByte::EndOfInput => return Err(()),
1238 _ => self.tokadd(c),
1239 }
1240
1241 len -= 1;
1242 self.buffer.pcur += len;
1243 self.tokcopy(len);
1244 Ok(())
1245 }
1246
1247 fn _multibyte_char_len(&self, ptr: usize) -> Option<usize> {
1248 let c1 = self.buffer.byte_at(ptr).as_option()?;
1249
1250 let len = if c1 & 0x80 == 0 {
1251 1
1252 } else if c1 & 0xE0 == 0xC0 {
1253 2
1254 } else if c1 & 0xF0 == 0xE0 {
1255 3
1256 } else if c1 & 0xF8 == 0xF0 {
1257 4
1258 } else {
1259 return None;
1261 };
1262
1263 let bytes = self.buffer.substr_at(ptr, ptr + len)?;
1264 std::str::from_utf8(bytes).ok()?;
1265 Some(len)
1266 }
1267
1268 pub(crate) fn multibyte_char_len(&mut self, ptr: usize) -> Option<usize> {
1269 let result = self._multibyte_char_len(ptr);
1270 if result.is_none() {
1271 self.yyerror0(DiagnosticMessage::InvalidMultibyteChar {});
1272 }
1273 result
1274 }
1275
1276 pub(crate) fn is_label_suffix(&self, n: usize) -> bool {
1277 self.buffer.peek_n(b':', n) && !self.buffer.peek_n(b':', n + 1)
1278 }
1279
1280 pub(crate) fn is_lvar_defined(&self, name: &str) -> bool {
1281 self.static_env.is_declared(name)
1282 }
1283}