lib_ruby_parser/lexer/
main.rs

1use alloc_from_pool::{Factory as PoolFactory, PoolValue};
2
3use crate::lexer::*;
4use crate::maybe_byte::*;
5use crate::source::buffer::*;
6use crate::source::Comment;
7use crate::source::Decoder;
8use crate::source::MagicComment;
9use crate::str_term::{str_types::*, HeredocEnd, StrTerm, StringLiteral};
10use crate::Loc;
11use crate::SharedContext;
12use crate::StackState;
13use crate::StaticEnvironment;
14use crate::Token;
15use crate::TokenBuf;
16use crate::{error::Diagnostics, Bytes};
17use crate::{lex_states::*, LexState};
18use crate::{Diagnostic, DiagnosticMessage, ErrorLevel};
19
20/// A struct responsible for converting a given input
21/// into a sequence of tokens
22#[derive(Debug, Default)]
23pub struct Lexer {
24    pub(crate) buffer: Buffer,
25
26    pub(crate) lval: Option<Bytes>,
27    pub(crate) lval_start: Option<usize>,
28    pub(crate) lval_end: Option<usize>,
29
30    pub(crate) strterm: Option<Box<StrTerm>>,
31    /// Current state of the lexer, used internally for testing
32    pub lex_state: LexState,
33    pub(crate) paren_nest: i32,
34    pub(crate) lpar_beg: i32,
35    pub(crate) brace_nest: i32,
36
37    /// Internal field, used to differentiate kDO_COND vs kDO,
38    /// exposed for internal testing
39    pub cond: StackState,
40    /// Internal field, used to differentiate kDO_BLOCK vs kDO,
41    /// exposed for internal testing
42    pub cmdarg: StackState,
43
44    pub(crate) tokenbuf: TokenBuf,
45
46    // pub(crate) max_numparam: usize,
47    pub(crate) context: SharedContext,
48
49    pub(crate) command_start: bool,
50    pub(crate) token_seen: bool,
51
52    /// Stack of sets of variables in current scopes.
53    /// Each stack item represents locals in the scope.
54    ///
55    /// You can use it to pre-define some locals and parse
56    /// your input as if these locals exist.
57    ///
58    /// For example, you can parse the following code
59    ///
60    /// ```text
61    /// a = b + c
62    /// ```
63    ///
64    /// as
65    ///
66    /// ```text
67    /// Send(LocalVar(a), "+", LocalVar(b))
68    /// ```
69    ///
70    /// by declaring `a` and `b` as locals using
71    ///
72    /// ```text
73    /// parser.lexer.static_env.declare("a")
74    /// parser.lexer.static_env.declare("b")
75    /// parser.parse()
76    /// ```
77    pub static_env: StaticEnvironment,
78
79    pub(crate) diagnostics: Diagnostics,
80    pub(crate) comments: Vec<Comment>,
81    pub(crate) magic_comments: Vec<MagicComment>,
82
83    #[doc(hidden)]
84    pub tokens_factory: PoolFactory<Token>,
85}
86
87impl Lexer {
88    pub(crate) const NULL_CHAR: u8 = 0x00;
89    pub(crate) const CTRL_D_CHAR: u8 = 0x04;
90    pub(crate) const CTRL_Z_CHAR: u8 = 0x1a;
91    pub(crate) const LF_CHAR: u8 = 0x0c;
92    pub(crate) const VTAB_CHAR: u8 = 0x0b;
93
94    /// Constructs an instance of Lexer
95    pub fn new<Bytes, Name>(bytes: Bytes, name: Name, decoder: Option<Decoder>) -> Self
96    where
97        Bytes: Into<Vec<u8>>,
98        Name: Into<String>,
99    {
100        Self {
101            cond: StackState::new("cond"),
102            cmdarg: StackState::new("cmdarg"),
103            lpar_beg: -1, /* make lambda_beginning_p() == FALSE at first */
104            buffer: Buffer::new(name.into(), bytes.into(), decoder),
105            ..Self::default()
106        }
107    }
108
109    /// Tokenizes given input until EOF
110    ///
111    /// Keep in mind that Lexer in Ruby is driven by Parser,
112    /// and so this method on its own can return a wrong sequence
113    /// of tokens. It's used internally to test simple inputs.
114    ///
115    /// If you need to get tokens better use `ParserResult::tokens` field
116    pub fn tokenize_until_eof(&mut self) -> Vec<Token> {
117        let mut tokens = vec![];
118
119        loop {
120            let token = self.yylex().take_value();
121            match token.token_type {
122                Self::END_OF_INPUT => break,
123                _ => tokens.push(token),
124            }
125        }
126
127        tokens
128    }
129
130    pub(crate) fn yylex(&mut self) -> PoolValue<Token> {
131        self.lval = None;
132
133        let token_type = self.parser_yylex();
134
135        let begin = std::mem::take(&mut self.lval_start).unwrap_or(self.buffer.ptok);
136        let mut end = std::mem::take(&mut self.lval_end).unwrap_or(self.buffer.pcur);
137
138        let mut token_value = self
139            .lval
140            .take()
141            .or_else(|| {
142                // take raw value if nothing was manually captured
143                self.buffer
144                    .substr_at(begin, end)
145                    .map(|s| Bytes::new(Vec::from(s)))
146            })
147            .unwrap_or_else(|| Bytes::new(vec![]));
148
149        if token_type == Self::tNL {
150            token_value = Bytes::new(vec![b'\n']);
151            end = begin + 1;
152        }
153
154        let token = self.tokens_factory.alloc(Token {
155            token_type,
156            token_value,
157            loc: Loc { begin, end },
158        });
159        println_if_debug_lexer!(
160            "yylex ({:?}, {:?}, {:?})",
161            token.token_name(),
162            token.token_value,
163            token.loc
164        );
165        token
166    }
167
168    pub(crate) fn nextc(&mut self) -> MaybeByte {
169        self.buffer.nextc()
170    }
171    pub(crate) fn char_at(&self, idx: usize) -> MaybeByte {
172        self.buffer.byte_at(idx)
173    }
174    pub(crate) fn token_flush(&mut self) {
175        self.buffer.token_flush()
176    }
177
178    pub(crate) fn parser_yylex(&mut self) -> i32 {
179        let mut c: MaybeByte;
180        let mut space_seen: bool = false;
181        let label: usize;
182        let mut last_state: LexState;
183        let token_seen = self.token_seen;
184
185        if let Some(strterm) = self.strterm.as_ref().map(|i| i.as_ref()) {
186            match strterm {
187                StrTerm::HeredocLiteral(_) => {
188                    return self.here_document();
189                }
190
191                StrTerm::StringLiteral(_) => {
192                    self.token_flush();
193                    return self.parse_string();
194                }
195            }
196        }
197
198        let cmd_state = self.command_start;
199        self.command_start = false;
200        self.token_seen = true;
201
202        'retrying: loop {
203            last_state = self.lex_state;
204            self.token_flush();
205
206            // handle EOF
207            c = self.nextc();
208
209            if c.is_eof() {
210                return Self::END_OF_INPUT;
211            }
212
213            match c.as_option() {
214                None
215                | Some(Self::NULL_CHAR)
216                | Some(Self::CTRL_D_CHAR)
217                | Some(Self::CTRL_Z_CHAR) => return Self::END_OF_INPUT,
218
219                // whitespaces
220                Some(b'\r') => {
221                    if !self.buffer.cr_seen {
222                        self.buffer.cr_seen = true;
223                        self.warn(
224                            DiagnosticMessage::SlashRAtMiddleOfLine {},
225                            self.current_loc(),
226                        );
227                    }
228                }
229
230                Some(b' ') | Some(b'\t') | Some(Self::LF_CHAR) | Some(Self::VTAB_CHAR) => {
231                    space_seen = true;
232                    continue 'retrying;
233                }
234
235                Some(b'#') | Some(b'\n') => {
236                    if c == b'#' {
237                        // it's a comment
238                        self.token_seen = token_seen;
239                        // no magic_comment in shebang line
240                        let magic_comment = self
241                            .magic_comment(self.buffer.pcur, self.buffer.pend - self.buffer.pcur);
242                        match magic_comment {
243                            Ok(magic_comment) => {
244                                if !magic_comment && self.comment_at_top() {
245                                    self.set_file_encoding(self.buffer.pcur, self.buffer.pend)
246                                }
247                            }
248                            Err(_) => return Self::END_OF_INPUT,
249                        }
250                        self.buffer.goto_eol();
251                        self.comments
252                            .push(Comment::new(self.current_loc(), &self.buffer.input.decoded))
253                    }
254                    self.token_seen = token_seen;
255                    let cc = self
256                        .lex_state
257                        .is_some(EXPR_BEG | EXPR_CLASS | EXPR_FNAME | EXPR_DOT)
258                        && !self.lex_state.is_some(EXPR_LABELED);
259                    if cc || self.lex_state.is_all(EXPR_ARG | EXPR_LABELED) {
260                        if !cc && self.context.in_kwarg() {
261                            return self.normal_newline_leaf_label();
262                        }
263                        continue 'retrying;
264                    }
265
266                    loop {
267                        // while(1)
268                        c = self.nextc();
269
270                        #[allow(clippy::never_loop)]
271                        // emulate ugly C switch with fall-through logic
272                        loop {
273                            if c == b' '
274                                || c == b'\t'
275                                || c == Self::LF_CHAR
276                                || c == b'\r'
277                                || c == Self::VTAB_CHAR
278                            {
279                                space_seen = true;
280                                break;
281                            }
282
283                            if c == b'#' {
284                                self.buffer.pushback(c);
285                                continue 'retrying;
286                            }
287
288                            if c == b'&' || c == b'.' {
289                                if self.buffer.peek(b'.') == (c == b'&') {
290                                    self.buffer.pushback(c);
291                                    continue 'retrying;
292                                }
293                            }
294
295                            if c.is_eof() {
296                                // EOF no decrement
297                                self.buffer.eof_no_decrement();
298                                return self.normal_newline_leaf_label();
299                            }
300
301                            // default:
302                            self.buffer.ruby_sourceline -= 1;
303                            self.buffer.nextline = self.buffer.lastline;
304                            // -1 branch fallthrough
305                            self.buffer.eof_no_decrement();
306                            return self.normal_newline_leaf_label();
307                        }
308                    }
309                }
310
311                Some(b'*') => {
312                    let result: i32;
313
314                    c = self.nextc();
315
316                    if c == b'*' {
317                        c = self.nextc();
318                        if c == b'=' {
319                            self.set_yylval_id("**=");
320                            self.lex_state.set(EXPR_BEG);
321                            return Self::tOP_ASGN;
322                        }
323                        self.buffer.pushback(c);
324                        if self.lex_state.is_spacearg(c, space_seen) {
325                            self.warn(
326                                DiagnosticMessage::DStarInterpretedAsArgPrefix {},
327                                self.current_loc(),
328                            );
329                            result = Self::tDSTAR;
330                        } else if self.lex_state.is_beg() {
331                            result = Self::tDSTAR;
332                        } else {
333                            result = self.warn_balanced(
334                                Self::tPOW,
335                                "**",
336                                "argument prefix",
337                                c,
338                                space_seen,
339                                last_state,
340                            );
341                        }
342                    } else {
343                        if c == b'=' {
344                            self.set_yylval_id("*=");
345                            self.lex_state.set(EXPR_BEG);
346                            return Self::tOP_ASGN;
347                        }
348                        self.buffer.pushback(c);
349                        if self.lex_state.is_spacearg(c, space_seen) {
350                            self.warn(
351                                DiagnosticMessage::StarInterpretedAsArgPrefix {},
352                                self.current_loc(),
353                            );
354                            result = Self::tSTAR;
355                        } else if self.lex_state.is_beg() {
356                            result = Self::tSTAR;
357                        } else {
358                            result = self.warn_balanced(
359                                Self::tSTAR2,
360                                "*",
361                                "argument prefix",
362                                c,
363                                space_seen,
364                                last_state,
365                            );
366                        }
367                    }
368
369                    self.lex_state.set(if self.lex_state.is_after_operator() {
370                        EXPR_ARG
371                    } else {
372                        EXPR_BEG
373                    });
374                    return result;
375                }
376
377                Some(b'!') => {
378                    c = self.nextc();
379                    if self.lex_state.is_after_operator() {
380                        self.lex_state.set(EXPR_ARG);
381                        if c == b'@' {
382                            return Self::tBANG;
383                        }
384                    } else {
385                        self.lex_state.set(EXPR_BEG);
386                    }
387                    if c == b'=' {
388                        return Self::tNEQ;
389                    }
390                    if c == b'~' {
391                        return Self::tNMATCH;
392                    }
393                    self.buffer.pushback(c);
394                    return Self::tBANG;
395                }
396
397                Some(b'=') => {
398                    if self.buffer.was_bol() {
399                        // skip embedded rd document
400                        if self.buffer.is_word_match("begin") {
401                            let begin_loc = self.loc(self.buffer.pcur - 1, self.buffer.pcur + 5);
402                            self.buffer.goto_eol();
403                            loop {
404                                self.buffer.goto_eol();
405                                c = self.nextc();
406                                if c.is_eof() {
407                                    self.compile_error(
408                                        DiagnosticMessage::EmbeddedDocumentMeetsEof {},
409                                        begin_loc,
410                                    );
411                                    return Self::END_OF_INPUT;
412                                }
413                                if c == b'=' && self.buffer.is_word_match("end") {
414                                    break;
415                                }
416                                self.buffer.pushback(c);
417                            }
418                            self.buffer.goto_eol();
419                            self.comments.push(Comment::new(
420                                begin_loc.with_end(self.buffer.pcur),
421                                &self.buffer.input.decoded,
422                            ));
423                            continue 'retrying;
424                        }
425                    }
426
427                    self.lex_state.set(if self.lex_state.is_after_operator() {
428                        EXPR_ARG
429                    } else {
430                        EXPR_BEG
431                    });
432                    c = self.nextc();
433                    if c == b'=' {
434                        c = self.nextc();
435                        if c == b'=' {
436                            return Self::tEQQ;
437                        }
438                        self.buffer.pushback(c);
439                        return Self::tEQ;
440                    }
441                    if c == b'~' {
442                        return Self::tMATCH;
443                    } else if c == b'>' {
444                        return Self::tASSOC;
445                    }
446                    self.buffer.pushback(c);
447                    return Self::tEQL;
448                }
449
450                Some(b'<') => {
451                    c = self.nextc();
452                    if c == b'<'
453                        && !self.lex_state.is_some(EXPR_DOT | EXPR_CLASS)
454                        && !self.lex_state.is_end()
455                        && (!self.lex_state.is_arg()
456                            || self.lex_state.is_some(EXPR_LABELED)
457                            || space_seen)
458                    {
459                        if let Some(token) = self.heredoc_identifier() {
460                            return token;
461                        }
462                    }
463                    if self.lex_state.is_after_operator() {
464                        self.lex_state.set(EXPR_ARG);
465                    } else {
466                        if self.lex_state.is_some(EXPR_CLASS) {
467                            self.command_start = true;
468                        }
469                        self.lex_state.set(EXPR_BEG);
470                    }
471                    if c == b'=' {
472                        c = self.nextc();
473                        if c == b'>' {
474                            return Self::tCMP;
475                        }
476                        self.buffer.pushback(c);
477                        return Self::tLEQ;
478                    }
479                    if c == b'<' {
480                        c = self.nextc();
481                        if c == b'=' {
482                            self.set_yylval_id("<<=");
483                            self.lex_state.set(EXPR_BEG);
484                            return Self::tOP_ASGN;
485                        }
486                        self.buffer.pushback(c);
487                        return self.warn_balanced(
488                            Self::tLSHFT,
489                            "<<",
490                            "here document",
491                            c,
492                            space_seen,
493                            last_state,
494                        );
495                    }
496                    self.buffer.pushback(c);
497                    return Self::tLT;
498                }
499
500                Some(b'>') => {
501                    self.lex_state.set(if self.lex_state.is_after_operator() {
502                        EXPR_ARG
503                    } else {
504                        EXPR_BEG
505                    });
506
507                    c = self.nextc();
508                    if c == b'=' {
509                        return Self::tGEQ;
510                    }
511
512                    if c == b'>' {
513                        c = self.nextc();
514                        if c == b'=' {
515                            self.set_yylval_id(">>=");
516                            self.lex_state.set(EXPR_BEG);
517                            return Self::tOP_ASGN;
518                        }
519                        self.buffer.pushback(c);
520                        return Self::tRSHFT;
521                    }
522                    self.buffer.pushback(c);
523                    return Self::tGT;
524                }
525
526                Some(b'"') => {
527                    label = if self.lex_state.is_label_possible(cmd_state) {
528                        str_label
529                    } else {
530                        0
531                    };
532                    self.strterm = self.new_strterm(str_dquote | label, b'"', None, None);
533                    self.buffer.set_ptok(self.buffer.pcur - 1);
534                    return Self::tSTRING_BEG;
535                }
536
537                Some(b'`') => {
538                    if self.lex_state.is_some(EXPR_FNAME) {
539                        self.lex_state.set(EXPR_ENDFN);
540                        return Self::tBACK_REF2;
541                    }
542                    if self.lex_state.is_some(EXPR_DOT) {
543                        if cmd_state {
544                            self.lex_state.set(EXPR_CMDARG);
545                        } else {
546                            self.lex_state.set(EXPR_ARG);
547                        }
548                        return Self::tBACK_REF2;
549                    }
550                    self.strterm = self.new_strterm(str_xquote, b'`', None, None);
551                    return Self::tXSTRING_BEG;
552                }
553
554                Some(b'\'') => {
555                    label = if self.lex_state.is_label_possible(cmd_state) {
556                        str_label
557                    } else {
558                        0
559                    };
560                    self.strterm = self.new_strterm(str_squote | label, b'\'', None, None);
561                    self.buffer.set_ptok(self.buffer.pcur - 1);
562                    return Self::tSTRING_BEG;
563                }
564
565                Some(b'?') => {
566                    return self.parse_qmark(space_seen).unwrap_or(-1);
567                }
568
569                Some(b'&') => {
570                    let result: i32;
571
572                    c = self.nextc();
573                    if c == b'&' {
574                        self.lex_state.set(EXPR_BEG);
575                        c = self.nextc();
576                        if c == b'=' {
577                            self.set_yylval_id("&&=");
578                            self.lex_state.set(EXPR_BEG);
579                            return Self::tOP_ASGN;
580                        }
581                        self.buffer.pushback(c);
582                        return Self::tANDOP;
583                    } else if c == b'=' {
584                        self.set_yylval_id("&=");
585                        self.lex_state.set(EXPR_BEG);
586                        return Self::tOP_ASGN;
587                    } else if c == b'.' {
588                        self.set_yylval_id("&.");
589                        self.lex_state.set(EXPR_DOT);
590                        return Self::tANDDOT;
591                    }
592                    self.buffer.pushback(c);
593                    if self.lex_state.is_spacearg(c, space_seen) {
594                        if c != b':'
595                            || {
596                                c = self.buffer.peekc_n(1);
597                                !c.is_eof()
598                            }
599                            || !(c == b'\''
600                                || c == b'"'
601                                || self
602                                    .buffer
603                                    .is_identchar(self.buffer.pcur + 1, self.buffer.pend))
604                        {
605                            self.warn(
606                                DiagnosticMessage::AmpersandInterpretedAsArgPrefix {},
607                                self.current_loc(),
608                            );
609                        }
610                        result = Self::tAMPER;
611                    } else if self.lex_state.is_beg() {
612                        result = Self::tAMPER;
613                    } else {
614                        result = self.warn_balanced(
615                            Self::tAMPER2,
616                            "&",
617                            "argument prefix",
618                            c,
619                            space_seen,
620                            last_state,
621                        );
622                    }
623                    self.lex_state.set(if self.lex_state.is_after_operator() {
624                        EXPR_ARG
625                    } else {
626                        EXPR_BEG
627                    });
628                    return result;
629                }
630
631                Some(b'|') => {
632                    c = self.nextc();
633                    if c == b'|' {
634                        self.lex_state.set(EXPR_BEG);
635                        c = self.nextc();
636                        if c == b'=' {
637                            self.set_yylval_id("||=");
638                            self.lex_state.set(EXPR_BEG);
639                            return Self::tOP_ASGN;
640                        }
641                        self.buffer.pushback(c);
642                        if last_state.is_some(EXPR_BEG) {
643                            self.buffer.pushback(b'|');
644                            return Self::tPIPE;
645                        }
646                        return Self::tOROP;
647                    }
648                    if c == b'=' {
649                        self.set_yylval_id("|=");
650                        self.lex_state.set(EXPR_BEG);
651                        return Self::tOP_ASGN;
652                    }
653                    self.lex_state.set(if self.lex_state.is_after_operator() {
654                        EXPR_ARG
655                    } else {
656                        EXPR_BEG | EXPR_LABEL
657                    });
658                    self.buffer.pushback(c);
659                    return Self::tPIPE;
660                }
661
662                Some(b'+') => {
663                    c = self.nextc();
664                    if self.lex_state.is_after_operator() {
665                        self.lex_state.set(EXPR_ARG);
666                        if c == b'@' {
667                            return Self::tUPLUS;
668                        }
669                        self.buffer.pushback(c);
670                        return Self::tPLUS;
671                    }
672                    if c == b'=' {
673                        self.set_yylval_id("+=");
674                        self.lex_state.set(EXPR_BEG);
675                        return Self::tOP_ASGN;
676                    }
677                    if self.lex_state.is_beg()
678                        || (self.lex_state.is_spacearg(c, space_seen)
679                            && self.arg_ambiguous(b'+', self.current_loc().adjust_end(-1)))
680                    {
681                        self.lex_state.set(EXPR_BEG);
682                        self.buffer.pushback(c);
683                        if !c.is_eof() && c.is_digit() {
684                            return self.parse_numeric(b'+');
685                        }
686                        return Self::tUPLUS;
687                    }
688                    self.lex_state.set(EXPR_BEG);
689                    self.buffer.pushback(c);
690                    return self.warn_balanced(
691                        Self::tPLUS,
692                        "+",
693                        "unary operator",
694                        c,
695                        space_seen,
696                        last_state,
697                    );
698                }
699
700                Some(b'-') => {
701                    c = self.nextc();
702                    if self.lex_state.is_after_operator() {
703                        self.lex_state.set(EXPR_ARG);
704                        if c == b'@' {
705                            return Self::tUMINUS;
706                        }
707                        self.buffer.pushback(c);
708                        return Self::tMINUS;
709                    }
710                    if c == b'=' {
711                        self.set_yylval_id("-=");
712                        self.lex_state.set(EXPR_BEG);
713                        return Self::tOP_ASGN;
714                    }
715                    if c == b'>' {
716                        self.lex_state.set(EXPR_ENDFN);
717                        return Self::tLAMBDA;
718                    }
719                    if self.lex_state.is_beg()
720                        || (self.lex_state.is_spacearg(c, space_seen)
721                            && self.arg_ambiguous(b'-', self.current_loc().adjust_end(-1)))
722                    {
723                        self.lex_state.set(EXPR_BEG);
724                        self.buffer.pushback(c);
725                        if !c.is_eof() && c.is_digit() {
726                            return Self::tUMINUS_NUM;
727                        }
728                        return Self::tUMINUS;
729                    }
730                    self.lex_state.set(EXPR_BEG);
731                    self.buffer.pushback(c);
732                    return self.warn_balanced(
733                        Self::tMINUS,
734                        "-",
735                        "unary operator",
736                        c,
737                        space_seen,
738                        last_state,
739                    );
740                }
741
742                Some(b'.') => {
743                    let is_beg = self.lex_state.is_beg();
744                    self.lex_state.set(EXPR_BEG);
745                    c = self.nextc();
746                    if c == b'.' {
747                        c = self.nextc();
748                        if c == b'.' {
749                            if self.context.in_argdef() {
750                                self.lex_state.set(EXPR_ENDARG);
751                                return Self::tBDOT3;
752                            }
753                            if self.paren_nest == 0 && self.buffer.is_looking_at_eol() {
754                                self.warn(DiagnosticMessage::TripleDotAtEol {}, self.current_loc());
755                            } else if self.lpar_beg >= 0
756                                && self.lpar_beg + 1 == self.paren_nest
757                                && last_state.is_some(EXPR_LABEL)
758                            {
759                                return Self::tDOT3;
760                            }
761                            return if is_beg { Self::tBDOT3 } else { Self::tDOT3 };
762                        }
763                        self.buffer.pushback(c);
764                        return if is_beg { Self::tBDOT2 } else { Self::tDOT2 };
765                    }
766                    self.buffer.pushback(c);
767                    if !c.is_eof() && c.is_digit() {
768                        let prev = if self.buffer.pcur - 1 > self.buffer.pbeg {
769                            self.buffer.byte_at(self.buffer.pcur - 2)
770                        } else {
771                            MaybeByte::EndOfInput
772                        };
773                        self.parse_numeric(b'.');
774                        if prev.is_digit() {
775                            self.yyerror0(DiagnosticMessage::FractionAfterNumeric {});
776                        } else {
777                            self.yyerror0(DiagnosticMessage::NoDigitsAfterDot {});
778                        }
779                        self.lex_state.set(EXPR_END);
780                        self.buffer.set_ptok(self.buffer.pcur);
781                        continue 'retrying;
782                    }
783                    self.set_yylval_id(".");
784                    self.lex_state.set(EXPR_DOT);
785                    return Self::tDOT;
786                }
787
788                Some(c) if c.is_ascii_digit() => {
789                    return self.parse_numeric(c);
790                }
791
792                Some(b')') => {
793                    self.cond.pop();
794                    self.cmdarg.pop();
795                    self.lex_state.set(EXPR_ENDFN);
796                    self.paren_nest -= 1;
797
798                    return Self::tRPAREN;
799                }
800
801                Some(b']') => {
802                    self.cond.pop();
803                    self.cmdarg.pop();
804                    self.lex_state.set(EXPR_END);
805                    self.paren_nest -= 1;
806
807                    return Self::tRBRACK;
808                }
809
810                Some(b'}') => {
811                    // tSTRING_DEND does COND.POP and CMDARG.POP in the yacc's rule (lalrpop here)
812                    if self.brace_nest == 0 {
813                        self.brace_nest -= 1;
814                        return Self::tSTRING_DEND;
815                    }
816                    self.brace_nest -= 1;
817                    self.cond.pop();
818                    self.cmdarg.pop();
819                    self.lex_state.set(EXPR_END);
820                    self.paren_nest -= 1;
821
822                    return Self::tRCURLY;
823                }
824
825                Some(b':') => {
826                    c = self.nextc();
827                    if c == b':' {
828                        if self.lex_state.is_beg()
829                            || self.lex_state.is_some(EXPR_CLASS)
830                            || self
831                                .lex_state
832                                .is_spacearg(MaybeByte::EndOfInput, space_seen)
833                        {
834                            self.lex_state.set(EXPR_BEG);
835                            return Self::tCOLON3;
836                        }
837                        self.set_yylval_id("::");
838                        self.lex_state.set(EXPR_DOT);
839                        return Self::tCOLON2;
840                    }
841                    if self.lex_state.is_end() || c.is_space() || c == Some(b'#') {
842                        self.buffer.pushback(c);
843                        let result = self.warn_balanced(
844                            Self::tCOLON,
845                            ":",
846                            "symbol literal",
847                            c,
848                            space_seen,
849                            last_state,
850                        );
851                        self.lex_state.set(EXPR_BEG);
852                        return result;
853                    }
854                    match c.as_option() {
855                        Some(c) if c == b'\'' => {
856                            self.strterm = self.new_strterm(str_ssym, c, None, None)
857                        }
858                        Some(c) if c == b'"' => {
859                            self.strterm = self.new_strterm(str_dsym, c, None, None)
860                        }
861                        _ => self.buffer.pushback(c),
862                    }
863                    self.lex_state.set(EXPR_FNAME);
864                    return Self::tSYMBEG;
865                }
866
867                Some(b'/') => {
868                    if self.lex_state.is_beg() {
869                        self.strterm = self.new_strterm(str_regexp, b'/', None, None);
870                        return Self::tREGEXP_BEG;
871                    }
872                    c = self.nextc();
873                    if c == b'=' {
874                        self.set_yylval_id("/=");
875                        self.lex_state.set(EXPR_BEG);
876                        return Self::tOP_ASGN;
877                    }
878                    self.buffer.pushback(c);
879                    if self.lex_state.is_spacearg(c, space_seen) {
880                        self.arg_ambiguous(b'/', self.current_loc());
881                        self.strterm = self.new_strterm(str_regexp, b'/', None, None);
882                        return Self::tREGEXP_BEG;
883                    }
884                    self.lex_state.set(if self.lex_state.is_after_operator() {
885                        EXPR_ARG
886                    } else {
887                        EXPR_BEG
888                    });
889                    return self.warn_balanced(
890                        Self::tDIVIDE,
891                        "/",
892                        "regexp literal",
893                        c,
894                        space_seen,
895                        last_state,
896                    );
897                }
898
899                Some(b'^') => {
900                    c = self.nextc();
901                    if c == b'=' {
902                        self.set_yylval_id("^=");
903                        self.lex_state.set(EXPR_BEG);
904                        return Self::tOP_ASGN;
905                    }
906                    self.lex_state.set(if self.lex_state.is_after_operator() {
907                        EXPR_ARG
908                    } else {
909                        EXPR_BEG
910                    });
911                    self.buffer.pushback(c);
912                    return Self::tCARET;
913                }
914
915                Some(b';') => {
916                    self.lex_state.set(EXPR_BEG);
917                    self.command_start = true;
918                    return Self::tSEMI;
919                }
920
921                Some(b',') => {
922                    self.lex_state.set(EXPR_BEG | EXPR_LABEL);
923                    return Self::tCOMMA;
924                }
925
926                Some(b'~') => {
927                    if self.lex_state.is_after_operator() {
928                        c = self.nextc();
929                        if c != b'@' {
930                            self.buffer.pushback(c);
931                        }
932                        self.lex_state.set(EXPR_ARG);
933                    } else {
934                        self.lex_state.set(EXPR_BEG);
935                    }
936
937                    return Self::tTILDE;
938                }
939
940                Some(b'(') => {
941                    let mut result: i32 = Self::tLPAREN2;
942
943                    if self.lex_state.is_beg() {
944                        result = Self::tLPAREN;
945                    } else if !space_seen {
946                        // foo( ... ) => method call, no ambiguity
947                    } else if self.lex_state.is_arg()
948                        || self.lex_state.is_all(EXPR_END | EXPR_LABEL)
949                    {
950                        result = Self::tLPAREN_ARG;
951                    } else if self.lex_state.is_some(EXPR_ENDFN) && !self.is_lambda_beginning() {
952                        self.warn(
953                            DiagnosticMessage::ParenthesesIterpretedAsArglist {},
954                            self.current_loc(),
955                        );
956                    }
957
958                    self.paren_nest += 1;
959                    self.cond.push(false);
960                    self.cmdarg.push(false);
961                    self.lex_state.set(EXPR_BEG | EXPR_LABEL);
962
963                    return result;
964                }
965
966                Some(b'[') => {
967                    let mut result: i32 = Self::tLBRACK2;
968
969                    self.paren_nest += 1;
970                    if self.lex_state.is_after_operator() {
971                        c = self.nextc();
972                        if c == b']' {
973                            self.paren_nest -= 1;
974                            self.lex_state.set(EXPR_ARG);
975                            c = self.nextc();
976                            if c == b'=' {
977                                return Self::tASET;
978                            }
979                            self.buffer.pushback(c);
980                            return Self::tAREF;
981                        }
982                        self.buffer.pushback(c);
983                        self.lex_state.set(EXPR_ARG | EXPR_LABEL);
984                        return Self::tLBRACK2;
985                    } else if self.lex_state.is_beg()
986                        || (self.lex_state.is_arg()
987                            && (space_seen || self.lex_state.is_some(EXPR_LABELED)))
988                    {
989                        result = Self::tLBRACK;
990                    }
991                    self.lex_state.set(EXPR_BEG | EXPR_LABEL);
992                    self.cond.push(false);
993                    self.cmdarg.push(false);
994                    return result;
995                }
996
997                Some(b'{') => {
998                    self.brace_nest += 1;
999
1000                    let result: i32;
1001
1002                    if self.is_lambda_beginning() {
1003                        result = Self::tLAMBEG;
1004                    } else if self.lex_state.is_some(EXPR_LABELED) {
1005                        result = Self::tLBRACE;
1006                    } else if self.lex_state.is_some(EXPR_ARG_ANY | EXPR_END | EXPR_ENDFN) {
1007                        result = Self::tLCURLY;
1008                    } else if self.lex_state.is_some(EXPR_ENDARG) {
1009                        result = Self::tLBRACE_ARG;
1010                    } else {
1011                        result = Self::tLBRACE;
1012                    }
1013
1014                    if result != Self::tLBRACE {
1015                        self.command_start = true;
1016                        self.lex_state.set(EXPR_BEG);
1017                    } else {
1018                        self.lex_state.set(EXPR_BEG | EXPR_LABEL);
1019                    }
1020
1021                    self.paren_nest += 1;
1022                    self.cond.push(false);
1023                    self.cmdarg.push(false);
1024                    return result;
1025                }
1026
1027                Some(b'\\') => {
1028                    c = self.nextc();
1029                    if c == b'\n' {
1030                        space_seen = true;
1031                        continue 'retrying; /* skip \\n */
1032                    }
1033                    if c == b' ' {
1034                        return Self::tSP;
1035                    }
1036                    if c.is_space() {
1037                        match c.as_option() {
1038                            Some(b'\t') => return Self::tSLASH_T,
1039                            Some(Self::LF_CHAR) => return Self::tSLASH_F,
1040                            Some(b'\r') => return Self::tSLASH_R,
1041                            Some(Self::VTAB_CHAR) => return Self::tVTAB,
1042                            Some(other) => unreachable!("unsupported space char {:?}", other),
1043                            None => {}
1044                        }
1045                    }
1046                    self.buffer.pushback(c);
1047                    return Self::tBACKSLASH;
1048                }
1049
1050                Some(b'%') => {
1051                    return self.parse_percent(space_seen, last_state);
1052                }
1053
1054                Some(b'$') => {
1055                    return self.parse_gvar(last_state);
1056                }
1057
1058                Some(b'@') => {
1059                    return self.parse_atmark(last_state);
1060                }
1061
1062                Some(b'_') => {
1063                    if self.buffer.was_bol() && self.buffer.is_whole_match(b"__END__", 0) {
1064                        self.buffer.eofp = true;
1065                        return Self::END_OF_INPUT;
1066                    }
1067                    self.newtok();
1068                }
1069
1070                Some(c) => {
1071                    if !self.is_identchar() {
1072                        self.compile_error(
1073                            DiagnosticMessage::InvalidChar { c },
1074                            self.current_loc(),
1075                        );
1076                        self.token_flush();
1077                        continue 'retrying;
1078                    }
1079
1080                    self.newtok();
1081                }
1082            }
1083
1084            break;
1085        }
1086
1087        self.parse_ident(c, cmd_state)
1088    }
1089
1090    fn normal_newline_leaf_label(&mut self) -> i32 {
1091        self.command_start = true;
1092        self.lex_state.set(EXPR_BEG);
1093        Self::tNL
1094    }
1095
1096    pub(crate) fn warn(&mut self, message: DiagnosticMessage, loc: Loc) {
1097        println_if_debug_lexer!("WARNING: {}", message.render());
1098        let diagnostic = Diagnostic {
1099            level: ErrorLevel::Warning,
1100            message,
1101            loc,
1102        };
1103        self.diagnostics.emit(diagnostic);
1104    }
1105
1106    pub(crate) fn warn_balanced(
1107        &mut self,
1108        token_type: i32,
1109        op: &'static str,
1110        syn: &'static str,
1111        c: MaybeByte,
1112        space_seen: bool,
1113        last_state: LexState,
1114    ) -> i32 {
1115        if !last_state.is_some(EXPR_CLASS | EXPR_DOT | EXPR_FNAME | EXPR_ENDFN)
1116            && space_seen & !c.is_space()
1117        {
1118            self.warn(
1119                DiagnosticMessage::AmbiguousOperator {
1120                    operator: op.to_string(),
1121                    interpreted_as: syn.to_string(),
1122                },
1123                self.current_loc(),
1124            );
1125        }
1126        token_type
1127    }
1128
1129    pub(crate) fn compile_error(&mut self, message: DiagnosticMessage, loc: Loc) {
1130        println_if_debug_lexer!("Compile error: {}", message.render());
1131        let diagnostic = Diagnostic {
1132            level: ErrorLevel::Error,
1133            message,
1134            loc,
1135        };
1136        self.diagnostics.emit(diagnostic);
1137    }
1138
1139    pub(crate) fn new_strterm(
1140        &self,
1141        func: usize,
1142        term: u8,
1143        paren: Option<u8>,
1144        heredoc_end: Option<HeredocEnd>,
1145    ) -> Option<Box<StrTerm>> {
1146        Some(Box::new(StrTerm::new_literal(StringLiteral::new(
1147            0,
1148            func,
1149            paren,
1150            term,
1151            heredoc_end,
1152        ))))
1153    }
1154
1155    pub(crate) fn loc(&self, begin_pos: usize, end_pos: usize) -> Loc {
1156        Loc {
1157            begin: begin_pos,
1158            end: end_pos,
1159        }
1160    }
1161
1162    pub(crate) fn current_loc(&self) -> Loc {
1163        self.loc(self.buffer.ptok, self.buffer.pcur)
1164    }
1165
1166    pub(crate) fn arg_ambiguous(&mut self, c: u8, loc: Loc) -> bool {
1167        if c == b'/' {
1168            self.warn(DiagnosticMessage::AmbiguousRegexp {}, loc);
1169        } else {
1170            self.warn(
1171                DiagnosticMessage::AmbiguousFirstArgument { operator: c },
1172                loc,
1173            );
1174        }
1175        true
1176    }
1177
1178    pub(crate) fn toklen(&self) -> usize {
1179        self.tokenbuf.len()
1180    }
1181
1182    pub(crate) fn tokfix(&self) {
1183        // nop
1184    }
1185
1186    pub(crate) fn yyerror0(&mut self, message: DiagnosticMessage) {
1187        self.yyerror1(message, self.current_loc());
1188    }
1189
1190    pub(crate) fn yyerror1(&mut self, message: DiagnosticMessage, loc: Loc) {
1191        println_if_debug_lexer!("yyerror0: {}", message.render());
1192        let diagnostic = Diagnostic {
1193            level: ErrorLevel::Error,
1194            message,
1195            loc,
1196        };
1197        self.diagnostics.emit(diagnostic);
1198    }
1199
1200    pub(crate) fn is_lambda_beginning(&self) -> bool {
1201        self.lpar_beg == self.paren_nest
1202    }
1203
1204    pub(crate) fn tokadd_ident(&mut self, mut c: MaybeByte) -> bool {
1205        loop {
1206            if self.tokadd_mbchar(c).is_err() {
1207                return true;
1208            }
1209            c = self.nextc();
1210
1211            if !self.is_identchar() {
1212                break;
1213            }
1214        }
1215
1216        self.buffer.pushback(c);
1217        false
1218    }
1219
1220    pub(crate) fn newtok(&mut self) {
1221        self.buffer.tokidx = 0;
1222        self.buffer.tokline = self.buffer.ruby_sourceline;
1223        self.tokenbuf = TokenBuf::default();
1224    }
1225
1226    pub(crate) fn literal_flush(&mut self, ptok: usize) {
1227        self.buffer.set_ptok(ptok);
1228    }
1229
1230    pub(crate) fn tokadd_mbchar(&mut self, c: MaybeByte) -> Result<(), ()> {
1231        let mut len = match self.multibyte_char_len(self.buffer.pcur - 1) {
1232            Some(len) => len,
1233            None => return Err(()),
1234        };
1235
1236        match c {
1237            MaybeByte::EndOfInput => return Err(()),
1238            _ => self.tokadd(c),
1239        }
1240
1241        len -= 1;
1242        self.buffer.pcur += len;
1243        self.tokcopy(len);
1244        Ok(())
1245    }
1246
1247    fn _multibyte_char_len(&self, ptr: usize) -> Option<usize> {
1248        let c1 = self.buffer.byte_at(ptr).as_option()?;
1249
1250        let len = if c1 & 0x80 == 0 {
1251            1
1252        } else if c1 & 0xE0 == 0xC0 {
1253            2
1254        } else if c1 & 0xF0 == 0xE0 {
1255            3
1256        } else if c1 & 0xF8 == 0xF0 {
1257            4
1258        } else {
1259            // malformed
1260            return None;
1261        };
1262
1263        let bytes = self.buffer.substr_at(ptr, ptr + len)?;
1264        std::str::from_utf8(bytes).ok()?;
1265        Some(len)
1266    }
1267
1268    pub(crate) fn multibyte_char_len(&mut self, ptr: usize) -> Option<usize> {
1269        let result = self._multibyte_char_len(ptr);
1270        if result.is_none() {
1271            self.yyerror0(DiagnosticMessage::InvalidMultibyteChar {});
1272        }
1273        result
1274    }
1275
1276    pub(crate) fn is_label_suffix(&self, n: usize) -> bool {
1277        self.buffer.peek_n(b':', n) && !self.buffer.peek_n(b':', n + 1)
1278    }
1279
1280    pub(crate) fn is_lvar_defined(&self, name: &str) -> bool {
1281        self.static_env.is_declared(name)
1282    }
1283}