php_parser_rs/lexer/
mod.rs

1use crate::ident;
2use crate::ident_start;
3use crate::lexer::byte_string::ByteString;
4use crate::lexer::error::SyntaxError;
5use crate::lexer::error::SyntaxResult;
6use crate::lexer::state::source::Source;
7use crate::lexer::state::StackFrame;
8use crate::lexer::state::State;
9use crate::lexer::token::DocStringIndentationKind;
10use crate::lexer::token::DocStringKind;
11use crate::lexer::token::OpenTagKind;
12use crate::lexer::token::Token;
13use crate::lexer::token::TokenKind;
14
15pub mod byte_string;
16pub mod error;
17pub mod stream;
18pub mod token;
19
20mod state;
21
22mod macros;
23
24#[derive(Debug, PartialEq, Eq, Clone, Default)]
25pub struct Lexer;
26
27impl Lexer {
28    pub const fn new() -> Self {
29        Self {}
30    }
31
32    pub fn tokenize<B: ?Sized + AsRef<[u8]>>(&self, input: &B) -> SyntaxResult<Vec<Token>> {
33        let mut state = State::new(Source::new(input.as_ref()));
34        let mut tokens = Vec::new();
35
36        while !state.source.eof() {
37            match state.frame()? {
38                // The "Initial" state is used to parse inline HTML. It is essentially a catch-all
39                // state that will build up a single token buffer until it encounters an open tag
40                // of some description.
41                StackFrame::Initial => self.initial(&mut state, &mut tokens)?,
42                // The scripting state is entered when an open tag is encountered in the source code.
43                // This tells the lexer to start analysing characters at PHP tokens instead of inline HTML.
44                StackFrame::Scripting => {
45                    self.skip_whitespace(&mut state);
46
47                    // If we have consumed whitespace and then reached the end of the file, we should break.
48                    if state.source.eof() {
49                        break;
50                    }
51
52                    tokens.push(self.scripting(&mut state)?);
53                }
54                // The "Halted" state is entered when the `__halt_compiler` token is encountered.
55                // In this state, all the text that follows is no longer parsed as PHP as is collected
56                // into a single "InlineHtml" token (kind of cheating, oh well).
57                StackFrame::Halted => {
58                    tokens.push(Token {
59                        kind: TokenKind::InlineHtml,
60                        span: state.source.span(),
61                        value: state.source.read_remaining().into(),
62                    });
63                    break;
64                }
65                // The double quote state is entered when inside a double-quoted string that
66                // contains variables.
67                StackFrame::DoubleQuote => self.double_quote(&mut state, &mut tokens)?,
68                // The shell exec state is entered when inside of a execution string (`).
69                StackFrame::ShellExec => self.shell_exec(&mut state, &mut tokens)?,
70                // The doc string state is entered when tokenizing heredocs and nowdocs.
71                StackFrame::DocString(kind, label, ..) => {
72                    let label = label.clone();
73
74                    match kind {
75                        DocStringKind::Heredoc => self.heredoc(&mut state, &mut tokens, label)?,
76                        DocStringKind::Nowdoc => self.nowdoc(&mut state, &mut tokens, label)?,
77                    }
78                }
79                // LookingForProperty is entered inside double quotes,
80                // backticks, or a heredoc, expecting a variable name.
81                // If one isn't found, it switches to scripting.
82                StackFrame::LookingForVarname => {
83                    if let Some(token) = self.looking_for_varname(&mut state)? {
84                        tokens.push(token);
85                    }
86                }
87                // LookingForProperty is entered inside double quotes,
88                // backticks, or a heredoc, expecting an arrow followed by a
89                // property name.
90                StackFrame::LookingForProperty => {
91                    tokens.push(self.looking_for_property(&mut state)?);
92                }
93                StackFrame::VarOffset => {
94                    if state.source.eof() {
95                        break;
96                    }
97
98                    tokens.push(self.var_offset(&mut state)?);
99                }
100            }
101        }
102
103        tokens.push(Token {
104            kind: TokenKind::Eof,
105            span: state.source.span(),
106            value: ByteString::default(),
107        });
108
109        Ok(tokens)
110    }
111
112    fn skip_whitespace(&self, state: &mut State) {
113        while let Some(true) = state.source.current().map(|u: &u8| u.is_ascii_whitespace()) {
114            state.source.next();
115        }
116    }
117
118    fn read_and_skip_whitespace(&self, state: &mut State) -> Vec<u8> {
119        let mut buffer = Vec::new();
120        while let Some(true) = state.source.current().map(|u: &u8| u.is_ascii_whitespace()) {
121            buffer.push(*state.source.current().unwrap());
122            state.source.next();
123        }
124        buffer
125    }
126
127    fn initial(&self, state: &mut State, tokens: &mut Vec<Token>) -> SyntaxResult<()> {
128        let inline_span = state.source.span();
129        let mut buffer = Vec::new();
130        while let Some(char) = state.source.current() {
131            if state.source.at_case_insensitive(b"<?php", 5) {
132                let tag_span = state.source.span();
133
134                let tag = state.source.read_and_skip(5);
135                state.replace(StackFrame::Scripting);
136
137                if !buffer.is_empty() {
138                    tokens.push(Token {
139                        kind: TokenKind::InlineHtml,
140                        span: inline_span,
141                        value: buffer.into(),
142                    });
143                }
144
145                tokens.push(Token {
146                    kind: TokenKind::OpenTag(OpenTagKind::Full),
147                    span: tag_span,
148                    value: tag.into(),
149                });
150
151                return Ok(());
152            } else if state.source.at_case_insensitive(b"<?=", 3) {
153                let tag_span = state.source.span();
154
155                state.source.skip(3);
156                state.replace(StackFrame::Scripting);
157
158                if !buffer.is_empty() {
159                    tokens.push(Token {
160                        kind: TokenKind::InlineHtml,
161                        span: inline_span,
162                        value: buffer.into(),
163                    });
164                }
165
166                tokens.push(Token {
167                    kind: TokenKind::OpenTag(OpenTagKind::Echo),
168                    span: tag_span,
169                    value: b"<?=".into(),
170                });
171
172                return Ok(());
173            } else if state.source.at_case_insensitive(b"<?", 2) {
174                let tag_span = state.source.span();
175
176                state.source.skip(2);
177                state.replace(StackFrame::Scripting);
178
179                if !buffer.is_empty() {
180                    tokens.push(Token {
181                        kind: TokenKind::InlineHtml,
182                        span: inline_span,
183                        value: buffer.into(),
184                    });
185                }
186
187                tokens.push(Token {
188                    kind: TokenKind::OpenTag(OpenTagKind::Short),
189                    span: tag_span,
190                    value: b"<?".into(),
191                });
192
193                return Ok(());
194            }
195
196            state.source.next();
197            buffer.push(*char);
198        }
199
200        tokens.push(Token {
201            kind: TokenKind::InlineHtml,
202            span: inline_span,
203            value: buffer.into(),
204        });
205
206        Ok(())
207    }
208
209    fn scripting(&self, state: &mut State) -> SyntaxResult<Token> {
210        let span = state.source.span();
211        let (kind, value): (TokenKind, ByteString) = match state.source.read(3) {
212            [b'!', b'=', b'='] => {
213                state.source.skip(3);
214
215                (TokenKind::BangDoubleEquals, b"!==".into())
216            }
217            [b'?', b'?', b'='] => {
218                state.source.skip(3);
219                (TokenKind::DoubleQuestionEquals, b"??=".into())
220            }
221            [b'?', b'-', b'>'] => {
222                state.source.skip(3);
223                (TokenKind::QuestionArrow, b"?->".into())
224            }
225            [b'=', b'=', b'='] => {
226                state.source.skip(3);
227                (TokenKind::TripleEquals, b"===".into())
228            }
229            [b'.', b'.', b'.'] => {
230                state.source.skip(3);
231                (TokenKind::Ellipsis, b"...".into())
232            }
233            [b'`', ..] => {
234                state.source.next();
235                state.replace(StackFrame::ShellExec);
236                (TokenKind::Backtick, b"`".into())
237            }
238            [b'@', ..] => {
239                state.source.next();
240                (TokenKind::At, b"@".into())
241            }
242            [b'!', b'=', ..] => {
243                state.source.skip(2);
244                (TokenKind::BangEquals, b"!=".into())
245            }
246            [b'!', ..] => {
247                state.source.next();
248                (TokenKind::Bang, b"!".into())
249            }
250            [b'&', b'&', ..] => {
251                state.source.skip(2);
252                (TokenKind::BooleanAnd, b"&&".into())
253            }
254            [b'&', b'=', ..] => {
255                state.source.skip(2);
256                (TokenKind::AmpersandEquals, b"&=".into())
257            }
258            [b'&', ..] => {
259                state.source.next();
260                (TokenKind::Ampersand, b"&".into())
261            }
262            [b'?', b'>', ..] => {
263                // This is a close tag, we can enter "Initial" mode again.
264                state.source.skip(2);
265
266                state.replace(StackFrame::Initial);
267
268                (TokenKind::CloseTag, b"?>".into())
269            }
270            [b'?', b'?', ..] => {
271                state.source.skip(2);
272                (TokenKind::DoubleQuestion, b"??".into())
273            }
274            [b'?', b':', ..] => {
275                state.source.skip(2);
276                (TokenKind::QuestionColon, b"?:".into())
277            }
278            [b'?', ..] => {
279                state.source.next();
280                (TokenKind::Question, b"?".into())
281            }
282            [b'=', b'>', ..] => {
283                state.source.skip(2);
284                (TokenKind::DoubleArrow, b"=>".into())
285            }
286            [b'=', b'=', ..] => {
287                state.source.skip(2);
288                (TokenKind::DoubleEquals, b"==".into())
289            }
290            [b'=', ..] => {
291                state.source.next();
292                (TokenKind::Equals, b"=".into())
293            }
294            // Single quoted string.
295            [b'\'', ..] => {
296                state.source.skip(1);
297                self.tokenize_single_quote_string(state)?
298            }
299            [b'b' | b'B', b'\'', ..] => {
300                state.source.skip(2);
301                self.tokenize_single_quote_string(state)?
302            }
303            [b'"', ..] => {
304                state.source.skip(1);
305                self.tokenize_double_quote_string(state)?
306            }
307            [b'b' | b'B', b'"', ..] => {
308                state.source.skip(2);
309                self.tokenize_double_quote_string(state)?
310            }
311            [b'$', ident_start!(), ..] => self.tokenize_variable(state),
312            [b'$', ..] => {
313                state.source.next();
314                (TokenKind::Dollar, b"$".into())
315            }
316            [b'.', b'=', ..] => {
317                state.source.skip(2);
318                (TokenKind::DotEquals, b".=".into())
319            }
320            [b'0'..=b'9', ..] => self.tokenize_number(state)?,
321            [b'.', b'0'..=b'9', ..] => self.tokenize_number(state)?,
322            [b'.', ..] => {
323                state.source.next();
324                (TokenKind::Dot, b".".into())
325            }
326            [b'\\', ident_start!(), ..] => {
327                state.source.next();
328
329                match self.scripting(state)? {
330                    Token {
331                        kind: TokenKind::Identifier | TokenKind::QualifiedIdentifier,
332                        value,
333                        ..
334                    } => {
335                        let mut bytes = value;
336                        bytes.insert(0, b'\\');
337
338                        (TokenKind::FullyQualifiedIdentifier, bytes)
339                    }
340                    Token {
341                        kind: TokenKind::True,
342                        ..
343                    } => (TokenKind::FullyQualifiedIdentifier, b"\\true".into()),
344                    Token {
345                        kind: TokenKind::False,
346                        ..
347                    } => (TokenKind::FullyQualifiedIdentifier, b"\\false".into()),
348                    Token {
349                        kind: TokenKind::Null,
350                        ..
351                    } => (TokenKind::FullyQualifiedIdentifier, b"\\null".into()),
352                    s => unreachable!("{:?}", s),
353                }
354            }
355            [b'\\', ..] => {
356                state.source.next();
357                (TokenKind::NamespaceSeparator, b"\\".into())
358            }
359            [b'/', b'*', ..] => {
360                state.source.next();
361                let mut buffer = vec![b'/'];
362
363                loop {
364                    match state.source.read(2) {
365                        [b'*', b'/'] => {
366                            state.source.skip(2);
367                            buffer.extend_from_slice(b"*/");
368                            break;
369                        }
370                        &[t, ..] => {
371                            state.source.next();
372                            buffer.push(t);
373                        }
374                        _ => {
375                            break;
376                        }
377                    }
378                }
379
380                if buffer.starts_with(b"/**") {
381                    (TokenKind::DocumentComment, buffer.into())
382                } else {
383                    (TokenKind::MultiLineComment, buffer.into())
384                }
385            }
386            [b'#', b'[', ..] => {
387                state.source.skip(2);
388                (TokenKind::Attribute, b"#[".into())
389            }
390            [ch @ b'/', b'/', ..] | [ch @ b'#', ..] => {
391                let mut buffer = if *ch == b'/' {
392                    state.source.skip(2);
393                    b"//".to_vec()
394                } else {
395                    state.source.next();
396                    b"#".to_vec()
397                };
398
399                while let Some(c) = state.source.current() {
400                    if *c == b'\n' {
401                        state.source.next();
402                        break;
403                    }
404
405                    if state.source.read(2) == [b'?', b'>'] {
406                        break;
407                    }
408
409                    buffer.push(*c);
410                    state.source.next();
411                }
412
413                if buffer.starts_with(b"#") {
414                    (TokenKind::HashMarkComment, buffer.into())
415                } else {
416                    (TokenKind::SingleLineComment, buffer.into())
417                }
418            }
419            [b'/', b'=', ..] => {
420                state.source.skip(2);
421                (TokenKind::SlashEquals, b"/=".into())
422            }
423            [b'/', ..] => {
424                state.source.next();
425                (TokenKind::Slash, b"/".into())
426            }
427            [b'*', b'*', b'=', ..] => {
428                state.source.skip(3);
429                (TokenKind::PowEquals, b"**=".into())
430            }
431            [b'<', b'<', b'='] => {
432                state.source.skip(3);
433
434                (TokenKind::LeftShiftEquals, b"<<=".into())
435            }
436            [b'<', b'=', b'>'] => {
437                state.source.skip(3);
438                (TokenKind::Spaceship, b"<=>".into())
439            }
440            [b'>', b'>', b'='] => {
441                state.source.skip(3);
442                (TokenKind::RightShiftEquals, b">>=".into())
443            }
444            [b'<', b'<', b'<'] => {
445                state.source.skip(3);
446                let mut buffer = b"<<<".to_vec();
447                buffer.extend(self.read_and_skip_whitespace(state));
448
449                let doc_string_kind = match state.source.read(1) {
450                    [b'\''] => {
451                        buffer.push(b'\'');
452                        state.source.next();
453                        DocStringKind::Nowdoc
454                    }
455                    [b'"'] => {
456                        buffer.push(b'"');
457                        state.source.next();
458                        DocStringKind::Heredoc
459                    }
460                    [_, ..] => DocStringKind::Heredoc,
461                    [] => {
462                        return Err(SyntaxError::UnexpectedEndOfFile(state.source.span()));
463                    }
464                };
465
466                let label: ByteString = match self.peek_identifier(state) {
467                    Some(_) => self.consume_identifier(state).into(),
468                    None => {
469                        return match state.source.current() {
470                            Some(c) => {
471                                Err(SyntaxError::UnexpectedCharacter(*c, state.source.span()))
472                            }
473                            None => Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
474                        }
475                    }
476                };
477
478                buffer.extend_from_slice(&label);
479
480                if doc_string_kind == DocStringKind::Nowdoc {
481                    match state.source.current() {
482                        Some(b'\'') => {
483                            buffer.push(b'\'');
484                            state.source.next();
485                        }
486                        _ => {
487                            // TODO(azjezz) this is most likely a bug, what if current is none?
488                            return Err(SyntaxError::UnexpectedCharacter(
489                                *state.source.current().unwrap(),
490                                state.source.span(),
491                            ));
492                        }
493                    };
494                } else if let Some(b'"') = state.source.current() {
495                    buffer.push(b'"');
496                    state.source.next();
497                }
498
499                if !matches!(state.source.current(), Some(b'\n')) {
500                    return Err(SyntaxError::UnexpectedCharacter(
501                        *state.source.current().unwrap(),
502                        state.source.span(),
503                    ));
504                }
505
506                state.source.next();
507                state.replace(StackFrame::DocString(
508                    doc_string_kind.clone(),
509                    label.clone(),
510                    DocStringIndentationKind::None,
511                    0,
512                ));
513
514                (TokenKind::StartDocString(doc_string_kind), buffer.into())
515            }
516            [b'*', b'*', ..] => {
517                state.source.skip(2);
518                (TokenKind::Pow, b"**".into())
519            }
520            [b'*', b'=', ..] => {
521                state.source.skip(2);
522                (TokenKind::AsteriskEquals, b"*=".into())
523            }
524            [b'*', ..] => {
525                state.source.next();
526                (TokenKind::Asterisk, b"*".into())
527            }
528            [b'|', b'|', ..] => {
529                state.source.skip(2);
530                (TokenKind::BooleanOr, b"||".into())
531            }
532            [b'|', b'=', ..] => {
533                state.source.skip(2);
534                (TokenKind::PipeEquals, b"|=".into())
535            }
536            [b'|', ..] => {
537                state.source.next();
538                (TokenKind::Pipe, b"|".into())
539            }
540            [b'^', b'=', ..] => {
541                state.source.skip(2);
542                (TokenKind::CaretEquals, b"^=".into())
543            }
544            [b'^', ..] => {
545                state.source.next();
546                (TokenKind::Caret, b"^".into())
547            }
548            [b'{', ..] => {
549                state.source.next();
550                state.enter(StackFrame::Scripting);
551                (TokenKind::LeftBrace, b"{".into())
552            }
553            [b'}', ..] => {
554                state.source.next();
555                state.exit();
556                (TokenKind::RightBrace, b"}".into())
557            }
558            [b'(', ..] => {
559                state.source.next();
560                let mut buffer = b"(".to_vec();
561
562                // Inlined so we can add whitespace to the buffer.
563                while let Some(true) = state.source.current().map(|u: &u8| u.is_ascii_whitespace())
564                {
565                    buffer.push(*state.source.current().unwrap());
566                    state.source.next();
567                }
568
569                if state.source.at_case_insensitive(b"int", 3) {
570                    if state.source.at_case_insensitive(b"integer", 7)
571                        && state.source.peek_ignoring_whitespace(7, 1) == [b')']
572                    {
573                        buffer.extend(state.source.read_and_skip(7));
574                        buffer.extend(self.read_and_skip_whitespace(state));
575                        buffer.extend(state.source.read_and_skip(1));
576
577                        (TokenKind::IntegerCast, buffer.into())
578                    } else if state.source.peek_ignoring_whitespace(3, 1) == [b')'] {
579                        buffer.extend(state.source.read_and_skip(3));
580                        buffer.extend(self.read_and_skip_whitespace(state));
581                        buffer.extend(state.source.read_and_skip(1));
582
583                        (TokenKind::IntCast, buffer.into())
584                    } else {
585                        (TokenKind::LeftParen, buffer.into())
586                    }
587                } else if state.source.at_case_insensitive(b"bool", 4) {
588                    if state.source.at_case_insensitive(b"boolean", 7)
589                        && state.source.peek_ignoring_whitespace(7, 1) == [b')']
590                    {
591                        buffer.extend(state.source.read_and_skip(7));
592                        buffer.extend(self.read_and_skip_whitespace(state));
593                        buffer.extend(state.source.read_and_skip(1));
594
595                        (TokenKind::BooleanCast, buffer.into())
596                    } else if state.source.peek_ignoring_whitespace(4, 1) == [b')'] {
597                        buffer.extend(state.source.read_and_skip(4));
598                        buffer.extend(self.read_and_skip_whitespace(state));
599                        buffer.extend(state.source.read_and_skip(1));
600
601                        (TokenKind::BoolCast, buffer.into())
602                    } else {
603                        (TokenKind::LeftParen, buffer.into())
604                    }
605                } else if state.source.at_case_insensitive(b"float", 5) {
606                    if state.source.peek_ignoring_whitespace(5, 1) == [b')'] {
607                        buffer.extend(state.source.read_and_skip(5));
608                        buffer.extend(self.read_and_skip_whitespace(state));
609                        buffer.extend(state.source.read_and_skip(1));
610
611                        (TokenKind::FloatCast, buffer.into())
612                    } else {
613                        (TokenKind::LeftParen, buffer.into())
614                    }
615                } else if state.source.at_case_insensitive(b"double", 6) {
616                    if state.source.peek_ignoring_whitespace(6, 1) == [b')'] {
617                        buffer.extend(state.source.read_and_skip(6));
618                        buffer.extend(self.read_and_skip_whitespace(state));
619                        buffer.extend(state.source.read_and_skip(1));
620
621                        (TokenKind::DoubleCast, buffer.into())
622                    } else {
623                        (TokenKind::LeftParen, buffer.into())
624                    }
625                } else if state.source.at_case_insensitive(b"real", 4) {
626                    if state.source.peek_ignoring_whitespace(4, 1) == [b')'] {
627                        buffer.extend(state.source.read_and_skip(4));
628                        buffer.extend(self.read_and_skip_whitespace(state));
629                        buffer.extend(state.source.read_and_skip(1));
630
631                        (TokenKind::RealCast, buffer.into())
632                    } else {
633                        (TokenKind::LeftParen, buffer.into())
634                    }
635                } else if state.source.at_case_insensitive(b"string", 6) {
636                    if state.source.peek_ignoring_whitespace(6, 1) == [b')'] {
637                        buffer.extend(state.source.read_and_skip(6));
638                        buffer.extend(self.read_and_skip_whitespace(state));
639                        buffer.extend(state.source.read_and_skip(1));
640
641                        (TokenKind::StringCast, buffer.into())
642                    } else {
643                        (TokenKind::LeftParen, buffer.into())
644                    }
645                } else if state.source.at_case_insensitive(b"binary", 6) {
646                    if state.source.peek_ignoring_whitespace(6, 1) == [b')'] {
647                        buffer.extend(state.source.read_and_skip(6));
648                        buffer.extend(self.read_and_skip_whitespace(state));
649                        buffer.extend(state.source.read_and_skip(1));
650
651                        (TokenKind::BinaryCast, buffer.into())
652                    } else {
653                        (TokenKind::LeftParen, buffer.into())
654                    }
655                } else if state.source.at_case_insensitive(b"array", 5) {
656                    if state.source.peek_ignoring_whitespace(5, 1) == [b')'] {
657                        buffer.extend(state.source.read_and_skip(5));
658                        buffer.extend(self.read_and_skip_whitespace(state));
659                        buffer.extend(state.source.read_and_skip(1));
660
661                        (TokenKind::ArrayCast, buffer.into())
662                    } else {
663                        (TokenKind::LeftParen, buffer.into())
664                    }
665                } else if state.source.at_case_insensitive(b"object", 6) {
666                    if state.source.peek_ignoring_whitespace(6, 1) == [b')'] {
667                        buffer.extend(state.source.read_and_skip(6));
668                        buffer.extend(self.read_and_skip_whitespace(state));
669                        buffer.extend(state.source.read_and_skip(1));
670
671                        (TokenKind::ObjectCast, buffer.into())
672                    } else {
673                        (TokenKind::LeftParen, buffer.into())
674                    }
675                } else if state.source.at_case_insensitive(b"unset", 5) {
676                    if state.source.peek_ignoring_whitespace(5, 1) == [b')'] {
677                        buffer.extend(state.source.read_and_skip(5));
678                        buffer.extend(self.read_and_skip_whitespace(state));
679                        buffer.extend(state.source.read_and_skip(1));
680
681                        (TokenKind::UnsetCast, buffer.into())
682                    } else {
683                        (TokenKind::LeftParen, buffer.into())
684                    }
685                } else {
686                    (TokenKind::LeftParen, buffer.into())
687                }
688            }
689            [b')', ..] => {
690                state.source.next();
691                (TokenKind::RightParen, b")".into())
692            }
693            [b';', ..] => {
694                state.source.next();
695                (TokenKind::SemiColon, b";".into())
696            }
697            [b'+', b'+', ..] => {
698                state.source.skip(2);
699                (TokenKind::Increment, b"++".into())
700            }
701            [b'+', b'=', ..] => {
702                state.source.skip(2);
703                (TokenKind::PlusEquals, b"+=".into())
704            }
705            [b'+', ..] => {
706                state.source.next();
707                (TokenKind::Plus, b"+".into())
708            }
709            [b'%', b'=', ..] => {
710                state.source.skip(2);
711                (TokenKind::PercentEquals, b"%=".into())
712            }
713            [b'%', ..] => {
714                state.source.next();
715                (TokenKind::Percent, b"%".into())
716            }
717            [b'-', b'-', ..] => {
718                state.source.skip(2);
719                (TokenKind::Decrement, b"--".into())
720            }
721            [b'-', b'>', ..] => {
722                state.source.skip(2);
723                (TokenKind::Arrow, b"->".into())
724            }
725            [b'-', b'=', ..] => {
726                state.source.skip(2);
727                (TokenKind::MinusEquals, b"-=".into())
728            }
729            [b'-', ..] => {
730                state.source.next();
731                (TokenKind::Minus, b"-".into())
732            }
733            [b'<', b'<', ..] => {
734                state.source.skip(2);
735                (TokenKind::LeftShift, b"<<".into())
736            }
737            [b'<', b'=', ..] => {
738                state.source.skip(2);
739                (TokenKind::LessThanEquals, b"<=".into())
740            }
741            [b'<', b'>', ..] => {
742                state.source.skip(2);
743                (TokenKind::AngledLeftRight, b"<>".into())
744            }
745            [b'<', ..] => {
746                state.source.next();
747                (TokenKind::LessThan, b"<".into())
748            }
749            [b'>', b'>', ..] => {
750                state.source.skip(2);
751                (TokenKind::RightShift, b">>".into())
752            }
753            [b'>', b'=', ..] => {
754                state.source.skip(2);
755                (TokenKind::GreaterThanEquals, b">=".into())
756            }
757            [b'>', ..] => {
758                state.source.next();
759                (TokenKind::GreaterThan, b">".into())
760            }
761            [b',', ..] => {
762                state.source.next();
763                (TokenKind::Comma, b",".into())
764            }
765            [b'[', ..] => {
766                state.source.next();
767                (TokenKind::LeftBracket, b"[".into())
768            }
769            [b']', ..] => {
770                state.source.next();
771                (TokenKind::RightBracket, b"]".into())
772            }
773            [b':', b':', ..] => {
774                state.source.skip(2);
775                (TokenKind::DoubleColon, b"::".into())
776            }
777            [b':', ..] => {
778                state.source.next();
779                (TokenKind::Colon, b":".into())
780            }
781            [b'~', ..] => {
782                state.source.next();
783                (TokenKind::BitwiseNot, b"~".into())
784            }
785            [b @ ident_start!(), ..] => {
786                state.source.next();
787                let mut qualified = false;
788                let mut last_was_slash = false;
789
790                let mut buffer = vec![*b];
791                while let Some(next @ ident!() | next @ b'\\') = state.source.current() {
792                    if matches!(next, ident!()) {
793                        buffer.push(*next);
794                        state.source.next();
795                        last_was_slash = false;
796                        continue;
797                    }
798
799                    if *next == b'\\' && !last_was_slash {
800                        qualified = true;
801                        last_was_slash = true;
802                        buffer.push(*next);
803                        state.source.next();
804                        continue;
805                    }
806
807                    break;
808                }
809
810                if qualified {
811                    (TokenKind::QualifiedIdentifier, buffer.into())
812                } else {
813                    let kind = identifier_to_keyword(&buffer).unwrap_or(TokenKind::Identifier);
814
815                    if kind == TokenKind::HaltCompiler {
816                        match state.source.read(3) {
817                            [b'(', b')', b';'] => {
818                                state.source.skip(3);
819                                state.replace(StackFrame::Halted);
820                            }
821                            _ => return Err(SyntaxError::InvalidHaltCompiler(state.source.span())),
822                        }
823                    }
824
825                    (kind, buffer.into())
826                }
827            }
828            [b, ..] => unimplemented!(
829                "<scripting> char: {}, line: {}, col: {}",
830                *b as char,
831                state.source.span().line,
832                state.source.span().column
833            ),
834            // We should never reach this point since we have the empty checks surrounding
835            // the call to this function, but it's better to be safe than sorry.
836            [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
837        };
838
839        Ok(Token { kind, span, value })
840    }
841
842    fn double_quote(&self, state: &mut State, tokens: &mut Vec<Token>) -> SyntaxResult<()> {
843        let span = state.source.span();
844        let mut buffer = Vec::new();
845        let (kind, value) = loop {
846            match state.source.read(3) {
847                [b'$', b'{', ..] => {
848                    state.source.skip(2);
849                    state.enter(StackFrame::LookingForVarname);
850                    break (TokenKind::DollarLeftBrace, b"${".into());
851                }
852                [b'{', b'$', ..] => {
853                    // Intentionally only consume the left brace.
854                    state.source.next();
855                    state.enter(StackFrame::Scripting);
856                    break (TokenKind::LeftBrace, b"{".into());
857                }
858                [b'"', ..] => {
859                    state.source.next();
860                    state.replace(StackFrame::Scripting);
861                    break (TokenKind::DoubleQuote, b'"'.into());
862                }
863                &[b'\\', b @ (b'"' | b'\\' | b'$'), ..] => {
864                    state.source.skip(2);
865                    buffer.push(b);
866                }
867                &[b'\\', b'n', ..] => {
868                    state.source.skip(2);
869                    buffer.push(b'\n');
870                }
871                &[b'\\', b'r', ..] => {
872                    state.source.skip(2);
873                    buffer.push(b'\r');
874                }
875                &[b'\\', b't', ..] => {
876                    state.source.skip(2);
877                    buffer.push(b'\t');
878                }
879                &[b'\\', b'v', ..] => {
880                    state.source.skip(2);
881                    buffer.push(b'\x0b');
882                }
883                &[b'\\', b'e', ..] => {
884                    state.source.skip(2);
885                    buffer.push(b'\x1b');
886                }
887                &[b'\\', b'f', ..] => {
888                    state.source.skip(2);
889                    buffer.push(b'\x0c');
890                }
891                &[b'\\', b'x', b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')] => {
892                    state.source.skip(3);
893
894                    let mut hex = String::from(b as char);
895                    if let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
896                        state.source.current()
897                    {
898                        state.source.next();
899                        hex.push(*b as char);
900                    }
901
902                    let b = u8::from_str_radix(&hex, 16).unwrap();
903                    buffer.push(b);
904                }
905                &[b'\\', b'u', b'{'] => {
906                    state.source.skip(3);
907
908                    let mut code_point = String::new();
909                    while let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
910                        state.source.current()
911                    {
912                        state.source.next();
913                        code_point.push(*b as char);
914                    }
915
916                    if code_point.is_empty() || state.source.current() != Some(&b'}') {
917                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
918                    }
919                    state.source.next();
920
921                    let c = if let Ok(c) = u32::from_str_radix(&code_point, 16) {
922                        c
923                    } else {
924                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
925                    };
926
927                    if let Some(c) = char::from_u32(c) {
928                        let mut tmp = [0; 4];
929                        let bytes = c.encode_utf8(&mut tmp);
930                        buffer.extend(bytes.as_bytes());
931                    } else {
932                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
933                    }
934                }
935                &[b'\\', b @ b'0'..=b'7', ..] => {
936                    state.source.skip(2);
937
938                    let mut octal = String::from(b as char);
939                    if let Some(b @ b'0'..=b'7') = state.source.current() {
940                        state.source.next();
941                        octal.push(*b as char);
942                    }
943                    if let Some(b @ b'0'..=b'7') = state.source.current() {
944                        state.source.next();
945                        octal.push(*b as char);
946                    }
947
948                    if let Ok(b) = u8::from_str_radix(&octal, 8) {
949                        buffer.push(b);
950                    } else {
951                        return Err(SyntaxError::InvalidOctalEscape(state.source.span()));
952                    }
953                }
954                [b'$', ident_start!(), ..] => {
955                    let mut var = state.source.read_and_skip(1).to_vec();
956                    var.extend(self.consume_identifier(state));
957
958                    match state.source.read(4) {
959                        [b'[', ..] => state.enter(StackFrame::VarOffset),
960                        [b'-', b'>', ident_start!(), ..] | [b'?', b'-', b'>', ident_start!()] => {
961                            state.enter(StackFrame::LookingForProperty)
962                        }
963                        _ => {}
964                    }
965
966                    break (TokenKind::Variable, var.into());
967                }
968                &[b, ..] => {
969                    state.source.next();
970                    buffer.push(b);
971                }
972                [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
973            }
974        };
975
976        if !buffer.is_empty() {
977            tokens.push(Token {
978                kind: TokenKind::StringPart,
979                span,
980                value: buffer.into(),
981            })
982        }
983
984        tokens.push(Token { kind, span, value });
985        Ok(())
986    }
987
988    fn shell_exec(&self, state: &mut State, tokens: &mut Vec<Token>) -> SyntaxResult<()> {
989        let span = state.source.span();
990        let mut buffer = Vec::new();
991        let (kind, value) = loop {
992            match state.source.read(2) {
993                [b'$', b'{'] => {
994                    state.source.skip(2);
995                    state.enter(StackFrame::LookingForVarname);
996                    break (TokenKind::DollarLeftBrace, b"${".into());
997                }
998                [b'{', b'$'] => {
999                    // Intentionally only consume the left brace.
1000                    state.source.next();
1001                    state.enter(StackFrame::Scripting);
1002                    break (TokenKind::LeftBrace, b"{".into());
1003                }
1004                [b'`', ..] => {
1005                    state.source.next();
1006                    state.replace(StackFrame::Scripting);
1007                    break (TokenKind::Backtick, b"`".into());
1008                }
1009                [b'$', ident_start!()] => {
1010                    let mut var = state.source.read_and_skip(1).to_vec();
1011                    var.extend(self.consume_identifier(state));
1012
1013                    match state.source.read(4) {
1014                        [b'[', ..] => state.enter(StackFrame::VarOffset),
1015                        [b'-', b'>', ident_start!(), ..] | [b'?', b'-', b'>', ident_start!()] => {
1016                            state.enter(StackFrame::LookingForProperty)
1017                        }
1018                        _ => {}
1019                    }
1020
1021                    break (TokenKind::Variable, var.into());
1022                }
1023                &[b, ..] => {
1024                    state.source.next();
1025                    buffer.push(b);
1026                }
1027                [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
1028            }
1029        };
1030
1031        if !buffer.is_empty() {
1032            tokens.push(Token {
1033                kind: TokenKind::StringPart,
1034                span,
1035                value: buffer.into(),
1036            })
1037        }
1038
1039        tokens.push(Token { kind, span, value });
1040
1041        Ok(())
1042    }
1043
1044    fn heredoc(
1045        &self,
1046        state: &mut State,
1047        tokens: &mut Vec<Token>,
1048        label: ByteString,
1049    ) -> SyntaxResult<()> {
1050        let span = state.source.span();
1051        let mut buffer: Vec<u8> = Vec::new();
1052
1053        let (kind, value) = loop {
1054            match state.source.read(3) {
1055                [b'$', b'{', ..] => {
1056                    state.source.skip(2);
1057                    state.enter(StackFrame::LookingForVarname);
1058                    break (TokenKind::DollarLeftBrace, b"${".into());
1059                }
1060                [b'{', b'$', ..] => {
1061                    // Intentionally only consume the left brace.
1062                    state.source.next();
1063                    state.enter(StackFrame::Scripting);
1064                    break (TokenKind::LeftBrace, b"{".into());
1065                }
1066                &[b'\\', b @ (b'"' | b'\\' | b'$'), ..] => {
1067                    state.source.skip(2);
1068                    buffer.push(b);
1069                }
1070                &[b'\\', b'n', ..] => {
1071                    state.source.skip(2);
1072                    buffer.push(b'\n');
1073                }
1074                &[b'\\', b'r', ..] => {
1075                    state.source.skip(2);
1076                    buffer.push(b'\r');
1077                }
1078                &[b'\\', b't', ..] => {
1079                    state.source.skip(2);
1080                    buffer.push(b'\t');
1081                }
1082                &[b'\\', b'v', ..] => {
1083                    state.source.skip(2);
1084                    buffer.push(b'\x0b');
1085                }
1086                &[b'\\', b'e', ..] => {
1087                    state.source.skip(2);
1088                    buffer.push(b'\x1b');
1089                }
1090                &[b'\\', b'f', ..] => {
1091                    state.source.skip(2);
1092                    buffer.push(b'\x0c');
1093                }
1094                &[b'\\', b'x', b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')] => {
1095                    state.source.skip(3);
1096
1097                    let mut hex = String::from(b as char);
1098                    if let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
1099                        state.source.current()
1100                    {
1101                        state.source.next();
1102                        hex.push(*b as char);
1103                    }
1104
1105                    let b = u8::from_str_radix(&hex, 16).unwrap();
1106                    buffer.push(b);
1107                }
1108                &[b'\\', b'u', b'{'] => {
1109                    state.source.skip(3);
1110
1111                    let mut code_point = String::new();
1112                    while let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
1113                        state.source.current()
1114                    {
1115                        state.source.next();
1116                        code_point.push(*b as char);
1117                    }
1118
1119                    if code_point.is_empty() || state.source.current() != Some(&b'}') {
1120                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
1121                    }
1122                    state.source.next();
1123
1124                    let c = if let Ok(c) = u32::from_str_radix(&code_point, 16) {
1125                        c
1126                    } else {
1127                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
1128                    };
1129
1130                    if let Some(c) = char::from_u32(c) {
1131                        let mut tmp = [0; 4];
1132                        let bytes = c.encode_utf8(&mut tmp);
1133                        buffer.extend(bytes.as_bytes());
1134                    } else {
1135                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
1136                    }
1137                }
1138                &[b'\\', b @ b'0'..=b'7', ..] => {
1139                    state.source.skip(2);
1140
1141                    let mut octal = String::from(b as char);
1142                    if let Some(b @ b'0'..=b'7') = state.source.current() {
1143                        state.source.next();
1144                        octal.push(*b as char);
1145                    }
1146                    if let Some(b @ b'0'..=b'7') = state.source.current() {
1147                        state.source.next();
1148                        octal.push(*b as char);
1149                    }
1150
1151                    if let Ok(b) = u8::from_str_radix(&octal, 8) {
1152                        buffer.push(b);
1153                    } else {
1154                        return Err(SyntaxError::InvalidOctalEscape(state.source.span()));
1155                    }
1156                }
1157                [b'$', ident_start!(), ..] => {
1158                    let mut var = state.source.read_and_skip(1).to_vec();
1159                    var.extend(self.consume_identifier(state));
1160
1161                    match state.source.read(4) {
1162                        [b'[', ..] => state.enter(StackFrame::VarOffset),
1163                        [b'-', b'>', ident_start!(), ..] | [b'?', b'-', b'>', ident_start!()] => {
1164                            state.enter(StackFrame::LookingForProperty)
1165                        }
1166                        _ => {}
1167                    }
1168
1169                    break (TokenKind::Variable, var.into());
1170                }
1171                // If we find a new-line, we can start to check if we can see the EndHeredoc token.
1172                [b'\n', ..] => {
1173                    buffer.push(b'\n');
1174                    state.source.next();
1175
1176                    // Check if we can see the closing label right here.
1177                    if state.source.at(&label, label.len()) {
1178                        state.source.skip(label.len());
1179                        state.replace(StackFrame::Scripting);
1180                        break (
1181                            TokenKind::EndDocString(DocStringIndentationKind::None, 0),
1182                            label,
1183                        );
1184                    }
1185
1186                    // Check if there's any whitespace first.
1187                    let (whitespace_kind, whitespace_amount) = match state.source.read(1) {
1188                        [b' '] => {
1189                            let mut amount = 0;
1190                            while state.source.read(1) == [b' '] {
1191                                amount += 1;
1192                                state.source.next();
1193                            }
1194                            (DocStringIndentationKind::Space, amount)
1195                        }
1196                        [b'\t'] => {
1197                            let mut amount = 0;
1198                            while state.source.read(1) == [b'\t'] {
1199                                amount += 1;
1200                                state.source.next();
1201                            }
1202                            (DocStringIndentationKind::Tab, amount)
1203                        }
1204                        _ => (DocStringIndentationKind::None, 0),
1205                    };
1206
1207                    // We've figured out what type of whitespace was being used
1208                    // at the start of the line.
1209                    // We should now check for any extra whitespace, of any kind.
1210                    let mut extra_whitespace_buffer = Vec::new();
1211                    while let [b @ b' ' | b @ b'\t'] = state.source.read(1) {
1212                        extra_whitespace_buffer.push(b);
1213                        state.source.next();
1214                    }
1215
1216                    // We've consumed all leading whitespace on this line now,
1217                    // so let's try to read the label again.
1218                    if state.source.at(&label, label.len()) {
1219                        // We've found the label, finally! We need to do 1 last
1220                        // check to make sure there wasn't a mixture of indentation types.
1221                        if whitespace_kind != DocStringIndentationKind::None
1222                            && !extra_whitespace_buffer.is_empty()
1223                        {
1224                            return Err(SyntaxError::InvalidDocIndentation(state.source.span()));
1225                        }
1226
1227                        // If we get here, only 1 type of indentation was found. We can move
1228                        // the process along by reading over the label and breaking out
1229                        // with the EndHeredoc token, storing the kind and amount of whitespace.
1230                        state.source.skip(label.len());
1231                        state.replace(StackFrame::Scripting);
1232                        break (
1233                            TokenKind::EndDocString(whitespace_kind, whitespace_amount),
1234                            label,
1235                        );
1236                    } else {
1237                        // We didn't find the label. The buffer still needs to know about
1238                        // the whitespace, so let's extend the buffer with the whitespace
1239                        // and let the loop run again to handle the rest of the line.
1240                        if whitespace_kind != DocStringIndentationKind::None {
1241                            let whitespace_char: u8 = whitespace_kind.into();
1242                            for _ in 0..whitespace_amount {
1243                                buffer.push(whitespace_char);
1244                            }
1245                        }
1246
1247                        buffer.extend(extra_whitespace_buffer);
1248                    }
1249                }
1250                &[b, ..] => {
1251                    state.source.next();
1252                    buffer.push(b);
1253                }
1254                [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
1255            }
1256        };
1257
1258        // Any trailing line breaks should be removed from the final heredoc.
1259        if buffer.last() == Some(&b'\n') {
1260            buffer.pop();
1261        }
1262
1263        if !buffer.is_empty() {
1264            tokens.push(Token {
1265                kind: TokenKind::StringPart,
1266                span,
1267                value: buffer.into(),
1268            })
1269        }
1270
1271        tokens.push(Token { kind, span, value });
1272
1273        Ok(())
1274    }
1275
1276    fn nowdoc(
1277        &self,
1278        state: &mut State,
1279        tokens: &mut Vec<Token>,
1280        label: ByteString,
1281    ) -> SyntaxResult<()> {
1282        let span = state.source.span();
1283        let mut buffer: Vec<u8> = Vec::new();
1284
1285        let (kind, value) = loop {
1286            match state.source.read(3) {
1287                // If we find a new-line, we can start to check if we can see the EndHeredoc token.
1288                [b'\n', ..] => {
1289                    buffer.push(b'\n');
1290                    state.source.next();
1291
1292                    // Check if we can see the closing label right here.
1293                    if state.source.at(&label, label.len()) {
1294                        state.source.skip(label.len());
1295                        state.replace(StackFrame::Scripting);
1296                        break (
1297                            TokenKind::EndDocString(DocStringIndentationKind::None, 0),
1298                            label,
1299                        );
1300                    }
1301
1302                    // Check if there's any whitespace first.
1303                    let (whitespace_kind, whitespace_amount) = match state.source.read(1) {
1304                        [b' '] => {
1305                            let mut amount = 0;
1306                            while state.source.read(1) == [b' '] {
1307                                amount += 1;
1308                                state.source.next();
1309                            }
1310                            (DocStringIndentationKind::Space, amount)
1311                        }
1312                        [b'\t'] => {
1313                            let mut amount = 0;
1314                            while state.source.read(1) == [b'\t'] {
1315                                amount += 1;
1316                                state.source.next();
1317                            }
1318                            (DocStringIndentationKind::Tab, amount)
1319                        }
1320                        _ => (DocStringIndentationKind::None, 0),
1321                    };
1322
1323                    // We've figured out what type of whitespace was being used
1324                    // at the start of the line.
1325                    // We should now check for any extra whitespace, of any kind.
1326                    let mut extra_whitespace_buffer = Vec::new();
1327                    while let [b @ b' ' | b @ b'\t'] = state.source.read(1) {
1328                        extra_whitespace_buffer.push(b);
1329                        state.source.next();
1330                    }
1331
1332                    // We've consumed all leading whitespace on this line now,
1333                    // so let's try to read the label again.
1334                    if state.source.at(&label, label.len()) {
1335                        // We've found the label, finally! We need to do 1 last
1336                        // check to make sure there wasn't a mixture of indentation types.
1337                        if whitespace_kind != DocStringIndentationKind::None
1338                            && !extra_whitespace_buffer.is_empty()
1339                        {
1340                            return Err(SyntaxError::InvalidDocIndentation(state.source.span()));
1341                        }
1342
1343                        // If we get here, only 1 type of indentation was found. We can move
1344                        // the process along by reading over the label and breaking out
1345                        // with the EndHeredoc token, storing the kind and amount of whitespace.
1346                        state.source.skip(label.len());
1347                        state.replace(StackFrame::Scripting);
1348                        break (
1349                            TokenKind::EndDocString(whitespace_kind, whitespace_amount),
1350                            label,
1351                        );
1352                    } else {
1353                        // We didn't find the label. The buffer still needs to know about
1354                        // the whitespace, so let's extend the buffer with the whitespace
1355                        // and let the loop run again to handle the rest of the line.
1356                        if whitespace_kind != DocStringIndentationKind::None {
1357                            let whitespace_char: u8 = whitespace_kind.into();
1358                            for _ in 0..whitespace_amount {
1359                                buffer.push(whitespace_char);
1360                            }
1361                        }
1362
1363                        buffer.extend(extra_whitespace_buffer);
1364                    }
1365                }
1366                &[b, ..] => {
1367                    state.source.next();
1368                    buffer.push(b);
1369                }
1370                [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
1371            }
1372        };
1373
1374        // Any trailing line breaks should be removed from the final heredoc.
1375        if buffer.last() == Some(&b'\n') {
1376            buffer.pop();
1377        }
1378
1379        if !buffer.is_empty() {
1380            tokens.push(Token {
1381                kind: TokenKind::StringPart,
1382                span,
1383                value: buffer.into(),
1384            })
1385        }
1386
1387        tokens.push(Token { kind, span, value });
1388
1389        Ok(())
1390    }
1391
1392    fn looking_for_varname(&self, state: &mut State) -> SyntaxResult<Option<Token>> {
1393        let identifier = self.peek_identifier(state);
1394
1395        if let Some(ident) = identifier {
1396            if let [b'[' | b'}'] = state.source.peek(ident.len(), 1) {
1397                let ident = ident.to_vec();
1398                let span = state.source.span();
1399                state.source.skip(ident.len());
1400                state.replace(StackFrame::Scripting);
1401                return Ok(Some(Token {
1402                    kind: TokenKind::Identifier,
1403                    span,
1404                    value: ident.into(),
1405                }));
1406            }
1407        }
1408
1409        state.replace(StackFrame::Scripting);
1410
1411        Ok(None)
1412    }
1413
1414    fn looking_for_property(&self, state: &mut State) -> SyntaxResult<Token> {
1415        let span = state.source.span();
1416        let (kind, value) = match state.source.read(3) {
1417            [b'?', b'-', b'>'] => {
1418                state.source.skip(3);
1419                (TokenKind::QuestionArrow, b"?->".into())
1420            }
1421            [b'-', b'>', ..] => {
1422                state.source.skip(2);
1423                (TokenKind::Arrow, b"->".into())
1424            }
1425            &[ident_start!(), ..] => {
1426                let buffer = self.consume_identifier(state);
1427                state.exit();
1428                (TokenKind::Identifier, buffer.into())
1429            }
1430            // Should be impossible as we already looked ahead this far inside double_quote.
1431            _ => unreachable!(),
1432        };
1433
1434        Ok(Token { kind, span, value })
1435    }
1436
1437    fn var_offset(&self, state: &mut State) -> SyntaxResult<Token> {
1438        let span = state.source.span();
1439        let (kind, value) = match state.source.read(2) {
1440            [b'$', ident_start!()] => self.tokenize_variable(state),
1441            [b'0'..=b'9', ..] => {
1442                // TODO: all integer literals are allowed, but only decimal integers with no underscores
1443                // are actually treated as numbers. Others are treated as strings.
1444                // Float literals are not allowed, but that could be handled in the parser.
1445                self.tokenize_number(state)?
1446            }
1447            [b'[', ..] => {
1448                state.source.next();
1449                (TokenKind::LeftBracket, b"[".into())
1450            }
1451            [b'-', ..] => {
1452                state.source.next();
1453                (TokenKind::Minus, b"-".into())
1454            }
1455            [b']', ..] => {
1456                state.source.next();
1457                state.exit();
1458                (TokenKind::RightBracket, b"]".into())
1459            }
1460            &[ident_start!(), ..] => {
1461                let label = self.consume_identifier(state);
1462                (TokenKind::Identifier, label.into())
1463            }
1464            &[b, ..] => return Err(SyntaxError::UnrecognisedToken(b, state.source.span())),
1465            [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
1466        };
1467        Ok(Token { kind, span, value })
1468    }
1469
1470    fn tokenize_single_quote_string(
1471        &self,
1472        state: &mut State,
1473    ) -> SyntaxResult<(TokenKind, ByteString)> {
1474        let mut buffer = vec![];
1475
1476        loop {
1477            match state.source.read(2) {
1478                [b'\'', ..] => {
1479                    state.source.next();
1480                    break;
1481                }
1482                &[b'\\', b @ b'\'' | b @ b'\\'] => {
1483                    state.source.skip(2);
1484                    buffer.push(b);
1485                }
1486                &[b, ..] => {
1487                    state.source.next();
1488                    buffer.push(b);
1489                }
1490                [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
1491            }
1492        }
1493
1494        Ok((TokenKind::LiteralSingleQuotedString, buffer.into()))
1495    }
1496
1497    fn tokenize_double_quote_string(
1498        &self,
1499        state: &mut State,
1500    ) -> SyntaxResult<(TokenKind, ByteString)> {
1501        let mut buffer = vec![];
1502
1503        let constant = loop {
1504            match state.source.read(3) {
1505                [b'"', ..] => {
1506                    state.source.next();
1507                    break true;
1508                }
1509                &[b'\\', b @ (b'"' | b'\\' | b'$'), ..] => {
1510                    state.source.skip(2);
1511                    buffer.push(b);
1512                }
1513                &[b'\\', b'n', ..] => {
1514                    state.source.skip(2);
1515                    buffer.push(b'\n');
1516                }
1517                &[b'\\', b'r', ..] => {
1518                    state.source.skip(2);
1519                    buffer.push(b'\r');
1520                }
1521                &[b'\\', b't', ..] => {
1522                    state.source.skip(2);
1523                    buffer.push(b'\t');
1524                }
1525                &[b'\\', b'v', ..] => {
1526                    state.source.skip(2);
1527                    buffer.push(b'\x0b');
1528                }
1529                &[b'\\', b'e', ..] => {
1530                    state.source.skip(2);
1531                    buffer.push(b'\x1b');
1532                }
1533                &[b'\\', b'f', ..] => {
1534                    state.source.skip(2);
1535                    buffer.push(b'\x0c');
1536                }
1537                &[b'\\', b'x', b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')] => {
1538                    state.source.skip(3);
1539
1540                    let mut hex = String::from(b as char);
1541                    if let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
1542                        state.source.current()
1543                    {
1544                        state.source.next();
1545                        hex.push(*b as char);
1546                    }
1547
1548                    let b = u8::from_str_radix(&hex, 16).unwrap();
1549                    buffer.push(b);
1550                }
1551                &[b'\\', b'u', b'{'] => {
1552                    state.source.skip(3);
1553
1554                    let mut code_point = String::new();
1555                    while let Some(b @ (b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F')) =
1556                        state.source.current()
1557                    {
1558                        state.source.next();
1559                        code_point.push(*b as char);
1560                    }
1561
1562                    if code_point.is_empty() || state.source.current() != Some(&b'}') {
1563                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
1564                    }
1565                    state.source.next();
1566
1567                    let c = if let Ok(c) = u32::from_str_radix(&code_point, 16) {
1568                        c
1569                    } else {
1570                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
1571                    };
1572
1573                    if let Some(c) = char::from_u32(c) {
1574                        let mut tmp = [0; 4];
1575                        let bytes = c.encode_utf8(&mut tmp);
1576                        buffer.extend(bytes.as_bytes());
1577                    } else {
1578                        return Err(SyntaxError::InvalidUnicodeEscape(state.source.span()));
1579                    }
1580                }
1581                &[b'\\', b @ b'0'..=b'7', ..] => {
1582                    state.source.skip(2);
1583
1584                    let mut octal = String::from(b as char);
1585                    if let Some(b @ b'0'..=b'7') = state.source.current() {
1586                        state.source.next();
1587                        octal.push(*b as char);
1588                    }
1589
1590                    if let Some(b @ b'0'..=b'7') = state.source.current() {
1591                        state.source.next();
1592                        octal.push(*b as char);
1593                    }
1594
1595                    if let Ok(b) = u8::from_str_radix(&octal, 8) {
1596                        buffer.push(b);
1597                    } else {
1598                        return Err(SyntaxError::InvalidOctalEscape(state.source.span()));
1599                    }
1600                }
1601                [b'$', ident_start!(), ..] | [b'{', b'$', ..] | [b'$', b'{', ..] => {
1602                    break false;
1603                }
1604                &[b, ..] => {
1605                    state.source.next();
1606                    buffer.push(b);
1607                }
1608                [] => return Err(SyntaxError::UnexpectedEndOfFile(state.source.span())),
1609            }
1610        };
1611
1612        Ok(if constant {
1613            (TokenKind::LiteralDoubleQuotedString, buffer.into())
1614        } else {
1615            state.replace(StackFrame::DoubleQuote);
1616            (TokenKind::StringPart, buffer.into())
1617        })
1618    }
1619
1620    fn peek_identifier<'a>(&'a self, state: &'a State) -> Option<&'a [u8]> {
1621        let mut size = 0;
1622
1623        if let [ident_start!()] = state.source.read(1) {
1624            size += 1;
1625            while let [ident!()] = state.source.peek(size, 1) {
1626                size += 1;
1627            }
1628
1629            Some(state.source.read(size))
1630        } else {
1631            None
1632        }
1633    }
1634
1635    fn consume_identifier(&self, state: &mut State) -> Vec<u8> {
1636        let ident = self.peek_identifier(state).unwrap().to_vec();
1637        state.source.skip(ident.len());
1638
1639        ident
1640    }
1641
1642    fn tokenize_variable(&self, state: &mut State) -> (TokenKind, ByteString) {
1643        let mut var = state.source.read_and_skip(1).to_vec();
1644        var.extend(self.consume_identifier(state));
1645        (TokenKind::Variable, var.into())
1646    }
1647
1648    fn tokenize_number(&self, state: &mut State) -> SyntaxResult<(TokenKind, ByteString)> {
1649        let mut buffer = Vec::new();
1650
1651        let (base, kind) = match state.source.read(2) {
1652            [a @ b'0', b @ b'B' | b @ b'b'] => {
1653                buffer.push(*a);
1654                buffer.push(*b);
1655                state.source.skip(2);
1656                (2, NumberKind::Int)
1657            }
1658            [a @ b'0', b @ b'O' | b @ b'o'] => {
1659                buffer.push(*a);
1660                buffer.push(*b);
1661                state.source.skip(2);
1662                (8, NumberKind::Int)
1663            }
1664            [a @ b'0', b @ b'X' | b @ b'x'] => {
1665                buffer.push(*a);
1666                buffer.push(*b);
1667                state.source.skip(2);
1668                (16, NumberKind::Int)
1669            }
1670            [b'0', ..] => (10, NumberKind::OctalOrFloat),
1671            [b'.', ..] => (10, NumberKind::Float),
1672            _ => (10, NumberKind::IntOrFloat),
1673        };
1674
1675        if kind != NumberKind::Float {
1676            self.read_digits(state, &mut buffer, base);
1677            if kind == NumberKind::Int {
1678                return parse_int(&buffer);
1679            }
1680        }
1681
1682        // Remaining cases: decimal integer, legacy octal integer, or float.
1683        let is_float = matches!(
1684            state.source.read(3),
1685            [b'.', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9'] | [b'e' | b'E', b'0'..=b'9', ..]
1686        );
1687
1688        if !is_float {
1689            return parse_int(&buffer);
1690        }
1691
1692        if let Some(b'.') = state.source.current() {
1693            buffer.push(b'.');
1694            state.source.next();
1695            self.read_digits(state, &mut buffer, 10);
1696        }
1697
1698        if let Some(b'e' | b'E') = state.source.current() {
1699            buffer.push(b'e');
1700            state.source.next();
1701            if let Some(b @ (b'-' | b'+')) = state.source.current() {
1702                buffer.push(*b);
1703                state.source.next();
1704            }
1705            self.read_digits(state, &mut buffer, 10);
1706        }
1707
1708        Ok((TokenKind::LiteralFloat, buffer.into()))
1709    }
1710
1711    fn read_digits(&self, state: &mut State, buffer: &mut Vec<u8>, base: usize) {
1712        if base == 16 {
1713            self.read_digits_fn(state, buffer, u8::is_ascii_hexdigit);
1714        } else {
1715            let max = b'0' + base as u8;
1716            self.read_digits_fn(state, buffer, |b| (b'0'..max).contains(b));
1717        };
1718    }
1719
1720    fn read_digits_fn<F: Fn(&u8) -> bool>(
1721        &self,
1722        state: &mut State,
1723        buffer: &mut Vec<u8>,
1724        is_digit: F,
1725    ) {
1726        if let Some(b) = state.source.current() {
1727            if is_digit(b) {
1728                state.source.next();
1729                buffer.push(*b);
1730            } else {
1731                return;
1732            }
1733        }
1734
1735        loop {
1736            match state.source.read(2) {
1737                [b, ..] if is_digit(b) => {
1738                    state.source.next();
1739                    buffer.push(*b);
1740                }
1741                [b'_', b] if is_digit(b) => {
1742                    state.source.next();
1743                    state.source.next();
1744                    buffer.push(*b);
1745                }
1746                _ => {
1747                    break;
1748                }
1749            }
1750        }
1751    }
1752}
1753
1754// Parses an integer literal in the given base and converts errors to SyntaxError.
1755// It returns a float token instead on overflow.
1756fn parse_int(buffer: &[u8]) -> SyntaxResult<(TokenKind, ByteString)> {
1757    Ok((TokenKind::LiteralInteger, buffer.into()))
1758}
1759
1760#[inline(always)]
1761fn identifier_to_keyword(ident: &[u8]) -> Option<TokenKind> {
1762    Some(match ident.to_ascii_lowercase().as_slice() {
1763        b"eval" => TokenKind::Eval,
1764        b"die" => TokenKind::Die,
1765        b"empty" => TokenKind::Empty,
1766        b"isset" => TokenKind::Isset,
1767        b"unset" => TokenKind::Unset,
1768        b"exit" => TokenKind::Exit,
1769        b"enddeclare" => TokenKind::EndDeclare,
1770        b"endswitch" => TokenKind::EndSwitch,
1771        b"endfor" => TokenKind::EndFor,
1772        b"endwhile" => TokenKind::EndWhile,
1773        b"endforeach" => TokenKind::EndForeach,
1774        b"endif" => TokenKind::EndIf,
1775        b"from" => TokenKind::From,
1776        b"and" => TokenKind::LogicalAnd,
1777        b"or" => TokenKind::LogicalOr,
1778        b"xor" => TokenKind::LogicalXor,
1779        b"print" => TokenKind::Print,
1780        b"__halt_compiler" => TokenKind::HaltCompiler,
1781        b"readonly" => TokenKind::Readonly,
1782        b"global" => TokenKind::Global,
1783        b"match" => TokenKind::Match,
1784        b"abstract" => TokenKind::Abstract,
1785        b"array" => TokenKind::Array,
1786        b"as" => TokenKind::As,
1787        b"break" => TokenKind::Break,
1788        b"case" => TokenKind::Case,
1789        b"catch" => TokenKind::Catch,
1790        b"class" => TokenKind::Class,
1791        b"clone" => TokenKind::Clone,
1792        b"continue" => TokenKind::Continue,
1793        b"const" => TokenKind::Const,
1794        b"declare" => TokenKind::Declare,
1795        b"default" => TokenKind::Default,
1796        b"do" => TokenKind::Do,
1797        b"echo" => TokenKind::Echo,
1798        b"else" => TokenKind::Else,
1799        b"elseif" => TokenKind::ElseIf,
1800        b"enum" => TokenKind::Enum,
1801        b"extends" => TokenKind::Extends,
1802        b"false" => TokenKind::False,
1803        b"final" => TokenKind::Final,
1804        b"finally" => TokenKind::Finally,
1805        b"fn" => TokenKind::Fn,
1806        b"for" => TokenKind::For,
1807        b"foreach" => TokenKind::Foreach,
1808        b"function" => TokenKind::Function,
1809        b"goto" => TokenKind::Goto,
1810        b"if" => TokenKind::If,
1811        b"include" => TokenKind::Include,
1812        b"include_once" => TokenKind::IncludeOnce,
1813        b"implements" => TokenKind::Implements,
1814        b"interface" => TokenKind::Interface,
1815        b"instanceof" => TokenKind::Instanceof,
1816        b"namespace" => TokenKind::Namespace,
1817        b"new" => TokenKind::New,
1818        b"null" => TokenKind::Null,
1819        b"private" => TokenKind::Private,
1820        b"protected" => TokenKind::Protected,
1821        b"public" => TokenKind::Public,
1822        b"require" => TokenKind::Require,
1823        b"require_once" => TokenKind::RequireOnce,
1824        b"return" => TokenKind::Return,
1825        b"static" => TokenKind::Static,
1826        b"switch" => TokenKind::Switch,
1827        b"throw" => TokenKind::Throw,
1828        b"trait" => TokenKind::Trait,
1829        b"true" => TokenKind::True,
1830        b"try" => TokenKind::Try,
1831        b"use" => TokenKind::Use,
1832        b"var" => TokenKind::Var,
1833        b"yield" => TokenKind::Yield,
1834        b"__dir__" => TokenKind::DirConstant,
1835        b"__file__" => TokenKind::FileConstant,
1836        b"__line__" => TokenKind::LineConstant,
1837        b"__function__" => TokenKind::FunctionConstant,
1838        b"__class__" => TokenKind::ClassConstant,
1839        b"__method__" => TokenKind::MethodConstant,
1840        b"__trait__" => TokenKind::TraitConstant,
1841        b"__namespace__" => TokenKind::NamespaceConstant,
1842        b"__compiler_halt_offset__" => TokenKind::CompilerHaltOffsetConstant,
1843        b"while" => TokenKind::While,
1844        b"insteadof" => TokenKind::Insteadof,
1845        b"list" => TokenKind::List,
1846        b"self" => TokenKind::Self_,
1847        b"parent" => TokenKind::Parent,
1848        _ => return None,
1849    })
1850}
1851
1852#[derive(Debug, Eq, PartialEq)]
1853enum NumberKind {
1854    Int,
1855    Float,
1856    IntOrFloat,
1857    OctalOrFloat,
1858}
php_parser_rs/lexer/mod.rs

php_parser_rs/lexer/
mod.rs