mago_syntax/lexer/
mod.rs

1#![allow(clippy::unreachable)]
2
3use core::hint::unreachable_unchecked;
4
5use std::collections::VecDeque;
6use std::fmt::Debug;
7
8use memchr::memchr2;
9use memchr::memmem;
10
11/// Lookup table for single-character tokens that are ALWAYS single-char
12/// (i.e., they can never be part of a multi-character token).
13/// Maps byte -> Option<TokenKind>
14const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
15    let mut table: [Option<TokenKind>; 256] = [None; 256];
16    table[b';' as usize] = Some(TokenKind::Semicolon);
17    table[b',' as usize] = Some(TokenKind::Comma);
18    table[b')' as usize] = Some(TokenKind::RightParenthesis);
19    table[b'[' as usize] = Some(TokenKind::LeftBracket);
20    table[b']' as usize] = Some(TokenKind::RightBracket);
21    table[b'{' as usize] = Some(TokenKind::LeftBrace);
22    table[b'}' as usize] = Some(TokenKind::RightBrace);
23    table[b'~' as usize] = Some(TokenKind::Tilde);
24    table[b'@' as usize] = Some(TokenKind::At);
25    table
26};
27
28/// Lookup table for identifier start characters (a-z, A-Z, _, 0x80-0xFF)
29const IDENT_START_TABLE: [bool; 256] = {
30    let mut table = [false; 256];
31    let mut i = 0usize;
32    while i < 256 {
33        table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
34        i += 1;
35    }
36
37    table
38};
39
40use mago_database::file::FileId;
41use mago_database::file::HasFileId;
42use mago_span::Position;
43use mago_syntax_core::float_exponent;
44use mago_syntax_core::float_separator;
45use mago_syntax_core::input::Input;
46use mago_syntax_core::number_sign;
47use mago_syntax_core::start_of_binary_number;
48use mago_syntax_core::start_of_float_number;
49use mago_syntax_core::start_of_hexadecimal_number;
50use mago_syntax_core::start_of_identifier;
51use mago_syntax_core::start_of_number;
52use mago_syntax_core::start_of_octal_number;
53use mago_syntax_core::start_of_octal_or_float_number;
54use mago_syntax_core::utils::is_part_of_identifier;
55use mago_syntax_core::utils::is_start_of_identifier;
56use mago_syntax_core::utils::read_digits_of_base;
57
58use crate::error::SyntaxError;
59use crate::lexer::internal::mode::HaltStage;
60use crate::lexer::internal::mode::Interpolation;
61use crate::lexer::internal::mode::LexerMode;
62use crate::lexer::internal::utils::NumberKind;
63use crate::settings::LexerSettings;
64use crate::token::DocumentKind;
65use crate::token::Token;
66use crate::token::TokenKind;
67
68mod internal;
69
70/// The `Lexer` struct is responsible for tokenizing input source code into discrete tokens
71/// based on PHP language syntax. It is designed to work with PHP code from version 7.0 up to 8.4.
72///
73/// The lexer reads through the provided input and processes it accordingly.
74///
75/// It identifies PHP-specific tokens, including operators, keywords, comments, strings, and other syntax elements,
76/// and produces a sequence of [`Token`] objects that are used in further stages of compilation or interpretation.
77///
78/// The lexer is designed to be used in a streaming fashion, where it reads the input source code in chunks
79/// and produces tokens incrementally. This allows for efficient processing of large source files and
80/// minimizes memory usage.
81#[derive(Debug)]
82pub struct Lexer<'input> {
83    input: Input<'input>,
84    settings: LexerSettings,
85    mode: LexerMode<'input>,
86    interpolating: bool,
87    brace_interpolating: bool,
88    /// Buffer for tokens during string interpolation.
89    buffer: VecDeque<Token<'input>>,
90}
91
92impl<'input> Lexer<'input> {
93    /// Initial capacity for the token buffer used during string interpolation.
94    /// Pre-allocating avoids reallocation during interpolation processing.
95    const BUFFER_INITIAL_CAPACITY: usize = 8;
96
97    /// Creates a new `Lexer` instance.
98    ///
99    /// # Parameters
100    ///
101    /// - `input`: The input source code to tokenize.
102    /// - `settings`: The lexer settings.
103    ///
104    /// # Returns
105    ///
106    /// A new `Lexer` instance that reads from the provided byte slice.
107    #[must_use]
108    pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
109        Lexer {
110            input,
111            settings,
112            mode: LexerMode::Inline,
113            interpolating: false,
114            brace_interpolating: false,
115            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
116        }
117    }
118
119    /// Creates a new `Lexer` instance for parsing a script block.
120    ///
121    /// # Parameters
122    ///
123    /// - `input`: The input source code to tokenize.
124    /// - `settings`: The lexer settings.
125    ///
126    /// # Returns
127    ///
128    /// A new `Lexer` instance that reads from the provided byte slice.
129    #[must_use]
130    pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
131        Lexer {
132            input,
133            settings,
134            mode: LexerMode::Script,
135            interpolating: false,
136            brace_interpolating: false,
137            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
138        }
139    }
140
141    /// Check if the lexer has reached the end of the input.
142    ///
143    /// If this method returns `true`, the lexer will not produce any more tokens.
144    #[must_use]
145    pub fn has_reached_eof(&self) -> bool {
146        self.input.has_reached_eof()
147    }
148
149    /// Get the current position of the lexer in the input source code.
150    #[inline]
151    #[must_use]
152    pub const fn current_position(&self) -> Position {
153        self.input.current_position()
154    }
155
156    /// Tokenizes the next input from the source code.
157    ///
158    /// This method reads from the input and produces the next [`Token`] based on the current [`LexerMode`].
159    /// It handles various lexical elements such as inline text, script code, strings with interpolation,
160    /// comments, and different PHP-specific constructs.
161    ///
162    /// # Returns
163    ///
164    /// - `Some(Ok(Token))` if a token was successfully parsed.
165    /// - `Some(Err(SyntaxError))` if a syntax error occurred while parsing the next token.
166    /// - `None` if the end of the input has been reached.
167    ///
168    /// # Notes
169    ///
170    /// - It efficiently handles tokenization by consuming input based on patterns specific to PHP syntax.
171    /// - The lexer supports complex features like string interpolation and different numeric formats.
172    ///
173    /// # Errors
174    ///
175    /// Returns `Some(Err(SyntaxError))` in cases such as:
176    ///
177    /// - Unrecognized tokens that do not match any known PHP syntax.
178    /// - Unexpected tokens in a given context, such as an unexpected end of string.
179    ///
180    /// # Panics
181    ///
182    /// This method should not panic under normal operation. If it does, it indicates a bug in the lexer implementation.
183    ///
184    /// # See Also
185    ///
186    /// - [`Token`]: Represents a lexical token with its kind, value, and span.
187    /// - [`SyntaxError`]: Represents errors that can occur during lexing.
188    #[inline]
189    pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
190        // Check if there are buffered tokens from string interpolation.
191        if !self.interpolating
192            && let Some(token) = self.buffer.pop_front()
193        {
194            return Some(Ok(token));
195        }
196
197        if self.input.has_reached_eof() {
198            return None;
199        }
200
201        match self.mode {
202            LexerMode::Inline => {
203                let start = self.input.current_position();
204                let offset = self.input.current_offset();
205
206                // Shebang is only valid at the absolute start of the file (offset 0).
207                if offset == 0
208                    && self.input.len() >= 2
209                    // SAFETY: `self.input.len() >= 2` was just checked, so indices 0 and 1 are in bounds.
210                    && unsafe { *self.input.read_at_unchecked(0) } == b'#'
211                    // SAFETY: same as above.
212                    && unsafe { *self.input.read_at_unchecked(1) } == b'!'
213                {
214                    let buffer = self.input.consume_through(b'\n');
215                    let end = self.input.current_position();
216
217                    return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
218                }
219
220                // Get the remaining bytes to scan.
221                let bytes = self.input.read_remaining();
222
223                if self.settings.enable_short_tags {
224                    if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
225                        if pos > 0 {
226                            let buffer = self.input.consume(pos);
227                            let end = self.input.current_position();
228
229                            return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
230                        }
231
232                        if self.input.is_at(b"<?php", true) {
233                            let buffer = self.input.consume(5);
234                            self.mode = LexerMode::Script;
235                            return Some(Ok(self.token(
236                                TokenKind::OpenTag,
237                                buffer,
238                                start,
239                                self.input.current_position(),
240                            )));
241                        }
242
243                        if self.input.is_at(b"<?=", false) {
244                            let buffer = self.input.consume(3);
245                            self.mode = LexerMode::Script;
246                            return Some(Ok(self.token(
247                                TokenKind::EchoTag,
248                                buffer,
249                                start,
250                                self.input.current_position(),
251                            )));
252                        }
253
254                        let buffer = self.input.consume(2);
255                        self.mode = LexerMode::Script;
256                        return Some(Ok(self.token(
257                            TokenKind::ShortOpenTag,
258                            buffer,
259                            start,
260                            self.input.current_position(),
261                        )));
262                    }
263                } else {
264                    let iter = memchr::memmem::find_iter(bytes, b"<?");
265
266                    for pos in iter {
267                        // SAFETY: `pos` is guaranteed to be within `bytes` by `find_iter`.
268                        let candidate = unsafe { bytes.get_unchecked(pos..) };
269
270                        if candidate.len() >= 5
271                            // SAFETY: `candidate.len() >= 5` was just checked, so indices 2, 3, 4 are in bounds.
272                            && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
273                            // SAFETY: same as above.
274                            && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
275                            // SAFETY: same as above.
276                            && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
277                        {
278                            if pos > 0 {
279                                let buffer = self.input.consume(pos);
280                                let end = self.input.current_position();
281                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
282                            }
283
284                            let buffer = self.input.consume(5);
285                            self.mode = LexerMode::Script;
286                            return Some(Ok(self.token(
287                                TokenKind::OpenTag,
288                                buffer,
289                                start,
290                                self.input.current_position(),
291                            )));
292                        }
293
294                        // SAFETY: index 2 is in bounds because the right-hand side is only evaluated when
295                        // `candidate.len() >= 3` holds.
296                        if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
297                            if pos > 0 {
298                                let buffer = self.input.consume(pos);
299                                let end = self.input.current_position();
300                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
301                            }
302
303                            let buffer = self.input.consume(3);
304                            self.mode = LexerMode::Script;
305                            return Some(Ok(self.token(
306                                TokenKind::EchoTag,
307                                buffer,
308                                start,
309                                self.input.current_position(),
310                            )));
311                        }
312                    }
313                }
314
315                if self.input.has_reached_eof() {
316                    return None;
317                }
318
319                let buffer = self.input.consume_remaining();
320                let end = self.input.current_position();
321                Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
322            }
323            LexerMode::Script => {
324                let start = self.input.current_position();
325                let whitespaces = self.input.consume_whitespaces();
326                if !whitespaces.is_empty() {
327                    return Some(Ok(self.token(
328                        TokenKind::Whitespace,
329                        whitespaces,
330                        start,
331                        self.input.current_position(),
332                    )));
333                }
334
335                let Some(&first_byte) = self.input.read(1).first() else {
336                    // SAFETY: we check for EOF before entering scripting section,
337                    unsafe { unreachable_unchecked() }
338                };
339
340                if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
341                    let buffer = self.input.consume(1);
342                    let end = self.input.current_position();
343                    return Some(Ok(self.token(kind, buffer, start, end)));
344                }
345
346                if IDENT_START_TABLE[first_byte as usize] {
347                    let is_binary_string_prefix = !self.interpolating
348                        && matches!(first_byte, b'b' | b'B')
349                        && matches!(self.input.read(4), [_, b'\'' | b'"', ..] | [_, b'<', b'<', b'<']);
350
351                    if !is_binary_string_prefix {
352                        let (token_kind, len) = self.scan_identifier_or_keyword_info();
353
354                        if token_kind == TokenKind::HaltCompiler {
355                            self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
356                        }
357
358                        let buffer = self.input.consume(len);
359                        let end = self.input.current_position();
360                        return Some(Ok(self.token(token_kind, buffer, start, end)));
361                    }
362
363                    // Fall through to handle b-prefix strings in the match block below
364                }
365
366                if first_byte == b'$'
367                    && let Some(&next) = self.input.read(2).get(1)
368                    && IDENT_START_TABLE[next as usize]
369                {
370                    let (ident_len, _) = self.input.scan_identifier(1);
371                    let buffer = self.input.consume(1 + ident_len);
372                    let end = self.input.current_position();
373                    return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
374                }
375
376                let mut document_label: &[u8] = &[];
377
378                let (token_kind, len) = match self.input.read(3) {
379                    [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
380                    [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
381                    [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
382                    [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
383                    [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
384                    [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
385                    [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
386                    [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
387                    [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
388                    [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input, 0) => {
389                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false, 0);
390
391                        document_label = self.input.peek(3 + whitespaces, label_length);
392
393                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
394                    }
395                    [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input, 0) => {
396                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true, 0);
397
398                        document_label = self.input.peek(4 + whitespaces, label_length);
399
400                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
401                    }
402                    [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input, 0) => {
403                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input, 0);
404
405                        document_label = self.input.peek(4 + whitespaces, label_length);
406
407                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
408                    }
409                    [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
410                    [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
411                    [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
412                    [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
413                    [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
414                    [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
415                    [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
416                    [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
417                    [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
418                    [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
419                    [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
420                    [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
421                    [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
422                    [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
423                    [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
424                    [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
425                    [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
426                    [b'>', b'>', ..] => (TokenKind::RightShift, 2),
427                    [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
428                    [b':', b':', ..] => (TokenKind::ColonColon, 2),
429                    [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
430                    [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
431                    [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
432                    [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
433                    [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
434                    [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
435                    [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
436                    [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
437                    [b'/', b'/', ..] => {
438                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
439                        let comment_len = scan_single_line_comment(remaining);
440                        (TokenKind::SingleLineComment, 2 + comment_len)
441                    }
442                    [b'/', b'*', asterisk] => {
443                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
444                        match scan_multi_line_comment(remaining) {
445                            Some(len) => {
446                                let is_docblock = asterisk == &b'*' && len > 2;
447                                if is_docblock {
448                                    (TokenKind::DocBlockComment, len + 2)
449                                } else {
450                                    (TokenKind::MultiLineComment, len + 2)
451                                }
452                            }
453                            None => {
454                                self.input.consume(remaining.len() + 2);
455                                return Some(Err(SyntaxError::UnexpectedEndOfFile(
456                                    self.file_id(),
457                                    self.input.current_position(),
458                                )));
459                            }
460                        }
461                    }
462                    [b'\\', start_of_identifier!(), ..] => {
463                        let mut length = 1;
464                        loop {
465                            let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
466                            length += ident_len;
467                            if ends_with_ns {
468                                length += 1; // Include the backslash
469                            } else {
470                                break;
471                            }
472                        }
473
474                        (TokenKind::FullyQualifiedIdentifier, length)
475                    }
476                    [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
477                    [b'$', ..] => (TokenKind::Dollar, 1),
478                    [b'!', ..] => (TokenKind::Bang, 1),
479                    [b'&', ..] => (TokenKind::Ampersand, 1),
480                    [b'?', ..] => (TokenKind::Question, 1),
481                    [b'=', ..] => (TokenKind::Equal, 1),
482                    [b'`', ..] => (TokenKind::Backtick, 1),
483                    [b'+', ..] => (TokenKind::Plus, 1),
484                    [b'%', ..] => (TokenKind::Percent, 1),
485                    [b'-', ..] => (TokenKind::Minus, 1),
486                    [b'<', ..] => (TokenKind::LessThan, 1),
487                    [b'>', ..] => (TokenKind::GreaterThan, 1),
488                    [b':', ..] => (TokenKind::Colon, 1),
489                    [b'|', ..] => (TokenKind::Pipe, 1),
490                    [b'^', ..] => (TokenKind::Caret, 1),
491                    [b'*', ..] => (TokenKind::Asterisk, 1),
492                    [b'/', ..] => (TokenKind::Slash, 1),
493                    [b'b' | b'B', b'\'', ..] => read_literal_string(&self.input, b'\'', 1),
494                    [b'b' | b'B', b'"', ..] if matches_literal_double_quote_string(&self.input, 1) => {
495                        read_literal_string(&self.input, b'"', 1)
496                    }
497                    [b'b' | b'B', b'"', ..] => (TokenKind::DoubleQuote, 2),
498                    [b'b' | b'B', b'<', b'<']
499                        if self.input.read(4).len() == 4
500                            && self.input.read(4)[3] == b'<'
501                            && matches_start_of_heredoc_document(&self.input, 1) =>
502                    {
503                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false, 1);
504
505                        document_label = self.input.peek(4 + whitespaces, label_length);
506
507                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
508                    }
509                    [b'b' | b'B', b'<', b'<']
510                        if self.input.read(4).len() == 4
511                            && self.input.read(4)[3] == b'<'
512                            && matches_start_of_double_quote_heredoc_document(&self.input, 1) =>
513                    {
514                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true, 1);
515
516                        document_label = self.input.peek(5 + whitespaces, label_length);
517
518                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
519                    }
520                    [b'b' | b'B', b'<', b'<']
521                        if self.input.read(4).len() == 4
522                            && self.input.read(4)[3] == b'<'
523                            && matches_start_of_nowdoc_document(&self.input, 1) =>
524                    {
525                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input, 1);
526
527                        document_label = self.input.peek(5 + whitespaces, label_length);
528
529                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
530                    }
531                    // Regular string literals
532                    [quote @ b'\'', ..] => read_literal_string(&self.input, *quote, 0),
533                    [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input, 0) => {
534                        read_literal_string(&self.input, *quote, 0)
535                    }
536                    [b'"', ..] => (TokenKind::DoubleQuote, 1),
537                    [b'(', ..] => 'parenthesis: {
538                        let mut peek_offset = 1;
539                        while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
540                            if b.is_ascii_whitespace() {
541                                peek_offset += 1;
542                            } else {
543                                // Check if this byte could start a cast type (case-insensitive)
544                                let lower = b | 0x20; // ASCII lowercase
545                                if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u' | b'v')
546                                {
547                                    break 'parenthesis (TokenKind::LeftParenthesis, 1);
548                                }
549                                break;
550                            }
551                        }
552
553                        for (value, kind) in internal::consts::CAST_TYPES {
554                            if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
555                                break 'parenthesis (kind, length);
556                            }
557                        }
558
559                        (TokenKind::LeftParenthesis, 1)
560                    }
561                    [b'#', ..] => {
562                        let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
563                        let comment_len = scan_single_line_comment(remaining);
564                        (TokenKind::HashComment, 1 + comment_len)
565                    }
566                    [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
567                    [b'.', start_of_number!(), ..] => {
568                        let mut length = read_digits_of_base(&self.input, 2, 10);
569                        if let float_exponent!() = self.input.peek(length, 1) {
570                            let mut exp_length = length + 1;
571                            if let number_sign!() = self.input.peek(exp_length, 1) {
572                                exp_length += 1;
573                            }
574
575                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
576                            if after_exp > exp_length {
577                                length = after_exp;
578                            }
579                        }
580
581                        (TokenKind::LiteralFloat, length)
582                    }
583                    [start_of_number!(), ..] => 'number: {
584                        let mut length = 1;
585
586                        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
587                            start_of_binary_number!() => {
588                                length += 1;
589
590                                (2, NumberKind::Integer)
591                            }
592                            start_of_octal_number!() => {
593                                length += 1;
594
595                                (8, NumberKind::Integer)
596                            }
597                            start_of_hexadecimal_number!() => {
598                                length += 1;
599
600                                (16, NumberKind::Integer)
601                            }
602                            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
603                            start_of_float_number!() => (10, NumberKind::Float),
604                            _ => (10, NumberKind::IntegerOrFloat),
605                        };
606
607                        if kind != NumberKind::Float {
608                            length = read_digits_of_base(&self.input, length, base);
609
610                            if kind == NumberKind::Integer {
611                                break 'number (TokenKind::LiteralInteger, length);
612                            }
613                        }
614
615                        let is_float = matches!(self.input.peek(length, 3), float_separator!());
616
617                        if !is_float {
618                            if kind == NumberKind::OctalOrFloat
619                                && let Some(invalid_idx) =
620                                    (1..length).find(|&i| matches!(self.input.peek(i, 1), [b'8' | b'9']))
621                            {
622                                let invalid_byte = self.input.peek(invalid_idx, 1)[0];
623                                let start = self.input.current_position();
624                                let invalid_position = Position { offset: start.offset + invalid_idx as u32 };
625                                self.input.consume(length);
626                                return Some(Err(SyntaxError::UnexpectedToken(
627                                    self.file_id(),
628                                    invalid_byte,
629                                    invalid_position,
630                                )));
631                            }
632                            break 'number (TokenKind::LiteralInteger, length);
633                        }
634
635                        if let [b'.'] = self.input.peek(length, 1) {
636                            length += 1;
637                            length = read_digits_of_base(&self.input, length, 10);
638                        }
639
640                        if let float_exponent!() = self.input.peek(length, 1) {
641                            // Only include exponent if there are digits after it
642                            let mut exp_length = length + 1;
643                            if let number_sign!() = self.input.peek(exp_length, 1) {
644                                exp_length += 1;
645                            }
646                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
647                            if after_exp > exp_length {
648                                // There are digits after the exponent marker
649                                length = after_exp;
650                            }
651                        }
652
653                        (TokenKind::LiteralFloat, length)
654                    }
655                    [b'.', ..] => (TokenKind::Dot, 1),
656                    [unknown_byte, ..] => {
657                        let position = self.input.current_position();
658                        self.input.consume(1);
659
660                        return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
661                    }
662                    [] => {
663                        // We check for EOF before entering scripting section, so this should be
664                        // unreachable. If we ever land here it means an upstream invariant broke,
665                        // so signal EOF gracefully rather than panicking inside the lexer.
666                        return None;
667                    }
668                };
669
670                self.mode = match token_kind {
671                    TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
672                    TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
673                    TokenKind::CloseTag => LexerMode::Inline,
674                    TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
675                    TokenKind::DocumentStart(document_kind) => {
676                        LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
677                    }
678                    _ => LexerMode::Script,
679                };
680
681                let buffer = self.input.consume(len);
682                let end = self.input.current_position();
683
684                Some(Ok(self.token(token_kind, buffer, start, end)))
685            }
686            LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
687                Interpolation::None => {
688                    let start = self.input.current_position();
689
690                    let mut length = 0;
691                    let mut last_was_slash = false;
692                    let mut token_kind = TokenKind::StringPart;
693                    loop {
694                        match self.input.peek(length, 2) {
695                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
696                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
697
698                                self.mode =
699                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
700
701                                break;
702                            }
703                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
704                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
705
706                                self.mode = LexerMode::DoubleQuoteString(Interpolation::BraceUntil(
707                                    start.offset + until_offset,
708                                ));
709
710                                break;
711                            }
712                            [b'\\', ..] => {
713                                length += 1;
714
715                                last_was_slash = !last_was_slash;
716                            }
717                            [b'"', ..] if !last_was_slash => {
718                                if length == 0 {
719                                    length += 1;
720                                    token_kind = TokenKind::DoubleQuote;
721
722                                    break;
723                                }
724
725                                break;
726                            }
727                            [_, ..] => {
728                                length += 1;
729                                last_was_slash = false;
730                            }
731                            [] => {
732                                break;
733                            }
734                        }
735                    }
736
737                    let buffer = self.input.consume(length);
738                    let end = self.input.current_position();
739
740                    if TokenKind::DoubleQuote == token_kind {
741                        self.mode = LexerMode::Script;
742                    }
743
744                    Some(Ok(self.token(token_kind, buffer, start, end)))
745                }
746                Interpolation::Until(offset) => {
747                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), false)
748                }
749                Interpolation::BraceUntil(offset) => {
750                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), true)
751                }
752            },
753            LexerMode::ShellExecuteString(interpolation) => match &interpolation {
754                Interpolation::None => {
755                    let start = self.input.current_position();
756
757                    let mut length = 0;
758                    let mut last_was_slash = false;
759                    let mut token_kind = TokenKind::StringPart;
760                    loop {
761                        match self.input.peek(length, 2) {
762                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
763                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
764
765                                self.mode =
766                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
767
768                                break;
769                            }
770                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
771                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
772
773                                self.mode = LexerMode::ShellExecuteString(Interpolation::BraceUntil(
774                                    start.offset + until_offset,
775                                ));
776
777                                break;
778                            }
779                            [b'\\', ..] => {
780                                length += 1;
781                                last_was_slash = !last_was_slash;
782                            }
783                            [b'`', ..] if !last_was_slash => {
784                                if length == 0 {
785                                    length += 1;
786                                    token_kind = TokenKind::Backtick;
787
788                                    break;
789                                }
790
791                                break;
792                            }
793                            [_, ..] => {
794                                length += 1;
795                                last_was_slash = false;
796                            }
797                            [] => {
798                                break;
799                            }
800                        }
801                    }
802
803                    let buffer = self.input.consume(length);
804                    let end = self.input.current_position();
805
806                    if TokenKind::Backtick == token_kind {
807                        self.mode = LexerMode::Script;
808                    }
809
810                    Some(Ok(self.token(token_kind, buffer, start, end)))
811                }
812                Interpolation::Until(offset) => {
813                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), false)
814                }
815                Interpolation::BraceUntil(offset) => {
816                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), true)
817                }
818            },
819            LexerMode::DocumentString(kind, label, interpolation) => match &kind {
820                DocumentKind::Heredoc => match &interpolation {
821                    Interpolation::None => {
822                        let start = self.input.current_position();
823
824                        let mut length = 0;
825                        let mut last_was_slash = false;
826                        let mut only_whitespaces = true;
827                        let mut token_kind = TokenKind::StringPart;
828                        loop {
829                            match self.input.peek(length, 2) {
830                                [b'\r', b'\n'] => {
831                                    length += 2;
832
833                                    break;
834                                }
835                                [b'\n' | b'\r', ..] => {
836                                    length += 1;
837
838                                    break;
839                                }
840                                [byte, ..] if byte.is_ascii_whitespace() => {
841                                    length += 1;
842                                }
843                                [b'$', start_of_identifier!(), ..] if !last_was_slash => {
844                                    let until_offset =
845                                        read_until_end_of_variable_interpolation(&self.input, length + 2);
846
847                                    self.mode = LexerMode::DocumentString(
848                                        kind,
849                                        label,
850                                        Interpolation::Until(start.offset + until_offset),
851                                    );
852
853                                    break;
854                                }
855                                [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
856                                    let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
857
858                                    self.mode = LexerMode::DocumentString(
859                                        kind,
860                                        label,
861                                        Interpolation::BraceUntil(start.offset + until_offset),
862                                    );
863
864                                    break;
865                                }
866                                [b'\\', ..] => {
867                                    length += 1;
868                                    last_was_slash = !last_was_slash;
869                                    only_whitespaces = false;
870                                }
871                                [_, ..] => {
872                                    if only_whitespaces
873                                        && self.input.peek(length, label.len()) == label
874                                        && self
875                                            .input
876                                            .peek(length + label.len(), 1)
877                                            .first()
878                                            .is_none_or(|c| !is_part_of_identifier(c))
879                                    {
880                                        length += label.len();
881                                        token_kind = TokenKind::DocumentEnd;
882
883                                        break;
884                                    }
885
886                                    length += 1;
887                                    last_was_slash = false;
888                                    only_whitespaces = false;
889                                }
890                                [] => {
891                                    break;
892                                }
893                            }
894                        }
895
896                        let buffer = self.input.consume(length);
897                        let end = self.input.current_position();
898
899                        if TokenKind::DocumentEnd == token_kind {
900                            self.mode = LexerMode::Script;
901                        }
902
903                        Some(Ok(self.token(token_kind, buffer, start, end)))
904                    }
905                    Interpolation::Until(offset) => {
906                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), false)
907                    }
908                    Interpolation::BraceUntil(offset) => {
909                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), true)
910                    }
911                },
912                DocumentKind::Nowdoc => {
913                    let start = self.input.current_position();
914
915                    let mut length = 0;
916                    let mut terminated = false;
917                    let mut only_whitespaces = true;
918
919                    loop {
920                        match self.input.peek(length, 2) {
921                            [b'\r', b'\n'] => {
922                                length += 2;
923
924                                break;
925                            }
926                            [b'\n' | b'\r', ..] => {
927                                length += 1;
928
929                                break;
930                            }
931                            [byte, ..] if byte.is_ascii_whitespace() => {
932                                length += 1;
933                            }
934                            [_, ..] => {
935                                if only_whitespaces
936                                    && self.input.peek(length, label.len()) == label
937                                    && self
938                                        .input
939                                        .peek(length + label.len(), 1)
940                                        .first()
941                                        .is_none_or(|c| !is_part_of_identifier(c))
942                                {
943                                    length += label.len();
944                                    terminated = true;
945
946                                    break;
947                                }
948
949                                only_whitespaces = false;
950                                length += 1;
951                            }
952                            [] => {
953                                break;
954                            }
955                        }
956                    }
957
958                    let buffer = self.input.consume(length);
959                    let end = self.input.current_position();
960
961                    if terminated {
962                        self.mode = LexerMode::Script;
963
964                        return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
965                    }
966
967                    Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
968                }
969            },
970            LexerMode::Halt(stage) => 'halt: {
971                let start = self.input.current_position();
972                if let HaltStage::End = stage {
973                    let buffer = self.input.consume_remaining();
974                    let end = self.input.current_position();
975
976                    break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
977                }
978
979                let whitespaces = self.input.consume_whitespaces();
980                if !whitespaces.is_empty() {
981                    let end = self.input.current_position();
982
983                    break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
984                }
985
986                match &stage {
987                    HaltStage::LookingForLeftParenthesis => {
988                        if self.input.is_at(b"(", false) {
989                            let buffer = self.input.consume(1);
990                            let end = self.input.current_position();
991
992                            self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
993
994                            Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
995                        } else {
996                            let byte = self.input.read(1)[0];
997                            let position = self.input.current_position();
998                            // Consume the unexpected byte to avoid infinite loops
999                            self.input.consume(1);
1000                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
1001                        }
1002                    }
1003                    HaltStage::LookingForRightParenthesis => {
1004                        if self.input.is_at(b")", false) {
1005                            let buffer = self.input.consume(1);
1006                            let end = self.input.current_position();
1007
1008                            self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
1009
1010                            Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
1011                        } else {
1012                            let byte = self.input.read(1)[0];
1013                            let position = self.input.current_position();
1014                            self.input.consume(1);
1015                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
1016                        }
1017                    }
1018                    HaltStage::LookingForTerminator => {
1019                        if self.input.is_at(b";", false) {
1020                            let buffer = self.input.consume(1);
1021                            let end = self.input.current_position();
1022
1023                            self.mode = LexerMode::Halt(HaltStage::End);
1024
1025                            Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
1026                        } else if self.input.is_at(b"?>", false) {
1027                            let buffer = self.input.consume(2);
1028                            let end = self.input.current_position();
1029
1030                            self.mode = LexerMode::Halt(HaltStage::End);
1031
1032                            Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
1033                        } else {
1034                            let byte = self.input.read(1)[0];
1035                            let position = self.input.current_position();
1036                            self.input.consume(1);
1037                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
1038                        }
1039                    }
1040                    HaltStage::End => {
1041                        // The `HaltStage::End` arm is reached only after the early-return at the top of
1042                        // this branch consumed the terminating `?>` or EOF; surfacing `None` keeps the
1043                        // lexer total instead of relying on `unreachable!`.
1044                        None
1045                    }
1046                }
1047            }
1048        }
1049    }
1050
1051    /// Fast path for scanning identifiers and keywords.
1052    /// Called when we know the first byte is an identifier start character.
1053    /// Returns (TokenKind, length) to allow proper mode switching.
1054    #[inline]
1055    fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
1056        let (mut length, ended_with_slash) = self.input.scan_identifier(0);
1057
1058        if !ended_with_slash {
1059            match length {
1060                6 if self.input.is_at(b"public(set)", true) => {
1061                    return (TokenKind::PublicSet, 11);
1062                }
1063                7 if self.input.is_at(b"private(set)", true) => {
1064                    return (TokenKind::PrivateSet, 12);
1065                }
1066                9 if self.input.is_at(b"protected(set)", true) => {
1067                    return (TokenKind::ProtectedSet, 14);
1068                }
1069                _ => {}
1070            }
1071        }
1072
1073        if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
1074            return (kind, length);
1075        }
1076
1077        let mut slashes = 0;
1078        let mut last_was_slash = false;
1079        loop {
1080            match self.input.peek(length, 1) {
1081                [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
1082                    length += 1;
1083                    last_was_slash = false;
1084                }
1085                [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1086                    length += 1;
1087                }
1088                [b'\\'] if !self.interpolating || self.brace_interpolating => {
1089                    if last_was_slash {
1090                        length -= 1;
1091                        slashes -= 1;
1092                        last_was_slash = false;
1093                        break;
1094                    }
1095
1096                    length += 1;
1097                    slashes += 1;
1098                    last_was_slash = true;
1099                }
1100                _ => {
1101                    break;
1102                }
1103            }
1104        }
1105
1106        if last_was_slash {
1107            length -= 1;
1108            slashes -= 1;
1109        }
1110
1111        let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1112
1113        (kind, length)
1114    }
1115
1116    #[inline]
1117    fn token(&self, kind: TokenKind, value: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1118        Token { kind, start, value }
1119    }
1120
1121    #[inline]
1122    fn interpolation(
1123        &mut self,
1124        end_offset: u32,
1125        post_interpolation_mode: LexerMode<'input>,
1126        brace: bool,
1127    ) -> Option<Result<Token<'input>, SyntaxError>> {
1128        self.mode = LexerMode::Script;
1129
1130        let was_interpolating = self.interpolating;
1131        self.interpolating = true;
1132        let was_brace_interpolating = self.brace_interpolating;
1133        // For brace interpolation ({$...}), allow qualified identifiers with backslashes.
1134        self.brace_interpolating = brace;
1135
1136        let pending_error = loop {
1137            match self.advance() {
1138                Some(Ok(token)) => {
1139                    let token_start = token.start.offset;
1140                    let token_end = token_start + token.value.len() as u32;
1141                    let is_final_token = token_start <= end_offset && end_offset <= token_end;
1142
1143                    self.buffer.push_back(token);
1144
1145                    if is_final_token {
1146                        break None;
1147                    }
1148                }
1149                Some(Err(error)) => break Some(error),
1150                None => break None,
1151            }
1152        };
1153
1154        self.mode = post_interpolation_mode;
1155        self.interpolating = was_interpolating;
1156        self.brace_interpolating = was_brace_interpolating;
1157
1158        if let Some(error) = pending_error {
1159            return Some(Err(error));
1160        }
1161
1162        self.advance()
1163    }
1164}
1165
1166impl HasFileId for Lexer<'_> {
1167    #[inline]
1168    fn file_id(&self) -> FileId {
1169        self.input.file_id()
1170    }
1171}
1172
1173#[inline]
1174fn matches_start_of_heredoc_document(input: &Input, prefix_len: usize) -> bool {
1175    let total = input.len();
1176    let base = input.current_offset();
1177
1178    // Start after the prefix (if any) and the fixed opener (3 bytes).
1179    let mut length = 3 + prefix_len;
1180    // Consume any following whitespace.
1181    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1182        length += 1;
1183    }
1184
1185    // The next byte must be a valid start-of-identifier.
1186    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1187        return false;
1188    }
1189    length += 1; // Include that identifier start.
1190
1191    // Now continue reading identifier characters until a newline is found.
1192    loop {
1193        let pos = base + length;
1194        if pos >= total {
1195            return false; // Unexpected EOF
1196        }
1197
1198        let byte = *input.read_at(pos);
1199        if byte == b'\n' {
1200            return true; // Newline found: valid heredoc opener.
1201        } else if byte == b'\r' {
1202            // Handle CRLF: treat '\r' followed by '\n' as a newline as well.
1203            return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1204        } else if is_part_of_identifier(input.read_at(pos)) {
1205            length += 1;
1206        } else {
1207            return false; // Unexpected character.
1208        }
1209    }
1210}
1211
1212#[inline]
1213fn matches_start_of_double_quote_heredoc_document(input: &Input, prefix_len: usize) -> bool {
1214    let total = input.len();
1215    let base = input.current_offset();
1216
1217    // Start after the prefix (if any) and the fixed opener (3 bytes), then skip any whitespace.
1218    let mut length = 3 + prefix_len;
1219    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1220        length += 1;
1221    }
1222
1223    // Next, expect an opening double quote.
1224    if base + length >= total || *input.read_at(base + length) != b'"' {
1225        return false;
1226    }
1227    length += 1;
1228
1229    // The following byte must be a valid start-of-identifier.
1230    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1231        return false;
1232    }
1233    length += 1;
1234
1235    // Now scan the label. For double‑quoted heredoc, a terminating double quote is required.
1236    let mut terminated = false;
1237    loop {
1238        let pos = base + length;
1239        if pos >= total {
1240            return false;
1241        }
1242        let byte = input.read_at(pos);
1243        if *byte == b'\n' {
1244            // End of line: valid only if a closing double quote was encountered.
1245            return terminated;
1246        } else if *byte == b'\r' {
1247            // Handle CRLF sequences.
1248            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1249        } else if !terminated && is_part_of_identifier(byte) {
1250            length += 1;
1251        } else if !terminated && *byte == b'"' {
1252            terminated = true;
1253            length += 1;
1254        } else {
1255            return false;
1256        }
1257    }
1258}
1259
1260#[inline]
1261fn matches_start_of_nowdoc_document(input: &Input, prefix_len: usize) -> bool {
1262    let total = input.len();
1263    let base = input.current_offset();
1264
1265    // Start after the prefix (if any) and the fixed opener (3 bytes) and skip whitespace.
1266    let mut length = 3 + prefix_len;
1267    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1268        length += 1;
1269    }
1270
1271    // Now, the next byte must be a single quote.
1272    if base + length >= total || *input.read_at(base + length) != b'\'' {
1273        return false;
1274    }
1275    length += 1;
1276
1277    // The following byte must be a valid start-of-identifier.
1278    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1279        return false;
1280    }
1281    length += 1;
1282
1283    // Read the label until a newline. A terminating single quote is required.
1284    let mut terminated = false;
1285    loop {
1286        let pos = base + length;
1287        if pos >= total {
1288            return false;
1289        }
1290        let byte = *input.read_at(pos);
1291        if byte == b'\n' {
1292            return terminated;
1293        } else if byte == b'\r' {
1294            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1295        } else if !terminated && is_part_of_identifier(&byte) {
1296            length += 1;
1297        } else if !terminated && byte == b'\'' {
1298            terminated = true;
1299            length += 1;
1300        } else {
1301            return false;
1302        }
1303    }
1304}
1305
1306#[inline]
1307fn matches_literal_double_quote_string(input: &Input, prefix_len: usize) -> bool {
1308    let total = input.len();
1309    let base = input.current_offset();
1310
1311    // Start after the prefix (if any) and the initial double-quote.
1312    let mut pos = base + 1 + prefix_len;
1313    loop {
1314        if pos >= total {
1315            // Reached EOF: assume literal is complete.
1316            return true;
1317        }
1318        let byte = *input.read_at(pos);
1319        if byte == b'"' {
1320            // Encounter a closing double quote.
1321            return true;
1322        }
1323        if byte == b'\\' {
1324            // Skip an escape sequence: assume that the backslash and the escaped character form a pair.
1325            pos += 2;
1326            continue;
1327        }
1328
1329        // Check for variable interpolation or complex expression start:
1330        // If two-byte sequences match either "$" followed by a start-of-identifier or "{" and "$", then return false.
1331        if pos + 1 < total {
1332            let next = *input.read_at(pos + 1);
1333            if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1334                return false;
1335            }
1336        }
1337        pos += 1;
1338    }
1339}
1340
1341#[inline]
1342fn read_start_of_heredoc_document(input: &Input, double_quoted: bool, prefix_len: usize) -> (usize, usize, usize) {
1343    let total = input.len();
1344    let base = input.current_offset();
1345
1346    // Start reading after the prefix (if any) and the fixed opener (3 bytes).
1347    let mut pos = base + 3 + prefix_len;
1348    let mut whitespaces = 0;
1349    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1350        whitespaces += 1;
1351        pos += 1;
1352    }
1353
1354    // The label (or delimiter) starts after:
1355    //   prefix + 3 bytes + whitespace bytes + an extra offset:
1356    //      if double-quoted: 2 bytes (opening and closing quotes around the label)
1357    //      else: 1 byte.
1358    let mut length = 3 + prefix_len + whitespaces + if double_quoted { 2 } else { 1 };
1359
1360    let mut label_length = 1; // Start with at least one byte for the label.
1361    let mut terminated = false; // For double-quoted heredoc, to track the closing quote.
1362    loop {
1363        let pos = base + length;
1364        // Bail out gracefully if we run past the input or hit a byte that the caller's
1365        // earlier validation should have rejected: returning the accumulated `(length,
1366        // whitespaces, label_length)` lets the lexer produce a malformed-heredoc token
1367        // instead of panicking.
1368        if pos >= total {
1369            return (length, whitespaces, label_length);
1370        }
1371
1372        let byte = *input.read_at(pos);
1373        if byte == b'\n' {
1374            // Newline ends the label.
1375            length += 1;
1376            return (length, whitespaces, label_length);
1377        } else if byte == b'\r' {
1378            // Handle CRLF sequences
1379            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1380                length += 2;
1381            } else {
1382                length += 1;
1383            }
1384            return (length, whitespaces, label_length);
1385        } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1386            // For both unquoted and double-quoted (before the closing quote) heredoc,
1387            // a valid identifier character is part of the label.
1388            length += 1;
1389            label_length += 1;
1390        } else if double_quoted && !terminated && byte == b'"' {
1391            // In a double-quoted heredoc, a double quote terminates the label.
1392            length += 1;
1393            terminated = true;
1394        } else {
1395            // Malformed heredoc label: stop scanning and let the caller surface a parse error.
1396            return (length, whitespaces, label_length);
1397        }
1398    }
1399}
1400
1401#[inline]
1402fn read_start_of_nowdoc_document(input: &Input, prefix_len: usize) -> (usize, usize, usize) {
1403    let total = input.len();
1404    let base = input.current_offset();
1405
1406    let mut pos = base + 3 + prefix_len;
1407    let mut whitespaces = 0;
1408    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1409        whitespaces += 1;
1410        pos += 1;
1411    }
1412
1413    // For nowdoc, the fixed extra offset is always 2.
1414    let mut length = 3 + prefix_len + whitespaces + 2;
1415
1416    let mut label_length = 1;
1417    let mut terminated = false;
1418    loop {
1419        let pos = base + length;
1420        if pos >= total {
1421            // Bail out gracefully on truncated input; surfacing the accumulated state lets the
1422            // lexer report a parse error instead of panicking.
1423            return (length, whitespaces, label_length);
1424        }
1425        let byte = *input.read_at(pos);
1426
1427        if byte == b'\n' {
1428            // A newline indicates the end of the label.
1429            length += 1;
1430            return (length, whitespaces, label_length);
1431        } else if byte == b'\r' {
1432            // Handle CRLF sequences
1433            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1434                length += 2;
1435            } else {
1436                length += 1;
1437            }
1438            return (length, whitespaces, label_length);
1439        } else if is_part_of_identifier(&byte) && !terminated {
1440            // For nowdoc, identifier characters contribute to the label until terminated.
1441            length += 1;
1442            label_length += 1;
1443        } else if !terminated && byte == b'\'' {
1444            // A single quote terminates the nowdoc label.
1445            length += 1;
1446            terminated = true;
1447        } else {
1448            // Malformed nowdoc label: stop scanning and let the caller surface a parse error.
1449            return (length, whitespaces, label_length);
1450        }
1451    }
1452}
1453
1454#[inline]
1455fn read_literal_string(input: &Input, quote: u8, prefix_len: usize) -> (TokenKind, usize) {
1456    let total = input.len();
1457    let start = input.current_offset();
1458    let skip = prefix_len + 1; // prefix + opening quote
1459    let mut length = skip;
1460
1461    let bytes = input.peek(skip, total - start - skip);
1462    loop {
1463        let scan_start = length - skip;
1464        match memchr2(quote, b'\\', &bytes[scan_start..]) {
1465            Some(pos) => {
1466                let abs_pos = scan_start + pos;
1467                let byte = bytes[abs_pos];
1468
1469                if byte == b'\\' {
1470                    length = skip + abs_pos + 2;
1471                    if length > total - start {
1472                        return (TokenKind::PartialLiteralString, total - start);
1473                    }
1474                } else {
1475                    length = skip + abs_pos + 1; // +1 for the closing quote
1476                    return (TokenKind::LiteralString, length);
1477                }
1478            }
1479            None => {
1480                // No quote or backslash found - EOF
1481                return (TokenKind::PartialLiteralString, total - start);
1482            }
1483        }
1484    }
1485}
1486
1487#[inline]
1488fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1489    let total = input.len();
1490    let base = input.current_offset();
1491    // `offset` is relative to the current position.
1492    let mut offset = from;
1493
1494    loop {
1495        let abs = base + offset;
1496        if abs >= total {
1497            // End of input.
1498            break;
1499        }
1500
1501        // Pattern 1: If the current byte is part of an identifier, simply advance.
1502        if is_part_of_identifier(input.read_at(abs)) {
1503            offset += 1;
1504            continue;
1505        }
1506
1507        // Pattern 2: If the current byte is a '[' then we enter a bracketed interpolation.
1508        if *input.read_at(abs) == b'[' {
1509            offset += 1;
1510            let mut nesting = 0;
1511            loop {
1512                let abs_inner = base + offset;
1513                if abs_inner >= total {
1514                    break;
1515                }
1516                let b = input.read_at(abs_inner);
1517                if *b == b']' {
1518                    offset += 1;
1519                    if nesting == 0 {
1520                        break;
1521                    }
1522
1523                    nesting -= 1;
1524                } else if *b == b'[' {
1525                    offset += 1;
1526                    nesting += 1;
1527                } else if b.is_ascii_whitespace() {
1528                    // Do not include whitespace.
1529                    break;
1530                } else {
1531                    offset += 1;
1532                }
1533            }
1534            // When bracketed interpolation is processed, exit the loop.
1535            break;
1536        }
1537
1538        // Pattern 3: Check for "->" followed by a valid identifier start.
1539        if base + offset + 2 < total
1540            && *input.read_at(abs) == b'-'
1541            && *input.read_at(base + offset + 1) == b'>'
1542            && is_start_of_identifier(input.read_at(base + offset + 2))
1543        {
1544            offset += 3;
1545            // Consume any following identifier characters.
1546            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1547                offset += 1;
1548            }
1549            break;
1550        }
1551
1552        // Pattern 4: Check for "?->" followed by a valid identifier start.
1553        if base + offset + 3 < total
1554            && *input.read_at(abs) == b'?'
1555            && *input.read_at(base + offset + 1) == b'-'
1556            && *input.read_at(base + offset + 2) == b'>'
1557            && is_start_of_identifier(input.read_at(base + offset + 3))
1558        {
1559            offset += 4;
1560            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1561                offset += 1;
1562            }
1563            break;
1564        }
1565
1566        // None of the expected patterns matched: exit the loop.
1567        break;
1568    }
1569
1570    offset as u32
1571}
1572
1573/// Scan forward from a `{$`/`${` brace interpolation to the offset just past
1574/// its matching `}` (tracking `{`/`}` nesting), or to end-of-input if the
1575/// interpolation is never closed.
1576///
1577/// The expression inside `{$...}` is real PHP and may contain nested string
1578/// literals - e.g. `"{$a["key"]}"` - whose own `{`/`}` bytes must not be
1579/// counted as interpolation braces (consider `"{$a["}"]}"`). Nested `'`, `"`,
1580/// and `` ` `` strings are therefore skipped wholesale, honouring `\` escapes,
1581/// so a `}` inside a nested string neither closes the interpolation early nor
1582/// is miscounted.
1583#[inline]
1584fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1585    let total = input.len();
1586    let base = input.current_offset();
1587    let mut offset = from;
1588    let mut nesting = 0;
1589
1590    loop {
1591        let abs = base + offset;
1592        if abs >= total {
1593            break;
1594        }
1595        match *input.read_at(abs) {
1596            b'}' => {
1597                offset += 1;
1598                if nesting == 0 {
1599                    break;
1600                }
1601
1602                nesting -= 1;
1603            }
1604            b'{' => {
1605                offset += 1;
1606                nesting += 1;
1607            }
1608            quote @ (b'\'' | b'"' | b'`') => {
1609                offset += 1;
1610                loop {
1611                    let abs = base + offset;
1612                    if abs >= total {
1613                        break;
1614                    }
1615                    match *input.read_at(abs) {
1616                        b'\\' => offset += 2,
1617                        b if b == quote => {
1618                            offset += 1;
1619                            break;
1620                        }
1621                        _ => offset += 1,
1622                    }
1623                }
1624            }
1625            _ => {
1626                offset += 1;
1627            }
1628        }
1629    }
1630
1631    offset as u32
1632}
1633
1634/// Scan a multi-line comment using SIMD-accelerated search.
1635/// Returns Some(length) including the closing */, or None if unterminated.
1636#[inline]
1637fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1638    // Use SIMD to find */ quickly
1639    memmem::find(bytes, b"*/").map(|pos| pos + 2)
1640}
1641
1642/// Scan a single-line comment using SIMD-accelerated search.
1643/// Returns the length of the comment body (not including the //).
1644/// Stops at newline or ?>.
1645#[inline]
1646fn scan_single_line_comment(bytes: &[u8]) -> usize {
1647    let mut pos = 0;
1648    while pos < bytes.len() {
1649        match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1650            Some(offset) => {
1651                let found_pos = pos + offset;
1652                match bytes[found_pos] {
1653                    b'\n' | b'\r' => return found_pos,
1654                    b'?' => {
1655                        // Check if it's ?>
1656                        if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1657                            // Also check for whitespace before ?>
1658                            if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1659                                return found_pos - 1;
1660                            }
1661                            return found_pos;
1662                        }
1663                        // Not ?>, continue searching
1664                        pos = found_pos + 1;
1665                    }
1666                    // `memchr3` only matches the three bytes we asked for; any other value here would
1667                    // indicate a memchr bug. Treat it as end-of-comment so the lexer keeps making progress.
1668                    _ => return found_pos,
1669                }
1670            }
1671            None => return bytes.len(),
1672        }
1673    }
1674
1675    bytes.len()
1676}
mago_syntax/lexer/mod.rs

mago_syntax/lexer/
mod.rs