mago_syntax/lexer/
mod.rs

1use std::collections::VecDeque;
2use std::fmt::Debug;
3use std::hint::unreachable_unchecked;
4
5use memchr::memchr2;
6use memchr::memmem;
7
8/// Lookup table for single-character tokens that are ALWAYS single-char
9/// (i.e., they can never be part of a multi-character token).
10/// Maps byte -> Option<TokenKind>
11const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
12    let mut table: [Option<TokenKind>; 256] = [None; 256];
13    table[b';' as usize] = Some(TokenKind::Semicolon);
14    table[b',' as usize] = Some(TokenKind::Comma);
15    table[b')' as usize] = Some(TokenKind::RightParenthesis);
16    table[b'[' as usize] = Some(TokenKind::LeftBracket);
17    table[b']' as usize] = Some(TokenKind::RightBracket);
18    table[b'{' as usize] = Some(TokenKind::LeftBrace);
19    table[b'}' as usize] = Some(TokenKind::RightBrace);
20    table[b'~' as usize] = Some(TokenKind::Tilde);
21    table[b'@' as usize] = Some(TokenKind::At);
22    table
23};
24
25/// Lookup table for identifier start characters (a-z, A-Z, _, 0x80-0xFF)
26const IDENT_START_TABLE: [bool; 256] = {
27    let mut table = [false; 256];
28    let mut i = 0usize;
29    while i < 256 {
30        table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
31        i += 1;
32    }
33
34    table
35};
36
37use mago_database::file::FileId;
38use mago_database::file::HasFileId;
39use mago_span::Position;
40use mago_syntax_core::float_exponent;
41use mago_syntax_core::float_separator;
42use mago_syntax_core::input::Input;
43use mago_syntax_core::number_sign;
44use mago_syntax_core::start_of_binary_number;
45use mago_syntax_core::start_of_float_number;
46use mago_syntax_core::start_of_hexadecimal_number;
47use mago_syntax_core::start_of_identifier;
48use mago_syntax_core::start_of_number;
49use mago_syntax_core::start_of_octal_number;
50use mago_syntax_core::start_of_octal_or_float_number;
51use mago_syntax_core::utils::is_part_of_identifier;
52use mago_syntax_core::utils::is_start_of_identifier;
53use mago_syntax_core::utils::read_digits_of_base;
54
55use crate::error::SyntaxError;
56use crate::lexer::internal::mode::HaltStage;
57use crate::lexer::internal::mode::Interpolation;
58use crate::lexer::internal::mode::LexerMode;
59use crate::lexer::internal::utils::NumberKind;
60use crate::settings::LexerSettings;
61use crate::token::DocumentKind;
62use crate::token::Token;
63use crate::token::TokenKind;
64
65mod internal;
66
67/// The `Lexer` struct is responsible for tokenizing input source code into discrete tokens
68/// based on PHP language syntax. It is designed to work with PHP code from version 7.0 up to 8.4.
69///
70/// The lexer reads through the provided input and processes it accordingly.
71///
72/// It identifies PHP-specific tokens, including operators, keywords, comments, strings, and other syntax elements,
73/// and produces a sequence of [`Token`] objects that are used in further stages of compilation or interpretation.
74///
75/// The lexer is designed to be used in a streaming fashion, where it reads the input source code in chunks
76/// and produces tokens incrementally. This allows for efficient processing of large source files and
77/// minimizes memory usage.
78#[derive(Debug)]
79pub struct Lexer<'input> {
80    input: Input<'input>,
81    settings: LexerSettings,
82    mode: LexerMode<'input>,
83    interpolating: bool,
84    brace_interpolating: bool,
85    /// Buffer for tokens during string interpolation.
86    buffer: VecDeque<Token<'input>>,
87}
88
89impl<'input> Lexer<'input> {
90    /// Initial capacity for the token buffer used during string interpolation.
91    /// Pre-allocating avoids reallocation during interpolation processing.
92    const BUFFER_INITIAL_CAPACITY: usize = 8;
93
94    /// Creates a new `Lexer` instance.
95    ///
96    /// # Parameters
97    ///
98    /// - `input`: The input source code to tokenize.
99    /// - `settings`: The lexer settings.
100    ///
101    /// # Returns
102    ///
103    /// A new `Lexer` instance that reads from the provided byte slice.
104    pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
105        Lexer {
106            input,
107            settings,
108            mode: LexerMode::Inline,
109            interpolating: false,
110            brace_interpolating: false,
111            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
112        }
113    }
114
115    /// Creates a new `Lexer` instance for parsing a script block.
116    ///
117    /// # Parameters
118    ///
119    /// - `input`: The input source code to tokenize.
120    /// - `settings`: The lexer settings.
121    ///
122    /// # Returns
123    ///
124    /// A new `Lexer` instance that reads from the provided byte slice.
125    pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
126        Lexer {
127            input,
128            settings,
129            mode: LexerMode::Script,
130            interpolating: false,
131            brace_interpolating: false,
132            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
133        }
134    }
135
136    /// Check if the lexer has reached the end of the input.
137    ///
138    /// If this method returns `true`, the lexer will not produce any more tokens.
139    #[must_use]
140    pub fn has_reached_eof(&self) -> bool {
141        self.input.has_reached_eof()
142    }
143
144    /// Get the current position of the lexer in the input source code.
145    #[inline]
146    pub const fn current_position(&self) -> Position {
147        self.input.current_position()
148    }
149
150    /// Tokenizes the next input from the source code.
151    ///
152    /// This method reads from the input and produces the next [`Token`] based on the current [`LexerMode`].
153    /// It handles various lexical elements such as inline text, script code, strings with interpolation,
154    /// comments, and different PHP-specific constructs.
155    ///
156    /// # Returns
157    ///
158    /// - `Some(Ok(Token))` if a token was successfully parsed.
159    /// - `Some(Err(SyntaxError))` if a syntax error occurred while parsing the next token.
160    /// - `None` if the end of the input has been reached.
161    ///
162    /// # Notes
163    ///
164    /// - It efficiently handles tokenization by consuming input based on patterns specific to PHP syntax.
165    /// - The lexer supports complex features like string interpolation and different numeric formats.
166    ///
167    /// # Errors
168    ///
169    /// Returns `Some(Err(SyntaxError))` in cases such as:
170    ///
171    /// - Unrecognized tokens that do not match any known PHP syntax.
172    /// - Unexpected tokens in a given context, such as an unexpected end of string.
173    ///
174    /// # Panics
175    ///
176    /// This method should not panic under normal operation. If it does, it indicates a bug in the lexer implementation.
177    ///
178    /// # See Also
179    ///
180    /// - [`Token`]: Represents a lexical token with its kind, value, and span.
181    /// - [`SyntaxError`]: Represents errors that can occur during lexing.
182    #[inline]
183    pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
184        // Check if there are buffered tokens from string interpolation.
185        if !self.interpolating
186            && let Some(token) = self.buffer.pop_front()
187        {
188            return Some(Ok(token));
189        }
190
191        if self.input.has_reached_eof() {
192            return None;
193        }
194
195        match self.mode {
196            LexerMode::Inline => {
197                let start = self.input.current_position();
198                let offset = self.input.current_offset();
199
200                // Shebang is only valid at the absolute start of the file (offset 0).
201                if offset == 0
202                    && self.input.len() >= 2
203                    && unsafe { *self.input.read_at_unchecked(0) } == b'#'
204                    && unsafe { *self.input.read_at_unchecked(1) } == b'!'
205                {
206                    let buffer = self.input.consume_through(b'\n');
207                    let end = self.input.current_position();
208
209                    return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
210                }
211
212                // Get the remaining bytes to scan.
213                let bytes = self.input.read_remaining();
214
215                if self.settings.enable_short_tags {
216                    if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
217                        if pos > 0 {
218                            let buffer = self.input.consume(pos);
219                            let end = self.input.current_position();
220
221                            return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
222                        }
223
224                        if self.input.is_at(b"<?php", true) {
225                            let buffer = self.input.consume(5);
226                            self.mode = LexerMode::Script;
227                            return Some(Ok(self.token(
228                                TokenKind::OpenTag,
229                                buffer,
230                                start,
231                                self.input.current_position(),
232                            )));
233                        }
234
235                        if self.input.is_at(b"<?=", false) {
236                            let buffer = self.input.consume(3);
237                            self.mode = LexerMode::Script;
238                            return Some(Ok(self.token(
239                                TokenKind::EchoTag,
240                                buffer,
241                                start,
242                                self.input.current_position(),
243                            )));
244                        }
245
246                        let buffer = self.input.consume(2);
247                        self.mode = LexerMode::Script;
248                        return Some(Ok(self.token(
249                            TokenKind::ShortOpenTag,
250                            buffer,
251                            start,
252                            self.input.current_position(),
253                        )));
254                    }
255                } else {
256                    let iter = memchr::memmem::find_iter(bytes, b"<?");
257
258                    for pos in iter {
259                        // SAFETY: `pos` is guaranteed to be within `bytes` by `find_iter`.
260                        let candidate = unsafe { bytes.get_unchecked(pos..) };
261
262                        if candidate.len() >= 5
263                            && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
264                            && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
265                            && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
266                        {
267                            if pos > 0 {
268                                let buffer = self.input.consume(pos);
269                                let end = self.input.current_position();
270                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
271                            }
272
273                            let buffer = self.input.consume(5);
274                            self.mode = LexerMode::Script;
275                            return Some(Ok(self.token(
276                                TokenKind::OpenTag,
277                                buffer,
278                                start,
279                                self.input.current_position(),
280                            )));
281                        }
282
283                        if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
284                            if pos > 0 {
285                                let buffer = self.input.consume(pos);
286                                let end = self.input.current_position();
287                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
288                            }
289
290                            let buffer = self.input.consume(3);
291                            self.mode = LexerMode::Script;
292                            return Some(Ok(self.token(
293                                TokenKind::EchoTag,
294                                buffer,
295                                start,
296                                self.input.current_position(),
297                            )));
298                        }
299                    }
300                }
301
302                if self.input.has_reached_eof() {
303                    return None;
304                }
305
306                let buffer = self.input.consume_remaining();
307                let end = self.input.current_position();
308                Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
309            }
310            LexerMode::Script => {
311                let start = self.input.current_position();
312                let whitespaces = self.input.consume_whitespaces();
313                if !whitespaces.is_empty() {
314                    return Some(Ok(self.token(
315                        TokenKind::Whitespace,
316                        whitespaces,
317                        start,
318                        self.input.current_position(),
319                    )));
320                }
321
322                let first_byte = match self.input.read(1).first() {
323                    Some(&b) => b,
324                    None => {
325                        // SAFETY: we check for EOF before entering scripting section,
326                        unsafe { unreachable_unchecked() }
327                    }
328                };
329
330                if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
331                    let buffer = self.input.consume(1);
332                    let end = self.input.current_position();
333                    return Some(Ok(self.token(kind, buffer, start, end)));
334                }
335
336                if IDENT_START_TABLE[first_byte as usize] {
337                    let is_binary_string_prefix = !self.interpolating
338                        && matches!(first_byte, b'b' | b'B')
339                        && matches!(self.input.read(4), [_, b'\'' | b'"', ..] | [_, b'<', b'<', b'<']);
340
341                    if !is_binary_string_prefix {
342                        let (token_kind, len) = self.scan_identifier_or_keyword_info();
343
344                        if token_kind == TokenKind::HaltCompiler {
345                            self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
346                        }
347
348                        let buffer = self.input.consume(len);
349                        let end = self.input.current_position();
350                        return Some(Ok(self.token(token_kind, buffer, start, end)));
351                    }
352
353                    // Fall through to handle b-prefix strings in the match block below
354                }
355
356                if first_byte == b'$'
357                    && let Some(&next) = self.input.read(2).get(1)
358                    && IDENT_START_TABLE[next as usize]
359                {
360                    let (ident_len, _) = self.input.scan_identifier(1);
361                    let buffer = self.input.consume(1 + ident_len);
362                    let end = self.input.current_position();
363                    return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
364                }
365
366                let mut document_label: &[u8] = &[];
367
368                let (token_kind, len) = match self.input.read(3) {
369                    [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
370                    [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
371                    [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
372                    [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
373                    [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
374                    [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
375                    [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
376                    [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
377                    [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
378                    [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input, 0) => {
379                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false, 0);
380
381                        document_label = self.input.peek(3 + whitespaces, label_length);
382
383                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
384                    }
385                    [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input, 0) => {
386                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true, 0);
387
388                        document_label = self.input.peek(4 + whitespaces, label_length);
389
390                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
391                    }
392                    [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input, 0) => {
393                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input, 0);
394
395                        document_label = self.input.peek(4 + whitespaces, label_length);
396
397                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
398                    }
399                    [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
400                    [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
401                    [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
402                    [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
403                    [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
404                    [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
405                    [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
406                    [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
407                    [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
408                    [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
409                    [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
410                    [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
411                    [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
412                    [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
413                    [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
414                    [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
415                    [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
416                    [b'>', b'>', ..] => (TokenKind::RightShift, 2),
417                    [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
418                    [b':', b':', ..] => (TokenKind::ColonColon, 2),
419                    [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
420                    [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
421                    [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
422                    [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
423                    [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
424                    [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
425                    [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
426                    [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
427                    [b'/', b'/', ..] => {
428                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
429                        let comment_len = scan_single_line_comment(remaining);
430                        (TokenKind::SingleLineComment, 2 + comment_len)
431                    }
432                    [b'/', b'*', asterisk] => {
433                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
434                        match scan_multi_line_comment(remaining) {
435                            Some(len) => {
436                                let is_docblock = asterisk == &b'*' && len > 2;
437                                if is_docblock {
438                                    (TokenKind::DocBlockComment, len + 2)
439                                } else {
440                                    (TokenKind::MultiLineComment, len + 2)
441                                }
442                            }
443                            None => {
444                                self.input.consume(remaining.len() + 2);
445                                return Some(Err(SyntaxError::UnexpectedEndOfFile(
446                                    self.file_id(),
447                                    self.input.current_position(),
448                                )));
449                            }
450                        }
451                    }
452                    [b'\\', start_of_identifier!(), ..] => {
453                        let mut length = 1;
454                        loop {
455                            let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
456                            length += ident_len;
457                            if ends_with_ns {
458                                length += 1; // Include the backslash
459                            } else {
460                                break;
461                            }
462                        }
463
464                        (TokenKind::FullyQualifiedIdentifier, length)
465                    }
466                    [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
467                    [b'$', ..] => (TokenKind::Dollar, 1),
468                    [b'!', ..] => (TokenKind::Bang, 1),
469                    [b'&', ..] => (TokenKind::Ampersand, 1),
470                    [b'?', ..] => (TokenKind::Question, 1),
471                    [b'=', ..] => (TokenKind::Equal, 1),
472                    [b'`', ..] => (TokenKind::Backtick, 1),
473                    [b'+', ..] => (TokenKind::Plus, 1),
474                    [b'%', ..] => (TokenKind::Percent, 1),
475                    [b'-', ..] => (TokenKind::Minus, 1),
476                    [b'<', ..] => (TokenKind::LessThan, 1),
477                    [b'>', ..] => (TokenKind::GreaterThan, 1),
478                    [b':', ..] => (TokenKind::Colon, 1),
479                    [b'|', ..] => (TokenKind::Pipe, 1),
480                    [b'^', ..] => (TokenKind::Caret, 1),
481                    [b'*', ..] => (TokenKind::Asterisk, 1),
482                    [b'/', ..] => (TokenKind::Slash, 1),
483                    [b'b' | b'B', b'\'', ..] => read_literal_string(&self.input, b'\'', 1),
484                    [b'b' | b'B', b'"', ..] if matches_literal_double_quote_string(&self.input, 1) => {
485                        read_literal_string(&self.input, b'"', 1)
486                    }
487                    [b'b' | b'B', b'"', ..] => (TokenKind::DoubleQuote, 2),
488                    [b'b' | b'B', b'<', b'<']
489                        if self.input.read(4).len() == 4
490                            && self.input.read(4)[3] == b'<'
491                            && matches_start_of_heredoc_document(&self.input, 1) =>
492                    {
493                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false, 1);
494
495                        document_label = self.input.peek(4 + whitespaces, label_length);
496
497                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
498                    }
499                    [b'b' | b'B', b'<', b'<']
500                        if self.input.read(4).len() == 4
501                            && self.input.read(4)[3] == b'<'
502                            && matches_start_of_double_quote_heredoc_document(&self.input, 1) =>
503                    {
504                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true, 1);
505
506                        document_label = self.input.peek(5 + whitespaces, label_length);
507
508                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
509                    }
510                    [b'b' | b'B', b'<', b'<']
511                        if self.input.read(4).len() == 4
512                            && self.input.read(4)[3] == b'<'
513                            && matches_start_of_nowdoc_document(&self.input, 1) =>
514                    {
515                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input, 1);
516
517                        document_label = self.input.peek(5 + whitespaces, label_length);
518
519                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
520                    }
521                    // Regular string literals
522                    [quote @ b'\'', ..] => read_literal_string(&self.input, *quote, 0),
523                    [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input, 0) => {
524                        read_literal_string(&self.input, *quote, 0)
525                    }
526                    [b'"', ..] => (TokenKind::DoubleQuote, 1),
527                    [b'(', ..] => 'parenthesis: {
528                        let mut peek_offset = 1;
529                        while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
530                            if b.is_ascii_whitespace() {
531                                peek_offset += 1;
532                            } else {
533                                // Check if this byte could start a cast type (case-insensitive)
534                                let lower = b | 0x20; // ASCII lowercase
535                                if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u' | b'v')
536                                {
537                                    break 'parenthesis (TokenKind::LeftParenthesis, 1);
538                                }
539                                break;
540                            }
541                        }
542
543                        for (value, kind) in internal::consts::CAST_TYPES {
544                            if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
545                                break 'parenthesis (kind, length);
546                            }
547                        }
548
549                        (TokenKind::LeftParenthesis, 1)
550                    }
551                    [b'#', ..] => {
552                        let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
553                        let comment_len = scan_single_line_comment(remaining);
554                        (TokenKind::HashComment, 1 + comment_len)
555                    }
556                    [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
557                    [b'.', start_of_number!(), ..] => {
558                        let mut length = read_digits_of_base(&self.input, 2, 10);
559                        if let float_exponent!() = self.input.peek(length, 1) {
560                            let mut exp_length = length + 1;
561                            if let number_sign!() = self.input.peek(exp_length, 1) {
562                                exp_length += 1;
563                            }
564
565                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
566                            if after_exp > exp_length {
567                                length = after_exp;
568                            }
569                        }
570
571                        (TokenKind::LiteralFloat, length)
572                    }
573                    [start_of_number!(), ..] => 'number: {
574                        let mut length = 1;
575
576                        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
577                            start_of_binary_number!() => {
578                                length += 1;
579
580                                (2, NumberKind::Integer)
581                            }
582                            start_of_octal_number!() => {
583                                length += 1;
584
585                                (8, NumberKind::Integer)
586                            }
587                            start_of_hexadecimal_number!() => {
588                                length += 1;
589
590                                (16, NumberKind::Integer)
591                            }
592                            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
593                            start_of_float_number!() => (10, NumberKind::Float),
594                            _ => (10, NumberKind::IntegerOrFloat),
595                        };
596
597                        if kind != NumberKind::Float {
598                            length = read_digits_of_base(&self.input, length, base);
599
600                            if kind == NumberKind::Integer {
601                                break 'number (TokenKind::LiteralInteger, length);
602                            }
603                        }
604
605                        let is_float = matches!(self.input.peek(length, 3), float_separator!());
606
607                        if !is_float {
608                            if kind == NumberKind::OctalOrFloat
609                                && let Some(invalid_idx) =
610                                    (1..length).find(|&i| matches!(self.input.peek(i, 1), [b'8' | b'9']))
611                            {
612                                let invalid_byte = self.input.peek(invalid_idx, 1)[0];
613                                let start = self.input.current_position();
614                                let invalid_position = Position { offset: start.offset + invalid_idx as u32 };
615                                self.input.consume(length);
616                                return Some(Err(SyntaxError::UnexpectedToken(
617                                    self.file_id(),
618                                    invalid_byte,
619                                    invalid_position,
620                                )));
621                            }
622                            break 'number (TokenKind::LiteralInteger, length);
623                        }
624
625                        if let [b'.'] = self.input.peek(length, 1) {
626                            length += 1;
627                            length = read_digits_of_base(&self.input, length, 10);
628                        }
629
630                        if let float_exponent!() = self.input.peek(length, 1) {
631                            // Only include exponent if there are digits after it
632                            let mut exp_length = length + 1;
633                            if let number_sign!() = self.input.peek(exp_length, 1) {
634                                exp_length += 1;
635                            }
636                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
637                            if after_exp > exp_length {
638                                // There are digits after the exponent marker
639                                length = after_exp;
640                            }
641                        }
642
643                        (TokenKind::LiteralFloat, length)
644                    }
645                    [b'.', ..] => (TokenKind::Dot, 1),
646                    [unknown_byte, ..] => {
647                        let position = self.input.current_position();
648                        self.input.consume(1);
649
650                        return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
651                    }
652                    [] => {
653                        // we check for EOF before entering scripting section,
654                        // so this should be unreachable.
655                        unreachable!()
656                    }
657                };
658
659                self.mode = match token_kind {
660                    TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
661                    TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
662                    TokenKind::CloseTag => LexerMode::Inline,
663                    TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
664                    TokenKind::DocumentStart(document_kind) => {
665                        LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
666                    }
667                    _ => LexerMode::Script,
668                };
669
670                let buffer = self.input.consume(len);
671                let end = self.input.current_position();
672
673                Some(Ok(self.token(token_kind, buffer, start, end)))
674            }
675            LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
676                Interpolation::None => {
677                    let start = self.input.current_position();
678
679                    let mut length = 0;
680                    let mut last_was_slash = false;
681                    let mut token_kind = TokenKind::StringPart;
682                    loop {
683                        match self.input.peek(length, 2) {
684                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
685                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
686
687                                self.mode =
688                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
689
690                                break;
691                            }
692                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
693                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
694
695                                self.mode = LexerMode::DoubleQuoteString(Interpolation::BraceUntil(
696                                    start.offset + until_offset,
697                                ));
698
699                                break;
700                            }
701                            [b'\\', ..] => {
702                                length += 1;
703
704                                last_was_slash = !last_was_slash;
705                            }
706                            [b'"', ..] if !last_was_slash => {
707                                if length == 0 {
708                                    length += 1;
709                                    token_kind = TokenKind::DoubleQuote;
710
711                                    break;
712                                }
713
714                                break;
715                            }
716                            [_, ..] => {
717                                length += 1;
718                                last_was_slash = false;
719                            }
720                            [] => {
721                                break;
722                            }
723                        }
724                    }
725
726                    let buffer = self.input.consume(length);
727                    let end = self.input.current_position();
728
729                    if TokenKind::DoubleQuote == token_kind {
730                        self.mode = LexerMode::Script;
731                    }
732
733                    Some(Ok(self.token(token_kind, buffer, start, end)))
734                }
735                Interpolation::Until(offset) => {
736                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), false)
737                }
738                Interpolation::BraceUntil(offset) => {
739                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), true)
740                }
741            },
742            LexerMode::ShellExecuteString(interpolation) => match &interpolation {
743                Interpolation::None => {
744                    let start = self.input.current_position();
745
746                    let mut length = 0;
747                    let mut last_was_slash = false;
748                    let mut token_kind = TokenKind::StringPart;
749                    loop {
750                        match self.input.peek(length, 2) {
751                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
752                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
753
754                                self.mode =
755                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
756
757                                break;
758                            }
759                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
760                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
761
762                                self.mode = LexerMode::ShellExecuteString(Interpolation::BraceUntil(
763                                    start.offset + until_offset,
764                                ));
765
766                                break;
767                            }
768                            [b'\\', ..] => {
769                                length += 1;
770                                last_was_slash = !last_was_slash;
771                            }
772                            [b'`', ..] if !last_was_slash => {
773                                if length == 0 {
774                                    length += 1;
775                                    token_kind = TokenKind::Backtick;
776
777                                    break;
778                                }
779
780                                break;
781                            }
782                            [_, ..] => {
783                                length += 1;
784                                last_was_slash = false;
785                            }
786                            [] => {
787                                break;
788                            }
789                        }
790                    }
791
792                    let buffer = self.input.consume(length);
793                    let end = self.input.current_position();
794
795                    if TokenKind::Backtick == token_kind {
796                        self.mode = LexerMode::Script;
797                    }
798
799                    Some(Ok(self.token(token_kind, buffer, start, end)))
800                }
801                Interpolation::Until(offset) => {
802                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), false)
803                }
804                Interpolation::BraceUntil(offset) => {
805                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), true)
806                }
807            },
808            LexerMode::DocumentString(kind, label, interpolation) => match &kind {
809                DocumentKind::Heredoc => match &interpolation {
810                    Interpolation::None => {
811                        let start = self.input.current_position();
812
813                        let mut length = 0;
814                        let mut last_was_slash = false;
815                        let mut only_whitespaces = true;
816                        let mut token_kind = TokenKind::StringPart;
817                        loop {
818                            match self.input.peek(length, 2) {
819                                [b'\r', b'\n'] => {
820                                    length += 2;
821
822                                    break;
823                                }
824                                [b'\n' | b'\r', ..] => {
825                                    length += 1;
826
827                                    break;
828                                }
829                                [byte, ..] if byte.is_ascii_whitespace() => {
830                                    length += 1;
831                                }
832                                [b'$', start_of_identifier!(), ..] if !last_was_slash => {
833                                    let until_offset =
834                                        read_until_end_of_variable_interpolation(&self.input, length + 2);
835
836                                    self.mode = LexerMode::DocumentString(
837                                        kind,
838                                        label,
839                                        Interpolation::Until(start.offset + until_offset),
840                                    );
841
842                                    break;
843                                }
844                                [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
845                                    let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
846
847                                    self.mode = LexerMode::DocumentString(
848                                        kind,
849                                        label,
850                                        Interpolation::BraceUntil(start.offset + until_offset),
851                                    );
852
853                                    break;
854                                }
855                                [b'\\', ..] => {
856                                    length += 1;
857                                    last_was_slash = !last_was_slash;
858                                    only_whitespaces = false;
859                                }
860                                [_, ..] => {
861                                    if only_whitespaces
862                                        && self.input.peek(length, label.len()) == label
863                                        && self
864                                            .input
865                                            .peek(length + label.len(), 1)
866                                            .first()
867                                            .is_none_or(|c| !is_part_of_identifier(c))
868                                    {
869                                        length += label.len();
870                                        token_kind = TokenKind::DocumentEnd;
871
872                                        break;
873                                    }
874
875                                    length += 1;
876                                    last_was_slash = false;
877                                    only_whitespaces = false;
878                                }
879                                [] => {
880                                    break;
881                                }
882                            }
883                        }
884
885                        let buffer = self.input.consume(length);
886                        let end = self.input.current_position();
887
888                        if TokenKind::DocumentEnd == token_kind {
889                            self.mode = LexerMode::Script;
890                        }
891
892                        Some(Ok(self.token(token_kind, buffer, start, end)))
893                    }
894                    Interpolation::Until(offset) => {
895                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), false)
896                    }
897                    Interpolation::BraceUntil(offset) => {
898                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), true)
899                    }
900                },
901                DocumentKind::Nowdoc => {
902                    let start = self.input.current_position();
903
904                    let mut length = 0;
905                    let mut terminated = false;
906                    let mut only_whitespaces = true;
907
908                    loop {
909                        match self.input.peek(length, 2) {
910                            [b'\r', b'\n'] => {
911                                length += 2;
912
913                                break;
914                            }
915                            [b'\n' | b'\r', ..] => {
916                                length += 1;
917
918                                break;
919                            }
920                            [byte, ..] if byte.is_ascii_whitespace() => {
921                                length += 1;
922                            }
923                            [_, ..] => {
924                                if only_whitespaces
925                                    && self.input.peek(length, label.len()) == label
926                                    && self
927                                        .input
928                                        .peek(length + label.len(), 1)
929                                        .first()
930                                        .is_none_or(|c| !is_part_of_identifier(c))
931                                {
932                                    length += label.len();
933                                    terminated = true;
934
935                                    break;
936                                }
937
938                                only_whitespaces = false;
939                                length += 1;
940                            }
941                            [] => {
942                                break;
943                            }
944                        }
945                    }
946
947                    let buffer = self.input.consume(length);
948                    let end = self.input.current_position();
949
950                    if terminated {
951                        self.mode = LexerMode::Script;
952
953                        return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
954                    }
955
956                    Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
957                }
958            },
959            LexerMode::Halt(stage) => 'halt: {
960                let start = self.input.current_position();
961                if let HaltStage::End = stage {
962                    let buffer = self.input.consume_remaining();
963                    let end = self.input.current_position();
964
965                    break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
966                }
967
968                let whitespaces = self.input.consume_whitespaces();
969                if !whitespaces.is_empty() {
970                    let end = self.input.current_position();
971
972                    break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
973                }
974
975                match &stage {
976                    HaltStage::LookingForLeftParenthesis => {
977                        if self.input.is_at(b"(", false) {
978                            let buffer = self.input.consume(1);
979                            let end = self.input.current_position();
980
981                            self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
982
983                            Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
984                        } else {
985                            let byte = self.input.read(1)[0];
986                            let position = self.input.current_position();
987                            // Consume the unexpected byte to avoid infinite loops
988                            self.input.consume(1);
989                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
990                        }
991                    }
992                    HaltStage::LookingForRightParenthesis => {
993                        if self.input.is_at(b")", false) {
994                            let buffer = self.input.consume(1);
995                            let end = self.input.current_position();
996
997                            self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
998
999                            Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
1000                        } else {
1001                            let byte = self.input.read(1)[0];
1002                            let position = self.input.current_position();
1003                            self.input.consume(1);
1004                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
1005                        }
1006                    }
1007                    HaltStage::LookingForTerminator => {
1008                        if self.input.is_at(b";", false) {
1009                            let buffer = self.input.consume(1);
1010                            let end = self.input.current_position();
1011
1012                            self.mode = LexerMode::Halt(HaltStage::End);
1013
1014                            Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
1015                        } else if self.input.is_at(b"?>", false) {
1016                            let buffer = self.input.consume(2);
1017                            let end = self.input.current_position();
1018
1019                            self.mode = LexerMode::Halt(HaltStage::End);
1020
1021                            Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
1022                        } else {
1023                            let byte = self.input.read(1)[0];
1024                            let position = self.input.current_position();
1025                            self.input.consume(1);
1026                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
1027                        }
1028                    }
1029                    _ => unreachable!(),
1030                }
1031            }
1032        }
1033    }
1034
1035    /// Fast path for scanning identifiers and keywords.
1036    /// Called when we know the first byte is an identifier start character.
1037    /// Returns (TokenKind, length) to allow proper mode switching.
1038    #[inline]
1039    fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
1040        let (mut length, ended_with_slash) = self.input.scan_identifier(0);
1041
1042        if !ended_with_slash {
1043            match length {
1044                6 if self.input.is_at(b"public(set)", true) => {
1045                    return (TokenKind::PublicSet, 11);
1046                }
1047                7 if self.input.is_at(b"private(set)", true) => {
1048                    return (TokenKind::PrivateSet, 12);
1049                }
1050                9 if self.input.is_at(b"protected(set)", true) => {
1051                    return (TokenKind::ProtectedSet, 14);
1052                }
1053                _ => {}
1054            }
1055        }
1056
1057        if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
1058            return (kind, length);
1059        }
1060
1061        let mut slashes = 0;
1062        let mut last_was_slash = false;
1063        loop {
1064            match self.input.peek(length, 1) {
1065                [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
1066                    length += 1;
1067                    last_was_slash = false;
1068                }
1069                [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1070                    length += 1;
1071                }
1072                [b'\\'] if !self.interpolating || self.brace_interpolating => {
1073                    if last_was_slash {
1074                        length -= 1;
1075                        slashes -= 1;
1076                        last_was_slash = false;
1077                        break;
1078                    }
1079
1080                    length += 1;
1081                    slashes += 1;
1082                    last_was_slash = true;
1083                }
1084                _ => {
1085                    break;
1086                }
1087            }
1088        }
1089
1090        if last_was_slash {
1091            length -= 1;
1092            slashes -= 1;
1093        }
1094
1095        let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1096
1097        (kind, length)
1098    }
1099
1100    #[inline]
1101    fn token(&self, kind: TokenKind, v: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1102        // SAFETY: The input bytes are guaranteed to be valid UTF-8 because:
1103        // 1. File contents are validated via simdutf8 during database loading
1104        // 2. Invalid UTF-8 is converted lossily before reaching the lexer
1105        // 3. All byte slices here are subslices of the validated input
1106        let value = unsafe { std::str::from_utf8_unchecked(v) };
1107
1108        Token { kind, start, value }
1109    }
1110
1111    #[inline]
1112    fn interpolation(
1113        &mut self,
1114        end_offset: u32,
1115        post_interpolation_mode: LexerMode<'input>,
1116        brace: bool,
1117    ) -> Option<Result<Token<'input>, SyntaxError>> {
1118        self.mode = LexerMode::Script;
1119
1120        let was_interpolating = self.interpolating;
1121        self.interpolating = true;
1122        let was_brace_interpolating = self.brace_interpolating;
1123        // For brace interpolation ({$...}), allow qualified identifiers with backslashes.
1124        self.brace_interpolating = brace;
1125
1126        let pending_error = loop {
1127            match self.advance() {
1128                Some(Ok(token)) => {
1129                    let token_start = token.start.offset;
1130                    let token_end = token_start + token.value.len() as u32;
1131                    let is_final_token = token_start <= end_offset && end_offset <= token_end;
1132
1133                    self.buffer.push_back(token);
1134
1135                    if is_final_token {
1136                        break None;
1137                    }
1138                }
1139                Some(Err(error)) => break Some(error),
1140                None => break None,
1141            }
1142        };
1143
1144        self.mode = post_interpolation_mode;
1145        self.interpolating = was_interpolating;
1146        self.brace_interpolating = was_brace_interpolating;
1147
1148        if let Some(error) = pending_error {
1149            return Some(Err(error));
1150        }
1151
1152        self.advance()
1153    }
1154}
1155
1156impl HasFileId for Lexer<'_> {
1157    #[inline]
1158    fn file_id(&self) -> FileId {
1159        self.input.file_id()
1160    }
1161}
1162
1163#[inline]
1164fn matches_start_of_heredoc_document(input: &Input, prefix_len: usize) -> bool {
1165    let total = input.len();
1166    let base = input.current_offset();
1167
1168    // Start after the prefix (if any) and the fixed opener (3 bytes).
1169    let mut length = 3 + prefix_len;
1170    // Consume any following whitespace.
1171    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1172        length += 1;
1173    }
1174
1175    // The next byte must be a valid start-of-identifier.
1176    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1177        return false;
1178    }
1179    length += 1; // Include that identifier start.
1180
1181    // Now continue reading identifier characters until a newline is found.
1182    loop {
1183        let pos = base + length;
1184        if pos >= total {
1185            return false; // Unexpected EOF
1186        }
1187
1188        let byte = *input.read_at(pos);
1189        if byte == b'\n' {
1190            return true; // Newline found: valid heredoc opener.
1191        } else if byte == b'\r' {
1192            // Handle CRLF: treat '\r' followed by '\n' as a newline as well.
1193            return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1194        } else if is_part_of_identifier(input.read_at(pos)) {
1195            length += 1;
1196        } else {
1197            return false; // Unexpected character.
1198        }
1199    }
1200}
1201
1202#[inline]
1203fn matches_start_of_double_quote_heredoc_document(input: &Input, prefix_len: usize) -> bool {
1204    let total = input.len();
1205    let base = input.current_offset();
1206
1207    // Start after the prefix (if any) and the fixed opener (3 bytes), then skip any whitespace.
1208    let mut length = 3 + prefix_len;
1209    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1210        length += 1;
1211    }
1212
1213    // Next, expect an opening double quote.
1214    if base + length >= total || *input.read_at(base + length) != b'"' {
1215        return false;
1216    }
1217    length += 1;
1218
1219    // The following byte must be a valid start-of-identifier.
1220    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1221        return false;
1222    }
1223    length += 1;
1224
1225    // Now scan the label. For double‑quoted heredoc, a terminating double quote is required.
1226    let mut terminated = false;
1227    loop {
1228        let pos = base + length;
1229        if pos >= total {
1230            return false;
1231        }
1232        let byte = input.read_at(pos);
1233        if *byte == b'\n' {
1234            // End of line: valid only if a closing double quote was encountered.
1235            return terminated;
1236        } else if *byte == b'\r' {
1237            // Handle CRLF sequences.
1238            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1239        } else if !terminated && is_part_of_identifier(byte) {
1240            length += 1;
1241        } else if !terminated && *byte == b'"' {
1242            terminated = true;
1243            length += 1;
1244        } else {
1245            return false;
1246        }
1247    }
1248}
1249
1250#[inline]
1251fn matches_start_of_nowdoc_document(input: &Input, prefix_len: usize) -> bool {
1252    let total = input.len();
1253    let base = input.current_offset();
1254
1255    // Start after the prefix (if any) and the fixed opener (3 bytes) and skip whitespace.
1256    let mut length = 3 + prefix_len;
1257    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1258        length += 1;
1259    }
1260
1261    // Now, the next byte must be a single quote.
1262    if base + length >= total || *input.read_at(base + length) != b'\'' {
1263        return false;
1264    }
1265    length += 1;
1266
1267    // The following byte must be a valid start-of-identifier.
1268    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1269        return false;
1270    }
1271    length += 1;
1272
1273    // Read the label until a newline. A terminating single quote is required.
1274    let mut terminated = false;
1275    loop {
1276        let pos = base + length;
1277        if pos >= total {
1278            return false;
1279        }
1280        let byte = *input.read_at(pos);
1281        if byte == b'\n' {
1282            return terminated;
1283        } else if byte == b'\r' {
1284            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1285        } else if !terminated && is_part_of_identifier(&byte) {
1286            length += 1;
1287        } else if !terminated && byte == b'\'' {
1288            terminated = true;
1289            length += 1;
1290        } else {
1291            return false;
1292        }
1293    }
1294}
1295
1296#[inline]
1297fn matches_literal_double_quote_string(input: &Input, prefix_len: usize) -> bool {
1298    let total = input.len();
1299    let base = input.current_offset();
1300
1301    // Start after the prefix (if any) and the initial double-quote.
1302    let mut pos = base + 1 + prefix_len;
1303    loop {
1304        if pos >= total {
1305            // Reached EOF: assume literal is complete.
1306            return true;
1307        }
1308        let byte = *input.read_at(pos);
1309        if byte == b'"' {
1310            // Encounter a closing double quote.
1311            return true;
1312        } else if byte == b'\\' {
1313            // Skip an escape sequence: assume that the backslash and the escaped character form a pair.
1314            pos += 2;
1315            continue;
1316        }
1317
1318        // Check for variable interpolation or complex expression start:
1319        // If two-byte sequences match either "$" followed by a start-of-identifier or "{" and "$", then return false.
1320        if pos + 1 < total {
1321            let next = *input.read_at(pos + 1);
1322            if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1323                return false;
1324            }
1325        }
1326        pos += 1;
1327    }
1328}
1329
1330#[inline]
1331fn read_start_of_heredoc_document(input: &Input, double_quoted: bool, prefix_len: usize) -> (usize, usize, usize) {
1332    let total = input.len();
1333    let base = input.current_offset();
1334
1335    // Start reading after the prefix (if any) and the fixed opener (3 bytes).
1336    let mut pos = base + 3 + prefix_len;
1337    let mut whitespaces = 0;
1338    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1339        whitespaces += 1;
1340        pos += 1;
1341    }
1342
1343    // The label (or delimiter) starts after:
1344    //   prefix + 3 bytes + whitespace bytes + an extra offset:
1345    //      if double-quoted: 2 bytes (opening and closing quotes around the label)
1346    //      else: 1 byte.
1347    let mut length = 3 + prefix_len + whitespaces + if double_quoted { 2 } else { 1 };
1348
1349    let mut label_length = 1; // Start with at least one byte for the label.
1350    let mut terminated = false; // For double-quoted heredoc, to track the closing quote.
1351    loop {
1352        let pos = base + length;
1353        // Ensure we haven't run past the input.
1354        if pos >= total {
1355            unreachable!("Unexpected end of input while reading heredoc label");
1356        }
1357
1358        let byte = *input.read_at(pos);
1359        if byte == b'\n' {
1360            // Newline ends the label.
1361            length += 1;
1362            return (length, whitespaces, label_length);
1363        } else if byte == b'\r' {
1364            // Handle CRLF sequences
1365            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1366                length += 2;
1367            } else {
1368                length += 1;
1369            }
1370            return (length, whitespaces, label_length);
1371        } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1372            // For both unquoted and double-quoted (before the closing quote) heredoc,
1373            // a valid identifier character is part of the label.
1374            length += 1;
1375            label_length += 1;
1376        } else if double_quoted && !terminated && byte == b'"' {
1377            // In a double-quoted heredoc, a double quote terminates the label.
1378            length += 1;
1379            terminated = true;
1380        } else {
1381            unreachable!("Unexpected character encountered in heredoc label");
1382        }
1383    }
1384}
1385
1386#[inline]
1387fn read_start_of_nowdoc_document(input: &Input, prefix_len: usize) -> (usize, usize, usize) {
1388    let total = input.len();
1389    let base = input.current_offset();
1390
1391    let mut pos = base + 3 + prefix_len;
1392    let mut whitespaces = 0;
1393    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1394        whitespaces += 1;
1395        pos += 1;
1396    }
1397
1398    // For nowdoc, the fixed extra offset is always 2.
1399    let mut length = 3 + prefix_len + whitespaces + 2;
1400
1401    let mut label_length = 1;
1402    let mut terminated = false;
1403    loop {
1404        let pos = base + length;
1405        if pos >= total {
1406            unreachable!("Unexpected end of input while reading nowdoc label");
1407        }
1408        let byte = *input.read_at(pos);
1409
1410        if byte == b'\n' {
1411            // A newline indicates the end of the label.
1412            length += 1;
1413            return (length, whitespaces, label_length);
1414        } else if byte == b'\r' {
1415            // Handle CRLF sequences
1416            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1417                length += 2;
1418            } else {
1419                length += 1;
1420            }
1421            return (length, whitespaces, label_length);
1422        } else if is_part_of_identifier(&byte) && !terminated {
1423            // For nowdoc, identifier characters contribute to the label until terminated.
1424            length += 1;
1425            label_length += 1;
1426        } else if !terminated && byte == b'\'' {
1427            // A single quote terminates the nowdoc label.
1428            length += 1;
1429            terminated = true;
1430        } else {
1431            unreachable!("Unexpected character encountered in nowdoc label");
1432        }
1433    }
1434}
1435
1436#[inline]
1437fn read_literal_string(input: &Input, quote: u8, prefix_len: usize) -> (TokenKind, usize) {
1438    let total = input.len();
1439    let start = input.current_offset();
1440    let skip = prefix_len + 1; // prefix + opening quote
1441    let mut length = skip;
1442
1443    let bytes = input.peek(skip, total - start - skip);
1444    loop {
1445        let scan_start = length - skip;
1446        match memchr2(quote, b'\\', &bytes[scan_start..]) {
1447            Some(pos) => {
1448                let abs_pos = scan_start + pos;
1449                let byte = bytes[abs_pos];
1450
1451                if byte == b'\\' {
1452                    length = skip + abs_pos + 2;
1453                    if length > total - start {
1454                        return (TokenKind::PartialLiteralString, total - start);
1455                    }
1456                } else {
1457                    length = skip + abs_pos + 1; // +1 for the closing quote
1458                    return (TokenKind::LiteralString, length);
1459                }
1460            }
1461            None => {
1462                // No quote or backslash found - EOF
1463                return (TokenKind::PartialLiteralString, total - start);
1464            }
1465        }
1466    }
1467}
1468
1469#[inline]
1470fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1471    let total = input.len();
1472    let base = input.current_offset();
1473    // `offset` is relative to the current position.
1474    let mut offset = from;
1475
1476    loop {
1477        let abs = base + offset;
1478        if abs >= total {
1479            // End of input.
1480            break;
1481        }
1482
1483        // Pattern 1: If the current byte is part of an identifier, simply advance.
1484        if is_part_of_identifier(input.read_at(abs)) {
1485            offset += 1;
1486            continue;
1487        }
1488
1489        // Pattern 2: If the current byte is a '[' then we enter a bracketed interpolation.
1490        if *input.read_at(abs) == b'[' {
1491            offset += 1;
1492            let mut nesting = 0;
1493            loop {
1494                let abs_inner = base + offset;
1495                if abs_inner >= total {
1496                    break;
1497                }
1498                let b = input.read_at(abs_inner);
1499                if *b == b']' {
1500                    offset += 1;
1501                    if nesting == 0 {
1502                        break;
1503                    }
1504
1505                    nesting -= 1;
1506                } else if *b == b'[' {
1507                    offset += 1;
1508                    nesting += 1;
1509                } else if b.is_ascii_whitespace() {
1510                    // Do not include whitespace.
1511                    break;
1512                } else {
1513                    offset += 1;
1514                }
1515            }
1516            // When bracketed interpolation is processed, exit the loop.
1517            break;
1518        }
1519
1520        // Pattern 3: Check for "->" followed by a valid identifier start.
1521        if base + offset + 2 < total
1522            && *input.read_at(abs) == b'-'
1523            && *input.read_at(base + offset + 1) == b'>'
1524            && is_start_of_identifier(input.read_at(base + offset + 2))
1525        {
1526            offset += 3;
1527            // Consume any following identifier characters.
1528            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1529                offset += 1;
1530            }
1531            break;
1532        }
1533
1534        // Pattern 4: Check for "?->" followed by a valid identifier start.
1535        if base + offset + 3 < total
1536            && *input.read_at(abs) == b'?'
1537            && *input.read_at(base + offset + 1) == b'-'
1538            && *input.read_at(base + offset + 2) == b'>'
1539            && is_start_of_identifier(input.read_at(base + offset + 3))
1540        {
1541            offset += 4;
1542            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1543                offset += 1;
1544            }
1545            break;
1546        }
1547
1548        // None of the expected patterns matched: exit the loop.
1549        break;
1550    }
1551
1552    offset as u32
1553}
1554
1555#[inline]
1556fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1557    let total = input.len();
1558    let base = input.current_offset();
1559    let mut offset = from;
1560    let mut nesting = 0;
1561
1562    loop {
1563        let abs = base + offset;
1564        if abs >= total {
1565            break;
1566        }
1567        match input.read_at(abs) {
1568            b'}' => {
1569                offset += 1;
1570                if nesting == 0 {
1571                    break;
1572                }
1573
1574                nesting -= 1;
1575            }
1576            b'{' => {
1577                offset += 1;
1578                nesting += 1;
1579            }
1580            _ => {
1581                offset += 1;
1582            }
1583        }
1584    }
1585
1586    offset as u32
1587}
1588
1589/// Scan a multi-line comment using SIMD-accelerated search.
1590/// Returns Some(length) including the closing */, or None if unterminated.
1591#[inline]
1592fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1593    // Use SIMD to find */ quickly
1594    memmem::find(bytes, b"*/").map(|pos| pos + 2)
1595}
1596
1597/// Scan a single-line comment using SIMD-accelerated search.
1598/// Returns the length of the comment body (not including the //).
1599/// Stops at newline or ?>.
1600#[inline]
1601fn scan_single_line_comment(bytes: &[u8]) -> usize {
1602    let mut pos = 0;
1603    while pos < bytes.len() {
1604        match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1605            Some(offset) => {
1606                let found_pos = pos + offset;
1607                match bytes[found_pos] {
1608                    b'\n' | b'\r' => return found_pos,
1609                    b'?' => {
1610                        // Check if it's ?>
1611                        if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1612                            // Also check for whitespace before ?>
1613                            if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1614                                return found_pos - 1;
1615                            }
1616                            return found_pos;
1617                        }
1618                        // Not ?>, continue searching
1619                        pos = found_pos + 1;
1620                    }
1621                    _ => unreachable!(),
1622                }
1623            }
1624            None => return bytes.len(),
1625        }
1626    }
1627
1628    bytes.len()
1629}
mago_syntax/lexer/mod.rs

mago_syntax/lexer/
mod.rs