mago_syntax/lexer/
mod.rs

1use std::collections::VecDeque;
2use std::fmt::Debug;
3use std::hint::unreachable_unchecked;
4
5use memchr::memchr2;
6use memchr::memmem;
7
8/// Lookup table for single-character tokens that are ALWAYS single-char
9/// (i.e., they can never be part of a multi-character token).
10/// Maps byte -> Option<TokenKind>
11const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
12    let mut table: [Option<TokenKind>; 256] = [None; 256];
13    table[b';' as usize] = Some(TokenKind::Semicolon);
14    table[b',' as usize] = Some(TokenKind::Comma);
15    table[b')' as usize] = Some(TokenKind::RightParenthesis);
16    table[b'[' as usize] = Some(TokenKind::LeftBracket);
17    table[b']' as usize] = Some(TokenKind::RightBracket);
18    table[b'{' as usize] = Some(TokenKind::LeftBrace);
19    table[b'}' as usize] = Some(TokenKind::RightBrace);
20    table[b'~' as usize] = Some(TokenKind::Tilde);
21    table[b'@' as usize] = Some(TokenKind::At);
22    table
23};
24
25/// Lookup table for identifier start characters (a-z, A-Z, _, 0x80-0xFF)
26const IDENT_START_TABLE: [bool; 256] = {
27    let mut table = [false; 256];
28    let mut i = 0usize;
29    while i < 256 {
30        table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
31        i += 1;
32    }
33
34    table
35};
36
37use mago_database::file::FileId;
38use mago_database::file::HasFileId;
39use mago_span::Position;
40use mago_syntax_core::float_exponent;
41use mago_syntax_core::float_separator;
42use mago_syntax_core::input::Input;
43use mago_syntax_core::number_sign;
44use mago_syntax_core::start_of_binary_number;
45use mago_syntax_core::start_of_float_number;
46use mago_syntax_core::start_of_hexadecimal_number;
47use mago_syntax_core::start_of_identifier;
48use mago_syntax_core::start_of_number;
49use mago_syntax_core::start_of_octal_number;
50use mago_syntax_core::start_of_octal_or_float_number;
51use mago_syntax_core::utils::is_part_of_identifier;
52use mago_syntax_core::utils::is_start_of_identifier;
53use mago_syntax_core::utils::read_digits_of_base;
54
55use crate::error::SyntaxError;
56use crate::lexer::internal::mode::HaltStage;
57use crate::lexer::internal::mode::Interpolation;
58use crate::lexer::internal::mode::LexerMode;
59use crate::lexer::internal::utils::NumberKind;
60use crate::settings::LexerSettings;
61use crate::token::DocumentKind;
62use crate::token::Token;
63use crate::token::TokenKind;
64
65mod internal;
66
67/// The `Lexer` struct is responsible for tokenizing input source code into discrete tokens
68/// based on PHP language syntax. It is designed to work with PHP code from version 7.0 up to 8.4.
69///
70/// The lexer reads through the provided input and processes it accordingly.
71///
72/// It identifies PHP-specific tokens, including operators, keywords, comments, strings, and other syntax elements,
73/// and produces a sequence of [`Token`] objects that are used in further stages of compilation or interpretation.
74///
75/// The lexer is designed to be used in a streaming fashion, where it reads the input source code in chunks
76/// and produces tokens incrementally. This allows for efficient processing of large source files and
77/// minimizes memory usage.
78#[derive(Debug)]
79pub struct Lexer<'input> {
80    input: Input<'input>,
81    settings: LexerSettings,
82    mode: LexerMode<'input>,
83    interpolating: bool,
84    brace_interpolating: bool,
85    /// Buffer for tokens during string interpolation.
86    buffer: VecDeque<Token<'input>>,
87}
88
89impl<'input> Lexer<'input> {
90    /// Initial capacity for the token buffer used during string interpolation.
91    /// Pre-allocating avoids reallocation during interpolation processing.
92    const BUFFER_INITIAL_CAPACITY: usize = 8;
93
94    /// Creates a new `Lexer` instance.
95    ///
96    /// # Parameters
97    ///
98    /// - `input`: The input source code to tokenize.
99    /// - `settings`: The lexer settings.
100    ///
101    /// # Returns
102    ///
103    /// A new `Lexer` instance that reads from the provided byte slice.
104    pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
105        Lexer {
106            input,
107            settings,
108            mode: LexerMode::Inline,
109            interpolating: false,
110            brace_interpolating: false,
111            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
112        }
113    }
114
115    /// Creates a new `Lexer` instance for parsing a script block.
116    ///
117    /// # Parameters
118    ///
119    /// - `input`: The input source code to tokenize.
120    /// - `settings`: The lexer settings.
121    ///
122    /// # Returns
123    ///
124    /// A new `Lexer` instance that reads from the provided byte slice.
125    pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
126        Lexer {
127            input,
128            settings,
129            mode: LexerMode::Script,
130            interpolating: false,
131            brace_interpolating: false,
132            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
133        }
134    }
135
136    /// Check if the lexer has reached the end of the input.
137    ///
138    /// If this method returns `true`, the lexer will not produce any more tokens.
139    #[must_use]
140    pub fn has_reached_eof(&self) -> bool {
141        self.input.has_reached_eof()
142    }
143
144    /// Get the current position of the lexer in the input source code.
145    #[inline]
146    pub const fn current_position(&self) -> Position {
147        self.input.current_position()
148    }
149
150    /// Tokenizes the next input from the source code.
151    ///
152    /// This method reads from the input and produces the next [`Token`] based on the current [`LexerMode`].
153    /// It handles various lexical elements such as inline text, script code, strings with interpolation,
154    /// comments, and different PHP-specific constructs.
155    ///
156    /// # Returns
157    ///
158    /// - `Some(Ok(Token))` if a token was successfully parsed.
159    /// - `Some(Err(SyntaxError))` if a syntax error occurred while parsing the next token.
160    /// - `None` if the end of the input has been reached.
161    ///
162    /// # Notes
163    ///
164    /// - It efficiently handles tokenization by consuming input based on patterns specific to PHP syntax.
165    /// - The lexer supports complex features like string interpolation and different numeric formats.
166    ///
167    /// # Errors
168    ///
169    /// Returns `Some(Err(SyntaxError))` in cases such as:
170    ///
171    /// - Unrecognized tokens that do not match any known PHP syntax.
172    /// - Unexpected tokens in a given context, such as an unexpected end of string.
173    ///
174    /// # Panics
175    ///
176    /// This method should not panic under normal operation. If it does, it indicates a bug in the lexer implementation.
177    ///
178    /// # See Also
179    ///
180    /// - [`Token`]: Represents a lexical token with its kind, value, and span.
181    /// - [`SyntaxError`]: Represents errors that can occur during lexing.
182    #[inline]
183    pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
184        // Check if there are buffered tokens from string interpolation.
185        if !self.interpolating
186            && let Some(token) = self.buffer.pop_front()
187        {
188            return Some(Ok(token));
189        }
190
191        if self.input.has_reached_eof() {
192            return None;
193        }
194
195        match self.mode {
196            LexerMode::Inline => {
197                let start = self.input.current_position();
198                let offset = self.input.current_offset();
199
200                // Shebang is only valid at the absolute start of the file (offset 0).
201                if offset == 0
202                    && self.input.len() >= 2
203                    && unsafe { *self.input.read_at_unchecked(0) } == b'#'
204                    && unsafe { *self.input.read_at_unchecked(1) } == b'!'
205                {
206                    let buffer = self.input.consume_through(b'\n');
207                    let end = self.input.current_position();
208
209                    return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
210                }
211
212                // Get the remaining bytes to scan.
213                let bytes = self.input.read_remaining();
214
215                if self.settings.enable_short_tags {
216                    if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
217                        if pos > 0 {
218                            let buffer = self.input.consume(pos);
219                            let end = self.input.current_position();
220
221                            return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
222                        }
223
224                        if self.input.is_at(b"<?php", true) {
225                            let buffer = self.input.consume(5);
226                            self.mode = LexerMode::Script;
227                            return Some(Ok(self.token(
228                                TokenKind::OpenTag,
229                                buffer,
230                                start,
231                                self.input.current_position(),
232                            )));
233                        }
234
235                        if self.input.is_at(b"<?=", false) {
236                            let buffer = self.input.consume(3);
237                            self.mode = LexerMode::Script;
238                            return Some(Ok(self.token(
239                                TokenKind::EchoTag,
240                                buffer,
241                                start,
242                                self.input.current_position(),
243                            )));
244                        }
245
246                        let buffer = self.input.consume(2);
247                        self.mode = LexerMode::Script;
248                        return Some(Ok(self.token(
249                            TokenKind::ShortOpenTag,
250                            buffer,
251                            start,
252                            self.input.current_position(),
253                        )));
254                    }
255                } else {
256                    let iter = memchr::memmem::find_iter(bytes, b"<?");
257
258                    for pos in iter {
259                        // SAFETY: `pos` is guaranteed to be within `bytes` by `find_iter`.
260                        let candidate = unsafe { bytes.get_unchecked(pos..) };
261
262                        if candidate.len() >= 5
263                            && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
264                            && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
265                            && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
266                        {
267                            if pos > 0 {
268                                let buffer = self.input.consume(pos);
269                                let end = self.input.current_position();
270                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
271                            }
272
273                            let buffer = self.input.consume(5);
274                            self.mode = LexerMode::Script;
275                            return Some(Ok(self.token(
276                                TokenKind::OpenTag,
277                                buffer,
278                                start,
279                                self.input.current_position(),
280                            )));
281                        }
282
283                        if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
284                            if pos > 0 {
285                                let buffer = self.input.consume(pos);
286                                let end = self.input.current_position();
287                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
288                            }
289
290                            let buffer = self.input.consume(3);
291                            self.mode = LexerMode::Script;
292                            return Some(Ok(self.token(
293                                TokenKind::EchoTag,
294                                buffer,
295                                start,
296                                self.input.current_position(),
297                            )));
298                        }
299                    }
300                }
301
302                if self.input.has_reached_eof() {
303                    return None;
304                }
305
306                let buffer = self.input.consume_remaining();
307                let end = self.input.current_position();
308                Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
309            }
310            LexerMode::Script => {
311                let start = self.input.current_position();
312                let whitespaces = self.input.consume_whitespaces();
313                if !whitespaces.is_empty() {
314                    return Some(Ok(self.token(
315                        TokenKind::Whitespace,
316                        whitespaces,
317                        start,
318                        self.input.current_position(),
319                    )));
320                }
321
322                let first_byte = match self.input.read(1).first() {
323                    Some(&b) => b,
324                    None => {
325                        // SAFETY: we check for EOF before entering scripting section,
326                        unsafe { unreachable_unchecked() }
327                    }
328                };
329
330                if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
331                    let buffer = self.input.consume(1);
332                    let end = self.input.current_position();
333                    return Some(Ok(self.token(kind, buffer, start, end)));
334                }
335
336                if IDENT_START_TABLE[first_byte as usize] {
337                    let (token_kind, len) = self.scan_identifier_or_keyword_info();
338
339                    if token_kind == TokenKind::HaltCompiler {
340                        self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
341                    }
342
343                    let buffer = self.input.consume(len);
344                    let end = self.input.current_position();
345                    return Some(Ok(self.token(token_kind, buffer, start, end)));
346                }
347
348                if first_byte == b'$'
349                    && let Some(&next) = self.input.read(2).get(1)
350                    && IDENT_START_TABLE[next as usize]
351                {
352                    let (ident_len, _) = self.input.scan_identifier(1);
353                    let buffer = self.input.consume(1 + ident_len);
354                    let end = self.input.current_position();
355                    return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
356                }
357
358                let mut document_label: &[u8] = &[];
359
360                let (token_kind, len) = match self.input.read(3) {
361                    [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
362                    [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
363                    [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
364                    [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
365                    [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
366                    [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
367                    [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
368                    [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
369                    [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
370                    [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
371                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
372
373                        document_label = self.input.peek(3 + whitespaces, label_length);
374
375                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
376                    }
377                    [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
378                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
379
380                        document_label = self.input.peek(4 + whitespaces, label_length);
381
382                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
383                    }
384                    [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
385                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
386
387                        document_label = self.input.peek(4 + whitespaces, label_length);
388
389                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
390                    }
391                    [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
392                    [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
393                    [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
394                    [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
395                    [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
396                    [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
397                    [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
398                    [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
399                    [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
400                    [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
401                    [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
402                    [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
403                    [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
404                    [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
405                    [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
406                    [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
407                    [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
408                    [b'>', b'>', ..] => (TokenKind::RightShift, 2),
409                    [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
410                    [b':', b':', ..] => (TokenKind::ColonColon, 2),
411                    [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
412                    [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
413                    [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
414                    [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
415                    [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
416                    [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
417                    [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
418                    [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
419                    [b'/', b'/', ..] => {
420                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
421                        let comment_len = scan_single_line_comment(remaining);
422                        (TokenKind::SingleLineComment, 2 + comment_len)
423                    }
424                    [b'/', b'*', asterisk] => {
425                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
426                        match scan_multi_line_comment(remaining) {
427                            Some(len) => {
428                                let is_docblock = asterisk == &b'*' && len > 2;
429                                if is_docblock {
430                                    (TokenKind::DocBlockComment, len + 2)
431                                } else {
432                                    (TokenKind::MultiLineComment, len + 2)
433                                }
434                            }
435                            None => {
436                                self.input.consume(remaining.len() + 2);
437                                return Some(Err(SyntaxError::UnexpectedEndOfFile(
438                                    self.file_id(),
439                                    self.input.current_position(),
440                                )));
441                            }
442                        }
443                    }
444                    [b'\\', start_of_identifier!(), ..] => {
445                        let mut length = 1;
446                        loop {
447                            let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
448                            length += ident_len;
449                            if ends_with_ns {
450                                length += 1; // Include the backslash
451                            } else {
452                                break;
453                            }
454                        }
455
456                        (TokenKind::FullyQualifiedIdentifier, length)
457                    }
458                    [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
459                    [b'$', ..] => (TokenKind::Dollar, 1),
460                    [b'!', ..] => (TokenKind::Bang, 1),
461                    [b'&', ..] => (TokenKind::Ampersand, 1),
462                    [b'?', ..] => (TokenKind::Question, 1),
463                    [b'=', ..] => (TokenKind::Equal, 1),
464                    [b'`', ..] => (TokenKind::Backtick, 1),
465                    [b'+', ..] => (TokenKind::Plus, 1),
466                    [b'%', ..] => (TokenKind::Percent, 1),
467                    [b'-', ..] => (TokenKind::Minus, 1),
468                    [b'<', ..] => (TokenKind::LessThan, 1),
469                    [b'>', ..] => (TokenKind::GreaterThan, 1),
470                    [b':', ..] => (TokenKind::Colon, 1),
471                    [b'|', ..] => (TokenKind::Pipe, 1),
472                    [b'^', ..] => (TokenKind::Caret, 1),
473                    [b'*', ..] => (TokenKind::Asterisk, 1),
474                    [b'/', ..] => (TokenKind::Slash, 1),
475                    [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
476                    [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
477                        read_literal_string(&self.input, *quote)
478                    }
479                    [b'"', ..] => (TokenKind::DoubleQuote, 1),
480                    [b'(', ..] => 'parenthesis: {
481                        let mut peek_offset = 1;
482                        while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
483                            if b.is_ascii_whitespace() {
484                                peek_offset += 1;
485                            } else {
486                                // Check if this byte could start a cast type (case-insensitive)
487                                let lower = b | 0x20; // ASCII lowercase
488                                if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u' | b'v')
489                                {
490                                    break 'parenthesis (TokenKind::LeftParenthesis, 1);
491                                }
492                                break;
493                            }
494                        }
495
496                        for (value, kind) in internal::consts::CAST_TYPES {
497                            if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
498                                break 'parenthesis (kind, length);
499                            }
500                        }
501
502                        (TokenKind::LeftParenthesis, 1)
503                    }
504                    [b'#', ..] => {
505                        let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
506                        let comment_len = scan_single_line_comment(remaining);
507                        (TokenKind::HashComment, 1 + comment_len)
508                    }
509                    [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
510                    [b'.', start_of_number!(), ..] => {
511                        let mut length = read_digits_of_base(&self.input, 2, 10);
512                        if let float_exponent!() = self.input.peek(length, 1) {
513                            let mut exp_length = length + 1;
514                            if let number_sign!() = self.input.peek(exp_length, 1) {
515                                exp_length += 1;
516                            }
517
518                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
519                            if after_exp > exp_length {
520                                length = after_exp;
521                            }
522                        }
523
524                        (TokenKind::LiteralFloat, length)
525                    }
526                    [start_of_number!(), ..] => 'number: {
527                        let mut length = 1;
528
529                        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
530                            start_of_binary_number!() => {
531                                length += 1;
532
533                                (2, NumberKind::Integer)
534                            }
535                            start_of_octal_number!() => {
536                                length += 1;
537
538                                (8, NumberKind::Integer)
539                            }
540                            start_of_hexadecimal_number!() => {
541                                length += 1;
542
543                                (16, NumberKind::Integer)
544                            }
545                            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
546                            start_of_float_number!() => (10, NumberKind::Float),
547                            _ => (10, NumberKind::IntegerOrFloat),
548                        };
549
550                        if kind != NumberKind::Float {
551                            length = read_digits_of_base(&self.input, length, base);
552
553                            if kind == NumberKind::Integer {
554                                break 'number (TokenKind::LiteralInteger, length);
555                            }
556                        }
557
558                        let is_float = matches!(self.input.peek(length, 3), float_separator!());
559
560                        if !is_float {
561                            break 'number (TokenKind::LiteralInteger, length);
562                        }
563
564                        if let [b'.'] = self.input.peek(length, 1) {
565                            length += 1;
566                            length = read_digits_of_base(&self.input, length, 10);
567                        }
568
569                        if let float_exponent!() = self.input.peek(length, 1) {
570                            // Only include exponent if there are digits after it
571                            let mut exp_length = length + 1;
572                            if let number_sign!() = self.input.peek(exp_length, 1) {
573                                exp_length += 1;
574                            }
575                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
576                            if after_exp > exp_length {
577                                // There are digits after the exponent marker
578                                length = after_exp;
579                            }
580                        }
581
582                        (TokenKind::LiteralFloat, length)
583                    }
584                    [b'.', ..] => (TokenKind::Dot, 1),
585                    [unknown_byte, ..] => {
586                        let position = self.input.current_position();
587                        self.input.consume(1);
588
589                        return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
590                    }
591                    [] => {
592                        // we check for EOF before entering scripting section,
593                        // so this should be unreachable.
594                        unreachable!()
595                    }
596                };
597
598                self.mode = match token_kind {
599                    TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
600                    TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
601                    TokenKind::CloseTag => LexerMode::Inline,
602                    TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
603                    TokenKind::DocumentStart(document_kind) => {
604                        LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
605                    }
606                    _ => LexerMode::Script,
607                };
608
609                let buffer = self.input.consume(len);
610                let end = self.input.current_position();
611
612                Some(Ok(self.token(token_kind, buffer, start, end)))
613            }
614            LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
615                Interpolation::None => {
616                    let start = self.input.current_position();
617
618                    let mut length = 0;
619                    let mut last_was_slash = false;
620                    let mut token_kind = TokenKind::StringPart;
621                    loop {
622                        match self.input.peek(length, 2) {
623                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
624                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
625
626                                self.mode =
627                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
628
629                                break;
630                            }
631                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
632                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
633
634                                self.mode = LexerMode::DoubleQuoteString(Interpolation::BraceUntil(
635                                    start.offset + until_offset,
636                                ));
637
638                                break;
639                            }
640                            [b'\\', ..] => {
641                                length += 1;
642
643                                last_was_slash = !last_was_slash;
644                            }
645                            [b'"', ..] if !last_was_slash => {
646                                if length == 0 {
647                                    length += 1;
648                                    token_kind = TokenKind::DoubleQuote;
649
650                                    break;
651                                }
652
653                                break;
654                            }
655                            [_, ..] => {
656                                length += 1;
657                                last_was_slash = false;
658                            }
659                            [] => {
660                                break;
661                            }
662                        }
663                    }
664
665                    let buffer = self.input.consume(length);
666                    let end = self.input.current_position();
667
668                    if TokenKind::DoubleQuote == token_kind {
669                        self.mode = LexerMode::Script;
670                    }
671
672                    Some(Ok(self.token(token_kind, buffer, start, end)))
673                }
674                Interpolation::Until(offset) => {
675                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), false)
676                }
677                Interpolation::BraceUntil(offset) => {
678                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), true)
679                }
680            },
681            LexerMode::ShellExecuteString(interpolation) => match &interpolation {
682                Interpolation::None => {
683                    let start = self.input.current_position();
684
685                    let mut length = 0;
686                    let mut last_was_slash = false;
687                    let mut token_kind = TokenKind::StringPart;
688                    loop {
689                        match self.input.peek(length, 2) {
690                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
691                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
692
693                                self.mode =
694                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
695
696                                break;
697                            }
698                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
699                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
700
701                                self.mode = LexerMode::ShellExecuteString(Interpolation::BraceUntil(
702                                    start.offset + until_offset,
703                                ));
704
705                                break;
706                            }
707                            [b'\\', ..] => {
708                                length += 1;
709                                last_was_slash = true;
710                            }
711                            [b'`', ..] if !last_was_slash => {
712                                if length == 0 {
713                                    length += 1;
714                                    token_kind = TokenKind::Backtick;
715
716                                    break;
717                                }
718
719                                break;
720                            }
721                            [_, ..] => {
722                                length += 1;
723                                last_was_slash = false;
724                            }
725                            [] => {
726                                break;
727                            }
728                        }
729                    }
730
731                    let buffer = self.input.consume(length);
732                    let end = self.input.current_position();
733
734                    if TokenKind::Backtick == token_kind {
735                        self.mode = LexerMode::Script;
736                    }
737
738                    Some(Ok(self.token(token_kind, buffer, start, end)))
739                }
740                Interpolation::Until(offset) => {
741                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), false)
742                }
743                Interpolation::BraceUntil(offset) => {
744                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), true)
745                }
746            },
747            LexerMode::DocumentString(kind, label, interpolation) => match &kind {
748                DocumentKind::Heredoc => match &interpolation {
749                    Interpolation::None => {
750                        let start = self.input.current_position();
751
752                        let mut length = 0;
753                        let mut last_was_slash = false;
754                        let mut only_whitespaces = true;
755                        let mut token_kind = TokenKind::StringPart;
756                        loop {
757                            match self.input.peek(length, 2) {
758                                [b'\r', b'\n'] => {
759                                    length += 2;
760
761                                    break;
762                                }
763                                [b'\n' | b'\r', ..] => {
764                                    length += 1;
765
766                                    break;
767                                }
768                                [byte, ..] if byte.is_ascii_whitespace() => {
769                                    length += 1;
770                                }
771                                [b'$', start_of_identifier!(), ..] if !last_was_slash => {
772                                    let until_offset =
773                                        read_until_end_of_variable_interpolation(&self.input, length + 2);
774
775                                    self.mode = LexerMode::DocumentString(
776                                        kind,
777                                        label,
778                                        Interpolation::Until(start.offset + until_offset),
779                                    );
780
781                                    break;
782                                }
783                                [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
784                                    let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
785
786                                    self.mode = LexerMode::DocumentString(
787                                        kind,
788                                        label,
789                                        Interpolation::BraceUntil(start.offset + until_offset),
790                                    );
791
792                                    break;
793                                }
794                                [b'\\', ..] => {
795                                    length += 1;
796                                    last_was_slash = true;
797                                    only_whitespaces = false;
798                                }
799                                [_, ..] => {
800                                    if only_whitespaces
801                                        && self.input.peek(length, label.len()) == label
802                                        && self
803                                            .input
804                                            .peek(length + label.len(), 1)
805                                            .first()
806                                            .is_none_or(|c| !c.is_ascii_alphanumeric())
807                                    {
808                                        length += label.len();
809                                        token_kind = TokenKind::DocumentEnd;
810
811                                        break;
812                                    }
813
814                                    length += 1;
815                                    last_was_slash = false;
816                                    only_whitespaces = false;
817                                }
818                                [] => {
819                                    break;
820                                }
821                            }
822                        }
823
824                        let buffer = self.input.consume(length);
825                        let end = self.input.current_position();
826
827                        if TokenKind::DocumentEnd == token_kind {
828                            self.mode = LexerMode::Script;
829                        }
830
831                        Some(Ok(self.token(token_kind, buffer, start, end)))
832                    }
833                    Interpolation::Until(offset) => {
834                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), false)
835                    }
836                    Interpolation::BraceUntil(offset) => {
837                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), true)
838                    }
839                },
840                DocumentKind::Nowdoc => {
841                    let start = self.input.current_position();
842
843                    let mut length = 0;
844                    let mut terminated = false;
845                    let mut only_whitespaces = true;
846
847                    loop {
848                        match self.input.peek(length, 2) {
849                            [b'\r', b'\n'] => {
850                                length += 2;
851
852                                break;
853                            }
854                            [b'\n' | b'\r', ..] => {
855                                length += 1;
856
857                                break;
858                            }
859                            [byte, ..] if byte.is_ascii_whitespace() => {
860                                length += 1;
861                            }
862                            [_, ..] => {
863                                if only_whitespaces
864                                    && self.input.peek(length, label.len()) == label
865                                    && self
866                                        .input
867                                        .peek(length + label.len(), 1)
868                                        .first()
869                                        .is_none_or(|c| !c.is_ascii_alphanumeric())
870                                {
871                                    length += label.len();
872                                    terminated = true;
873
874                                    break;
875                                }
876
877                                only_whitespaces = false;
878                                length += 1;
879                            }
880                            [] => {
881                                break;
882                            }
883                        }
884                    }
885
886                    let buffer = self.input.consume(length);
887                    let end = self.input.current_position();
888
889                    if terminated {
890                        self.mode = LexerMode::Script;
891
892                        return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
893                    }
894
895                    Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
896                }
897            },
898            LexerMode::Halt(stage) => 'halt: {
899                let start = self.input.current_position();
900                if let HaltStage::End = stage {
901                    let buffer = self.input.consume_remaining();
902                    let end = self.input.current_position();
903
904                    break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
905                }
906
907                let whitespaces = self.input.consume_whitespaces();
908                if !whitespaces.is_empty() {
909                    let end = self.input.current_position();
910
911                    break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
912                }
913
914                match &stage {
915                    HaltStage::LookingForLeftParenthesis => {
916                        if self.input.is_at(b"(", false) {
917                            let buffer = self.input.consume(1);
918                            let end = self.input.current_position();
919
920                            self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
921
922                            Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
923                        } else {
924                            let byte = self.input.read(1)[0];
925                            let position = self.input.current_position();
926                            // Consume the unexpected byte to avoid infinite loops
927                            self.input.consume(1);
928                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
929                        }
930                    }
931                    HaltStage::LookingForRightParenthesis => {
932                        if self.input.is_at(b")", false) {
933                            let buffer = self.input.consume(1);
934                            let end = self.input.current_position();
935
936                            self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
937
938                            Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
939                        } else {
940                            let byte = self.input.read(1)[0];
941                            let position = self.input.current_position();
942                            self.input.consume(1);
943                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
944                        }
945                    }
946                    HaltStage::LookingForTerminator => {
947                        if self.input.is_at(b";", false) {
948                            let buffer = self.input.consume(1);
949                            let end = self.input.current_position();
950
951                            self.mode = LexerMode::Halt(HaltStage::End);
952
953                            Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
954                        } else if self.input.is_at(b"?>", false) {
955                            let buffer = self.input.consume(2);
956                            let end = self.input.current_position();
957
958                            self.mode = LexerMode::Halt(HaltStage::End);
959
960                            Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
961                        } else {
962                            let byte = self.input.read(1)[0];
963                            let position = self.input.current_position();
964                            self.input.consume(1);
965                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
966                        }
967                    }
968                    _ => unreachable!(),
969                }
970            }
971        }
972    }
973
974    /// Fast path for scanning identifiers and keywords.
975    /// Called when we know the first byte is an identifier start character.
976    /// Returns (TokenKind, length) to allow proper mode switching.
977    #[inline]
978    fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
979        let (mut length, ended_with_slash) = self.input.scan_identifier(0);
980
981        if !ended_with_slash {
982            match length {
983                6 => {
984                    if self.input.is_at(b"public(set)", true) {
985                        return (TokenKind::PublicSet, 11);
986                    }
987                }
988                7 => {
989                    if self.input.is_at(b"private(set)", true) {
990                        return (TokenKind::PrivateSet, 12);
991                    }
992                }
993                9 => {
994                    if self.input.is_at(b"protected(set)", true) {
995                        return (TokenKind::ProtectedSet, 14);
996                    }
997                }
998                _ => {}
999            }
1000        }
1001
1002        if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
1003            return (kind, length);
1004        }
1005
1006        let mut slashes = 0;
1007        let mut last_was_slash = false;
1008        loop {
1009            match self.input.peek(length, 1) {
1010                [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
1011                    length += 1;
1012                    last_was_slash = false;
1013                }
1014                [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1015                    length += 1;
1016                }
1017                [b'\\'] if !self.interpolating || self.brace_interpolating => {
1018                    if last_was_slash {
1019                        length -= 1;
1020                        slashes -= 1;
1021                        last_was_slash = false;
1022                        break;
1023                    }
1024
1025                    length += 1;
1026                    slashes += 1;
1027                    last_was_slash = true;
1028                }
1029                _ => {
1030                    break;
1031                }
1032            }
1033        }
1034
1035        if last_was_slash {
1036            length -= 1;
1037            slashes -= 1;
1038        }
1039
1040        let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1041
1042        (kind, length)
1043    }
1044
1045    #[inline]
1046    fn token(&self, kind: TokenKind, v: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1047        // SAFETY: The input bytes are guaranteed to be valid UTF-8 because:
1048        // 1. File contents are validated via simdutf8 during database loading
1049        // 2. Invalid UTF-8 is converted lossily before reaching the lexer
1050        // 3. All byte slices here are subslices of the validated input
1051        let value = unsafe { std::str::from_utf8_unchecked(v) };
1052
1053        Token { kind, start, value }
1054    }
1055
1056    #[inline]
1057    fn interpolation(
1058        &mut self,
1059        end_offset: u32,
1060        post_interpolation_mode: LexerMode<'input>,
1061        brace: bool,
1062    ) -> Option<Result<Token<'input>, SyntaxError>> {
1063        self.mode = LexerMode::Script;
1064
1065        let was_interpolating = self.interpolating;
1066        self.interpolating = true;
1067        let was_brace_interpolating = self.brace_interpolating;
1068        // For brace interpolation ({$...}), allow qualified identifiers with backslashes.
1069        self.brace_interpolating = brace;
1070
1071        loop {
1072            let subsequent_token = self.advance()?.ok()?;
1073            // Check if this token contains the end offset
1074            let token_start = subsequent_token.start.offset;
1075            let token_end = token_start + subsequent_token.value.len() as u32;
1076            let is_final_token = token_start <= end_offset && end_offset <= token_end;
1077
1078            self.buffer.push_back(subsequent_token);
1079
1080            if is_final_token {
1081                break;
1082            }
1083        }
1084
1085        self.mode = post_interpolation_mode;
1086        self.interpolating = was_interpolating;
1087        self.brace_interpolating = was_brace_interpolating;
1088
1089        self.advance()
1090    }
1091}
1092
1093impl HasFileId for Lexer<'_> {
1094    #[inline]
1095    fn file_id(&self) -> FileId {
1096        self.input.file_id()
1097    }
1098}
1099
1100#[inline]
1101fn matches_start_of_heredoc_document(input: &Input) -> bool {
1102    let total = input.len();
1103    let base = input.current_offset();
1104
1105    // Start after the fixed opener (3 bytes).
1106    let mut length = 3;
1107    // Consume any following whitespace.
1108    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1109        length += 1;
1110    }
1111
1112    // The next byte must be a valid start-of-identifier.
1113    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1114        return false;
1115    }
1116    length += 1; // Include that identifier start.
1117
1118    // Now continue reading identifier characters until a newline is found.
1119    loop {
1120        let pos = base + length;
1121        if pos >= total {
1122            return false; // Unexpected EOF
1123        }
1124
1125        let byte = *input.read_at(pos);
1126        if byte == b'\n' {
1127            return true; // Newline found: valid heredoc opener.
1128        } else if byte == b'\r' {
1129            // Handle CRLF: treat '\r' followed by '\n' as a newline as well.
1130            return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1131        } else if is_part_of_identifier(input.read_at(pos)) {
1132            length += 1;
1133        } else {
1134            return false; // Unexpected character.
1135        }
1136    }
1137}
1138
1139#[inline]
1140fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1141    let total = input.len();
1142    let base = input.current_offset();
1143
1144    // Start after the fixed opener (3 bytes), then skip any whitespace.
1145    let mut length = 3;
1146    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1147        length += 1;
1148    }
1149
1150    // Next, expect an opening double quote.
1151    if base + length >= total || *input.read_at(base + length) != b'"' {
1152        return false;
1153    }
1154    length += 1;
1155
1156    // The following byte must be a valid start-of-identifier.
1157    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1158        return false;
1159    }
1160    length += 1;
1161
1162    // Now scan the label. For double‑quoted heredoc, a terminating double quote is required.
1163    let mut terminated = false;
1164    loop {
1165        let pos = base + length;
1166        if pos >= total {
1167            return false;
1168        }
1169        let byte = input.read_at(pos);
1170        if *byte == b'\n' {
1171            // End of line: valid only if a closing double quote was encountered.
1172            return terminated;
1173        } else if *byte == b'\r' {
1174            // Handle CRLF sequences.
1175            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1176        } else if !terminated && is_part_of_identifier(byte) {
1177            length += 1;
1178        } else if !terminated && *byte == b'"' {
1179            terminated = true;
1180            length += 1;
1181        } else {
1182            return false;
1183        }
1184    }
1185}
1186
1187#[inline]
1188fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1189    let total = input.len();
1190    let base = input.current_offset();
1191
1192    // Start after the fixed opener (3 bytes) and skip whitespace.
1193    let mut length = 3;
1194    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1195        length += 1;
1196    }
1197
1198    // Now, the next byte must be a single quote.
1199    if base + length >= total || *input.read_at(base + length) != b'\'' {
1200        return false;
1201    }
1202    length += 1;
1203
1204    // The following byte must be a valid start-of-identifier.
1205    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1206        return false;
1207    }
1208    length += 1;
1209
1210    // Read the label until a newline. A terminating single quote is required.
1211    let mut terminated = false;
1212    loop {
1213        let pos = base + length;
1214        if pos >= total {
1215            return false;
1216        }
1217        let byte = *input.read_at(pos);
1218        if byte == b'\n' {
1219            return terminated;
1220        } else if byte == b'\r' {
1221            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1222        } else if !terminated && is_part_of_identifier(&byte) {
1223            length += 1;
1224        } else if !terminated && byte == b'\'' {
1225            terminated = true;
1226            length += 1;
1227        } else {
1228            return false;
1229        }
1230    }
1231}
1232
1233#[inline]
1234fn matches_literal_double_quote_string(input: &Input) -> bool {
1235    let total = input.len();
1236    let base = input.current_offset();
1237
1238    // Start after the initial double-quote (assumed consumed).
1239    let mut pos = base + 1;
1240    loop {
1241        if pos >= total {
1242            // Reached EOF: assume literal is complete.
1243            return true;
1244        }
1245        let byte = *input.read_at(pos);
1246        if byte == b'"' {
1247            // Encounter a closing double quote.
1248            return true;
1249        } else if byte == b'\\' {
1250            // Skip an escape sequence: assume that the backslash and the escaped character form a pair.
1251            pos += 2;
1252            continue;
1253        }
1254
1255        // Check for variable interpolation or complex expression start:
1256        // If two-byte sequences match either "$" followed by a start-of-identifier or "{" and "$", then return false.
1257        if pos + 1 < total {
1258            let next = *input.read_at(pos + 1);
1259            if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1260                return false;
1261            }
1262        }
1263        pos += 1;
1264    }
1265}
1266
1267#[inline]
1268fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1269    let total = input.len();
1270    let base = input.current_offset();
1271
1272    // Start reading at offset base+3 (the fixed opener length).
1273    let mut pos = base + 3;
1274    let mut whitespaces = 0;
1275    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1276        whitespaces += 1;
1277        pos += 1;
1278    }
1279
1280    // The label (or delimiter) starts after:
1281    //   3 bytes + whitespace bytes + an extra offset:
1282    //      if double-quoted: 2 bytes (opening and closing quotes around the label)
1283    //      else: 1 byte.
1284    let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1285
1286    let mut label_length = 1; // Start with at least one byte for the label.
1287    let mut terminated = false; // For double-quoted heredoc, to track the closing quote.
1288    loop {
1289        let pos = base + length;
1290        // Ensure we haven't run past the input.
1291        if pos >= total {
1292            unreachable!("Unexpected end of input while reading heredoc label");
1293        }
1294
1295        let byte = *input.read_at(pos);
1296        if byte == b'\n' {
1297            // Newline ends the label.
1298            length += 1;
1299            return (length, whitespaces, label_length);
1300        } else if byte == b'\r' {
1301            // Handle CRLF sequences
1302            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1303                length += 2;
1304            } else {
1305                length += 1;
1306            }
1307            return (length, whitespaces, label_length);
1308        } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1309            // For both unquoted and double-quoted (before the closing quote) heredoc,
1310            // a valid identifier character is part of the label.
1311            length += 1;
1312            label_length += 1;
1313        } else if double_quoted && !terminated && byte == b'"' {
1314            // In a double-quoted heredoc, a double quote terminates the label.
1315            length += 1;
1316            terminated = true;
1317        } else {
1318            unreachable!("Unexpected character encountered in heredoc label");
1319        }
1320    }
1321}
1322
1323#[inline]
1324fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1325    let total = input.len();
1326    let base = input.current_offset();
1327
1328    let mut pos = base + 3;
1329    let mut whitespaces = 0;
1330    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1331        whitespaces += 1;
1332        pos += 1;
1333    }
1334
1335    // For nowdoc, the fixed extra offset is always 2.
1336    let mut length = 3 + whitespaces + 2;
1337
1338    let mut label_length = 1;
1339    let mut terminated = false;
1340    loop {
1341        let pos = base + length;
1342        if pos >= total {
1343            unreachable!("Unexpected end of input while reading nowdoc label");
1344        }
1345        let byte = *input.read_at(pos);
1346
1347        if byte == b'\n' {
1348            // A newline indicates the end of the label.
1349            length += 1;
1350            return (length, whitespaces, label_length);
1351        } else if byte == b'\r' {
1352            // Handle CRLF sequences
1353            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1354                length += 2;
1355            } else {
1356                length += 1;
1357            }
1358            return (length, whitespaces, label_length);
1359        } else if is_part_of_identifier(&byte) && !terminated {
1360            // For nowdoc, identifier characters contribute to the label until terminated.
1361            length += 1;
1362            label_length += 1;
1363        } else if !terminated && byte == b'\'' {
1364            // A single quote terminates the nowdoc label.
1365            length += 1;
1366            terminated = true;
1367        } else {
1368            unreachable!("Unexpected character encountered in nowdoc label");
1369        }
1370    }
1371}
1372
1373#[inline]
1374fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1375    let total = input.len();
1376    let start = input.current_offset();
1377    let mut length = 1; // We assume the opening quote is already consumed.
1378
1379    let bytes = input.peek(length, total - start - length);
1380    loop {
1381        match memchr2(quote, b'\\', &bytes[length - 1..]) {
1382            Some(pos) => {
1383                let abs_pos = length - 1 + pos;
1384                let byte = bytes[abs_pos];
1385
1386                if byte == b'\\' {
1387                    length = abs_pos + 2 + 1; // +1 because bytes starts at offset 1
1388                    if length > total - start {
1389                        return (TokenKind::PartialLiteralString, total - start);
1390                    }
1391                } else {
1392                    length = abs_pos + 2; // +1 for the quote, +1 because bytes starts at offset 1
1393                    return (TokenKind::LiteralString, length);
1394                }
1395            }
1396            None => {
1397                // No quote or backslash found - EOF
1398                return (TokenKind::PartialLiteralString, total - start);
1399            }
1400        }
1401    }
1402}
1403
1404#[inline]
1405fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1406    let total = input.len();
1407    let base = input.current_offset();
1408    // `offset` is relative to the current position.
1409    let mut offset = from;
1410
1411    loop {
1412        let abs = base + offset;
1413        if abs >= total {
1414            // End of input.
1415            break;
1416        }
1417
1418        // Pattern 1: If the current byte is part of an identifier, simply advance.
1419        if is_part_of_identifier(input.read_at(abs)) {
1420            offset += 1;
1421            continue;
1422        }
1423
1424        // Pattern 2: If the current byte is a '[' then we enter a bracketed interpolation.
1425        if *input.read_at(abs) == b'[' {
1426            offset += 1;
1427            let mut nesting = 0;
1428            loop {
1429                let abs_inner = base + offset;
1430                if abs_inner >= total {
1431                    break;
1432                }
1433                let b = input.read_at(abs_inner);
1434                if *b == b']' {
1435                    offset += 1;
1436                    if nesting == 0 {
1437                        break;
1438                    }
1439
1440                    nesting -= 1;
1441                } else if *b == b'[' {
1442                    offset += 1;
1443                    nesting += 1;
1444                } else if b.is_ascii_whitespace() {
1445                    // Do not include whitespace.
1446                    break;
1447                } else {
1448                    offset += 1;
1449                }
1450            }
1451            // When bracketed interpolation is processed, exit the loop.
1452            break;
1453        }
1454
1455        // Pattern 3: Check for "->" followed by a valid identifier start.
1456        if base + offset + 2 < total
1457            && *input.read_at(abs) == b'-'
1458            && *input.read_at(base + offset + 1) == b'>'
1459            && is_start_of_identifier(input.read_at(base + offset + 2))
1460        {
1461            offset += 3;
1462            // Consume any following identifier characters.
1463            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1464                offset += 1;
1465            }
1466            break;
1467        }
1468
1469        // Pattern 4: Check for "?->" followed by a valid identifier start.
1470        if base + offset + 3 < total
1471            && *input.read_at(abs) == b'?'
1472            && *input.read_at(base + offset + 1) == b'-'
1473            && *input.read_at(base + offset + 2) == b'>'
1474            && is_start_of_identifier(input.read_at(base + offset + 3))
1475        {
1476            offset += 4;
1477            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1478                offset += 1;
1479            }
1480            break;
1481        }
1482
1483        // None of the expected patterns matched: exit the loop.
1484        break;
1485    }
1486
1487    offset as u32
1488}
1489
1490#[inline]
1491fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1492    let total = input.len();
1493    let base = input.current_offset();
1494    let mut offset = from;
1495    let mut nesting = 0;
1496
1497    loop {
1498        let abs = base + offset;
1499        if abs >= total {
1500            break;
1501        }
1502        match input.read_at(abs) {
1503            b'}' => {
1504                offset += 1;
1505                if nesting == 0 {
1506                    break;
1507                }
1508
1509                nesting -= 1;
1510            }
1511            b'{' => {
1512                offset += 1;
1513                nesting += 1;
1514            }
1515            _ => {
1516                offset += 1;
1517            }
1518        }
1519    }
1520
1521    offset as u32
1522}
1523
1524/// Scan a multi-line comment using SIMD-accelerated search.
1525/// Returns Some(length) including the closing */, or None if unterminated.
1526#[inline]
1527fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1528    // Use SIMD to find */ quickly
1529    memmem::find(bytes, b"*/").map(|pos| pos + 2)
1530}
1531
1532/// Scan a single-line comment using SIMD-accelerated search.
1533/// Returns the length of the comment body (not including the //).
1534/// Stops at newline or ?>.
1535#[inline]
1536fn scan_single_line_comment(bytes: &[u8]) -> usize {
1537    let mut pos = 0;
1538    while pos < bytes.len() {
1539        match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1540            Some(offset) => {
1541                let found_pos = pos + offset;
1542                match bytes[found_pos] {
1543                    b'\n' | b'\r' => return found_pos,
1544                    b'?' => {
1545                        // Check if it's ?>
1546                        if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1547                            // Also check for whitespace before ?>
1548                            if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1549                                return found_pos - 1;
1550                            }
1551                            return found_pos;
1552                        }
1553                        // Not ?>, continue searching
1554                        pos = found_pos + 1;
1555                    }
1556                    _ => unreachable!(),
1557                }
1558            }
1559            None => return bytes.len(),
1560        }
1561    }
1562
1563    bytes.len()
1564}
mago_syntax/lexer/mod.rs

mago_syntax/lexer/
mod.rs