mago_syntax/lexer/
mod.rs

1use std::collections::VecDeque;
2use std::fmt::Debug;
3use std::hint::unreachable_unchecked;
4
5use memchr::memchr2;
6use memchr::memmem;
7
8/// Lookup table for single-character tokens that are ALWAYS single-char
9/// (i.e., they can never be part of a multi-character token).
10/// Maps byte -> Option<TokenKind>
11const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
12    let mut table: [Option<TokenKind>; 256] = [None; 256];
13    table[b';' as usize] = Some(TokenKind::Semicolon);
14    table[b',' as usize] = Some(TokenKind::Comma);
15    table[b')' as usize] = Some(TokenKind::RightParenthesis);
16    table[b'[' as usize] = Some(TokenKind::LeftBracket);
17    table[b']' as usize] = Some(TokenKind::RightBracket);
18    table[b'{' as usize] = Some(TokenKind::LeftBrace);
19    table[b'}' as usize] = Some(TokenKind::RightBrace);
20    table[b'~' as usize] = Some(TokenKind::Tilde);
21    table[b'@' as usize] = Some(TokenKind::At);
22    table
23};
24
25/// Lookup table for identifier start characters (a-z, A-Z, _, 0x80-0xFF)
26const IDENT_START_TABLE: [bool; 256] = {
27    let mut table = [false; 256];
28    let mut i = 0usize;
29    while i < 256 {
30        table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
31        i += 1;
32    }
33
34    table
35};
36
37use mago_database::file::FileId;
38use mago_database::file::HasFileId;
39use mago_span::Position;
40use mago_syntax_core::float_exponent;
41use mago_syntax_core::float_separator;
42use mago_syntax_core::input::Input;
43use mago_syntax_core::number_sign;
44use mago_syntax_core::start_of_binary_number;
45use mago_syntax_core::start_of_float_number;
46use mago_syntax_core::start_of_hexadecimal_number;
47use mago_syntax_core::start_of_identifier;
48use mago_syntax_core::start_of_number;
49use mago_syntax_core::start_of_octal_number;
50use mago_syntax_core::start_of_octal_or_float_number;
51use mago_syntax_core::utils::is_part_of_identifier;
52use mago_syntax_core::utils::is_start_of_identifier;
53use mago_syntax_core::utils::read_digits_of_base;
54
55use crate::error::SyntaxError;
56use crate::lexer::internal::mode::HaltStage;
57use crate::lexer::internal::mode::Interpolation;
58use crate::lexer::internal::mode::LexerMode;
59use crate::lexer::internal::utils::NumberKind;
60use crate::settings::LexerSettings;
61use crate::token::DocumentKind;
62use crate::token::Token;
63use crate::token::TokenKind;
64
65mod internal;
66
67/// The `Lexer` struct is responsible for tokenizing input source code into discrete tokens
68/// based on PHP language syntax. It is designed to work with PHP code from version 7.0 up to 8.4.
69///
70/// The lexer reads through the provided input and processes it accordingly.
71///
72/// It identifies PHP-specific tokens, including operators, keywords, comments, strings, and other syntax elements,
73/// and produces a sequence of [`Token`] objects that are used in further stages of compilation or interpretation.
74///
75/// The lexer is designed to be used in a streaming fashion, where it reads the input source code in chunks
76/// and produces tokens incrementally. This allows for efficient processing of large source files and
77/// minimizes memory usage.
78#[derive(Debug)]
79pub struct Lexer<'input> {
80    input: Input<'input>,
81    settings: LexerSettings,
82    mode: LexerMode<'input>,
83    interpolating: bool,
84    brace_interpolating: bool,
85    /// Buffer for tokens during string interpolation.
86    buffer: VecDeque<Token<'input>>,
87}
88
89impl<'input> Lexer<'input> {
90    /// Initial capacity for the token buffer used during string interpolation.
91    /// Pre-allocating avoids reallocation during interpolation processing.
92    const BUFFER_INITIAL_CAPACITY: usize = 8;
93
94    /// Creates a new `Lexer` instance.
95    ///
96    /// # Parameters
97    ///
98    /// - `input`: The input source code to tokenize.
99    /// - `settings`: The lexer settings.
100    ///
101    /// # Returns
102    ///
103    /// A new `Lexer` instance that reads from the provided byte slice.
104    pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
105        Lexer {
106            input,
107            settings,
108            mode: LexerMode::Inline,
109            interpolating: false,
110            brace_interpolating: false,
111            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
112        }
113    }
114
115    /// Creates a new `Lexer` instance for parsing a script block.
116    ///
117    /// # Parameters
118    ///
119    /// - `input`: The input source code to tokenize.
120    /// - `settings`: The lexer settings.
121    ///
122    /// # Returns
123    ///
124    /// A new `Lexer` instance that reads from the provided byte slice.
125    pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
126        Lexer {
127            input,
128            settings,
129            mode: LexerMode::Script,
130            interpolating: false,
131            brace_interpolating: false,
132            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
133        }
134    }
135
136    /// Check if the lexer has reached the end of the input.
137    ///
138    /// If this method returns `true`, the lexer will not produce any more tokens.
139    #[must_use]
140    pub fn has_reached_eof(&self) -> bool {
141        self.input.has_reached_eof()
142    }
143
144    /// Get the current position of the lexer in the input source code.
145    #[inline]
146    pub const fn current_position(&self) -> Position {
147        self.input.current_position()
148    }
149
150    /// Tokenizes the next input from the source code.
151    ///
152    /// This method reads from the input and produces the next [`Token`] based on the current [`LexerMode`].
153    /// It handles various lexical elements such as inline text, script code, strings with interpolation,
154    /// comments, and different PHP-specific constructs.
155    ///
156    /// # Returns
157    ///
158    /// - `Some(Ok(Token))` if a token was successfully parsed.
159    /// - `Some(Err(SyntaxError))` if a syntax error occurred while parsing the next token.
160    /// - `None` if the end of the input has been reached.
161    ///
162    /// # Notes
163    ///
164    /// - It efficiently handles tokenization by consuming input based on patterns specific to PHP syntax.
165    /// - The lexer supports complex features like string interpolation and different numeric formats.
166    ///
167    /// # Errors
168    ///
169    /// Returns `Some(Err(SyntaxError))` in cases such as:
170    ///
171    /// - Unrecognized tokens that do not match any known PHP syntax.
172    /// - Unexpected tokens in a given context, such as an unexpected end of string.
173    ///
174    /// # Panics
175    ///
176    /// This method should not panic under normal operation. If it does, it indicates a bug in the lexer implementation.
177    ///
178    /// # See Also
179    ///
180    /// - [`Token`]: Represents a lexical token with its kind, value, and span.
181    /// - [`SyntaxError`]: Represents errors that can occur during lexing.
182    #[inline]
183    pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
184        // Check if there are buffered tokens from string interpolation.
185        if !self.interpolating
186            && let Some(token) = self.buffer.pop_front()
187        {
188            return Some(Ok(token));
189        }
190
191        if self.input.has_reached_eof() {
192            return None;
193        }
194
195        match self.mode {
196            LexerMode::Inline => {
197                let start = self.input.current_position();
198                let offset = self.input.current_offset();
199
200                // Shebang is only valid at the absolute start of the file (offset 0).
201                if offset == 0
202                    && self.input.len() >= 2
203                    && unsafe { *self.input.read_at_unchecked(0) } == b'#'
204                    && unsafe { *self.input.read_at_unchecked(1) } == b'!'
205                {
206                    let buffer = self.input.consume_through(b'\n');
207                    let end = self.input.current_position();
208
209                    return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
210                }
211
212                // Get the remaining bytes to scan.
213                let bytes = self.input.read_remaining();
214
215                if self.settings.enable_short_tags {
216                    if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
217                        if pos > 0 {
218                            let buffer = self.input.consume(pos);
219                            let end = self.input.current_position();
220
221                            return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
222                        }
223
224                        if self.input.is_at(b"<?php", true) {
225                            let buffer = self.input.consume(5);
226                            self.mode = LexerMode::Script;
227                            return Some(Ok(self.token(
228                                TokenKind::OpenTag,
229                                buffer,
230                                start,
231                                self.input.current_position(),
232                            )));
233                        }
234
235                        if self.input.is_at(b"<?=", false) {
236                            let buffer = self.input.consume(3);
237                            self.mode = LexerMode::Script;
238                            return Some(Ok(self.token(
239                                TokenKind::EchoTag,
240                                buffer,
241                                start,
242                                self.input.current_position(),
243                            )));
244                        }
245
246                        let buffer = self.input.consume(2);
247                        self.mode = LexerMode::Script;
248                        return Some(Ok(self.token(
249                            TokenKind::ShortOpenTag,
250                            buffer,
251                            start,
252                            self.input.current_position(),
253                        )));
254                    }
255                } else {
256                    let iter = memchr::memmem::find_iter(bytes, b"<?");
257
258                    for pos in iter {
259                        // SAFETY: `pos` is guaranteed to be within `bytes` by `find_iter`.
260                        let candidate = unsafe { bytes.get_unchecked(pos..) };
261
262                        if candidate.len() >= 5
263                            && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
264                            && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
265                            && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
266                        {
267                            if pos > 0 {
268                                let buffer = self.input.consume(pos);
269                                let end = self.input.current_position();
270                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
271                            }
272
273                            let buffer = self.input.consume(5);
274                            self.mode = LexerMode::Script;
275                            return Some(Ok(self.token(
276                                TokenKind::OpenTag,
277                                buffer,
278                                start,
279                                self.input.current_position(),
280                            )));
281                        }
282
283                        if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
284                            if pos > 0 {
285                                let buffer = self.input.consume(pos);
286                                let end = self.input.current_position();
287                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
288                            }
289
290                            let buffer = self.input.consume(3);
291                            self.mode = LexerMode::Script;
292                            return Some(Ok(self.token(
293                                TokenKind::EchoTag,
294                                buffer,
295                                start,
296                                self.input.current_position(),
297                            )));
298                        }
299                    }
300                }
301
302                if self.input.has_reached_eof() {
303                    return None;
304                }
305
306                let buffer = self.input.consume_remaining();
307                let end = self.input.current_position();
308                Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
309            }
310            LexerMode::Script => {
311                let start = self.input.current_position();
312                let whitespaces = self.input.consume_whitespaces();
313                if !whitespaces.is_empty() {
314                    return Some(Ok(self.token(
315                        TokenKind::Whitespace,
316                        whitespaces,
317                        start,
318                        self.input.current_position(),
319                    )));
320                }
321
322                let first_byte = match self.input.read(1).first() {
323                    Some(&b) => b,
324                    None => {
325                        // SAFETY: we check for EOF before entering scripting section,
326                        unsafe { unreachable_unchecked() }
327                    }
328                };
329
330                if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
331                    let buffer = self.input.consume(1);
332                    let end = self.input.current_position();
333                    return Some(Ok(self.token(kind, buffer, start, end)));
334                }
335
336                if IDENT_START_TABLE[first_byte as usize] {
337                    let (token_kind, len) = self.scan_identifier_or_keyword_info();
338
339                    if token_kind == TokenKind::HaltCompiler {
340                        self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
341                    }
342
343                    let buffer = self.input.consume(len);
344                    let end = self.input.current_position();
345                    return Some(Ok(self.token(token_kind, buffer, start, end)));
346                }
347
348                if first_byte == b'$'
349                    && let Some(&next) = self.input.read(2).get(1)
350                    && IDENT_START_TABLE[next as usize]
351                {
352                    let (ident_len, _) = self.input.scan_identifier(1);
353                    let buffer = self.input.consume(1 + ident_len);
354                    let end = self.input.current_position();
355                    return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
356                }
357
358                let mut document_label: &[u8] = &[];
359
360                let (token_kind, len) = match self.input.read(3) {
361                    [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
362                    [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
363                    [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
364                    [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
365                    [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
366                    [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
367                    [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
368                    [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
369                    [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
370                    [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
371                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
372
373                        document_label = self.input.peek(3 + whitespaces, label_length);
374
375                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
376                    }
377                    [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
378                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
379
380                        document_label = self.input.peek(4 + whitespaces, label_length);
381
382                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
383                    }
384                    [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
385                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
386
387                        document_label = self.input.peek(4 + whitespaces, label_length);
388
389                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
390                    }
391                    [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
392                    [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
393                    [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
394                    [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
395                    [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
396                    [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
397                    [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
398                    [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
399                    [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
400                    [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
401                    [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
402                    [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
403                    [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
404                    [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
405                    [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
406                    [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
407                    [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
408                    [b'>', b'>', ..] => (TokenKind::RightShift, 2),
409                    [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
410                    [b':', b':', ..] => (TokenKind::ColonColon, 2),
411                    [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
412                    [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
413                    [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
414                    [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
415                    [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
416                    [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
417                    [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
418                    [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
419                    [b'/', b'/', ..] => {
420                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
421                        let comment_len = scan_single_line_comment(remaining);
422                        (TokenKind::SingleLineComment, 2 + comment_len)
423                    }
424                    [b'/', b'*', asterisk] => {
425                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
426                        match scan_multi_line_comment(remaining) {
427                            Some(len) => {
428                                let is_docblock = asterisk == &b'*' && len > 2;
429                                if is_docblock {
430                                    (TokenKind::DocBlockComment, len + 2)
431                                } else {
432                                    (TokenKind::MultiLineComment, len + 2)
433                                }
434                            }
435                            None => {
436                                self.input.consume(remaining.len() + 2);
437                                return Some(Err(SyntaxError::UnexpectedEndOfFile(
438                                    self.file_id(),
439                                    self.input.current_position(),
440                                )));
441                            }
442                        }
443                    }
444                    [b'\\', start_of_identifier!(), ..] => {
445                        let mut length = 1;
446                        loop {
447                            let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
448                            length += ident_len;
449                            if ends_with_ns {
450                                length += 1; // Include the backslash
451                            } else {
452                                break;
453                            }
454                        }
455
456                        (TokenKind::FullyQualifiedIdentifier, length)
457                    }
458                    [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
459                    [b'$', ..] => (TokenKind::Dollar, 1),
460                    [b'!', ..] => (TokenKind::Bang, 1),
461                    [b'&', ..] => (TokenKind::Ampersand, 1),
462                    [b'?', ..] => (TokenKind::Question, 1),
463                    [b'=', ..] => (TokenKind::Equal, 1),
464                    [b'`', ..] => (TokenKind::Backtick, 1),
465                    [b'+', ..] => (TokenKind::Plus, 1),
466                    [b'%', ..] => (TokenKind::Percent, 1),
467                    [b'-', ..] => (TokenKind::Minus, 1),
468                    [b'<', ..] => (TokenKind::LessThan, 1),
469                    [b'>', ..] => (TokenKind::GreaterThan, 1),
470                    [b':', ..] => (TokenKind::Colon, 1),
471                    [b'|', ..] => (TokenKind::Pipe, 1),
472                    [b'^', ..] => (TokenKind::Caret, 1),
473                    [b'*', ..] => (TokenKind::Asterisk, 1),
474                    [b'/', ..] => (TokenKind::Slash, 1),
475                    [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
476                    [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
477                        read_literal_string(&self.input, *quote)
478                    }
479                    [b'"', ..] => (TokenKind::DoubleQuote, 1),
480                    [b'(', ..] => 'parenthesis: {
481                        let mut peek_offset = 1;
482                        while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
483                            if b.is_ascii_whitespace() {
484                                peek_offset += 1;
485                            } else {
486                                // Check if this byte could start a cast type (case-insensitive)
487                                let lower = b | 0x20; // ASCII lowercase
488                                if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u' | b'v')
489                                {
490                                    break 'parenthesis (TokenKind::LeftParenthesis, 1);
491                                }
492                                break;
493                            }
494                        }
495
496                        for (value, kind) in internal::consts::CAST_TYPES {
497                            if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
498                                break 'parenthesis (kind, length);
499                            }
500                        }
501
502                        (TokenKind::LeftParenthesis, 1)
503                    }
504                    [b'#', ..] => {
505                        let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
506                        let comment_len = scan_single_line_comment(remaining);
507                        (TokenKind::HashComment, 1 + comment_len)
508                    }
509                    [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
510                    [b'.', start_of_number!(), ..] => {
511                        let mut length = read_digits_of_base(&self.input, 2, 10);
512                        if let float_exponent!() = self.input.peek(length, 1) {
513                            let mut exp_length = length + 1;
514                            if let number_sign!() = self.input.peek(exp_length, 1) {
515                                exp_length += 1;
516                            }
517
518                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
519                            if after_exp > exp_length {
520                                length = after_exp;
521                            }
522                        }
523
524                        (TokenKind::LiteralFloat, length)
525                    }
526                    [start_of_number!(), ..] => 'number: {
527                        let mut length = 1;
528
529                        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
530                            start_of_binary_number!() => {
531                                length += 1;
532
533                                (2, NumberKind::Integer)
534                            }
535                            start_of_octal_number!() => {
536                                length += 1;
537
538                                (8, NumberKind::Integer)
539                            }
540                            start_of_hexadecimal_number!() => {
541                                length += 1;
542
543                                (16, NumberKind::Integer)
544                            }
545                            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
546                            start_of_float_number!() => (10, NumberKind::Float),
547                            _ => (10, NumberKind::IntegerOrFloat),
548                        };
549
550                        if kind != NumberKind::Float {
551                            length = read_digits_of_base(&self.input, length, base);
552
553                            if kind == NumberKind::Integer {
554                                break 'number (TokenKind::LiteralInteger, length);
555                            }
556                        }
557
558                        let is_float = matches!(self.input.peek(length, 3), float_separator!());
559
560                        if !is_float {
561                            break 'number (TokenKind::LiteralInteger, length);
562                        }
563
564                        if let [b'.'] = self.input.peek(length, 1) {
565                            length += 1;
566                            length = read_digits_of_base(&self.input, length, 10);
567                        }
568
569                        if let float_exponent!() = self.input.peek(length, 1) {
570                            // Only include exponent if there are digits after it
571                            let mut exp_length = length + 1;
572                            if let number_sign!() = self.input.peek(exp_length, 1) {
573                                exp_length += 1;
574                            }
575                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
576                            if after_exp > exp_length {
577                                // There are digits after the exponent marker
578                                length = after_exp;
579                            }
580                        }
581
582                        (TokenKind::LiteralFloat, length)
583                    }
584                    [b'.', ..] => (TokenKind::Dot, 1),
585                    [unknown_byte, ..] => {
586                        let position = self.input.current_position();
587                        self.input.consume(1);
588
589                        return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
590                    }
591                    [] => {
592                        // we check for EOF before entering scripting section,
593                        // so this should be unreachable.
594                        unreachable!()
595                    }
596                };
597
598                self.mode = match token_kind {
599                    TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
600                    TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
601                    TokenKind::CloseTag => LexerMode::Inline,
602                    TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
603                    TokenKind::DocumentStart(document_kind) => {
604                        LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
605                    }
606                    _ => LexerMode::Script,
607                };
608
609                let buffer = self.input.consume(len);
610                let end = self.input.current_position();
611
612                Some(Ok(self.token(token_kind, buffer, start, end)))
613            }
614            LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
615                Interpolation::None => {
616                    let start = self.input.current_position();
617
618                    let mut length = 0;
619                    let mut last_was_slash = false;
620                    let mut token_kind = TokenKind::StringPart;
621                    loop {
622                        match self.input.peek(length, 2) {
623                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
624                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
625
626                                self.mode =
627                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
628
629                                break;
630                            }
631                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
632                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
633
634                                self.mode = LexerMode::DoubleQuoteString(Interpolation::BraceUntil(
635                                    start.offset + until_offset,
636                                ));
637
638                                break;
639                            }
640                            [b'\\', ..] => {
641                                length += 1;
642
643                                last_was_slash = !last_was_slash;
644                            }
645                            [b'"', ..] if !last_was_slash => {
646                                if length == 0 {
647                                    length += 1;
648                                    token_kind = TokenKind::DoubleQuote;
649
650                                    break;
651                                }
652
653                                break;
654                            }
655                            [_, ..] => {
656                                length += 1;
657                                last_was_slash = false;
658                            }
659                            [] => {
660                                break;
661                            }
662                        }
663                    }
664
665                    let buffer = self.input.consume(length);
666                    let end = self.input.current_position();
667
668                    if TokenKind::DoubleQuote == token_kind {
669                        self.mode = LexerMode::Script;
670                    }
671
672                    Some(Ok(self.token(token_kind, buffer, start, end)))
673                }
674                Interpolation::Until(offset) => {
675                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), false)
676                }
677                Interpolation::BraceUntil(offset) => {
678                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), true)
679                }
680            },
681            LexerMode::ShellExecuteString(interpolation) => match &interpolation {
682                Interpolation::None => {
683                    let start = self.input.current_position();
684
685                    let mut length = 0;
686                    let mut last_was_slash = false;
687                    let mut token_kind = TokenKind::StringPart;
688                    loop {
689                        match self.input.peek(length, 2) {
690                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
691                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
692
693                                self.mode =
694                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
695
696                                break;
697                            }
698                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
699                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
700
701                                self.mode = LexerMode::ShellExecuteString(Interpolation::BraceUntil(
702                                    start.offset + until_offset,
703                                ));
704
705                                break;
706                            }
707                            [b'\\', ..] => {
708                                length += 1;
709                                last_was_slash = true;
710                            }
711                            [b'`', ..] if !last_was_slash => {
712                                if length == 0 {
713                                    length += 1;
714                                    token_kind = TokenKind::Backtick;
715
716                                    break;
717                                }
718
719                                break;
720                            }
721                            [_, ..] => {
722                                length += 1;
723                                last_was_slash = false;
724                            }
725                            [] => {
726                                break;
727                            }
728                        }
729                    }
730
731                    let buffer = self.input.consume(length);
732                    let end = self.input.current_position();
733
734                    if TokenKind::Backtick == token_kind {
735                        self.mode = LexerMode::Script;
736                    }
737
738                    Some(Ok(self.token(token_kind, buffer, start, end)))
739                }
740                Interpolation::Until(offset) => {
741                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), false)
742                }
743                Interpolation::BraceUntil(offset) => {
744                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), true)
745                }
746            },
747            LexerMode::DocumentString(kind, label, interpolation) => match &kind {
748                DocumentKind::Heredoc => match &interpolation {
749                    Interpolation::None => {
750                        let start = self.input.current_position();
751
752                        let mut length = 0;
753                        let mut last_was_slash = false;
754                        let mut only_whitespaces = true;
755                        let mut token_kind = TokenKind::StringPart;
756                        loop {
757                            match self.input.peek(length, 2) {
758                                [b'\r', b'\n'] => {
759                                    length += 2;
760
761                                    break;
762                                }
763                                [b'\n' | b'\r', ..] => {
764                                    length += 1;
765
766                                    break;
767                                }
768                                [byte, ..] if byte.is_ascii_whitespace() => {
769                                    length += 1;
770                                }
771                                [b'$', start_of_identifier!(), ..] if !last_was_slash => {
772                                    let until_offset =
773                                        read_until_end_of_variable_interpolation(&self.input, length + 2);
774
775                                    self.mode = LexerMode::DocumentString(
776                                        kind,
777                                        label,
778                                        Interpolation::Until(start.offset + until_offset),
779                                    );
780
781                                    break;
782                                }
783                                [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
784                                    let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
785
786                                    self.mode = LexerMode::DocumentString(
787                                        kind,
788                                        label,
789                                        Interpolation::BraceUntil(start.offset + until_offset),
790                                    );
791
792                                    break;
793                                }
794                                [b'\\', ..] => {
795                                    length += 1;
796                                    last_was_slash = true;
797                                    only_whitespaces = false;
798                                }
799                                [_, ..] => {
800                                    if only_whitespaces
801                                        && self.input.peek(length, label.len()) == label
802                                        && self
803                                            .input
804                                            .peek(length + label.len(), 1)
805                                            .first()
806                                            .is_none_or(|c| !c.is_ascii_alphanumeric())
807                                    {
808                                        length += label.len();
809                                        token_kind = TokenKind::DocumentEnd;
810
811                                        break;
812                                    }
813
814                                    length += 1;
815                                    last_was_slash = false;
816                                    only_whitespaces = false;
817                                }
818                                [] => {
819                                    break;
820                                }
821                            }
822                        }
823
824                        let buffer = self.input.consume(length);
825                        let end = self.input.current_position();
826
827                        if TokenKind::DocumentEnd == token_kind {
828                            self.mode = LexerMode::Script;
829                        }
830
831                        Some(Ok(self.token(token_kind, buffer, start, end)))
832                    }
833                    Interpolation::Until(offset) => {
834                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), false)
835                    }
836                    Interpolation::BraceUntil(offset) => {
837                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), true)
838                    }
839                },
840                DocumentKind::Nowdoc => {
841                    let start = self.input.current_position();
842
843                    let mut length = 0;
844                    let mut terminated = false;
845                    let mut only_whitespaces = true;
846
847                    loop {
848                        match self.input.peek(length, 2) {
849                            [b'\r', b'\n'] => {
850                                length += 2;
851
852                                break;
853                            }
854                            [b'\n' | b'\r', ..] => {
855                                length += 1;
856
857                                break;
858                            }
859                            [byte, ..] if byte.is_ascii_whitespace() => {
860                                length += 1;
861                            }
862                            [_, ..] => {
863                                if only_whitespaces
864                                    && self.input.peek(length, label.len()) == label
865                                    && self
866                                        .input
867                                        .peek(length + label.len(), 1)
868                                        .first()
869                                        .is_none_or(|c| !c.is_ascii_alphanumeric())
870                                {
871                                    length += label.len();
872                                    terminated = true;
873
874                                    break;
875                                }
876
877                                only_whitespaces = false;
878                                length += 1;
879                            }
880                            [] => {
881                                break;
882                            }
883                        }
884                    }
885
886                    let buffer = self.input.consume(length);
887                    let end = self.input.current_position();
888
889                    if terminated {
890                        self.mode = LexerMode::Script;
891
892                        return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
893                    }
894
895                    Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
896                }
897            },
898            LexerMode::Halt(stage) => 'halt: {
899                let start = self.input.current_position();
900                if let HaltStage::End = stage {
901                    let buffer = self.input.consume_remaining();
902                    let end = self.input.current_position();
903
904                    break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
905                }
906
907                let whitespaces = self.input.consume_whitespaces();
908                if !whitespaces.is_empty() {
909                    let end = self.input.current_position();
910
911                    break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
912                }
913
914                match &stage {
915                    HaltStage::LookingForLeftParenthesis => {
916                        if self.input.is_at(b"(", false) {
917                            let buffer = self.input.consume(1);
918                            let end = self.input.current_position();
919
920                            self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
921
922                            Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
923                        } else {
924                            let byte = self.input.read(1)[0];
925                            let position = self.input.current_position();
926                            // Consume the unexpected byte to avoid infinite loops
927                            self.input.consume(1);
928                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
929                        }
930                    }
931                    HaltStage::LookingForRightParenthesis => {
932                        if self.input.is_at(b")", false) {
933                            let buffer = self.input.consume(1);
934                            let end = self.input.current_position();
935
936                            self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
937
938                            Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
939                        } else {
940                            let byte = self.input.read(1)[0];
941                            let position = self.input.current_position();
942                            self.input.consume(1);
943                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
944                        }
945                    }
946                    HaltStage::LookingForTerminator => {
947                        if self.input.is_at(b";", false) {
948                            let buffer = self.input.consume(1);
949                            let end = self.input.current_position();
950
951                            self.mode = LexerMode::Halt(HaltStage::End);
952
953                            Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
954                        } else if self.input.is_at(b"?>", false) {
955                            let buffer = self.input.consume(2);
956                            let end = self.input.current_position();
957
958                            self.mode = LexerMode::Halt(HaltStage::End);
959
960                            Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
961                        } else {
962                            let byte = self.input.read(1)[0];
963                            let position = self.input.current_position();
964                            self.input.consume(1);
965                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
966                        }
967                    }
968                    _ => unreachable!(),
969                }
970            }
971        }
972    }
973
974    /// Fast path for scanning identifiers and keywords.
975    /// Called when we know the first byte is an identifier start character.
976    /// Returns (TokenKind, length) to allow proper mode switching.
977    #[inline]
978    fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
979        let (mut length, ended_with_slash) = self.input.scan_identifier(0);
980
981        if !ended_with_slash {
982            match length {
983                6 if self.input.is_at(b"public(set)", true) => {
984                    return (TokenKind::PublicSet, 11);
985                }
986                7 if self.input.is_at(b"private(set)", true) => {
987                    return (TokenKind::PrivateSet, 12);
988                }
989                9 if self.input.is_at(b"protected(set)", true) => {
990                    return (TokenKind::ProtectedSet, 14);
991                }
992                _ => {}
993            }
994        }
995
996        if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
997            return (kind, length);
998        }
999
1000        let mut slashes = 0;
1001        let mut last_was_slash = false;
1002        loop {
1003            match self.input.peek(length, 1) {
1004                [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
1005                    length += 1;
1006                    last_was_slash = false;
1007                }
1008                [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1009                    length += 1;
1010                }
1011                [b'\\'] if !self.interpolating || self.brace_interpolating => {
1012                    if last_was_slash {
1013                        length -= 1;
1014                        slashes -= 1;
1015                        last_was_slash = false;
1016                        break;
1017                    }
1018
1019                    length += 1;
1020                    slashes += 1;
1021                    last_was_slash = true;
1022                }
1023                _ => {
1024                    break;
1025                }
1026            }
1027        }
1028
1029        if last_was_slash {
1030            length -= 1;
1031            slashes -= 1;
1032        }
1033
1034        let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1035
1036        (kind, length)
1037    }
1038
1039    #[inline]
1040    fn token(&self, kind: TokenKind, v: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1041        // SAFETY: The input bytes are guaranteed to be valid UTF-8 because:
1042        // 1. File contents are validated via simdutf8 during database loading
1043        // 2. Invalid UTF-8 is converted lossily before reaching the lexer
1044        // 3. All byte slices here are subslices of the validated input
1045        let value = unsafe { std::str::from_utf8_unchecked(v) };
1046
1047        Token { kind, start, value }
1048    }
1049
1050    #[inline]
1051    fn interpolation(
1052        &mut self,
1053        end_offset: u32,
1054        post_interpolation_mode: LexerMode<'input>,
1055        brace: bool,
1056    ) -> Option<Result<Token<'input>, SyntaxError>> {
1057        self.mode = LexerMode::Script;
1058
1059        let was_interpolating = self.interpolating;
1060        self.interpolating = true;
1061        let was_brace_interpolating = self.brace_interpolating;
1062        // For brace interpolation ({$...}), allow qualified identifiers with backslashes.
1063        self.brace_interpolating = brace;
1064
1065        loop {
1066            let subsequent_token = self.advance()?.ok()?;
1067            // Check if this token contains the end offset
1068            let token_start = subsequent_token.start.offset;
1069            let token_end = token_start + subsequent_token.value.len() as u32;
1070            let is_final_token = token_start <= end_offset && end_offset <= token_end;
1071
1072            self.buffer.push_back(subsequent_token);
1073
1074            if is_final_token {
1075                break;
1076            }
1077        }
1078
1079        self.mode = post_interpolation_mode;
1080        self.interpolating = was_interpolating;
1081        self.brace_interpolating = was_brace_interpolating;
1082
1083        self.advance()
1084    }
1085}
1086
1087impl HasFileId for Lexer<'_> {
1088    #[inline]
1089    fn file_id(&self) -> FileId {
1090        self.input.file_id()
1091    }
1092}
1093
1094#[inline]
1095fn matches_start_of_heredoc_document(input: &Input) -> bool {
1096    let total = input.len();
1097    let base = input.current_offset();
1098
1099    // Start after the fixed opener (3 bytes).
1100    let mut length = 3;
1101    // Consume any following whitespace.
1102    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1103        length += 1;
1104    }
1105
1106    // The next byte must be a valid start-of-identifier.
1107    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1108        return false;
1109    }
1110    length += 1; // Include that identifier start.
1111
1112    // Now continue reading identifier characters until a newline is found.
1113    loop {
1114        let pos = base + length;
1115        if pos >= total {
1116            return false; // Unexpected EOF
1117        }
1118
1119        let byte = *input.read_at(pos);
1120        if byte == b'\n' {
1121            return true; // Newline found: valid heredoc opener.
1122        } else if byte == b'\r' {
1123            // Handle CRLF: treat '\r' followed by '\n' as a newline as well.
1124            return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1125        } else if is_part_of_identifier(input.read_at(pos)) {
1126            length += 1;
1127        } else {
1128            return false; // Unexpected character.
1129        }
1130    }
1131}
1132
1133#[inline]
1134fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1135    let total = input.len();
1136    let base = input.current_offset();
1137
1138    // Start after the fixed opener (3 bytes), then skip any whitespace.
1139    let mut length = 3;
1140    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1141        length += 1;
1142    }
1143
1144    // Next, expect an opening double quote.
1145    if base + length >= total || *input.read_at(base + length) != b'"' {
1146        return false;
1147    }
1148    length += 1;
1149
1150    // The following byte must be a valid start-of-identifier.
1151    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1152        return false;
1153    }
1154    length += 1;
1155
1156    // Now scan the label. For double‑quoted heredoc, a terminating double quote is required.
1157    let mut terminated = false;
1158    loop {
1159        let pos = base + length;
1160        if pos >= total {
1161            return false;
1162        }
1163        let byte = input.read_at(pos);
1164        if *byte == b'\n' {
1165            // End of line: valid only if a closing double quote was encountered.
1166            return terminated;
1167        } else if *byte == b'\r' {
1168            // Handle CRLF sequences.
1169            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1170        } else if !terminated && is_part_of_identifier(byte) {
1171            length += 1;
1172        } else if !terminated && *byte == b'"' {
1173            terminated = true;
1174            length += 1;
1175        } else {
1176            return false;
1177        }
1178    }
1179}
1180
1181#[inline]
1182fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1183    let total = input.len();
1184    let base = input.current_offset();
1185
1186    // Start after the fixed opener (3 bytes) and skip whitespace.
1187    let mut length = 3;
1188    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1189        length += 1;
1190    }
1191
1192    // Now, the next byte must be a single quote.
1193    if base + length >= total || *input.read_at(base + length) != b'\'' {
1194        return false;
1195    }
1196    length += 1;
1197
1198    // The following byte must be a valid start-of-identifier.
1199    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1200        return false;
1201    }
1202    length += 1;
1203
1204    // Read the label until a newline. A terminating single quote is required.
1205    let mut terminated = false;
1206    loop {
1207        let pos = base + length;
1208        if pos >= total {
1209            return false;
1210        }
1211        let byte = *input.read_at(pos);
1212        if byte == b'\n' {
1213            return terminated;
1214        } else if byte == b'\r' {
1215            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1216        } else if !terminated && is_part_of_identifier(&byte) {
1217            length += 1;
1218        } else if !terminated && byte == b'\'' {
1219            terminated = true;
1220            length += 1;
1221        } else {
1222            return false;
1223        }
1224    }
1225}
1226
1227#[inline]
1228fn matches_literal_double_quote_string(input: &Input) -> bool {
1229    let total = input.len();
1230    let base = input.current_offset();
1231
1232    // Start after the initial double-quote (assumed consumed).
1233    let mut pos = base + 1;
1234    loop {
1235        if pos >= total {
1236            // Reached EOF: assume literal is complete.
1237            return true;
1238        }
1239        let byte = *input.read_at(pos);
1240        if byte == b'"' {
1241            // Encounter a closing double quote.
1242            return true;
1243        } else if byte == b'\\' {
1244            // Skip an escape sequence: assume that the backslash and the escaped character form a pair.
1245            pos += 2;
1246            continue;
1247        }
1248
1249        // Check for variable interpolation or complex expression start:
1250        // If two-byte sequences match either "$" followed by a start-of-identifier or "{" and "$", then return false.
1251        if pos + 1 < total {
1252            let next = *input.read_at(pos + 1);
1253            if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1254                return false;
1255            }
1256        }
1257        pos += 1;
1258    }
1259}
1260
1261#[inline]
1262fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1263    let total = input.len();
1264    let base = input.current_offset();
1265
1266    // Start reading at offset base+3 (the fixed opener length).
1267    let mut pos = base + 3;
1268    let mut whitespaces = 0;
1269    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1270        whitespaces += 1;
1271        pos += 1;
1272    }
1273
1274    // The label (or delimiter) starts after:
1275    //   3 bytes + whitespace bytes + an extra offset:
1276    //      if double-quoted: 2 bytes (opening and closing quotes around the label)
1277    //      else: 1 byte.
1278    let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1279
1280    let mut label_length = 1; // Start with at least one byte for the label.
1281    let mut terminated = false; // For double-quoted heredoc, to track the closing quote.
1282    loop {
1283        let pos = base + length;
1284        // Ensure we haven't run past the input.
1285        if pos >= total {
1286            unreachable!("Unexpected end of input while reading heredoc label");
1287        }
1288
1289        let byte = *input.read_at(pos);
1290        if byte == b'\n' {
1291            // Newline ends the label.
1292            length += 1;
1293            return (length, whitespaces, label_length);
1294        } else if byte == b'\r' {
1295            // Handle CRLF sequences
1296            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1297                length += 2;
1298            } else {
1299                length += 1;
1300            }
1301            return (length, whitespaces, label_length);
1302        } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1303            // For both unquoted and double-quoted (before the closing quote) heredoc,
1304            // a valid identifier character is part of the label.
1305            length += 1;
1306            label_length += 1;
1307        } else if double_quoted && !terminated && byte == b'"' {
1308            // In a double-quoted heredoc, a double quote terminates the label.
1309            length += 1;
1310            terminated = true;
1311        } else {
1312            unreachable!("Unexpected character encountered in heredoc label");
1313        }
1314    }
1315}
1316
1317#[inline]
1318fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1319    let total = input.len();
1320    let base = input.current_offset();
1321
1322    let mut pos = base + 3;
1323    let mut whitespaces = 0;
1324    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1325        whitespaces += 1;
1326        pos += 1;
1327    }
1328
1329    // For nowdoc, the fixed extra offset is always 2.
1330    let mut length = 3 + whitespaces + 2;
1331
1332    let mut label_length = 1;
1333    let mut terminated = false;
1334    loop {
1335        let pos = base + length;
1336        if pos >= total {
1337            unreachable!("Unexpected end of input while reading nowdoc label");
1338        }
1339        let byte = *input.read_at(pos);
1340
1341        if byte == b'\n' {
1342            // A newline indicates the end of the label.
1343            length += 1;
1344            return (length, whitespaces, label_length);
1345        } else if byte == b'\r' {
1346            // Handle CRLF sequences
1347            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1348                length += 2;
1349            } else {
1350                length += 1;
1351            }
1352            return (length, whitespaces, label_length);
1353        } else if is_part_of_identifier(&byte) && !terminated {
1354            // For nowdoc, identifier characters contribute to the label until terminated.
1355            length += 1;
1356            label_length += 1;
1357        } else if !terminated && byte == b'\'' {
1358            // A single quote terminates the nowdoc label.
1359            length += 1;
1360            terminated = true;
1361        } else {
1362            unreachable!("Unexpected character encountered in nowdoc label");
1363        }
1364    }
1365}
1366
1367#[inline]
1368fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1369    let total = input.len();
1370    let start = input.current_offset();
1371    let mut length = 1; // We assume the opening quote is already consumed.
1372
1373    let bytes = input.peek(length, total - start - length);
1374    loop {
1375        match memchr2(quote, b'\\', &bytes[length - 1..]) {
1376            Some(pos) => {
1377                let abs_pos = length - 1 + pos;
1378                let byte = bytes[abs_pos];
1379
1380                if byte == b'\\' {
1381                    length = abs_pos + 2 + 1; // +1 because bytes starts at offset 1
1382                    if length > total - start {
1383                        return (TokenKind::PartialLiteralString, total - start);
1384                    }
1385                } else {
1386                    length = abs_pos + 2; // +1 for the quote, +1 because bytes starts at offset 1
1387                    return (TokenKind::LiteralString, length);
1388                }
1389            }
1390            None => {
1391                // No quote or backslash found - EOF
1392                return (TokenKind::PartialLiteralString, total - start);
1393            }
1394        }
1395    }
1396}
1397
1398#[inline]
1399fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1400    let total = input.len();
1401    let base = input.current_offset();
1402    // `offset` is relative to the current position.
1403    let mut offset = from;
1404
1405    loop {
1406        let abs = base + offset;
1407        if abs >= total {
1408            // End of input.
1409            break;
1410        }
1411
1412        // Pattern 1: If the current byte is part of an identifier, simply advance.
1413        if is_part_of_identifier(input.read_at(abs)) {
1414            offset += 1;
1415            continue;
1416        }
1417
1418        // Pattern 2: If the current byte is a '[' then we enter a bracketed interpolation.
1419        if *input.read_at(abs) == b'[' {
1420            offset += 1;
1421            let mut nesting = 0;
1422            loop {
1423                let abs_inner = base + offset;
1424                if abs_inner >= total {
1425                    break;
1426                }
1427                let b = input.read_at(abs_inner);
1428                if *b == b']' {
1429                    offset += 1;
1430                    if nesting == 0 {
1431                        break;
1432                    }
1433
1434                    nesting -= 1;
1435                } else if *b == b'[' {
1436                    offset += 1;
1437                    nesting += 1;
1438                } else if b.is_ascii_whitespace() {
1439                    // Do not include whitespace.
1440                    break;
1441                } else {
1442                    offset += 1;
1443                }
1444            }
1445            // When bracketed interpolation is processed, exit the loop.
1446            break;
1447        }
1448
1449        // Pattern 3: Check for "->" followed by a valid identifier start.
1450        if base + offset + 2 < total
1451            && *input.read_at(abs) == b'-'
1452            && *input.read_at(base + offset + 1) == b'>'
1453            && is_start_of_identifier(input.read_at(base + offset + 2))
1454        {
1455            offset += 3;
1456            // Consume any following identifier characters.
1457            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1458                offset += 1;
1459            }
1460            break;
1461        }
1462
1463        // Pattern 4: Check for "?->" followed by a valid identifier start.
1464        if base + offset + 3 < total
1465            && *input.read_at(abs) == b'?'
1466            && *input.read_at(base + offset + 1) == b'-'
1467            && *input.read_at(base + offset + 2) == b'>'
1468            && is_start_of_identifier(input.read_at(base + offset + 3))
1469        {
1470            offset += 4;
1471            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1472                offset += 1;
1473            }
1474            break;
1475        }
1476
1477        // None of the expected patterns matched: exit the loop.
1478        break;
1479    }
1480
1481    offset as u32
1482}
1483
1484#[inline]
1485fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1486    let total = input.len();
1487    let base = input.current_offset();
1488    let mut offset = from;
1489    let mut nesting = 0;
1490
1491    loop {
1492        let abs = base + offset;
1493        if abs >= total {
1494            break;
1495        }
1496        match input.read_at(abs) {
1497            b'}' => {
1498                offset += 1;
1499                if nesting == 0 {
1500                    break;
1501                }
1502
1503                nesting -= 1;
1504            }
1505            b'{' => {
1506                offset += 1;
1507                nesting += 1;
1508            }
1509            _ => {
1510                offset += 1;
1511            }
1512        }
1513    }
1514
1515    offset as u32
1516}
1517
1518/// Scan a multi-line comment using SIMD-accelerated search.
1519/// Returns Some(length) including the closing */, or None if unterminated.
1520#[inline]
1521fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1522    // Use SIMD to find */ quickly
1523    memmem::find(bytes, b"*/").map(|pos| pos + 2)
1524}
1525
1526/// Scan a single-line comment using SIMD-accelerated search.
1527/// Returns the length of the comment body (not including the //).
1528/// Stops at newline or ?>.
1529#[inline]
1530fn scan_single_line_comment(bytes: &[u8]) -> usize {
1531    let mut pos = 0;
1532    while pos < bytes.len() {
1533        match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1534            Some(offset) => {
1535                let found_pos = pos + offset;
1536                match bytes[found_pos] {
1537                    b'\n' | b'\r' => return found_pos,
1538                    b'?' => {
1539                        // Check if it's ?>
1540                        if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1541                            // Also check for whitespace before ?>
1542                            if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1543                                return found_pos - 1;
1544                            }
1545                            return found_pos;
1546                        }
1547                        // Not ?>, continue searching
1548                        pos = found_pos + 1;
1549                    }
1550                    _ => unreachable!(),
1551                }
1552            }
1553            None => return bytes.len(),
1554        }
1555    }
1556
1557    bytes.len()
1558}
mago_syntax/lexer/mod.rs

mago_syntax/lexer/
mod.rs