mago_syntax/lexer/
mod.rs

1use std::collections::VecDeque;
2use std::fmt::Debug;
3use std::hint::unreachable_unchecked;
4
5use memchr::memchr2;
6use memchr::memmem;
7
8/// Lookup table for single-character tokens that are ALWAYS single-char
9/// (i.e., they can never be part of a multi-character token).
10/// Maps byte -> Option<TokenKind>
11const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
12    let mut table: [Option<TokenKind>; 256] = [None; 256];
13    table[b';' as usize] = Some(TokenKind::Semicolon);
14    table[b',' as usize] = Some(TokenKind::Comma);
15    table[b')' as usize] = Some(TokenKind::RightParenthesis);
16    table[b'[' as usize] = Some(TokenKind::LeftBracket);
17    table[b']' as usize] = Some(TokenKind::RightBracket);
18    table[b'{' as usize] = Some(TokenKind::LeftBrace);
19    table[b'}' as usize] = Some(TokenKind::RightBrace);
20    table[b'~' as usize] = Some(TokenKind::Tilde);
21    table[b'@' as usize] = Some(TokenKind::At);
22    table
23};
24
25/// Lookup table for identifier start characters (a-z, A-Z, _, 0x80-0xFF)
26const IDENT_START_TABLE: [bool; 256] = {
27    let mut table = [false; 256];
28    let mut i = 0usize;
29    while i < 256 {
30        table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
31        i += 1;
32    }
33
34    table
35};
36
37use mago_database::file::FileId;
38use mago_database::file::HasFileId;
39use mago_span::Position;
40use mago_syntax_core::float_exponent;
41use mago_syntax_core::float_separator;
42use mago_syntax_core::input::Input;
43use mago_syntax_core::number_sign;
44use mago_syntax_core::start_of_binary_number;
45use mago_syntax_core::start_of_float_number;
46use mago_syntax_core::start_of_hexadecimal_number;
47use mago_syntax_core::start_of_identifier;
48use mago_syntax_core::start_of_number;
49use mago_syntax_core::start_of_octal_number;
50use mago_syntax_core::start_of_octal_or_float_number;
51use mago_syntax_core::utils::is_part_of_identifier;
52use mago_syntax_core::utils::is_start_of_identifier;
53use mago_syntax_core::utils::read_digits_of_base;
54
55use crate::error::SyntaxError;
56use crate::lexer::internal::mode::HaltStage;
57use crate::lexer::internal::mode::Interpolation;
58use crate::lexer::internal::mode::LexerMode;
59use crate::lexer::internal::utils::NumberKind;
60use crate::settings::LexerSettings;
61use crate::token::DocumentKind;
62use crate::token::Token;
63use crate::token::TokenKind;
64
65mod internal;
66
67/// The `Lexer` struct is responsible for tokenizing input source code into discrete tokens
68/// based on PHP language syntax. It is designed to work with PHP code from version 7.0 up to 8.4.
69///
70/// The lexer reads through the provided input and processes it accordingly.
71///
72/// It identifies PHP-specific tokens, including operators, keywords, comments, strings, and other syntax elements,
73/// and produces a sequence of [`Token`] objects that are used in further stages of compilation or interpretation.
74///
75/// The lexer is designed to be used in a streaming fashion, where it reads the input source code in chunks
76/// and produces tokens incrementally. This allows for efficient processing of large source files and
77/// minimizes memory usage.
78#[derive(Debug)]
79pub struct Lexer<'input> {
80    input: Input<'input>,
81    settings: LexerSettings,
82    mode: LexerMode<'input>,
83    interpolating: bool,
84    /// Buffer for tokens during string interpolation.
85    buffer: VecDeque<Token<'input>>,
86}
87
88impl<'input> Lexer<'input> {
89    /// Initial capacity for the token buffer used during string interpolation.
90    /// Pre-allocating avoids reallocation during interpolation processing.
91    const BUFFER_INITIAL_CAPACITY: usize = 8;
92
93    /// Creates a new `Lexer` instance.
94    ///
95    /// # Parameters
96    ///
97    /// - `input`: The input source code to tokenize.
98    /// - `settings`: The lexer settings.
99    ///
100    /// # Returns
101    ///
102    /// A new `Lexer` instance that reads from the provided byte slice.
103    pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
104        Lexer {
105            input,
106            settings,
107            mode: LexerMode::Inline,
108            interpolating: false,
109            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
110        }
111    }
112
113    /// Creates a new `Lexer` instance for parsing a script block.
114    ///
115    /// # Parameters
116    ///
117    /// - `input`: The input source code to tokenize.
118    /// - `settings`: The lexer settings.
119    ///
120    /// # Returns
121    ///
122    /// A new `Lexer` instance that reads from the provided byte slice.
123    pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
124        Lexer {
125            input,
126            settings,
127            mode: LexerMode::Script,
128            interpolating: false,
129            buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
130        }
131    }
132
133    /// Check if the lexer has reached the end of the input.
134    ///
135    /// If this method returns `true`, the lexer will not produce any more tokens.
136    #[must_use]
137    pub fn has_reached_eof(&self) -> bool {
138        self.input.has_reached_eof()
139    }
140
141    /// Get the current position of the lexer in the input source code.
142    #[inline]
143    pub const fn current_position(&self) -> Position {
144        self.input.current_position()
145    }
146
147    /// Tokenizes the next input from the source code.
148    ///
149    /// This method reads from the input and produces the next [`Token`] based on the current [`LexerMode`].
150    /// It handles various lexical elements such as inline text, script code, strings with interpolation,
151    /// comments, and different PHP-specific constructs.
152    ///
153    /// # Returns
154    ///
155    /// - `Some(Ok(Token))` if a token was successfully parsed.
156    /// - `Some(Err(SyntaxError))` if a syntax error occurred while parsing the next token.
157    /// - `None` if the end of the input has been reached.
158    ///
159    /// # Notes
160    ///
161    /// - It efficiently handles tokenization by consuming input based on patterns specific to PHP syntax.
162    /// - The lexer supports complex features like string interpolation and different numeric formats.
163    ///
164    /// # Errors
165    ///
166    /// Returns `Some(Err(SyntaxError))` in cases such as:
167    ///
168    /// - Unrecognized tokens that do not match any known PHP syntax.
169    /// - Unexpected tokens in a given context, such as an unexpected end of string.
170    ///
171    /// # Panics
172    ///
173    /// This method should not panic under normal operation. If it does, it indicates a bug in the lexer implementation.
174    ///
175    /// # See Also
176    ///
177    /// - [`Token`]: Represents a lexical token with its kind, value, and span.
178    /// - [`SyntaxError`]: Represents errors that can occur during lexing.
179    #[inline]
180    pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
181        // Check if there are buffered tokens from string interpolation.
182        if !self.interpolating
183            && let Some(token) = self.buffer.pop_front()
184        {
185            return Some(Ok(token));
186        }
187
188        if self.input.has_reached_eof() {
189            return None;
190        }
191
192        match self.mode {
193            LexerMode::Inline => {
194                let start = self.input.current_position();
195                let offset = self.input.current_offset();
196
197                // Shebang is only valid at the absolute start of the file (offset 0).
198                if offset == 0
199                    && self.input.len() >= 2
200                    && unsafe { *self.input.read_at_unchecked(0) } == b'#'
201                    && unsafe { *self.input.read_at_unchecked(1) } == b'!'
202                {
203                    let buffer = self.input.consume_through(b'\n');
204                    let end = self.input.current_position();
205
206                    return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
207                }
208
209                // Get the remaining bytes to scan.
210                let bytes = self.input.read_remaining();
211
212                if self.settings.enable_short_tags {
213                    if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
214                        if pos > 0 {
215                            let buffer = self.input.consume(pos);
216                            let end = self.input.current_position();
217
218                            return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
219                        }
220
221                        if self.input.is_at(b"<?php", true) {
222                            let buffer = self.input.consume(5);
223                            self.mode = LexerMode::Script;
224                            return Some(Ok(self.token(
225                                TokenKind::OpenTag,
226                                buffer,
227                                start,
228                                self.input.current_position(),
229                            )));
230                        }
231
232                        if self.input.is_at(b"<?=", false) {
233                            let buffer = self.input.consume(3);
234                            self.mode = LexerMode::Script;
235                            return Some(Ok(self.token(
236                                TokenKind::EchoTag,
237                                buffer,
238                                start,
239                                self.input.current_position(),
240                            )));
241                        }
242
243                        let buffer = self.input.consume(2);
244                        self.mode = LexerMode::Script;
245                        return Some(Ok(self.token(
246                            TokenKind::ShortOpenTag,
247                            buffer,
248                            start,
249                            self.input.current_position(),
250                        )));
251                    }
252                } else {
253                    let iter = memchr::memmem::find_iter(bytes, b"<?");
254
255                    for pos in iter {
256                        // SAFETY: `pos` is guaranteed to be within `bytes` by `find_iter`.
257                        let candidate = unsafe { bytes.get_unchecked(pos..) };
258
259                        if candidate.len() >= 5
260                            && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
261                            && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
262                            && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
263                        {
264                            if pos > 0 {
265                                let buffer = self.input.consume(pos);
266                                let end = self.input.current_position();
267                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
268                            }
269
270                            let buffer = self.input.consume(5);
271                            self.mode = LexerMode::Script;
272                            return Some(Ok(self.token(
273                                TokenKind::OpenTag,
274                                buffer,
275                                start,
276                                self.input.current_position(),
277                            )));
278                        }
279
280                        if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
281                            if pos > 0 {
282                                let buffer = self.input.consume(pos);
283                                let end = self.input.current_position();
284                                return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
285                            }
286
287                            let buffer = self.input.consume(3);
288                            self.mode = LexerMode::Script;
289                            return Some(Ok(self.token(
290                                TokenKind::EchoTag,
291                                buffer,
292                                start,
293                                self.input.current_position(),
294                            )));
295                        }
296                    }
297                }
298
299                if self.input.has_reached_eof() {
300                    return None;
301                }
302
303                let buffer = self.input.consume_remaining();
304                let end = self.input.current_position();
305                Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
306            }
307            LexerMode::Script => {
308                let start = self.input.current_position();
309                let whitespaces = self.input.consume_whitespaces();
310                if !whitespaces.is_empty() {
311                    return Some(Ok(self.token(
312                        TokenKind::Whitespace,
313                        whitespaces,
314                        start,
315                        self.input.current_position(),
316                    )));
317                }
318
319                let first_byte = match self.input.read(1).first() {
320                    Some(&b) => b,
321                    None => {
322                        // SAFETY: we check for EOF before entering scripting section,
323                        unsafe { unreachable_unchecked() }
324                    }
325                };
326
327                if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
328                    let buffer = self.input.consume(1);
329                    let end = self.input.current_position();
330                    return Some(Ok(self.token(kind, buffer, start, end)));
331                }
332
333                if IDENT_START_TABLE[first_byte as usize] {
334                    let (token_kind, len) = self.scan_identifier_or_keyword_info();
335
336                    if token_kind == TokenKind::HaltCompiler {
337                        self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
338                    }
339
340                    let buffer = self.input.consume(len);
341                    let end = self.input.current_position();
342                    return Some(Ok(self.token(token_kind, buffer, start, end)));
343                }
344
345                if first_byte == b'$'
346                    && let Some(&next) = self.input.read(2).get(1)
347                    && IDENT_START_TABLE[next as usize]
348                {
349                    let (ident_len, _) = self.input.scan_identifier(1);
350                    let buffer = self.input.consume(1 + ident_len);
351                    let end = self.input.current_position();
352                    return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
353                }
354
355                let mut document_label: &[u8] = &[];
356
357                let (token_kind, len) = match self.input.read(3) {
358                    [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
359                    [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
360                    [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
361                    [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
362                    [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
363                    [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
364                    [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
365                    [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
366                    [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
367                    [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
368                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
369
370                        document_label = self.input.peek(3 + whitespaces, label_length);
371
372                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
373                    }
374                    [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
375                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
376
377                        document_label = self.input.peek(4 + whitespaces, label_length);
378
379                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
380                    }
381                    [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
382                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
383
384                        document_label = self.input.peek(4 + whitespaces, label_length);
385
386                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
387                    }
388                    [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
389                    [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
390                    [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
391                    [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
392                    [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
393                    [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
394                    [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
395                    [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
396                    [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
397                    [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
398                    [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
399                    [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
400                    [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
401                    [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
402                    [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
403                    [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
404                    [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
405                    [b'>', b'>', ..] => (TokenKind::RightShift, 2),
406                    [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
407                    [b':', b':', ..] => (TokenKind::ColonColon, 2),
408                    [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
409                    [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
410                    [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
411                    [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
412                    [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
413                    [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
414                    [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
415                    [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
416                    [b'/', b'/', ..] => {
417                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
418                        let comment_len = scan_single_line_comment(remaining);
419                        (TokenKind::SingleLineComment, 2 + comment_len)
420                    }
421                    [b'/', b'*', asterisk] => {
422                        let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
423                        match scan_multi_line_comment(remaining) {
424                            Some(len) => {
425                                let is_docblock = asterisk == &b'*' && len > 2;
426                                if is_docblock {
427                                    (TokenKind::DocBlockComment, len + 2)
428                                } else {
429                                    (TokenKind::MultiLineComment, len + 2)
430                                }
431                            }
432                            None => {
433                                self.input.consume(remaining.len() + 2);
434                                return Some(Err(SyntaxError::UnexpectedEndOfFile(
435                                    self.file_id(),
436                                    self.input.current_position(),
437                                )));
438                            }
439                        }
440                    }
441                    [b'\\', start_of_identifier!(), ..] => {
442                        let mut length = 1;
443                        loop {
444                            let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
445                            length += ident_len;
446                            if ends_with_ns {
447                                length += 1; // Include the backslash
448                            } else {
449                                break;
450                            }
451                        }
452
453                        (TokenKind::FullyQualifiedIdentifier, length)
454                    }
455                    [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
456                    [b'$', ..] => (TokenKind::Dollar, 1),
457                    [b'!', ..] => (TokenKind::Bang, 1),
458                    [b'&', ..] => (TokenKind::Ampersand, 1),
459                    [b'?', ..] => (TokenKind::Question, 1),
460                    [b'=', ..] => (TokenKind::Equal, 1),
461                    [b'`', ..] => (TokenKind::Backtick, 1),
462                    [b'+', ..] => (TokenKind::Plus, 1),
463                    [b'%', ..] => (TokenKind::Percent, 1),
464                    [b'-', ..] => (TokenKind::Minus, 1),
465                    [b'<', ..] => (TokenKind::LessThan, 1),
466                    [b'>', ..] => (TokenKind::GreaterThan, 1),
467                    [b':', ..] => (TokenKind::Colon, 1),
468                    [b'|', ..] => (TokenKind::Pipe, 1),
469                    [b'^', ..] => (TokenKind::Caret, 1),
470                    [b'*', ..] => (TokenKind::Asterisk, 1),
471                    [b'/', ..] => (TokenKind::Slash, 1),
472                    [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
473                    [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
474                        read_literal_string(&self.input, *quote)
475                    }
476                    [b'"', ..] => (TokenKind::DoubleQuote, 1),
477                    [b'(', ..] => 'parenthesis: {
478                        let mut peek_offset = 1;
479                        while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
480                            if b.is_ascii_whitespace() {
481                                peek_offset += 1;
482                            } else {
483                                // Check if this byte could start a cast type (case-insensitive)
484                                let lower = b | 0x20; // ASCII lowercase
485                                if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u' | b'v')
486                                {
487                                    break 'parenthesis (TokenKind::LeftParenthesis, 1);
488                                }
489                                break;
490                            }
491                        }
492
493                        for (value, kind) in internal::consts::CAST_TYPES {
494                            if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
495                                break 'parenthesis (kind, length);
496                            }
497                        }
498
499                        (TokenKind::LeftParenthesis, 1)
500                    }
501                    [b'#', ..] => {
502                        let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
503                        let comment_len = scan_single_line_comment(remaining);
504                        (TokenKind::HashComment, 1 + comment_len)
505                    }
506                    [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
507                    [b'.', start_of_number!(), ..] => {
508                        let mut length = read_digits_of_base(&self.input, 2, 10);
509                        if let float_exponent!() = self.input.peek(length, 1) {
510                            let mut exp_length = length + 1;
511                            if let number_sign!() = self.input.peek(exp_length, 1) {
512                                exp_length += 1;
513                            }
514
515                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
516                            if after_exp > exp_length {
517                                length = after_exp;
518                            }
519                        }
520
521                        (TokenKind::LiteralFloat, length)
522                    }
523                    [start_of_number!(), ..] => 'number: {
524                        let mut length = 1;
525
526                        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
527                            start_of_binary_number!() => {
528                                length += 1;
529
530                                (2, NumberKind::Integer)
531                            }
532                            start_of_octal_number!() => {
533                                length += 1;
534
535                                (8, NumberKind::Integer)
536                            }
537                            start_of_hexadecimal_number!() => {
538                                length += 1;
539
540                                (16, NumberKind::Integer)
541                            }
542                            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
543                            start_of_float_number!() => (10, NumberKind::Float),
544                            _ => (10, NumberKind::IntegerOrFloat),
545                        };
546
547                        if kind != NumberKind::Float {
548                            length = read_digits_of_base(&self.input, length, base);
549
550                            if kind == NumberKind::Integer {
551                                break 'number (TokenKind::LiteralInteger, length);
552                            }
553                        }
554
555                        let is_float = matches!(self.input.peek(length, 3), float_separator!());
556
557                        if !is_float {
558                            break 'number (TokenKind::LiteralInteger, length);
559                        }
560
561                        if let [b'.'] = self.input.peek(length, 1) {
562                            length += 1;
563                            length = read_digits_of_base(&self.input, length, 10);
564                        }
565
566                        if let float_exponent!() = self.input.peek(length, 1) {
567                            // Only include exponent if there are digits after it
568                            let mut exp_length = length + 1;
569                            if let number_sign!() = self.input.peek(exp_length, 1) {
570                                exp_length += 1;
571                            }
572                            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
573                            if after_exp > exp_length {
574                                // There are digits after the exponent marker
575                                length = after_exp;
576                            }
577                        }
578
579                        (TokenKind::LiteralFloat, length)
580                    }
581                    [b'.', ..] => (TokenKind::Dot, 1),
582                    [unknown_byte, ..] => {
583                        let position = self.input.current_position();
584                        self.input.consume(1);
585
586                        return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
587                    }
588                    [] => {
589                        // we check for EOF before entering scripting section,
590                        // so this should be unreachable.
591                        unreachable!()
592                    }
593                };
594
595                self.mode = match token_kind {
596                    TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
597                    TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
598                    TokenKind::CloseTag => LexerMode::Inline,
599                    TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
600                    TokenKind::DocumentStart(document_kind) => {
601                        LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
602                    }
603                    _ => LexerMode::Script,
604                };
605
606                let buffer = self.input.consume(len);
607                let end = self.input.current_position();
608
609                Some(Ok(self.token(token_kind, buffer, start, end)))
610            }
611            LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
612                Interpolation::None => {
613                    let start = self.input.current_position();
614
615                    let mut length = 0;
616                    let mut last_was_slash = false;
617                    let mut token_kind = TokenKind::StringPart;
618                    loop {
619                        match self.input.peek(length, 2) {
620                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
621                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
622
623                                self.mode =
624                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
625
626                                break;
627                            }
628                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
629                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
630
631                                self.mode =
632                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
633
634                                break;
635                            }
636                            [b'\\', ..] => {
637                                length += 1;
638
639                                last_was_slash = !last_was_slash;
640                            }
641                            [b'"', ..] if !last_was_slash => {
642                                if length == 0 {
643                                    length += 1;
644                                    token_kind = TokenKind::DoubleQuote;
645
646                                    break;
647                                }
648
649                                break;
650                            }
651                            [_, ..] => {
652                                length += 1;
653                                last_was_slash = false;
654                            }
655                            [] => {
656                                break;
657                            }
658                        }
659                    }
660
661                    let buffer = self.input.consume(length);
662                    let end = self.input.current_position();
663
664                    if TokenKind::DoubleQuote == token_kind {
665                        self.mode = LexerMode::Script;
666                    }
667
668                    Some(Ok(self.token(token_kind, buffer, start, end)))
669                }
670                Interpolation::Until(offset) => {
671                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
672                }
673            },
674            LexerMode::ShellExecuteString(interpolation) => match &interpolation {
675                Interpolation::None => {
676                    let start = self.input.current_position();
677
678                    let mut length = 0;
679                    let mut last_was_slash = false;
680                    let mut token_kind = TokenKind::StringPart;
681                    loop {
682                        match self.input.peek(length, 2) {
683                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
684                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
685
686                                self.mode =
687                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
688
689                                break;
690                            }
691                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
692                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
693
694                                self.mode =
695                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
696
697                                break;
698                            }
699                            [b'\\', ..] => {
700                                length += 1;
701                                last_was_slash = true;
702                            }
703                            [b'`', ..] if !last_was_slash => {
704                                if length == 0 {
705                                    length += 1;
706                                    token_kind = TokenKind::Backtick;
707
708                                    break;
709                                }
710
711                                break;
712                            }
713                            [_, ..] => {
714                                length += 1;
715                                last_was_slash = false;
716                            }
717                            [] => {
718                                break;
719                            }
720                        }
721                    }
722
723                    let buffer = self.input.consume(length);
724                    let end = self.input.current_position();
725
726                    if TokenKind::Backtick == token_kind {
727                        self.mode = LexerMode::Script;
728                    }
729
730                    Some(Ok(self.token(token_kind, buffer, start, end)))
731                }
732                Interpolation::Until(offset) => {
733                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
734                }
735            },
736            LexerMode::DocumentString(kind, label, interpolation) => match &kind {
737                DocumentKind::Heredoc => match &interpolation {
738                    Interpolation::None => {
739                        let start = self.input.current_position();
740
741                        let mut length = 0;
742                        let mut last_was_slash = false;
743                        let mut only_whitespaces = true;
744                        let mut token_kind = TokenKind::StringPart;
745                        loop {
746                            match self.input.peek(length, 2) {
747                                [b'\r', b'\n'] => {
748                                    length += 2;
749
750                                    break;
751                                }
752                                [b'\n' | b'\r', ..] => {
753                                    length += 1;
754
755                                    break;
756                                }
757                                [byte, ..] if byte.is_ascii_whitespace() => {
758                                    length += 1;
759                                }
760                                [b'$', start_of_identifier!(), ..] if !last_was_slash => {
761                                    let until_offset =
762                                        read_until_end_of_variable_interpolation(&self.input, length + 2);
763
764                                    self.mode = LexerMode::DocumentString(
765                                        kind,
766                                        label,
767                                        Interpolation::Until(start.offset + until_offset),
768                                    );
769
770                                    break;
771                                }
772                                [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
773                                    let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
774
775                                    self.mode = LexerMode::DocumentString(
776                                        kind,
777                                        label,
778                                        Interpolation::Until(start.offset + until_offset),
779                                    );
780
781                                    break;
782                                }
783                                [b'\\', ..] => {
784                                    length += 1;
785                                    last_was_slash = true;
786                                    only_whitespaces = false;
787                                }
788                                [_, ..] => {
789                                    if only_whitespaces
790                                        && self.input.peek(length, label.len()) == label
791                                        && self
792                                            .input
793                                            .peek(length + label.len(), 1)
794                                            .first()
795                                            .is_none_or(|c| !c.is_ascii_alphanumeric())
796                                    {
797                                        length += label.len();
798                                        token_kind = TokenKind::DocumentEnd;
799
800                                        break;
801                                    }
802
803                                    length += 1;
804                                    last_was_slash = false;
805                                    only_whitespaces = false;
806                                }
807                                [] => {
808                                    break;
809                                }
810                            }
811                        }
812
813                        let buffer = self.input.consume(length);
814                        let end = self.input.current_position();
815
816                        if TokenKind::DocumentEnd == token_kind {
817                            self.mode = LexerMode::Script;
818                        }
819
820                        Some(Ok(self.token(token_kind, buffer, start, end)))
821                    }
822                    Interpolation::Until(offset) => {
823                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
824                    }
825                },
826                DocumentKind::Nowdoc => {
827                    let start = self.input.current_position();
828
829                    let mut length = 0;
830                    let mut terminated = false;
831                    let mut only_whitespaces = true;
832
833                    loop {
834                        match self.input.peek(length, 2) {
835                            [b'\r', b'\n'] => {
836                                length += 2;
837
838                                break;
839                            }
840                            [b'\n' | b'\r', ..] => {
841                                length += 1;
842
843                                break;
844                            }
845                            [byte, ..] if byte.is_ascii_whitespace() => {
846                                length += 1;
847                            }
848                            [_, ..] => {
849                                if only_whitespaces
850                                    && self.input.peek(length, label.len()) == label
851                                    && self
852                                        .input
853                                        .peek(length + label.len(), 1)
854                                        .first()
855                                        .is_none_or(|c| !c.is_ascii_alphanumeric())
856                                {
857                                    length += label.len();
858                                    terminated = true;
859
860                                    break;
861                                }
862
863                                only_whitespaces = false;
864                                length += 1;
865                            }
866                            [] => {
867                                break;
868                            }
869                        }
870                    }
871
872                    let buffer = self.input.consume(length);
873                    let end = self.input.current_position();
874
875                    if terminated {
876                        self.mode = LexerMode::Script;
877
878                        return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
879                    }
880
881                    Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
882                }
883            },
884            LexerMode::Halt(stage) => 'halt: {
885                let start = self.input.current_position();
886                if let HaltStage::End = stage {
887                    let buffer = self.input.consume_remaining();
888                    let end = self.input.current_position();
889
890                    break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
891                }
892
893                let whitespaces = self.input.consume_whitespaces();
894                if !whitespaces.is_empty() {
895                    let end = self.input.current_position();
896
897                    break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
898                }
899
900                match &stage {
901                    HaltStage::LookingForLeftParenthesis => {
902                        if self.input.is_at(b"(", false) {
903                            let buffer = self.input.consume(1);
904                            let end = self.input.current_position();
905
906                            self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
907
908                            Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
909                        } else {
910                            let byte = self.input.read(1)[0];
911                            let position = self.input.current_position();
912                            // Consume the unexpected byte to avoid infinite loops
913                            self.input.consume(1);
914                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
915                        }
916                    }
917                    HaltStage::LookingForRightParenthesis => {
918                        if self.input.is_at(b")", false) {
919                            let buffer = self.input.consume(1);
920                            let end = self.input.current_position();
921
922                            self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
923
924                            Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
925                        } else {
926                            let byte = self.input.read(1)[0];
927                            let position = self.input.current_position();
928                            self.input.consume(1);
929                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
930                        }
931                    }
932                    HaltStage::LookingForTerminator => {
933                        if self.input.is_at(b";", false) {
934                            let buffer = self.input.consume(1);
935                            let end = self.input.current_position();
936
937                            self.mode = LexerMode::Halt(HaltStage::End);
938
939                            Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
940                        } else if self.input.is_at(b"?>", false) {
941                            let buffer = self.input.consume(2);
942                            let end = self.input.current_position();
943
944                            self.mode = LexerMode::Halt(HaltStage::End);
945
946                            Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
947                        } else {
948                            let byte = self.input.read(1)[0];
949                            let position = self.input.current_position();
950                            self.input.consume(1);
951                            Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
952                        }
953                    }
954                    _ => unreachable!(),
955                }
956            }
957        }
958    }
959
960    /// Fast path for scanning identifiers and keywords.
961    /// Called when we know the first byte is an identifier start character.
962    /// Returns (TokenKind, length) to allow proper mode switching.
963    #[inline]
964    fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
965        let (mut length, ended_with_slash) = self.input.scan_identifier(0);
966
967        if !ended_with_slash {
968            match length {
969                6 => {
970                    if self.input.is_at(b"public(set)", true) {
971                        return (TokenKind::PublicSet, 11);
972                    }
973                }
974                7 => {
975                    if self.input.is_at(b"private(set)", true) {
976                        return (TokenKind::PrivateSet, 12);
977                    }
978                }
979                9 => {
980                    if self.input.is_at(b"protected(set)", true) {
981                        return (TokenKind::ProtectedSet, 14);
982                    }
983                }
984                _ => {}
985            }
986        }
987
988        if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
989            return (kind, length);
990        }
991
992        let mut slashes = 0;
993        let mut last_was_slash = false;
994        loop {
995            match self.input.peek(length, 1) {
996                [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
997                    length += 1;
998                    last_was_slash = false;
999                }
1000                [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1001                    length += 1;
1002                }
1003                [b'\\'] if !self.interpolating => {
1004                    if last_was_slash {
1005                        length -= 1;
1006                        slashes -= 1;
1007                        last_was_slash = false;
1008                        break;
1009                    }
1010
1011                    length += 1;
1012                    slashes += 1;
1013                    last_was_slash = true;
1014                }
1015                _ => {
1016                    break;
1017                }
1018            }
1019        }
1020
1021        if last_was_slash {
1022            length -= 1;
1023            slashes -= 1;
1024        }
1025
1026        let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1027
1028        (kind, length)
1029    }
1030
1031    #[inline]
1032    fn token(&self, kind: TokenKind, v: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1033        // SAFETY: The input bytes are guaranteed to be valid UTF-8 because:
1034        // 1. File contents are validated via simdutf8 during database loading
1035        // 2. Invalid UTF-8 is converted lossily before reaching the lexer
1036        // 3. All byte slices here are subslices of the validated input
1037        let value = unsafe { std::str::from_utf8_unchecked(v) };
1038
1039        Token { kind, start, value }
1040    }
1041
1042    #[inline]
1043    fn interpolation(
1044        &mut self,
1045        end_offset: u32,
1046        post_interpolation_mode: LexerMode<'input>,
1047    ) -> Option<Result<Token<'input>, SyntaxError>> {
1048        self.mode = LexerMode::Script;
1049
1050        let was_interpolating = self.interpolating;
1051        self.interpolating = true;
1052
1053        loop {
1054            let subsequent_token = self.advance()?.ok()?;
1055            // Check if this token contains the end offset
1056            let token_start = subsequent_token.start.offset;
1057            let token_end = token_start + subsequent_token.value.len() as u32;
1058            let is_final_token = token_start <= end_offset && end_offset <= token_end;
1059
1060            self.buffer.push_back(subsequent_token);
1061
1062            if is_final_token {
1063                break;
1064            }
1065        }
1066
1067        self.mode = post_interpolation_mode;
1068        self.interpolating = was_interpolating;
1069
1070        self.advance()
1071    }
1072}
1073
1074impl HasFileId for Lexer<'_> {
1075    #[inline]
1076    fn file_id(&self) -> FileId {
1077        self.input.file_id()
1078    }
1079}
1080
1081#[inline]
1082fn matches_start_of_heredoc_document(input: &Input) -> bool {
1083    let total = input.len();
1084    let base = input.current_offset();
1085
1086    // Start after the fixed opener (3 bytes).
1087    let mut length = 3;
1088    // Consume any following whitespace.
1089    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1090        length += 1;
1091    }
1092
1093    // The next byte must be a valid start-of-identifier.
1094    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1095        return false;
1096    }
1097    length += 1; // Include that identifier start.
1098
1099    // Now continue reading identifier characters until a newline is found.
1100    loop {
1101        let pos = base + length;
1102        if pos >= total {
1103            return false; // Unexpected EOF
1104        }
1105
1106        let byte = *input.read_at(pos);
1107        if byte == b'\n' {
1108            return true; // Newline found: valid heredoc opener.
1109        } else if byte == b'\r' {
1110            // Handle CRLF: treat '\r' followed by '\n' as a newline as well.
1111            return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1112        } else if is_part_of_identifier(input.read_at(pos)) {
1113            length += 1;
1114        } else {
1115            return false; // Unexpected character.
1116        }
1117    }
1118}
1119
1120#[inline]
1121fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1122    let total = input.len();
1123    let base = input.current_offset();
1124
1125    // Start after the fixed opener (3 bytes), then skip any whitespace.
1126    let mut length = 3;
1127    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1128        length += 1;
1129    }
1130
1131    // Next, expect an opening double quote.
1132    if base + length >= total || *input.read_at(base + length) != b'"' {
1133        return false;
1134    }
1135    length += 1;
1136
1137    // The following byte must be a valid start-of-identifier.
1138    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1139        return false;
1140    }
1141    length += 1;
1142
1143    // Now scan the label. For double‑quoted heredoc, a terminating double quote is required.
1144    let mut terminated = false;
1145    loop {
1146        let pos = base + length;
1147        if pos >= total {
1148            return false;
1149        }
1150        let byte = input.read_at(pos);
1151        if *byte == b'\n' {
1152            // End of line: valid only if a closing double quote was encountered.
1153            return terminated;
1154        } else if *byte == b'\r' {
1155            // Handle CRLF sequences.
1156            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1157        } else if !terminated && is_part_of_identifier(byte) {
1158            length += 1;
1159        } else if !terminated && *byte == b'"' {
1160            terminated = true;
1161            length += 1;
1162        } else {
1163            return false;
1164        }
1165    }
1166}
1167
1168#[inline]
1169fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1170    let total = input.len();
1171    let base = input.current_offset();
1172
1173    // Start after the fixed opener (3 bytes) and skip whitespace.
1174    let mut length = 3;
1175    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1176        length += 1;
1177    }
1178
1179    // Now, the next byte must be a single quote.
1180    if base + length >= total || *input.read_at(base + length) != b'\'' {
1181        return false;
1182    }
1183    length += 1;
1184
1185    // The following byte must be a valid start-of-identifier.
1186    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1187        return false;
1188    }
1189    length += 1;
1190
1191    // Read the label until a newline. A terminating single quote is required.
1192    let mut terminated = false;
1193    loop {
1194        let pos = base + length;
1195        if pos >= total {
1196            return false;
1197        }
1198        let byte = *input.read_at(pos);
1199        if byte == b'\n' {
1200            return terminated;
1201        } else if byte == b'\r' {
1202            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1203        } else if !terminated && is_part_of_identifier(&byte) {
1204            length += 1;
1205        } else if !terminated && byte == b'\'' {
1206            terminated = true;
1207            length += 1;
1208        } else {
1209            return false;
1210        }
1211    }
1212}
1213
1214#[inline]
1215fn matches_literal_double_quote_string(input: &Input) -> bool {
1216    let total = input.len();
1217    let base = input.current_offset();
1218
1219    // Start after the initial double-quote (assumed consumed).
1220    let mut pos = base + 1;
1221    loop {
1222        if pos >= total {
1223            // Reached EOF: assume literal is complete.
1224            return true;
1225        }
1226        let byte = *input.read_at(pos);
1227        if byte == b'"' {
1228            // Encounter a closing double quote.
1229            return true;
1230        } else if byte == b'\\' {
1231            // Skip an escape sequence: assume that the backslash and the escaped character form a pair.
1232            pos += 2;
1233            continue;
1234        }
1235
1236        // Check for variable interpolation or complex expression start:
1237        // If two-byte sequences match either "$" followed by a start-of-identifier or "{" and "$", then return false.
1238        if pos + 1 < total {
1239            let next = *input.read_at(pos + 1);
1240            if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1241                return false;
1242            }
1243        }
1244        pos += 1;
1245    }
1246}
1247
1248#[inline]
1249fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1250    let total = input.len();
1251    let base = input.current_offset();
1252
1253    // Start reading at offset base+3 (the fixed opener length).
1254    let mut pos = base + 3;
1255    let mut whitespaces = 0;
1256    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1257        whitespaces += 1;
1258        pos += 1;
1259    }
1260
1261    // The label (or delimiter) starts after:
1262    //   3 bytes + whitespace bytes + an extra offset:
1263    //      if double-quoted: 2 bytes (opening and closing quotes around the label)
1264    //      else: 1 byte.
1265    let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1266
1267    let mut label_length = 1; // Start with at least one byte for the label.
1268    let mut terminated = false; // For double-quoted heredoc, to track the closing quote.
1269    loop {
1270        let pos = base + length;
1271        // Ensure we haven't run past the input.
1272        if pos >= total {
1273            unreachable!("Unexpected end of input while reading heredoc label");
1274        }
1275
1276        let byte = *input.read_at(pos);
1277        if byte == b'\n' {
1278            // Newline ends the label.
1279            length += 1;
1280            return (length, whitespaces, label_length);
1281        } else if byte == b'\r' {
1282            // Handle CRLF sequences
1283            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1284                length += 2;
1285            } else {
1286                length += 1;
1287            }
1288            return (length, whitespaces, label_length);
1289        } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1290            // For both unquoted and double-quoted (before the closing quote) heredoc,
1291            // a valid identifier character is part of the label.
1292            length += 1;
1293            label_length += 1;
1294        } else if double_quoted && !terminated && byte == b'"' {
1295            // In a double-quoted heredoc, a double quote terminates the label.
1296            length += 1;
1297            terminated = true;
1298        } else {
1299            unreachable!("Unexpected character encountered in heredoc label");
1300        }
1301    }
1302}
1303
1304#[inline]
1305fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1306    let total = input.len();
1307    let base = input.current_offset();
1308
1309    let mut pos = base + 3;
1310    let mut whitespaces = 0;
1311    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1312        whitespaces += 1;
1313        pos += 1;
1314    }
1315
1316    // For nowdoc, the fixed extra offset is always 2.
1317    let mut length = 3 + whitespaces + 2;
1318
1319    let mut label_length = 1;
1320    let mut terminated = false;
1321    loop {
1322        let pos = base + length;
1323        if pos >= total {
1324            unreachable!("Unexpected end of input while reading nowdoc label");
1325        }
1326        let byte = *input.read_at(pos);
1327
1328        if byte == b'\n' {
1329            // A newline indicates the end of the label.
1330            length += 1;
1331            return (length, whitespaces, label_length);
1332        } else if byte == b'\r' {
1333            // Handle CRLF sequences
1334            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1335                length += 2;
1336            } else {
1337                length += 1;
1338            }
1339            return (length, whitespaces, label_length);
1340        } else if is_part_of_identifier(&byte) && !terminated {
1341            // For nowdoc, identifier characters contribute to the label until terminated.
1342            length += 1;
1343            label_length += 1;
1344        } else if !terminated && byte == b'\'' {
1345            // A single quote terminates the nowdoc label.
1346            length += 1;
1347            terminated = true;
1348        } else {
1349            unreachable!("Unexpected character encountered in nowdoc label");
1350        }
1351    }
1352}
1353
1354#[inline]
1355fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1356    let total = input.len();
1357    let start = input.current_offset();
1358    let mut length = 1; // We assume the opening quote is already consumed.
1359
1360    let bytes = input.peek(length, total - start - length);
1361    loop {
1362        match memchr2(quote, b'\\', &bytes[length - 1..]) {
1363            Some(pos) => {
1364                let abs_pos = length - 1 + pos;
1365                let byte = bytes[abs_pos];
1366
1367                if byte == b'\\' {
1368                    length = abs_pos + 2 + 1; // +1 because bytes starts at offset 1
1369                    if length > total - start {
1370                        return (TokenKind::PartialLiteralString, total - start);
1371                    }
1372                } else {
1373                    length = abs_pos + 2; // +1 for the quote, +1 because bytes starts at offset 1
1374                    return (TokenKind::LiteralString, length);
1375                }
1376            }
1377            None => {
1378                // No quote or backslash found - EOF
1379                return (TokenKind::PartialLiteralString, total - start);
1380            }
1381        }
1382    }
1383}
1384
1385#[inline]
1386fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1387    let total = input.len();
1388    let base = input.current_offset();
1389    // `offset` is relative to the current position.
1390    let mut offset = from;
1391
1392    loop {
1393        let abs = base + offset;
1394        if abs >= total {
1395            // End of input.
1396            break;
1397        }
1398
1399        // Pattern 1: If the current byte is part of an identifier, simply advance.
1400        if is_part_of_identifier(input.read_at(abs)) {
1401            offset += 1;
1402            continue;
1403        }
1404
1405        // Pattern 2: If the current byte is a '[' then we enter a bracketed interpolation.
1406        if *input.read_at(abs) == b'[' {
1407            offset += 1;
1408            let mut nesting = 0;
1409            loop {
1410                let abs_inner = base + offset;
1411                if abs_inner >= total {
1412                    break;
1413                }
1414                let b = input.read_at(abs_inner);
1415                if *b == b']' {
1416                    offset += 1;
1417                    if nesting == 0 {
1418                        break;
1419                    }
1420
1421                    nesting -= 1;
1422                } else if *b == b'[' {
1423                    offset += 1;
1424                    nesting += 1;
1425                } else if b.is_ascii_whitespace() {
1426                    // Do not include whitespace.
1427                    break;
1428                } else {
1429                    offset += 1;
1430                }
1431            }
1432            // When bracketed interpolation is processed, exit the loop.
1433            break;
1434        }
1435
1436        // Pattern 3: Check for "->" followed by a valid identifier start.
1437        if base + offset + 2 < total
1438            && *input.read_at(abs) == b'-'
1439            && *input.read_at(base + offset + 1) == b'>'
1440            && is_start_of_identifier(input.read_at(base + offset + 2))
1441        {
1442            offset += 3;
1443            // Consume any following identifier characters.
1444            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1445                offset += 1;
1446            }
1447            break;
1448        }
1449
1450        // Pattern 4: Check for "?->" followed by a valid identifier start.
1451        if base + offset + 3 < total
1452            && *input.read_at(abs) == b'?'
1453            && *input.read_at(base + offset + 1) == b'-'
1454            && *input.read_at(base + offset + 2) == b'>'
1455            && is_start_of_identifier(input.read_at(base + offset + 3))
1456        {
1457            offset += 4;
1458            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1459                offset += 1;
1460            }
1461            break;
1462        }
1463
1464        // None of the expected patterns matched: exit the loop.
1465        break;
1466    }
1467
1468    offset as u32
1469}
1470
1471#[inline]
1472fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1473    let total = input.len();
1474    let base = input.current_offset();
1475    let mut offset = from;
1476    let mut nesting = 0;
1477
1478    loop {
1479        let abs = base + offset;
1480        if abs >= total {
1481            break;
1482        }
1483        match input.read_at(abs) {
1484            b'}' => {
1485                offset += 1;
1486                if nesting == 0 {
1487                    break;
1488                }
1489
1490                nesting -= 1;
1491            }
1492            b'{' => {
1493                offset += 1;
1494                nesting += 1;
1495            }
1496            _ => {
1497                offset += 1;
1498            }
1499        }
1500    }
1501
1502    offset as u32
1503}
1504
1505/// Scan a multi-line comment using SIMD-accelerated search.
1506/// Returns Some(length) including the closing */, or None if unterminated.
1507#[inline]
1508fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1509    // Use SIMD to find */ quickly
1510    memmem::find(bytes, b"*/").map(|pos| pos + 2)
1511}
1512
1513/// Scan a single-line comment using SIMD-accelerated search.
1514/// Returns the length of the comment body (not including the //).
1515/// Stops at newline or ?>.
1516#[inline]
1517fn scan_single_line_comment(bytes: &[u8]) -> usize {
1518    let mut pos = 0;
1519    while pos < bytes.len() {
1520        match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1521            Some(offset) => {
1522                let found_pos = pos + offset;
1523                match bytes[found_pos] {
1524                    b'\n' | b'\r' => return found_pos,
1525                    b'?' => {
1526                        // Check if it's ?>
1527                        if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1528                            // Also check for whitespace before ?>
1529                            if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1530                                return found_pos - 1;
1531                            }
1532                            return found_pos;
1533                        }
1534                        // Not ?>, continue searching
1535                        pos = found_pos + 1;
1536                    }
1537                    _ => unreachable!(),
1538                }
1539            }
1540            None => return bytes.len(),
1541        }
1542    }
1543
1544    bytes.len()
1545}
mago_syntax/lexer/mod.rs

mago_syntax/lexer/
mod.rs