mago_syntax/lexer/
mod.rs

1use std::collections::VecDeque;
2use std::fmt::Debug;
3
4use bumpalo::Bump;
5use mago_database::file::FileId;
6use mago_database::file::HasFileId;
7use mago_span::Position;
8use mago_span::Span;
9
10use mago_syntax_core::float_exponent;
11use mago_syntax_core::float_separator;
12use mago_syntax_core::input::Input;
13use mago_syntax_core::number_sign;
14use mago_syntax_core::part_of_identifier;
15use mago_syntax_core::start_of_binary_number;
16use mago_syntax_core::start_of_float_number;
17use mago_syntax_core::start_of_hexadecimal_number;
18use mago_syntax_core::start_of_identifier;
19use mago_syntax_core::start_of_number;
20use mago_syntax_core::start_of_octal_number;
21use mago_syntax_core::start_of_octal_or_float_number;
22use mago_syntax_core::utils::is_part_of_identifier;
23use mago_syntax_core::utils::is_start_of_identifier;
24use mago_syntax_core::utils::read_digits_of_base;
25
26use crate::error::SyntaxError;
27use crate::lexer::internal::mode::HaltStage;
28use crate::lexer::internal::mode::Interpolation;
29use crate::lexer::internal::mode::LexerMode;
30use crate::lexer::internal::utils::NumberKind;
31use crate::token::DocumentKind;
32use crate::token::Token;
33use crate::token::TokenKind;
34
35mod internal;
36
37/// The `Lexer` struct is responsible for tokenizing input source code into discrete tokens
38/// based on PHP language syntax. It is designed to work with PHP code from version 7.0 up to 8.4.
39///
40/// The lexer reads through the provided input and processes it accordingly.
41///
42/// It identifies PHP-specific tokens, including operators, keywords, comments, strings, and other syntax elements,
43/// and produces a sequence of [`Token`] objects that are used in further stages of compilation or interpretation.
44///
45/// The lexer is designed to be used in a streaming fashion, where it reads the input source code in chunks
46/// and produces tokens incrementally. This allows for efficient processing of large source files and
47/// minimizes memory usage.
48#[derive(Debug)]
49pub struct Lexer<'input, 'arena> {
50    arena: &'arena Bump,
51    input: Input<'input>,
52    mode: LexerMode<'arena>,
53    interpolating: bool,
54    buffer: VecDeque<Token<'arena>>,
55}
56
57impl<'input, 'arena> Lexer<'input, 'arena> {
58    /// Creates a new `Lexer` instance.
59    ///
60    /// # Parameters
61    ///
62    /// - `arena`: The arena to use for allocating tokens.
63    /// - `input`: The input source code to tokenize.
64    ///
65    /// # Returns
66    ///
67    /// A new `Lexer` instance that reads from the provided byte slice.
68    pub fn new(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
69        Lexer { arena, input, mode: LexerMode::Inline, interpolating: false, buffer: VecDeque::new() }
70    }
71
72    /// Creates a new `Lexer` instance for parsing a script block.
73    ///
74    /// # Parameters
75    ///
76    /// - `arena`: The arena to use for allocating tokens.
77    /// - `input`: The input source code to tokenize.
78    ///
79    /// # Returns
80    ///
81    /// A new `Lexer` instance that reads from the provided byte slice.
82    pub fn scripting(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
83        Lexer { arena, input, mode: LexerMode::Script, interpolating: false, buffer: VecDeque::new() }
84    }
85
86    /// Check if the lexer has reached the end of the input.
87    ///
88    /// If this method returns `true`, the lexer will not produce any more tokens.
89    #[must_use]
90    pub fn has_reached_eof(&self) -> bool {
91        self.input.has_reached_eof()
92    }
93
94    /// Get the current position of the lexer in the input source code.
95    #[must_use]
96    pub fn get_position(&self) -> Position {
97        self.input.current_position()
98    }
99
100    /// Tokenizes the next input from the source code.
101    ///
102    /// This method reads from the input and produces the next [`Token`] based on the current [`LexerMode`].
103    /// It handles various lexical elements such as inline text, script code, strings with interpolation,
104    /// comments, and different PHP-specific constructs.
105    ///
106    /// # Returns
107    ///
108    /// - `Some(Ok(Token))` if a token was successfully parsed.
109    /// - `Some(Err(SyntaxError))` if a syntax error occurred while parsing the next token.
110    /// - `None` if the end of the input has been reached.
111    ///
112    /// # Notes
113    ///
114    /// - It efficiently handles tokenization by consuming input based on patterns specific to PHP syntax.
115    /// - The lexer supports complex features like string interpolation and different numeric formats.
116    ///
117    /// # Errors
118    ///
119    /// Returns `Some(Err(SyntaxError))` in cases such as:
120    ///
121    /// - Unrecognized tokens that do not match any known PHP syntax.
122    /// - Unexpected tokens in a given context, such as an unexpected end of string.
123    ///
124    /// # Panics
125    ///
126    /// This method should not panic under normal operation. If it does, it indicates a bug in the lexer implementation.
127    ///
128    /// # See Also
129    ///
130    /// - [`Token`]: Represents a lexical token with its kind, value, and span.
131    /// - [`SyntaxError`]: Represents errors that can occur during lexing.
132    #[inline]
133    pub fn advance(&mut self) -> Option<Result<Token<'arena>, SyntaxError>> {
134        if !self.interpolating
135            && let Some(token) = self.buffer.pop_front()
136        {
137            return Some(Ok(token));
138        }
139
140        if self.input.has_reached_eof() {
141            return None;
142        }
143
144        match self.mode {
145            LexerMode::Inline => {
146                let start = self.input.current_position();
147                if self.input.is_at(b"<?", false) {
148                    let (kind, buffer) = if self.input.is_at(b"<?php", true) {
149                        (TokenKind::OpenTag, self.input.consume(5))
150                    } else if self.input.is_at(b"<?=", false) {
151                        (TokenKind::EchoTag, self.input.consume(3))
152                    } else {
153                        (TokenKind::ShortOpenTag, self.input.consume(2))
154                    };
155
156                    let end = self.input.current_position();
157                    let tag = self.token(kind, buffer, start, end);
158
159                    self.mode = LexerMode::Script;
160
161                    return Some(Ok(tag));
162                }
163
164                if self.input.is_at(b"#!", true) {
165                    let buffer = self.input.consume_through(b'\n');
166                    let end = self.input.current_position();
167
168                    Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)))
169                } else {
170                    let buffer = self.input.consume_until(b"<?", false);
171                    let end = self.input.current_position();
172
173                    Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
174                }
175            }
176            LexerMode::Script => {
177                let start = self.input.current_position();
178                let whitespaces = self.input.consume_whitespaces();
179                if !whitespaces.is_empty() {
180                    return Some(Ok(self.token(
181                        TokenKind::Whitespace,
182                        whitespaces,
183                        start,
184                        self.input.current_position(),
185                    )));
186                }
187
188                let mut document_label: &[u8] = &[];
189
190                let (token_kind, len) = match self.input.read(3) {
191                    [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
192                    [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
193                    [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
194                    [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
195                    [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
196                    [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
197                    [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
198                    [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
199                    [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
200                    [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
201                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
202
203                        document_label = self.input.peek(3 + whitespaces, label_length);
204
205                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
206                    }
207                    [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
208                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
209
210                        document_label = self.input.peek(4 + whitespaces, label_length);
211
212                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
213                    }
214                    [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
215                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
216
217                        document_label = self.input.peek(4 + whitespaces, label_length);
218
219                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
220                    }
221                    [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
222                    [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
223                    [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
224                    [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
225                    [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
226                    [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
227                    [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
228                    [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
229                    [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
230                    [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
231                    [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
232                    [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
233                    [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
234                    [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
235                    [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
236                    [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
237                    [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
238                    [b'>', b'>', ..] => (TokenKind::RightShift, 2),
239                    [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
240                    [b':', b':', ..] => (TokenKind::ColonColon, 2),
241                    [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
242                    [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
243                    [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
244                    [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
245                    [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
246                    [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
247                    [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
248                    [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
249                    [b'/', b'/', ..] => {
250                        let mut length = 2;
251                        loop {
252                            match self.input.peek(length, 3) {
253                                [b'\n' | b'\r', ..] => {
254                                    break;
255                                }
256                                [w, b'?', b'>'] if w.is_ascii_whitespace() => {
257                                    break;
258                                }
259                                [b'?', b'>', ..] | [] => {
260                                    break;
261                                }
262                                [_, ..] => {
263                                    length += 1;
264                                }
265                            }
266                        }
267
268                        (TokenKind::SingleLineComment, length)
269                    }
270                    [b'/', b'*', asterisk] => {
271                        let mut length = 2;
272                        let mut is_multiline = false;
273                        let mut terminated = false;
274                        loop {
275                            match self.input.peek(length, 2) {
276                                [b'*', b'/'] => {
277                                    if length == 2 {
278                                        is_multiline = true;
279                                    }
280
281                                    length += 2;
282
283                                    terminated = true;
284                                    break;
285                                }
286                                [_, ..] => {
287                                    length += 1;
288                                }
289                                [] => {
290                                    break;
291                                }
292                            }
293                        }
294
295                        if !terminated {
296                            self.input.consume(length);
297
298                            return Some(Err(SyntaxError::UnexpectedEndOfFile(
299                                self.file_id(),
300                                self.input.current_position(),
301                            )));
302                        }
303
304                        if !is_multiline && asterisk == &b'*' {
305                            (TokenKind::DocBlockComment, length)
306                        } else {
307                            (TokenKind::MultiLineComment, length)
308                        }
309                    }
310                    [b'\\', start_of_identifier!(), ..] => {
311                        let mut length = 2;
312                        let mut last_was_slash = false;
313                        loop {
314                            match self.input.peek(length, 1) {
315                                [start_of_identifier!(), ..] if last_was_slash => {
316                                    length += 1;
317                                    last_was_slash = false;
318                                }
319                                [part_of_identifier!(), ..] if !last_was_slash => {
320                                    length += 1;
321                                }
322                                [b'\\', ..] => {
323                                    if last_was_slash {
324                                        length -= 1;
325
326                                        break;
327                                    }
328
329                                    length += 1;
330                                    last_was_slash = true;
331                                }
332                                _ => {
333                                    break;
334                                }
335                            }
336                        }
337
338                        if last_was_slash {
339                            length -= 1;
340                        }
341
342                        (TokenKind::FullyQualifiedIdentifier, length)
343                    }
344                    [b'$', start_of_identifier!(), ..] => {
345                        let mut length = 2;
346                        while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
347                            length += 1;
348                        }
349
350                        (TokenKind::Variable, length)
351                    }
352                    [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
353                    [b'$', ..] => (TokenKind::Dollar, 1),
354                    [b'@', ..] => (TokenKind::At, 1),
355                    [b'!', ..] => (TokenKind::Bang, 1),
356                    [b'&', ..] => (TokenKind::Ampersand, 1),
357                    [b'?', ..] => (TokenKind::Question, 1),
358                    [b'=', ..] => (TokenKind::Equal, 1),
359                    [b'`', ..] => (TokenKind::Backtick, 1),
360                    [b')', ..] => (TokenKind::RightParenthesis, 1),
361                    [b';', ..] => (TokenKind::Semicolon, 1),
362                    [b'+', ..] => (TokenKind::Plus, 1),
363                    [b'%', ..] => (TokenKind::Percent, 1),
364                    [b'-', ..] => (TokenKind::Minus, 1),
365                    [b'<', ..] => (TokenKind::LessThan, 1),
366                    [b'>', ..] => (TokenKind::GreaterThan, 1),
367                    [b',', ..] => (TokenKind::Comma, 1),
368                    [b'[', ..] => (TokenKind::LeftBracket, 1),
369                    [b']', ..] => (TokenKind::RightBracket, 1),
370                    [b'{', ..] => (TokenKind::LeftBrace, 1),
371                    [b'}', ..] => (TokenKind::RightBrace, 1),
372                    [b':', ..] => (TokenKind::Colon, 1),
373                    [b'~', ..] => (TokenKind::Tilde, 1),
374                    [b'|', ..] => (TokenKind::Pipe, 1),
375                    [b'^', ..] => (TokenKind::Caret, 1),
376                    [b'*', ..] => (TokenKind::Asterisk, 1),
377                    [b'/', ..] => (TokenKind::Slash, 1),
378                    [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
379                    [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
380                        read_literal_string(&self.input, *quote)
381                    }
382                    [b'"', ..] => (TokenKind::DoubleQuote, 1),
383                    [b'(', ..] => 'parenthesis: {
384                        for (value, kind) in internal::consts::CAST_TYPES {
385                            if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
386                                break 'parenthesis (kind, length);
387                            }
388                        }
389
390                        (TokenKind::LeftParenthesis, 1)
391                    }
392                    [b'#', ..] => {
393                        let mut length = 1;
394                        loop {
395                            match self.input.peek(length, 3) {
396                                [b'\n' | b'\r', ..] => {
397                                    break;
398                                }
399                                [w, b'?', b'>'] if w.is_ascii_whitespace() => {
400                                    break;
401                                }
402                                [b'?', b'>', ..] | [] => {
403                                    break;
404                                }
405                                [_, ..] => {
406                                    length += 1;
407                                }
408                            }
409                        }
410
411                        (TokenKind::HashComment, length)
412                    }
413                    [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
414                    [start_of_identifier!(), ..] => 'identifier: {
415                        let mut length = 1;
416                        let mut ended_with_slash = false;
417                        loop {
418                            match self.input.peek(length, 2) {
419                                [part_of_identifier!(), ..] => {
420                                    length += 1;
421                                }
422                                [b'\\', start_of_identifier!(), ..] => {
423                                    ended_with_slash = true;
424                                    break;
425                                }
426                                // special case for `private(set)`
427                                [b'(', ..] if length == 7 => {
428                                    if self.input.is_at(b"private(set)", true) {
429                                        break 'identifier (TokenKind::PrivateSet, 7 + 5);
430                                    }
431
432                                    break;
433                                }
434                                // special case for `public(set)`
435                                [b'(', ..] if length == 6 => {
436                                    if self.input.is_at(b"public(set)", true) {
437                                        break 'identifier (TokenKind::PublicSet, 6 + 5);
438                                    }
439
440                                    break;
441                                }
442                                // special case for `protected(set)`
443                                [b'(', ..] if length == 9 => {
444                                    if self.input.is_at(b"protected(set)", true) {
445                                        break 'identifier (TokenKind::ProtectedSet, 9 + 5);
446                                    }
447
448                                    break;
449                                }
450                                _ => {
451                                    break;
452                                }
453                            }
454                        }
455
456                        if !ended_with_slash {
457                            for (value, kind) in internal::consts::KEYWORD_TYPES {
458                                if value.len() != length {
459                                    continue;
460                                }
461
462                                if self.input.is_at(value, true) {
463                                    break 'identifier (kind, value.len());
464                                }
465                            }
466                        }
467
468                        let mut slashes = 0;
469                        let mut last_was_slash = false;
470                        loop {
471                            match self.input.peek(length, 1) {
472                                [start_of_identifier!(), ..] if last_was_slash => {
473                                    length += 1;
474                                    last_was_slash = false;
475                                }
476                                [part_of_identifier!(), ..] if !last_was_slash => {
477                                    length += 1;
478                                }
479                                [b'\\', ..] if !self.interpolating => {
480                                    if last_was_slash {
481                                        length -= 1;
482                                        slashes -= 1;
483                                        last_was_slash = false;
484
485                                        break;
486                                    }
487
488                                    length += 1;
489                                    slashes += 1;
490                                    last_was_slash = true;
491                                }
492                                _ => {
493                                    break;
494                                }
495                            }
496                        }
497
498                        if last_was_slash {
499                            length -= 1;
500                            slashes -= 1;
501                        }
502
503                        if slashes > 0 {
504                            (TokenKind::QualifiedIdentifier, length)
505                        } else {
506                            (TokenKind::Identifier, length)
507                        }
508                    }
509                    [b'.', start_of_number!(), ..] => {
510                        let mut length = read_digits_of_base(&self.input, 2, 10);
511                        if let float_exponent!() = self.input.peek(length, 1) {
512                            length += 1;
513                            if let number_sign!() = self.input.peek(length, 1) {
514                                length += 1;
515                            }
516
517                            length = read_digits_of_base(&self.input, length, 10);
518                        }
519
520                        (TokenKind::LiteralFloat, length)
521                    }
522                    [start_of_number!(), ..] => 'number: {
523                        let mut length = 1;
524
525                        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
526                            start_of_binary_number!() => {
527                                length += 1;
528
529                                (2, NumberKind::Integer)
530                            }
531                            start_of_octal_number!() => {
532                                length += 1;
533
534                                (8, NumberKind::Integer)
535                            }
536                            start_of_hexadecimal_number!() => {
537                                length += 1;
538
539                                (16, NumberKind::Integer)
540                            }
541                            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
542                            start_of_float_number!() => (10, NumberKind::Float),
543                            _ => (10, NumberKind::IntegerOrFloat),
544                        };
545
546                        if kind != NumberKind::Float {
547                            length = read_digits_of_base(&self.input, length, base);
548
549                            if kind == NumberKind::Integer {
550                                break 'number (TokenKind::LiteralInteger, length);
551                            }
552                        }
553
554                        let is_float = matches!(self.input.peek(length, 3), float_separator!());
555
556                        if !is_float {
557                            break 'number (TokenKind::LiteralInteger, length);
558                        }
559
560                        if let [b'.'] = self.input.peek(length, 1) {
561                            length += 1;
562                            length = read_digits_of_base(&self.input, length, 10);
563                        }
564
565                        if let float_exponent!() = self.input.peek(length, 1) {
566                            length += 1;
567                            if let number_sign!() = self.input.peek(length, 1) {
568                                length += 1;
569                            }
570
571                            length = read_digits_of_base(&self.input, length, 10);
572                        }
573
574                        (TokenKind::LiteralFloat, length)
575                    }
576                    [b'.', ..] => (TokenKind::Dot, 1),
577                    [unknown_byte, ..] => {
578                        return Some(Err(SyntaxError::UnrecognizedToken(
579                            self.file_id(),
580                            *unknown_byte,
581                            self.input.current_position(),
582                        )));
583                    }
584                    [] => {
585                        // we check for EOF before entering scripting section,
586                        // so this should be unreachable.
587                        unreachable!()
588                    }
589                };
590
591                self.mode = match token_kind {
592                    TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
593                    TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
594                    TokenKind::CloseTag => LexerMode::Inline,
595                    TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
596                    TokenKind::DocumentStart(document_kind) => LexerMode::DocumentString(
597                        document_kind,
598                        self.arena.alloc_slice_copy(document_label),
599                        Interpolation::None,
600                    ),
601                    _ => LexerMode::Script,
602                };
603
604                let buffer = self.input.consume(len);
605                let end = self.input.current_position();
606
607                Some(Ok(self.token(token_kind, buffer, start, end)))
608            }
609            LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
610                Interpolation::None => {
611                    let start = self.input.current_position();
612
613                    let mut length = 0;
614                    let mut last_was_slash = false;
615                    let mut token_kind = TokenKind::StringPart;
616                    loop {
617                        match self.input.peek(length, 2) {
618                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
619                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
620
621                                self.mode =
622                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
623
624                                break;
625                            }
626                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
627                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
628
629                                self.mode =
630                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
631
632                                break;
633                            }
634                            [b'\\', ..] => {
635                                length += 1;
636
637                                last_was_slash = !last_was_slash;
638                            }
639                            [b'"', ..] if !last_was_slash => {
640                                if length == 0 {
641                                    length += 1;
642                                    token_kind = TokenKind::DoubleQuote;
643
644                                    break;
645                                }
646
647                                break;
648                            }
649                            [_, ..] => {
650                                length += 1;
651                                last_was_slash = false;
652                            }
653                            [] => {
654                                break;
655                            }
656                        }
657                    }
658
659                    let buffer = self.input.consume(length);
660                    let end = self.input.current_position();
661
662                    if TokenKind::DoubleQuote == token_kind {
663                        self.mode = LexerMode::Script;
664                    }
665
666                    Some(Ok(self.token(token_kind, buffer, start, end)))
667                }
668                Interpolation::Until(offset) => {
669                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
670                }
671            },
672            LexerMode::ShellExecuteString(interpolation) => match &interpolation {
673                Interpolation::None => {
674                    let start = self.input.current_position();
675
676                    let mut length = 0;
677                    let mut last_was_slash = false;
678                    let mut token_kind = TokenKind::StringPart;
679                    loop {
680                        match self.input.peek(length, 2) {
681                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
682                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
683
684                                self.mode =
685                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
686
687                                break;
688                            }
689                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
690                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
691
692                                self.mode =
693                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
694
695                                break;
696                            }
697                            [b'\\', ..] => {
698                                length += 1;
699                                last_was_slash = true;
700                            }
701                            [b'`', ..] if !last_was_slash => {
702                                if length == 0 {
703                                    length += 1;
704                                    token_kind = TokenKind::Backtick;
705
706                                    break;
707                                }
708
709                                break;
710                            }
711                            [_, ..] => {
712                                length += 1;
713                                last_was_slash = false;
714                            }
715                            [] => {
716                                break;
717                            }
718                        }
719                    }
720
721                    let buffer = self.input.consume(length);
722                    let end = self.input.current_position();
723
724                    if TokenKind::Backtick == token_kind {
725                        self.mode = LexerMode::Script;
726                    }
727
728                    Some(Ok(self.token(token_kind, buffer, start, end)))
729                }
730                Interpolation::Until(offset) => {
731                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
732                }
733            },
734            LexerMode::DocumentString(kind, label, interpolation) => match &kind {
735                DocumentKind::Heredoc => match &interpolation {
736                    Interpolation::None => {
737                        let start = self.input.current_position();
738
739                        let mut length = 0;
740                        let mut last_was_slash = false;
741                        let mut only_whitespaces = true;
742                        let mut token_kind = TokenKind::StringPart;
743                        loop {
744                            match self.input.peek(length, 2) {
745                                [b'\r', b'\n'] => {
746                                    length += 2;
747
748                                    break;
749                                }
750                                [b'\n' | b'\r', ..] => {
751                                    length += 1;
752
753                                    break;
754                                }
755                                [byte, ..] if byte.is_ascii_whitespace() => {
756                                    length += 1;
757                                }
758                                [b'$', start_of_identifier!(), ..] if !last_was_slash => {
759                                    let until_offset =
760                                        read_until_end_of_variable_interpolation(&self.input, length + 2);
761
762                                    self.mode = LexerMode::DocumentString(
763                                        kind,
764                                        label,
765                                        Interpolation::Until(start.offset + until_offset),
766                                    );
767
768                                    break;
769                                }
770                                [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
771                                    let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
772
773                                    self.mode = LexerMode::DocumentString(
774                                        kind,
775                                        label,
776                                        Interpolation::Until(start.offset + until_offset),
777                                    );
778
779                                    break;
780                                }
781                                [b'\\', ..] => {
782                                    length += 1;
783                                    last_was_slash = true;
784                                    only_whitespaces = false;
785                                }
786                                [_, ..] => {
787                                    if only_whitespaces
788                                        && self.input.peek(length, label.len()) == label
789                                        && self
790                                            .input
791                                            .peek(length + label.len(), 1)
792                                            .first()
793                                            .is_none_or(|c| !c.is_ascii_alphanumeric())
794                                    {
795                                        length += label.len();
796                                        token_kind = TokenKind::DocumentEnd;
797
798                                        break;
799                                    }
800
801                                    length += 1;
802                                    last_was_slash = false;
803                                    only_whitespaces = false;
804                                }
805                                [] => {
806                                    break;
807                                }
808                            }
809                        }
810
811                        let buffer = self.input.consume(length);
812                        let end = self.input.current_position();
813
814                        if TokenKind::DocumentEnd == token_kind {
815                            self.mode = LexerMode::Script;
816                        }
817
818                        Some(Ok(self.token(token_kind, buffer, start, end)))
819                    }
820                    Interpolation::Until(offset) => {
821                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
822                    }
823                },
824                DocumentKind::Nowdoc => {
825                    let start = self.input.current_position();
826
827                    let mut length = 0;
828                    let mut terminated = false;
829                    let mut only_whitespaces = true;
830
831                    loop {
832                        match self.input.peek(length, 2) {
833                            [b'\r', b'\n'] => {
834                                length += 2;
835
836                                break;
837                            }
838                            [b'\n' | b'\r', ..] => {
839                                length += 1;
840
841                                break;
842                            }
843                            [byte, ..] if byte.is_ascii_whitespace() => {
844                                length += 1;
845                            }
846                            [_, ..] => {
847                                if only_whitespaces
848                                    && self.input.peek(length, label.len()) == label
849                                    && self
850                                        .input
851                                        .peek(length + label.len(), 1)
852                                        .first()
853                                        .is_none_or(|c| !c.is_ascii_alphanumeric())
854                                {
855                                    length += label.len();
856                                    terminated = true;
857
858                                    break;
859                                }
860
861                                only_whitespaces = false;
862                                length += 1;
863                            }
864                            [] => {
865                                break;
866                            }
867                        }
868                    }
869
870                    let buffer = self.input.consume(length);
871                    let end = self.input.current_position();
872
873                    if terminated {
874                        self.mode = LexerMode::Script;
875
876                        return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
877                    }
878
879                    Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
880                }
881            },
882            LexerMode::Halt(stage) => 'halt: {
883                let start = self.input.current_position();
884                if let HaltStage::End = stage {
885                    let buffer = self.input.consume_remaining();
886                    let end = self.input.current_position();
887
888                    break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
889                }
890
891                let whitespaces = self.input.consume_whitespaces();
892                if !whitespaces.is_empty() {
893                    let end = self.input.current_position();
894
895                    break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
896                }
897
898                match &stage {
899                    HaltStage::LookingForLeftParenthesis => {
900                        if self.input.is_at(b"(", false) {
901                            let buffer = self.input.consume(1);
902                            let end = self.input.current_position();
903
904                            self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
905
906                            Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
907                        } else {
908                            Some(Err(SyntaxError::UnexpectedToken(
909                                self.file_id(),
910                                self.input.read(1)[0],
911                                self.input.current_position(),
912                            )))
913                        }
914                    }
915                    HaltStage::LookingForRightParenthesis => {
916                        if self.input.is_at(b")", false) {
917                            let buffer = self.input.consume(1);
918                            let end = self.input.current_position();
919
920                            self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
921
922                            Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
923                        } else {
924                            Some(Err(SyntaxError::UnexpectedToken(
925                                self.file_id(),
926                                self.input.read(1)[0],
927                                self.input.current_position(),
928                            )))
929                        }
930                    }
931                    HaltStage::LookingForTerminator => {
932                        if self.input.is_at(b";", false) {
933                            let buffer = self.input.consume(1);
934                            let end = self.input.current_position();
935
936                            self.mode = LexerMode::Halt(HaltStage::End);
937
938                            Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
939                        } else if self.input.is_at(b"?>", false) {
940                            let buffer = self.input.consume(2);
941                            let end = self.input.current_position();
942
943                            self.mode = LexerMode::Halt(HaltStage::End);
944
945                            Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
946                        } else {
947                            Some(Err(SyntaxError::UnexpectedToken(
948                                self.file_id(),
949                                self.input.read(1)[0],
950                                self.input.current_position(),
951                            )))
952                        }
953                    }
954                    _ => unreachable!(),
955                }
956            }
957        }
958    }
959
960    #[inline]
961    fn token(&mut self, kind: TokenKind, v: &[u8], from: Position, to: Position) -> Token<'arena> {
962        // SAFETY: The input bytes are guaranteed to be valid UTF-8 because:
963        // 1. File contents are validated via simdutf8 during database loading
964        // 2. Invalid UTF-8 is converted lossily before reaching the lexer
965        // 3. All byte slices here are subslices of the validated input
966        let string = unsafe { std::str::from_utf8_unchecked(v) };
967
968        Token { kind, value: self.arena.alloc_str(string), span: Span::new(self.file_id(), from, to) }
969    }
970
971    #[inline]
972    fn interpolation(
973        &mut self,
974        end_offset: u32,
975        post_interpolation_mode: LexerMode<'arena>,
976    ) -> Option<Result<Token<'arena>, SyntaxError>> {
977        self.mode = LexerMode::Script;
978
979        let was_interpolating = self.interpolating;
980        self.interpolating = true;
981
982        loop {
983            let subsequent_token = self.advance()?.ok()?;
984            let is_final_token = subsequent_token.span.has_offset(end_offset);
985
986            self.buffer.push_back(subsequent_token);
987
988            if is_final_token {
989                break;
990            }
991        }
992
993        self.mode = post_interpolation_mode;
994        self.interpolating = was_interpolating;
995
996        self.advance()
997    }
998}
999
1000impl HasFileId for Lexer<'_, '_> {
1001    #[inline]
1002    fn file_id(&self) -> FileId {
1003        self.input.file_id()
1004    }
1005}
1006
1007#[inline]
1008fn matches_start_of_heredoc_document(input: &Input) -> bool {
1009    let total = input.len();
1010    let base = input.current_offset();
1011
1012    // Start after the fixed opener (3 bytes).
1013    let mut length = 3;
1014    // Consume any following whitespace.
1015    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1016        length += 1;
1017    }
1018
1019    // The next byte must be a valid start-of-identifier.
1020    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1021        return false;
1022    }
1023    length += 1; // Include that identifier start.
1024
1025    // Now continue reading identifier characters until a newline is found.
1026    loop {
1027        let pos = base + length;
1028        if pos >= total {
1029            return false; // Unexpected EOF
1030        }
1031
1032        let byte = *input.read_at(pos);
1033        if byte == b'\n' {
1034            return true; // Newline found: valid heredoc opener.
1035        } else if byte == b'\r' {
1036            // Handle CRLF: treat '\r' followed by '\n' as a newline as well.
1037            return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1038        } else if is_part_of_identifier(input.read_at(pos)) {
1039            length += 1;
1040        } else {
1041            return false; // Unexpected character.
1042        }
1043    }
1044}
1045
1046#[inline]
1047fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1048    let total = input.len();
1049    let base = input.current_offset();
1050
1051    // Start after the fixed opener (3 bytes), then skip any whitespace.
1052    let mut length = 3;
1053    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1054        length += 1;
1055    }
1056
1057    // Next, expect an opening double quote.
1058    if base + length >= total || *input.read_at(base + length) != b'"' {
1059        return false;
1060    }
1061    length += 1;
1062
1063    // The following byte must be a valid start-of-identifier.
1064    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1065        return false;
1066    }
1067    length += 1;
1068
1069    // Now scan the label. For double‑quoted heredoc, a terminating double quote is required.
1070    let mut terminated = false;
1071    loop {
1072        let pos = base + length;
1073        if pos >= total {
1074            return false;
1075        }
1076        let byte = input.read_at(pos);
1077        if *byte == b'\n' {
1078            // End of line: valid only if a closing double quote was encountered.
1079            return terminated;
1080        } else if *byte == b'\r' {
1081            // Handle CRLF sequences.
1082            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1083        } else if !terminated && is_part_of_identifier(byte) {
1084            length += 1;
1085        } else if !terminated && *byte == b'"' {
1086            terminated = true;
1087            length += 1;
1088        } else {
1089            return false;
1090        }
1091    }
1092}
1093
1094#[inline]
1095fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1096    let total = input.len();
1097    let base = input.current_offset();
1098
1099    // Start after the fixed opener (3 bytes) and skip whitespace.
1100    let mut length = 3;
1101    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1102        length += 1;
1103    }
1104
1105    // Now, the next byte must be a single quote.
1106    if base + length >= total || *input.read_at(base + length) != b'\'' {
1107        return false;
1108    }
1109    length += 1;
1110
1111    // The following byte must be a valid start-of-identifier.
1112    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1113        return false;
1114    }
1115    length += 1;
1116
1117    // Read the label until a newline. A terminating single quote is required.
1118    let mut terminated = false;
1119    loop {
1120        let pos = base + length;
1121        if pos >= total {
1122            return false;
1123        }
1124        let byte = *input.read_at(pos);
1125        if byte == b'\n' {
1126            return terminated;
1127        } else if byte == b'\r' {
1128            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1129        } else if !terminated && is_part_of_identifier(&byte) {
1130            length += 1;
1131        } else if !terminated && byte == b'\'' {
1132            terminated = true;
1133            length += 1;
1134        } else {
1135            return false;
1136        }
1137    }
1138}
1139
1140#[inline]
1141fn matches_literal_double_quote_string(input: &Input) -> bool {
1142    let total = input.len();
1143    let base = input.current_offset();
1144
1145    // Start after the initial double-quote (assumed consumed).
1146    let mut pos = base + 1;
1147    loop {
1148        if pos >= total {
1149            // Reached EOF: assume literal is complete.
1150            return true;
1151        }
1152        let byte = *input.read_at(pos);
1153        if byte == b'"' {
1154            // Encounter a closing double quote.
1155            return true;
1156        } else if byte == b'\\' {
1157            // Skip an escape sequence: assume that the backslash and the escaped character form a pair.
1158            pos += 2;
1159            continue;
1160        }
1161
1162        // Check for variable interpolation or complex expression start:
1163        // If two-byte sequences match either "$" followed by a start-of-identifier or "{" and "$", then return false.
1164        if pos + 1 < total {
1165            let next = *input.read_at(pos + 1);
1166            if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1167                return false;
1168            }
1169        }
1170        pos += 1;
1171    }
1172}
1173
1174#[inline]
1175fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1176    let total = input.len();
1177    let base = input.current_offset();
1178
1179    // --- Block 1: Consume Whitespace ---
1180    // Start reading at offset base+3 (the fixed opener length).
1181    let mut pos = base + 3;
1182    let mut whitespaces = 0;
1183    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1184        whitespaces += 1;
1185        pos += 1;
1186    }
1187
1188    // --- Block 2: Calculate Initial Label Offset ---
1189    // The label (or delimiter) starts after:
1190    //   3 bytes + whitespace bytes + an extra offset:
1191    //      if double-quoted: 2 bytes (opening and closing quotes around the label)
1192    //      else: 1 byte.
1193    let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1194
1195    // --- Block 3: Read the Label ---
1196    let mut label_length = 1; // Start with at least one byte for the label.
1197    let mut terminated = false; // For double-quoted heredoc, to track the closing quote.
1198    loop {
1199        let pos = base + length;
1200        // Ensure we haven't run past the input.
1201        if pos >= total {
1202            unreachable!("Unexpected end of input while reading heredoc label");
1203        }
1204
1205        let byte = *input.read_at(pos);
1206        if byte == b'\n' {
1207            // Newline ends the label.
1208            length += 1;
1209            return (length, whitespaces, label_length);
1210        } else if byte == b'\r' {
1211            // Handle CRLF sequences
1212            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1213                length += 2;
1214            } else {
1215                length += 1;
1216            }
1217            return (length, whitespaces, label_length);
1218        } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1219            // For both unquoted and double-quoted (before the closing quote) heredoc,
1220            // a valid identifier character is part of the label.
1221            length += 1;
1222            label_length += 1;
1223        } else if double_quoted && !terminated && byte == b'"' {
1224            // In a double-quoted heredoc, a double quote terminates the label.
1225            length += 1;
1226            terminated = true;
1227        } else {
1228            unreachable!("Unexpected character encountered in heredoc label");
1229        }
1230    }
1231}
1232
1233#[inline]
1234fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1235    let total = input.len();
1236    let base = input.current_offset();
1237
1238    // --- Block 1: Consume Whitespace ---
1239    let mut pos = base + 3;
1240    let mut whitespaces = 0;
1241    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1242        whitespaces += 1;
1243        pos += 1;
1244    }
1245
1246    // --- Block 2: Calculate Initial Label Offset ---
1247    // For nowdoc, the fixed extra offset is always 2.
1248    let mut length = 3 + whitespaces + 2;
1249
1250    // --- Block 3: Read the Label ---
1251    let mut label_length = 1;
1252    let mut terminated = false;
1253    loop {
1254        let pos = base + length;
1255        if pos >= total {
1256            unreachable!("Unexpected end of input while reading nowdoc label");
1257        }
1258        let byte = *input.read_at(pos);
1259
1260        if byte == b'\n' {
1261            // A newline indicates the end of the label.
1262            length += 1;
1263            return (length, whitespaces, label_length);
1264        } else if byte == b'\r' {
1265            // Handle CRLF sequences
1266            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1267                length += 2;
1268            } else {
1269                length += 1;
1270            }
1271            return (length, whitespaces, label_length);
1272        } else if is_part_of_identifier(&byte) && !terminated {
1273            // For nowdoc, identifier characters contribute to the label until terminated.
1274            length += 1;
1275            label_length += 1;
1276        } else if !terminated && byte == b'\'' {
1277            // A single quote terminates the nowdoc label.
1278            length += 1;
1279            terminated = true;
1280        } else {
1281            unreachable!("Unexpected character encountered in nowdoc label");
1282        }
1283    }
1284}
1285
1286#[inline]
1287fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1288    let total = input.len();
1289    let start = input.current_offset();
1290    let mut length = 1; // We assume the opening quote is already consumed.
1291    let mut last_was_backslash = false;
1292    let mut partial = false;
1293
1294    loop {
1295        let pos = start + length;
1296        if pos >= total {
1297            // Reached EOF before closing quote.
1298            partial = true;
1299            break;
1300        }
1301
1302        let byte = input.read_at(pos);
1303        if *byte == b'\\' {
1304            // Toggle the backslash flag.
1305            last_was_backslash = !last_was_backslash;
1306            length += 1;
1307        } else {
1308            // If we see the closing quote and the previous byte was not an escape.
1309            if *byte == quote && !last_was_backslash {
1310                length += 1; // Include the closing quote.
1311                break;
1312            }
1313            length += 1;
1314            last_was_backslash = false;
1315        }
1316    }
1317
1318    if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1319}
1320
1321#[inline]
1322fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1323    let total = input.len();
1324    let base = input.current_offset();
1325    // `offset` is relative to the current position.
1326    let mut offset = from;
1327
1328    loop {
1329        let abs = base + offset;
1330        if abs >= total {
1331            // End of input.
1332            break;
1333        }
1334
1335        // Pattern 1: If the current byte is part of an identifier, simply advance.
1336        if is_part_of_identifier(input.read_at(abs)) {
1337            offset += 1;
1338            continue;
1339        }
1340
1341        // Pattern 2: If the current byte is a '[' then we enter a bracketed interpolation.
1342        if *input.read_at(abs) == b'[' {
1343            offset += 1;
1344            let mut nesting = 0;
1345            loop {
1346                let abs_inner = base + offset;
1347                if abs_inner >= total {
1348                    break;
1349                }
1350                let b = input.read_at(abs_inner);
1351                if *b == b']' {
1352                    offset += 1;
1353                    if nesting == 0 {
1354                        break;
1355                    }
1356
1357                    nesting -= 1;
1358                } else if *b == b'[' {
1359                    offset += 1;
1360                    nesting += 1;
1361                } else if b.is_ascii_whitespace() {
1362                    // Do not include whitespace.
1363                    break;
1364                } else {
1365                    offset += 1;
1366                }
1367            }
1368            // When bracketed interpolation is processed, exit the loop.
1369            break;
1370        }
1371
1372        // Pattern 3: Check for "->" followed by a valid identifier start.
1373        if base + offset + 2 < total
1374            && *input.read_at(abs) == b'-'
1375            && *input.read_at(base + offset + 1) == b'>'
1376            && is_start_of_identifier(input.read_at(base + offset + 2))
1377        {
1378            offset += 3;
1379            // Consume any following identifier characters.
1380            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1381                offset += 1;
1382            }
1383            break;
1384        }
1385
1386        // Pattern 4: Check for "?->" followed by a valid identifier start.
1387        if base + offset + 3 < total
1388            && *input.read_at(abs) == b'?'
1389            && *input.read_at(base + offset + 1) == b'-'
1390            && *input.read_at(base + offset + 2) == b'>'
1391            && is_start_of_identifier(input.read_at(base + offset + 3))
1392        {
1393            offset += 4;
1394            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1395                offset += 1;
1396            }
1397            break;
1398        }
1399
1400        // None of the expected patterns matched: exit the loop.
1401        break;
1402    }
1403
1404    offset as u32
1405}
1406
1407#[inline]
1408fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1409    let total = input.len();
1410    let base = input.current_offset();
1411    let mut offset = from;
1412    let mut nesting = 0;
1413
1414    loop {
1415        let abs = base + offset;
1416        if abs >= total {
1417            break;
1418        }
1419        match input.read_at(abs) {
1420            b'}' => {
1421                offset += 1;
1422                if nesting == 0 {
1423                    break;
1424                }
1425
1426                nesting -= 1;
1427            }
1428            b'{' => {
1429                offset += 1;
1430                nesting += 1;
1431            }
1432            _ => {
1433                offset += 1;
1434            }
1435        }
1436    }
1437
1438    offset as u32
1439}
mago_syntax/lexer/mod.rs

mago_syntax/lexer/
mod.rs