mago_syntax/lexer/
mod.rs

1use std::collections::VecDeque;
2use std::fmt::Debug;
3
4use bumpalo::Bump;
5use mago_database::file::FileId;
6use mago_database::file::HasFileId;
7use mago_span::Position;
8use mago_span::Span;
9
10use mago_syntax_core::input::Input;
11use mago_syntax_core::utils::is_part_of_identifier;
12use mago_syntax_core::utils::is_start_of_identifier;
13use mago_syntax_core::utils::read_digits_of_base;
14use mago_syntax_core::*;
15
16use crate::error::SyntaxError;
17use crate::lexer::internal::mode::HaltStage;
18use crate::lexer::internal::mode::Interpolation;
19use crate::lexer::internal::mode::LexerMode;
20use crate::lexer::internal::utils::NumberKind;
21use crate::token::DocumentKind;
22use crate::token::Token;
23use crate::token::TokenKind;
24
25mod internal;
26
27/// The `Lexer` struct is responsible for tokenizing input source code into discrete tokens
28/// based on PHP language syntax. It is designed to work with PHP code from version 7.0 up to 8.4.
29///
30/// The lexer reads through the provided input and processes it accordingly.
31///
32/// It identifies PHP-specific tokens, including operators, keywords, comments, strings, and other syntax elements,
33/// and produces a sequence of [`Token`] objects that are used in further stages of compilation or interpretation.
34///
35/// The lexer is designed to be used in a streaming fashion, where it reads the input source code in chunks
36/// and produces tokens incrementally. This allows for efficient processing of large source files and
37/// minimizes memory usage.
38#[derive(Debug)]
39pub struct Lexer<'input, 'arena> {
40    arena: &'arena Bump,
41    input: Input<'input>,
42    mode: LexerMode<'arena>,
43    interpolating: bool,
44    buffer: VecDeque<Token<'arena>>,
45}
46
47impl<'input, 'arena> Lexer<'input, 'arena> {
48    /// Creates a new `Lexer` instance.
49    ///
50    /// # Parameters
51    ///
52    /// - `arena`: The arena to use for allocating tokens.
53    /// - `input`: The input source code to tokenize.
54    ///
55    /// # Returns
56    ///
57    /// A new `Lexer` instance that reads from the provided byte slice.
58    pub fn new(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
59        Lexer { arena, input, mode: LexerMode::Inline, interpolating: false, buffer: VecDeque::new() }
60    }
61
62    /// Creates a new `Lexer` instance for parsing a script block.
63    ///
64    /// # Parameters
65    ///
66    /// - `arena`: The arena to use for allocating tokens.
67    /// - `input`: The input source code to tokenize.
68    ///
69    /// # Returns
70    ///
71    /// A new `Lexer` instance that reads from the provided byte slice.
72    pub fn scripting(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
73        Lexer { arena, input, mode: LexerMode::Script, interpolating: false, buffer: VecDeque::new() }
74    }
75
76    /// Check if the lexer has reached the end of the input.
77    ///
78    /// If this method returns `true`, the lexer will not produce any more tokens.
79    pub fn has_reached_eof(&self) -> bool {
80        self.input.has_reached_eof()
81    }
82
83    /// Get the current position of the lexer in the input source code.
84    pub fn get_position(&self) -> Position {
85        self.input.current_position()
86    }
87
88    /// Tokenizes the next input from the source code.
89    ///
90    /// This method reads from the input and produces the next [`Token`] based on the current [`LexerMode`].
91    /// It handles various lexical elements such as inline text, script code, strings with interpolation,
92    /// comments, and different PHP-specific constructs.
93    ///
94    /// # Returns
95    ///
96    /// - `Some(Ok(Token))` if a token was successfully parsed.
97    /// - `Some(Err(SyntaxError))` if a syntax error occurred while parsing the next token.
98    /// - `None` if the end of the input has been reached.
99    ///
100    /// # Notes
101    ///
102    /// - It efficiently handles tokenization by consuming input based on patterns specific to PHP syntax.
103    /// - The lexer supports complex features like string interpolation and different numeric formats.
104    ///
105    /// # Errors
106    ///
107    /// Returns `Some(Err(SyntaxError))` in cases such as:
108    ///
109    /// - Unrecognized tokens that do not match any known PHP syntax.
110    /// - Unexpected tokens in a given context, such as an unexpected end of string.
111    ///
112    /// # Panics
113    ///
114    /// This method should not panic under normal operation. If it does, it indicates a bug in the lexer implementation.
115    ///
116    /// # See Also
117    ///
118    /// - [`Token`]: Represents a lexical token with its kind, value, and span.
119    /// - [`SyntaxError`]: Represents errors that can occur during lexing.
120    #[inline]
121    pub fn advance(&mut self) -> Option<Result<Token<'arena>, SyntaxError>> {
122        if !self.interpolating
123            && let Some(token) = self.buffer.pop_front()
124        {
125            return Some(Ok(token));
126        }
127
128        if self.input.has_reached_eof() {
129            return None;
130        }
131
132        match self.mode {
133            LexerMode::Inline => {
134                let start = self.input.current_position();
135                if self.input.is_at(b"<?", false) {
136                    let (kind, buffer) = if self.input.is_at(b"<?php", true) {
137                        (TokenKind::OpenTag, self.input.consume(5))
138                    } else if self.input.is_at(b"<?=", false) {
139                        (TokenKind::EchoTag, self.input.consume(3))
140                    } else {
141                        (TokenKind::ShortOpenTag, self.input.consume(2))
142                    };
143
144                    let end = self.input.current_position();
145                    let tag = self.token(kind, buffer, start, end);
146
147                    self.mode = LexerMode::Script;
148
149                    return tag;
150                }
151
152                if self.input.is_at(b"#!", true) {
153                    let buffer = self.input.consume_through(b'\n');
154                    let end = self.input.current_position();
155
156                    self.token(TokenKind::InlineShebang, buffer, start, end)
157                } else {
158                    let buffer = self.input.consume_until(b"<?", false);
159                    let end = self.input.current_position();
160
161                    self.token(TokenKind::InlineText, buffer, start, end)
162                }
163            }
164            LexerMode::Script => {
165                let whitespaces = self.input.consume_whitespaces();
166                if !whitespaces.is_empty() {
167                    let start = self.input.current_position();
168                    let buffer = whitespaces;
169                    let end = self.input.current_position();
170
171                    return self.token(TokenKind::Whitespace, buffer, start, end);
172                }
173
174                let mut document_label: &[u8] = &[];
175
176                let (token_kind, len) = match self.input.read(3) {
177                    [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
178                    [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
179                    [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
180                    [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
181                    [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
182                    [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
183                    [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
184                    [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
185                    [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
186                    [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
187                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
188
189                        document_label = self.input.peek(3 + whitespaces, label_length);
190
191                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
192                    }
193                    [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
194                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
195
196                        document_label = self.input.peek(4 + whitespaces, label_length);
197
198                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
199                    }
200                    [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
201                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
202
203                        document_label = self.input.peek(4 + whitespaces, label_length);
204
205                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
206                    }
207                    [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
208                    [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
209                    [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
210                    [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
211                    [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
212                    [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
213                    [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
214                    [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
215                    [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
216                    [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
217                    [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
218                    [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
219                    [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
220                    [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
221                    [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
222                    [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
223                    [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
224                    [b'>', b'>', ..] => (TokenKind::RightShift, 2),
225                    [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
226                    [b':', b':', ..] => (TokenKind::ColonColon, 2),
227                    [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
228                    [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
229                    [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
230                    [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
231                    [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
232                    [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
233                    [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
234                    [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
235                    [b'/', b'/', ..] => {
236                        let mut length = 2;
237                        loop {
238                            match self.input.peek(length, 3) {
239                                [b'\n' | b'\r', ..] => {
240                                    break;
241                                }
242                                [w, b'?', b'>'] if w.is_ascii_whitespace() => {
243                                    break;
244                                }
245                                [b'?', b'>', ..] | [] => {
246                                    break;
247                                }
248                                [_, ..] => {
249                                    length += 1;
250                                }
251                            }
252                        }
253
254                        (TokenKind::SingleLineComment, length)
255                    }
256                    [b'/', b'*', asterisk] => {
257                        let mut length = 2;
258                        let mut is_multiline = false;
259                        let mut terminated = false;
260                        loop {
261                            match self.input.peek(length, 2) {
262                                [b'*', b'/'] => {
263                                    if length == 2 {
264                                        is_multiline = true;
265                                    }
266
267                                    length += 2;
268
269                                    terminated = true;
270                                    break;
271                                }
272                                [_, ..] => {
273                                    length += 1;
274                                }
275                                [] => {
276                                    break;
277                                }
278                            }
279                        }
280
281                        if !terminated {
282                            self.input.consume(length);
283
284                            return Some(Err(SyntaxError::UnexpectedEndOfFile(
285                                self.file_id(),
286                                self.input.current_position(),
287                            )));
288                        }
289
290                        if !is_multiline && asterisk == &b'*' {
291                            (TokenKind::DocBlockComment, length)
292                        } else {
293                            (TokenKind::MultiLineComment, length)
294                        }
295                    }
296                    [b'\\', start_of_identifier!(), ..] => {
297                        let mut length = 2;
298                        let mut last_was_slash = false;
299                        loop {
300                            match self.input.peek(length, 1) {
301                                [start_of_identifier!(), ..] if last_was_slash => {
302                                    length += 1;
303                                    last_was_slash = false;
304                                }
305                                [part_of_identifier!(), ..] if !last_was_slash => {
306                                    length += 1;
307                                }
308                                [b'\\', ..] => {
309                                    if last_was_slash {
310                                        length -= 1;
311
312                                        break;
313                                    }
314
315                                    length += 1;
316                                    last_was_slash = true;
317                                }
318                                _ => {
319                                    break;
320                                }
321                            }
322                        }
323
324                        if last_was_slash {
325                            length -= 1;
326                        }
327
328                        (TokenKind::FullyQualifiedIdentifier, length)
329                    }
330                    [b'$', start_of_identifier!(), ..] => {
331                        let mut length = 2;
332                        while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
333                            length += 1;
334                        }
335
336                        (TokenKind::Variable, length)
337                    }
338                    [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
339                    [b'$', ..] => (TokenKind::Dollar, 1),
340                    [b'@', ..] => (TokenKind::At, 1),
341                    [b'!', ..] => (TokenKind::Bang, 1),
342                    [b'&', ..] => (TokenKind::Ampersand, 1),
343                    [b'?', ..] => (TokenKind::Question, 1),
344                    [b'=', ..] => (TokenKind::Equal, 1),
345                    [b'`', ..] => (TokenKind::Backtick, 1),
346                    [b')', ..] => (TokenKind::RightParenthesis, 1),
347                    [b';', ..] => (TokenKind::Semicolon, 1),
348                    [b'+', ..] => (TokenKind::Plus, 1),
349                    [b'%', ..] => (TokenKind::Percent, 1),
350                    [b'-', ..] => (TokenKind::Minus, 1),
351                    [b'<', ..] => (TokenKind::LessThan, 1),
352                    [b'>', ..] => (TokenKind::GreaterThan, 1),
353                    [b',', ..] => (TokenKind::Comma, 1),
354                    [b'[', ..] => (TokenKind::LeftBracket, 1),
355                    [b']', ..] => (TokenKind::RightBracket, 1),
356                    [b'{', ..] => (TokenKind::LeftBrace, 1),
357                    [b'}', ..] => (TokenKind::RightBrace, 1),
358                    [b':', ..] => (TokenKind::Colon, 1),
359                    [b'~', ..] => (TokenKind::Tilde, 1),
360                    [b'|', ..] => (TokenKind::Pipe, 1),
361                    [b'^', ..] => (TokenKind::Caret, 1),
362                    [b'*', ..] => (TokenKind::Asterisk, 1),
363                    [b'/', ..] => (TokenKind::Slash, 1),
364                    [quote @ b'\'', ..] => read_literal_string(&self.input, quote),
365                    [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
366                        read_literal_string(&self.input, quote)
367                    }
368                    [b'"', ..] => (TokenKind::DoubleQuote, 1),
369                    [b'(', ..] => 'parenthesis: {
370                        for (value, kind) in internal::consts::CAST_TYPES {
371                            if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
372                                break 'parenthesis (kind, length);
373                            }
374                        }
375
376                        (TokenKind::LeftParenthesis, 1)
377                    }
378                    [b'#', ..] => {
379                        let mut length = 1;
380                        loop {
381                            match self.input.peek(length, 3) {
382                                [b'\n' | b'\r', ..] => {
383                                    break;
384                                }
385                                [w, b'?', b'>'] if w.is_ascii_whitespace() => {
386                                    break;
387                                }
388                                [b'?', b'>', ..] | [] => {
389                                    break;
390                                }
391                                [_, ..] => {
392                                    length += 1;
393                                }
394                            }
395                        }
396
397                        (TokenKind::HashComment, length)
398                    }
399                    [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
400                    [start_of_identifier!(), ..] => 'identifier: {
401                        let mut length = 1;
402                        let mut ended_with_slash = false;
403                        loop {
404                            match self.input.peek(length, 2) {
405                                [part_of_identifier!(), ..] => {
406                                    length += 1;
407                                }
408                                [b'\\', start_of_identifier!(), ..] => {
409                                    ended_with_slash = true;
410                                    break;
411                                }
412                                // special case for `private(set)`
413                                [b'(', ..] if length == 7 => {
414                                    if self.input.is_at(b"private(set)", true) {
415                                        break 'identifier (TokenKind::PrivateSet, 7 + 5);
416                                    }
417
418                                    break;
419                                }
420                                // special case for `public(set)`
421                                [b'(', ..] if length == 6 => {
422                                    if self.input.is_at(b"public(set)", true) {
423                                        break 'identifier (TokenKind::PublicSet, 6 + 5);
424                                    }
425
426                                    break;
427                                }
428                                // special case for `protected(set)`
429                                [b'(', ..] if length == 9 => {
430                                    if self.input.is_at(b"protected(set)", true) {
431                                        break 'identifier (TokenKind::ProtectedSet, 9 + 5);
432                                    }
433
434                                    break;
435                                }
436                                _ => {
437                                    break;
438                                }
439                            }
440                        }
441
442                        if !ended_with_slash {
443                            for (value, kind) in internal::consts::KEYWORD_TYPES {
444                                if value.len() != length {
445                                    continue;
446                                }
447
448                                if self.input.is_at(value, true) {
449                                    break 'identifier (kind, value.len());
450                                }
451                            }
452                        }
453
454                        let mut slashes = 0;
455                        let mut last_was_slash = false;
456                        loop {
457                            match self.input.peek(length, 1) {
458                                [start_of_identifier!(), ..] if last_was_slash => {
459                                    length += 1;
460                                    last_was_slash = false;
461                                }
462                                [part_of_identifier!(), ..] if !last_was_slash => {
463                                    length += 1;
464                                }
465                                [b'\\', ..] if !self.interpolating => {
466                                    if !last_was_slash {
467                                        length += 1;
468                                        slashes += 1;
469                                        last_was_slash = true;
470                                    } else {
471                                        length -= 1;
472                                        slashes -= 1;
473                                        last_was_slash = false;
474
475                                        break;
476                                    }
477                                }
478                                _ => {
479                                    break;
480                                }
481                            }
482                        }
483
484                        if last_was_slash {
485                            length -= 1;
486                            slashes -= 1;
487                        }
488
489                        if slashes > 0 {
490                            (TokenKind::QualifiedIdentifier, length)
491                        } else {
492                            (TokenKind::Identifier, length)
493                        }
494                    }
495                    [b'.', start_of_number!(), ..] => {
496                        let mut length = read_digits_of_base(&self.input, 2, 10);
497                        if let float_exponent!() = self.input.peek(length, 1) {
498                            length += 1;
499                            if let number_sign!() = self.input.peek(length, 1) {
500                                length += 1;
501                            }
502
503                            length = read_digits_of_base(&self.input, length, 10);
504                        }
505
506                        (TokenKind::LiteralFloat, length)
507                    }
508                    [start_of_number!(), ..] => 'number: {
509                        let mut length = 1;
510
511                        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
512                            start_of_binary_number!() => {
513                                length += 1;
514
515                                (2, NumberKind::Integer)
516                            }
517                            start_of_octal_number!() => {
518                                length += 1;
519
520                                (8, NumberKind::Integer)
521                            }
522                            start_of_hexadecimal_number!() => {
523                                length += 1;
524
525                                (16, NumberKind::Integer)
526                            }
527                            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
528                            start_of_float_number!() => (10, NumberKind::Float),
529                            _ => (10, NumberKind::IntegerOrFloat),
530                        };
531
532                        if kind != NumberKind::Float {
533                            length = read_digits_of_base(&self.input, length, base);
534
535                            if kind == NumberKind::Integer {
536                                break 'number (TokenKind::LiteralInteger, length);
537                            }
538                        }
539
540                        let is_float = matches!(self.input.peek(length, 3), float_separator!());
541
542                        if !is_float {
543                            break 'number (TokenKind::LiteralInteger, length);
544                        }
545
546                        if let [b'.'] = self.input.peek(length, 1) {
547                            length += 1;
548                            length = read_digits_of_base(&self.input, length, 10);
549                        }
550
551                        if let float_exponent!() = self.input.peek(length, 1) {
552                            length += 1;
553                            if let number_sign!() = self.input.peek(length, 1) {
554                                length += 1;
555                            }
556
557                            length = read_digits_of_base(&self.input, length, 10);
558                        }
559
560                        (TokenKind::LiteralFloat, length)
561                    }
562                    [b'.', ..] => (TokenKind::Dot, 1),
563                    [unknown_byte, ..] => {
564                        return Some(Err(SyntaxError::UnrecognizedToken(
565                            self.file_id(),
566                            *unknown_byte,
567                            self.input.current_position(),
568                        )));
569                    }
570                    [] => {
571                        // we check for EOF before entering scripting section,
572                        // so this should be unreachable.
573                        unreachable!()
574                    }
575                };
576
577                self.mode = match token_kind {
578                    TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
579                    TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
580                    TokenKind::CloseTag => LexerMode::Inline,
581                    TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
582                    TokenKind::DocumentStart(document_kind) => LexerMode::DocumentString(
583                        document_kind,
584                        self.arena.alloc_slice_copy(document_label),
585                        Interpolation::None,
586                    ),
587                    _ => LexerMode::Script,
588                };
589
590                let start = self.input.current_position();
591                let buffer = self.input.consume(len);
592                let end = self.input.current_position();
593
594                self.token(token_kind, buffer, start, end)
595            }
596            LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
597                Interpolation::None => {
598                    let start = self.input.current_position();
599
600                    let mut length = 0;
601                    let mut last_was_slash = false;
602                    let mut token_kind = TokenKind::StringPart;
603                    loop {
604                        match self.input.peek(length, 2) {
605                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
606                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
607
608                                self.mode =
609                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
610
611                                break;
612                            }
613                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
614                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
615
616                                self.mode =
617                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
618
619                                break;
620                            }
621                            [b'\\', ..] => {
622                                length += 1;
623
624                                last_was_slash = !last_was_slash;
625                            }
626                            [b'"', ..] if !last_was_slash => {
627                                if length == 0 {
628                                    length += 1;
629                                    token_kind = TokenKind::DoubleQuote;
630
631                                    break;
632                                }
633
634                                break;
635                            }
636                            [_, ..] => {
637                                length += 1;
638                                last_was_slash = false;
639                            }
640                            [] => {
641                                break;
642                            }
643                        }
644                    }
645
646                    let buffer = self.input.consume(length);
647                    let end = self.input.current_position();
648
649                    if TokenKind::DoubleQuote == token_kind {
650                        self.mode = LexerMode::Script;
651                    }
652
653                    self.token(token_kind, buffer, start, end)
654                }
655                Interpolation::Until(offset) => {
656                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
657                }
658            },
659            LexerMode::ShellExecuteString(interpolation) => match &interpolation {
660                Interpolation::None => {
661                    let start = self.input.current_position();
662
663                    let mut length = 0;
664                    let mut last_was_slash = false;
665                    let mut token_kind = TokenKind::StringPart;
666                    loop {
667                        match self.input.peek(length, 2) {
668                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
669                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
670
671                                self.mode =
672                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
673
674                                break;
675                            }
676                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
677                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
678
679                                self.mode =
680                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
681
682                                break;
683                            }
684                            [b'\\', ..] => {
685                                length += 1;
686                                last_was_slash = true;
687                            }
688                            [b'`', ..] if !last_was_slash => {
689                                if length == 0 {
690                                    length += 1;
691                                    token_kind = TokenKind::Backtick;
692
693                                    break;
694                                }
695
696                                break;
697                            }
698                            [_, ..] => {
699                                length += 1;
700                                last_was_slash = false;
701                            }
702                            [] => {
703                                break;
704                            }
705                        }
706                    }
707
708                    let buffer = self.input.consume(length);
709                    let end = self.input.current_position();
710
711                    if TokenKind::Backtick == token_kind {
712                        self.mode = LexerMode::Script;
713                    }
714
715                    self.token(token_kind, buffer, start, end)
716                }
717                Interpolation::Until(offset) => {
718                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
719                }
720            },
721            LexerMode::DocumentString(kind, label, interpolation) => match &kind {
722                DocumentKind::Heredoc => match &interpolation {
723                    Interpolation::None => {
724                        let start = self.input.current_position();
725
726                        let mut length = 0;
727                        let mut last_was_slash = false;
728                        let mut only_whitespaces = true;
729                        let mut token_kind = TokenKind::StringPart;
730                        loop {
731                            match self.input.peek(length, 2) {
732                                [b'\r', b'\n'] => {
733                                    length += 2;
734
735                                    break;
736                                }
737                                [b'\n', ..] | [b'\r', ..] => {
738                                    length += 1;
739
740                                    break;
741                                }
742                                [byte, ..] if byte.is_ascii_whitespace() => {
743                                    length += 1;
744                                }
745                                [b'$', start_of_identifier!(), ..] if !last_was_slash => {
746                                    let until_offset =
747                                        read_until_end_of_variable_interpolation(&self.input, length + 2);
748
749                                    self.mode = LexerMode::DocumentString(
750                                        kind,
751                                        label,
752                                        Interpolation::Until(start.offset + until_offset),
753                                    );
754
755                                    break;
756                                }
757                                [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
758                                    let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
759
760                                    self.mode = LexerMode::DocumentString(
761                                        kind,
762                                        label,
763                                        Interpolation::Until(start.offset + until_offset),
764                                    );
765
766                                    break;
767                                }
768                                [b'\\', ..] => {
769                                    length += 1;
770                                    last_was_slash = true;
771                                    only_whitespaces = false;
772                                }
773                                [_, ..] => {
774                                    if only_whitespaces
775                                        && self.input.peek(length, label.len()) == label
776                                        && self
777                                            .input
778                                            .peek(length + label.len(), 1)
779                                            .first()
780                                            .is_none_or(|c| !c.is_ascii_alphanumeric())
781                                    {
782                                        length += label.len();
783                                        token_kind = TokenKind::DocumentEnd;
784
785                                        break;
786                                    }
787
788                                    length += 1;
789                                    last_was_slash = false;
790                                    only_whitespaces = false;
791                                }
792                                [] => {
793                                    break;
794                                }
795                            }
796                        }
797
798                        let buffer = self.input.consume(length);
799                        let end = self.input.current_position();
800
801                        if TokenKind::DocumentEnd == token_kind {
802                            self.mode = LexerMode::Script;
803                        }
804
805                        self.token(token_kind, buffer, start, end)
806                    }
807                    Interpolation::Until(offset) => {
808                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
809                    }
810                },
811                DocumentKind::Nowdoc => {
812                    let start = self.input.current_position();
813
814                    let mut length = 0;
815                    let mut terminated = false;
816                    let mut only_whitespaces = true;
817
818                    loop {
819                        match self.input.peek(length, 2) {
820                            [b'\r', b'\n'] => {
821                                length += 2;
822
823                                break;
824                            }
825                            [b'\n', ..] | [b'\r', ..] => {
826                                length += 1;
827
828                                break;
829                            }
830                            [byte, ..] if byte.is_ascii_whitespace() => {
831                                length += 1;
832                            }
833                            [_, ..] => {
834                                if only_whitespaces
835                                    && self.input.peek(length, label.len()) == label
836                                    && self
837                                        .input
838                                        .peek(length + label.len(), 1)
839                                        .first()
840                                        .is_none_or(|c| !c.is_ascii_alphanumeric())
841                                {
842                                    length += label.len();
843                                    terminated = true;
844
845                                    break;
846                                }
847
848                                only_whitespaces = false;
849                                length += 1;
850                            }
851                            [] => {
852                                break;
853                            }
854                        }
855                    }
856
857                    let buffer = self.input.consume(length);
858                    let end = self.input.current_position();
859
860                    if terminated {
861                        self.mode = LexerMode::Script;
862
863                        return self.token(TokenKind::DocumentEnd, buffer, start, end);
864                    }
865
866                    self.token(TokenKind::StringPart, buffer, start, end)
867                }
868            },
869            LexerMode::Halt(stage) => 'halt: {
870                let start = self.input.current_position();
871                if let HaltStage::End = stage {
872                    let buffer = self.input.consume_remaining();
873                    let end = self.input.current_position();
874
875                    break 'halt self.token(TokenKind::InlineText, buffer, start, end);
876                }
877
878                let whitespaces = self.input.consume_whitespaces();
879                if !whitespaces.is_empty() {
880                    let end = self.input.current_position();
881
882                    break 'halt self.token(TokenKind::Whitespace, whitespaces, start, end);
883                }
884
885                match &stage {
886                    HaltStage::LookingForLeftParenthesis => {
887                        if self.input.is_at(b"(", false) {
888                            let buffer = self.input.consume(1);
889                            let end = self.input.current_position();
890
891                            self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
892
893                            self.token(TokenKind::LeftParenthesis, buffer, start, end)
894                        } else {
895                            Some(Err(SyntaxError::UnexpectedToken(
896                                self.file_id(),
897                                self.input.read(1)[0],
898                                self.input.current_position(),
899                            )))
900                        }
901                    }
902                    HaltStage::LookingForRightParenthesis => {
903                        if self.input.is_at(b")", false) {
904                            let buffer = self.input.consume(1);
905                            let end = self.input.current_position();
906
907                            self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
908
909                            self.token(TokenKind::RightParenthesis, buffer, start, end)
910                        } else {
911                            Some(Err(SyntaxError::UnexpectedToken(
912                                self.file_id(),
913                                self.input.read(1)[0],
914                                self.input.current_position(),
915                            )))
916                        }
917                    }
918                    HaltStage::LookingForTerminator => {
919                        if self.input.is_at(b";", false) {
920                            let buffer = self.input.consume(1);
921                            let end = self.input.current_position();
922
923                            self.mode = LexerMode::Halt(HaltStage::End);
924
925                            self.token(TokenKind::Semicolon, buffer, start, end)
926                        } else if self.input.is_at(b"?>", false) {
927                            let buffer = self.input.consume(2);
928                            let end = self.input.current_position();
929
930                            self.mode = LexerMode::Halt(HaltStage::End);
931
932                            self.token(TokenKind::CloseTag, buffer, start, end)
933                        } else {
934                            Some(Err(SyntaxError::UnexpectedToken(
935                                self.file_id(),
936                                self.input.read(1)[0],
937                                self.input.current_position(),
938                            )))
939                        }
940                    }
941                    _ => unreachable!(),
942                }
943            }
944        }
945    }
946
947    #[inline]
948    fn token(
949        &mut self,
950        kind: TokenKind,
951        v: &[u8],
952        from: Position,
953        to: Position,
954    ) -> Option<Result<Token<'arena>, SyntaxError>> {
955        let string = String::from_utf8_lossy(v);
956
957        Some(Ok(Token { kind, value: self.arena.alloc_str(&string), span: Span::new(self.file_id(), from, to) }))
958    }
959
960    #[inline]
961    fn interpolation(
962        &mut self,
963        end_offset: u32,
964        post_interpolation_mode: LexerMode<'arena>,
965    ) -> Option<Result<Token<'arena>, SyntaxError>> {
966        self.mode = LexerMode::Script;
967
968        let was_interpolating = self.interpolating;
969        self.interpolating = true;
970
971        loop {
972            let subsequent_token = self.advance()?.ok()?;
973            let is_final_token = subsequent_token.span.has_offset(end_offset);
974
975            self.buffer.push_back(subsequent_token);
976
977            if is_final_token {
978                break;
979            }
980        }
981
982        self.mode = post_interpolation_mode;
983        self.interpolating = was_interpolating;
984
985        self.advance()
986    }
987}
988
989impl HasFileId for Lexer<'_, '_> {
990    #[inline]
991    fn file_id(&self) -> FileId {
992        self.input.file_id()
993    }
994}
995
996#[inline]
997fn matches_start_of_heredoc_document(input: &Input) -> bool {
998    let total = input.len();
999    let base = input.current_offset();
1000
1001    // Start after the fixed opener (3 bytes).
1002    let mut length = 3;
1003    // Consume any following whitespace.
1004    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1005        length += 1;
1006    }
1007
1008    // The next byte must be a valid start-of-identifier.
1009    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1010        return false;
1011    }
1012    length += 1; // Include that identifier start.
1013
1014    // Now continue reading identifier characters until a newline is found.
1015    loop {
1016        let pos = base + length;
1017        if pos >= total {
1018            return false; // Unexpected EOF
1019        }
1020
1021        let byte = *input.read_at(pos);
1022        if byte == b'\n' {
1023            return true; // Newline found: valid heredoc opener.
1024        } else if byte == b'\r' {
1025            // Handle CRLF: treat '\r' followed by '\n' as a newline as well.
1026            return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1027        } else if is_part_of_identifier(input.read_at(pos)) {
1028            length += 1;
1029        } else {
1030            return false; // Unexpected character.
1031        }
1032    }
1033}
1034
1035#[inline]
1036fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1037    let total = input.len();
1038    let base = input.current_offset();
1039
1040    // Start after the fixed opener (3 bytes), then skip any whitespace.
1041    let mut length = 3;
1042    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1043        length += 1;
1044    }
1045
1046    // Next, expect an opening double quote.
1047    if base + length >= total || *input.read_at(base + length) != b'"' {
1048        return false;
1049    }
1050    length += 1;
1051
1052    // The following byte must be a valid start-of-identifier.
1053    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1054        return false;
1055    }
1056    length += 1;
1057
1058    // Now scan the label. For double‑quoted heredoc, a terminating double quote is required.
1059    let mut terminated = false;
1060    loop {
1061        let pos = base + length;
1062        if pos >= total {
1063            return false;
1064        }
1065        let byte = input.read_at(pos);
1066        if *byte == b'\n' {
1067            // End of line: valid only if a closing double quote was encountered.
1068            return terminated;
1069        } else if *byte == b'\r' {
1070            // Handle CRLF sequences.
1071            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1072        } else if !terminated && is_part_of_identifier(byte) {
1073            length += 1;
1074        } else if !terminated && *byte == b'"' {
1075            terminated = true;
1076            length += 1;
1077        } else {
1078            return false;
1079        }
1080    }
1081}
1082
1083#[inline]
1084fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1085    let total = input.len();
1086    let base = input.current_offset();
1087
1088    // Start after the fixed opener (3 bytes) and skip whitespace.
1089    let mut length = 3;
1090    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1091        length += 1;
1092    }
1093
1094    // Now, the next byte must be a single quote.
1095    if base + length >= total || *input.read_at(base + length) != b'\'' {
1096        return false;
1097    }
1098    length += 1;
1099
1100    // The following byte must be a valid start-of-identifier.
1101    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1102        return false;
1103    }
1104    length += 1;
1105
1106    // Read the label until a newline. A terminating single quote is required.
1107    let mut terminated = false;
1108    loop {
1109        let pos = base + length;
1110        if pos >= total {
1111            return false;
1112        }
1113        let byte = *input.read_at(pos);
1114        if byte == b'\n' {
1115            return terminated;
1116        } else if byte == b'\r' {
1117            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1118        } else if !terminated && is_part_of_identifier(&byte) {
1119            length += 1;
1120        } else if !terminated && byte == b'\'' {
1121            terminated = true;
1122            length += 1;
1123        } else {
1124            return false;
1125        }
1126    }
1127}
1128
1129#[inline]
1130fn matches_literal_double_quote_string(input: &Input) -> bool {
1131    let total = input.len();
1132    let base = input.current_offset();
1133
1134    // Start after the initial double-quote (assumed consumed).
1135    let mut pos = base + 1;
1136    loop {
1137        if pos >= total {
1138            // Reached EOF: assume literal is complete.
1139            return true;
1140        }
1141        let byte = *input.read_at(pos);
1142        if byte == b'"' {
1143            // Encounter a closing double quote.
1144            return true;
1145        } else if byte == b'\\' {
1146            // Skip an escape sequence: assume that the backslash and the escaped character form a pair.
1147            pos += 2;
1148            continue;
1149        } else {
1150            // Check for variable interpolation or complex expression start:
1151            // If two-byte sequences match either "$" followed by a start-of-identifier or "{" and "$", then return false.
1152            if pos + 1 < total {
1153                let next = *input.read_at(pos + 1);
1154                if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1155                    return false;
1156                }
1157            }
1158            pos += 1;
1159        }
1160    }
1161}
1162
1163#[inline]
1164fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1165    let total = input.len();
1166    let base = input.current_offset();
1167
1168    // --- Block 1: Consume Whitespace ---
1169    // Start reading at offset base+3 (the fixed opener length).
1170    let mut pos = base + 3;
1171    let mut whitespaces = 0;
1172    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1173        whitespaces += 1;
1174        pos += 1;
1175    }
1176
1177    // --- Block 2: Calculate Initial Label Offset ---
1178    // The label (or delimiter) starts after:
1179    //   3 bytes + whitespace bytes + an extra offset:
1180    //      if double-quoted: 2 bytes (opening and closing quotes around the label)
1181    //      else: 1 byte.
1182    let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1183
1184    // --- Block 3: Read the Label ---
1185    let mut label_length = 1; // Start with at least one byte for the label.
1186    let mut terminated = false; // For double-quoted heredoc, to track the closing quote.
1187    loop {
1188        let pos = base + length;
1189        // Ensure we haven't run past the input.
1190        if pos >= total {
1191            unreachable!("Unexpected end of input while reading heredoc label");
1192        }
1193
1194        let byte = *input.read_at(pos);
1195        if byte == b'\n' {
1196            // Newline ends the label.
1197            length += 1;
1198            return (length, whitespaces, label_length);
1199        } else if byte == b'\r' {
1200            // Handle CRLF sequences
1201            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1202                length += 2;
1203            } else {
1204                length += 1;
1205            }
1206            return (length, whitespaces, label_length);
1207        } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1208            // For both unquoted and double-quoted (before the closing quote) heredoc,
1209            // a valid identifier character is part of the label.
1210            length += 1;
1211            label_length += 1;
1212        } else if double_quoted && !terminated && byte == b'"' {
1213            // In a double-quoted heredoc, a double quote terminates the label.
1214            length += 1;
1215            terminated = true;
1216        } else {
1217            unreachable!("Unexpected character encountered in heredoc label");
1218        }
1219    }
1220}
1221
1222#[inline]
1223fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1224    let total = input.len();
1225    let base = input.current_offset();
1226
1227    // --- Block 1: Consume Whitespace ---
1228    let mut pos = base + 3;
1229    let mut whitespaces = 0;
1230    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1231        whitespaces += 1;
1232        pos += 1;
1233    }
1234
1235    // --- Block 2: Calculate Initial Label Offset ---
1236    // For nowdoc, the fixed extra offset is always 2.
1237    let mut length = 3 + whitespaces + 2;
1238
1239    // --- Block 3: Read the Label ---
1240    let mut label_length = 1;
1241    let mut terminated = false;
1242    loop {
1243        let pos = base + length;
1244        if pos >= total {
1245            unreachable!("Unexpected end of input while reading nowdoc label");
1246        }
1247        let byte = *input.read_at(pos);
1248
1249        if byte == b'\n' {
1250            // A newline indicates the end of the label.
1251            length += 1;
1252            return (length, whitespaces, label_length);
1253        } else if byte == b'\r' {
1254            // Handle CRLF sequences
1255            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1256                length += 2;
1257            } else {
1258                length += 1;
1259            }
1260            return (length, whitespaces, label_length);
1261        } else if is_part_of_identifier(&byte) && !terminated {
1262            // For nowdoc, identifier characters contribute to the label until terminated.
1263            length += 1;
1264            label_length += 1;
1265        } else if !terminated && byte == b'\'' {
1266            // A single quote terminates the nowdoc label.
1267            length += 1;
1268            terminated = true;
1269        } else {
1270            unreachable!("Unexpected character encountered in nowdoc label");
1271        }
1272    }
1273}
1274
1275#[inline]
1276fn read_literal_string(input: &Input, quote: &u8) -> (TokenKind, usize) {
1277    let total = input.len();
1278    let start = input.current_offset();
1279    let mut length = 1; // We assume the opening quote is already consumed.
1280    let mut last_was_backslash = false;
1281    let mut partial = false;
1282
1283    loop {
1284        let pos = start + length;
1285        if pos >= total {
1286            // Reached EOF before closing quote.
1287            partial = true;
1288            break;
1289        }
1290
1291        let byte = input.read_at(pos);
1292        if *byte == b'\\' {
1293            // Toggle the backslash flag.
1294            last_was_backslash = !last_was_backslash;
1295            length += 1;
1296        } else {
1297            // If we see the closing quote and the previous byte was not an escape.
1298            if *byte == *quote && !last_was_backslash {
1299                length += 1; // Include the closing quote.
1300                break;
1301            }
1302            length += 1;
1303            last_was_backslash = false;
1304        }
1305    }
1306
1307    if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1308}
1309
1310#[inline]
1311fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1312    let total = input.len();
1313    let base = input.current_offset();
1314    // `offset` is relative to the current position.
1315    let mut offset = from;
1316
1317    loop {
1318        let abs = base + offset;
1319        if abs >= total {
1320            // End of input.
1321            break;
1322        }
1323
1324        // Pattern 1: If the current byte is part of an identifier, simply advance.
1325        if is_part_of_identifier(input.read_at(abs)) {
1326            offset += 1;
1327            continue;
1328        }
1329
1330        // Pattern 2: If the current byte is a '[' then we enter a bracketed interpolation.
1331        if *input.read_at(abs) == b'[' {
1332            offset += 1;
1333            let mut nesting = 0;
1334            loop {
1335                let abs_inner = base + offset;
1336                if abs_inner >= total {
1337                    break;
1338                }
1339                let b = input.read_at(abs_inner);
1340                if *b == b']' {
1341                    offset += 1;
1342                    if nesting == 0 {
1343                        break;
1344                    } else {
1345                        nesting -= 1;
1346                    }
1347                } else if *b == b'[' {
1348                    offset += 1;
1349                    nesting += 1;
1350                } else if b.is_ascii_whitespace() {
1351                    // Do not include whitespace.
1352                    break;
1353                } else {
1354                    offset += 1;
1355                }
1356            }
1357            // When bracketed interpolation is processed, exit the loop.
1358            break;
1359        }
1360
1361        // Pattern 3: Check for "->" followed by a valid identifier start.
1362        if base + offset + 2 < total
1363            && *input.read_at(abs) == b'-'
1364            && *input.read_at(base + offset + 1) == b'>'
1365            && is_start_of_identifier(input.read_at(base + offset + 2))
1366        {
1367            offset += 3;
1368            // Consume any following identifier characters.
1369            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1370                offset += 1;
1371            }
1372            break;
1373        }
1374
1375        // Pattern 4: Check for "?->" followed by a valid identifier start.
1376        if base + offset + 3 < total
1377            && *input.read_at(abs) == b'?'
1378            && *input.read_at(base + offset + 1) == b'-'
1379            && *input.read_at(base + offset + 2) == b'>'
1380            && is_start_of_identifier(input.read_at(base + offset + 3))
1381        {
1382            offset += 4;
1383            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1384                offset += 1;
1385            }
1386            break;
1387        }
1388
1389        // None of the expected patterns matched: exit the loop.
1390        break;
1391    }
1392
1393    offset as u32
1394}
1395
1396#[inline]
1397fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1398    let total = input.len();
1399    let base = input.current_offset();
1400    let mut offset = from;
1401    let mut nesting = 0;
1402
1403    loop {
1404        let abs = base + offset;
1405        if abs >= total {
1406            break;
1407        }
1408        match input.read_at(abs) {
1409            b'}' => {
1410                offset += 1;
1411                if nesting == 0 {
1412                    break;
1413                } else {
1414                    nesting -= 1;
1415                }
1416            }
1417            b'{' => {
1418                offset += 1;
1419                nesting += 1;
1420            }
1421            _ => {
1422                offset += 1;
1423            }
1424        }
1425    }
1426
1427    offset as u32
1428}
mago_syntax/lexer/mod.rs

mago_syntax/lexer/
mod.rs