mago_syntax/lexer/
mod.rs

1use std::collections::VecDeque;
2use std::fmt::Debug;
3
4use bumpalo::Bump;
5use mago_database::file::FileId;
6use mago_database::file::HasFileId;
7use mago_span::Position;
8use mago_span::Span;
9
10use mago_syntax_core::input::Input;
11use mago_syntax_core::utils::is_part_of_identifier;
12use mago_syntax_core::utils::is_start_of_identifier;
13use mago_syntax_core::utils::read_digits_of_base;
14use mago_syntax_core::*;
15
16use crate::error::SyntaxError;
17use crate::lexer::internal::mode::HaltStage;
18use crate::lexer::internal::mode::Interpolation;
19use crate::lexer::internal::mode::LexerMode;
20use crate::lexer::internal::utils::NumberKind;
21use crate::token::DocumentKind;
22use crate::token::Token;
23use crate::token::TokenKind;
24
25mod internal;
26
27/// The `Lexer` struct is responsible for tokenizing input source code into discrete tokens
28/// based on PHP language syntax. It is designed to work with PHP code from version 7.0 up to 8.4.
29///
30/// The lexer reads through the provided input and processes it accordingly.
31///
32/// It identifies PHP-specific tokens, including operators, keywords, comments, strings, and other syntax elements,
33/// and produces a sequence of [`Token`] objects that are used in further stages of compilation or interpretation.
34///
35/// The lexer is designed to be used in a streaming fashion, where it reads the input source code in chunks
36/// and produces tokens incrementally. This allows for efficient processing of large source files and
37/// minimizes memory usage.
38#[derive(Debug)]
39pub struct Lexer<'input, 'arena> {
40    arena: &'arena Bump,
41    input: Input<'input>,
42    mode: LexerMode<'arena>,
43    interpolating: bool,
44    buffer: VecDeque<Token<'arena>>,
45}
46
47impl<'input, 'arena> Lexer<'input, 'arena> {
48    /// Creates a new `Lexer` instance.
49    ///
50    /// # Parameters
51    ///
52    /// - `arena`: The arena to use for allocating tokens.
53    /// - `input`: The input source code to tokenize.
54    ///
55    /// # Returns
56    ///
57    /// A new `Lexer` instance that reads from the provided byte slice.
58    pub fn new(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
59        Lexer { arena, input, mode: LexerMode::Inline, interpolating: false, buffer: VecDeque::new() }
60    }
61
62    /// Creates a new `Lexer` instance for parsing a script block.
63    ///
64    /// # Parameters
65    ///
66    /// - `arena`: The arena to use for allocating tokens.
67    /// - `input`: The input source code to tokenize.
68    ///
69    /// # Returns
70    ///
71    /// A new `Lexer` instance that reads from the provided byte slice.
72    pub fn scripting(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
73        Lexer { arena, input, mode: LexerMode::Script, interpolating: false, buffer: VecDeque::new() }
74    }
75
76    /// Check if the lexer has reached the end of the input.
77    ///
78    /// If this method returns `true`, the lexer will not produce any more tokens.
79    pub fn has_reached_eof(&self) -> bool {
80        self.input.has_reached_eof()
81    }
82
83    /// Get the current position of the lexer in the input source code.
84    pub fn get_position(&self) -> Position {
85        self.input.current_position()
86    }
87
88    /// Tokenizes the next input from the source code.
89    ///
90    /// This method reads from the input and produces the next [`Token`] based on the current [`LexerMode`].
91    /// It handles various lexical elements such as inline text, script code, strings with interpolation,
92    /// comments, and different PHP-specific constructs.
93    ///
94    /// # Returns
95    ///
96    /// - `Some(Ok(Token))` if a token was successfully parsed.
97    /// - `Some(Err(SyntaxError))` if a syntax error occurred while parsing the next token.
98    /// - `None` if the end of the input has been reached.
99    ///
100    /// # Notes
101    ///
102    /// - It efficiently handles tokenization by consuming input based on patterns specific to PHP syntax.
103    /// - The lexer supports complex features like string interpolation and different numeric formats.
104    ///
105    /// # Errors
106    ///
107    /// Returns `Some(Err(SyntaxError))` in cases such as:
108    ///
109    /// - Unrecognized tokens that do not match any known PHP syntax.
110    /// - Unexpected tokens in a given context, such as an unexpected end of string.
111    ///
112    /// # Panics
113    ///
114    /// This method should not panic under normal operation. If it does, it indicates a bug in the lexer implementation.
115    ///
116    /// # See Also
117    ///
118    /// - [`Token`]: Represents a lexical token with its kind, value, and span.
119    /// - [`SyntaxError`]: Represents errors that can occur during lexing.
120    #[inline]
121    pub fn advance(&mut self) -> Option<Result<Token<'arena>, SyntaxError>> {
122        if !self.interpolating
123            && let Some(token) = self.buffer.pop_front()
124        {
125            return Some(Ok(token));
126        }
127
128        if self.input.has_reached_eof() {
129            return None;
130        }
131
132        match self.mode {
133            LexerMode::Inline => {
134                let start = self.input.current_position();
135                if self.input.is_at(b"<?", false) {
136                    let (kind, buffer) = if self.input.is_at(b"<?php", true) {
137                        (TokenKind::OpenTag, self.input.consume(5))
138                    } else if self.input.is_at(b"<?=", false) {
139                        (TokenKind::EchoTag, self.input.consume(3))
140                    } else {
141                        (TokenKind::ShortOpenTag, self.input.consume(2))
142                    };
143
144                    let end = self.input.current_position();
145                    let tag = self.token(kind, buffer, start, end);
146
147                    self.mode = LexerMode::Script;
148
149                    return tag;
150                }
151
152                if self.input.is_at(b"#!", true) {
153                    let buffer = self.input.consume_through(b'\n');
154                    let end = self.input.current_position();
155
156                    self.token(TokenKind::InlineShebang, buffer, start, end)
157                } else {
158                    let buffer = self.input.consume_until(b"<?", false);
159                    let end = self.input.current_position();
160
161                    self.token(TokenKind::InlineText, buffer, start, end)
162                }
163            }
164            LexerMode::Script => {
165                let start = self.input.current_position();
166                let whitespaces = self.input.consume_whitespaces();
167                if !whitespaces.is_empty() {
168                    return self.token(TokenKind::Whitespace, whitespaces, start, self.input.current_position());
169                }
170
171                let mut document_label: &[u8] = &[];
172
173                let (token_kind, len) = match self.input.read(3) {
174                    [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
175                    [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
176                    [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
177                    [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
178                    [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
179                    [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
180                    [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
181                    [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
182                    [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
183                    [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
184                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
185
186                        document_label = self.input.peek(3 + whitespaces, label_length);
187
188                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
189                    }
190                    [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
191                        let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
192
193                        document_label = self.input.peek(4 + whitespaces, label_length);
194
195                        (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
196                    }
197                    [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
198                        let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
199
200                        document_label = self.input.peek(4 + whitespaces, label_length);
201
202                        (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
203                    }
204                    [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
205                    [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
206                    [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
207                    [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
208                    [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
209                    [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
210                    [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
211                    [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
212                    [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
213                    [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
214                    [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
215                    [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
216                    [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
217                    [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
218                    [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
219                    [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
220                    [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
221                    [b'>', b'>', ..] => (TokenKind::RightShift, 2),
222                    [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
223                    [b':', b':', ..] => (TokenKind::ColonColon, 2),
224                    [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
225                    [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
226                    [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
227                    [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
228                    [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
229                    [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
230                    [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
231                    [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
232                    [b'/', b'/', ..] => {
233                        let mut length = 2;
234                        loop {
235                            match self.input.peek(length, 3) {
236                                [b'\n' | b'\r', ..] => {
237                                    break;
238                                }
239                                [w, b'?', b'>'] if w.is_ascii_whitespace() => {
240                                    break;
241                                }
242                                [b'?', b'>', ..] | [] => {
243                                    break;
244                                }
245                                [_, ..] => {
246                                    length += 1;
247                                }
248                            }
249                        }
250
251                        (TokenKind::SingleLineComment, length)
252                    }
253                    [b'/', b'*', asterisk] => {
254                        let mut length = 2;
255                        let mut is_multiline = false;
256                        let mut terminated = false;
257                        loop {
258                            match self.input.peek(length, 2) {
259                                [b'*', b'/'] => {
260                                    if length == 2 {
261                                        is_multiline = true;
262                                    }
263
264                                    length += 2;
265
266                                    terminated = true;
267                                    break;
268                                }
269                                [_, ..] => {
270                                    length += 1;
271                                }
272                                [] => {
273                                    break;
274                                }
275                            }
276                        }
277
278                        if !terminated {
279                            self.input.consume(length);
280
281                            return Some(Err(SyntaxError::UnexpectedEndOfFile(
282                                self.file_id(),
283                                self.input.current_position(),
284                            )));
285                        }
286
287                        if !is_multiline && asterisk == &b'*' {
288                            (TokenKind::DocBlockComment, length)
289                        } else {
290                            (TokenKind::MultiLineComment, length)
291                        }
292                    }
293                    [b'\\', start_of_identifier!(), ..] => {
294                        let mut length = 2;
295                        let mut last_was_slash = false;
296                        loop {
297                            match self.input.peek(length, 1) {
298                                [start_of_identifier!(), ..] if last_was_slash => {
299                                    length += 1;
300                                    last_was_slash = false;
301                                }
302                                [part_of_identifier!(), ..] if !last_was_slash => {
303                                    length += 1;
304                                }
305                                [b'\\', ..] => {
306                                    if last_was_slash {
307                                        length -= 1;
308
309                                        break;
310                                    }
311
312                                    length += 1;
313                                    last_was_slash = true;
314                                }
315                                _ => {
316                                    break;
317                                }
318                            }
319                        }
320
321                        if last_was_slash {
322                            length -= 1;
323                        }
324
325                        (TokenKind::FullyQualifiedIdentifier, length)
326                    }
327                    [b'$', start_of_identifier!(), ..] => {
328                        let mut length = 2;
329                        while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
330                            length += 1;
331                        }
332
333                        (TokenKind::Variable, length)
334                    }
335                    [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
336                    [b'$', ..] => (TokenKind::Dollar, 1),
337                    [b'@', ..] => (TokenKind::At, 1),
338                    [b'!', ..] => (TokenKind::Bang, 1),
339                    [b'&', ..] => (TokenKind::Ampersand, 1),
340                    [b'?', ..] => (TokenKind::Question, 1),
341                    [b'=', ..] => (TokenKind::Equal, 1),
342                    [b'`', ..] => (TokenKind::Backtick, 1),
343                    [b')', ..] => (TokenKind::RightParenthesis, 1),
344                    [b';', ..] => (TokenKind::Semicolon, 1),
345                    [b'+', ..] => (TokenKind::Plus, 1),
346                    [b'%', ..] => (TokenKind::Percent, 1),
347                    [b'-', ..] => (TokenKind::Minus, 1),
348                    [b'<', ..] => (TokenKind::LessThan, 1),
349                    [b'>', ..] => (TokenKind::GreaterThan, 1),
350                    [b',', ..] => (TokenKind::Comma, 1),
351                    [b'[', ..] => (TokenKind::LeftBracket, 1),
352                    [b']', ..] => (TokenKind::RightBracket, 1),
353                    [b'{', ..] => (TokenKind::LeftBrace, 1),
354                    [b'}', ..] => (TokenKind::RightBrace, 1),
355                    [b':', ..] => (TokenKind::Colon, 1),
356                    [b'~', ..] => (TokenKind::Tilde, 1),
357                    [b'|', ..] => (TokenKind::Pipe, 1),
358                    [b'^', ..] => (TokenKind::Caret, 1),
359                    [b'*', ..] => (TokenKind::Asterisk, 1),
360                    [b'/', ..] => (TokenKind::Slash, 1),
361                    [quote @ b'\'', ..] => read_literal_string(&self.input, quote),
362                    [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
363                        read_literal_string(&self.input, quote)
364                    }
365                    [b'"', ..] => (TokenKind::DoubleQuote, 1),
366                    [b'(', ..] => 'parenthesis: {
367                        for (value, kind) in internal::consts::CAST_TYPES {
368                            if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
369                                break 'parenthesis (kind, length);
370                            }
371                        }
372
373                        (TokenKind::LeftParenthesis, 1)
374                    }
375                    [b'#', ..] => {
376                        let mut length = 1;
377                        loop {
378                            match self.input.peek(length, 3) {
379                                [b'\n' | b'\r', ..] => {
380                                    break;
381                                }
382                                [w, b'?', b'>'] if w.is_ascii_whitespace() => {
383                                    break;
384                                }
385                                [b'?', b'>', ..] | [] => {
386                                    break;
387                                }
388                                [_, ..] => {
389                                    length += 1;
390                                }
391                            }
392                        }
393
394                        (TokenKind::HashComment, length)
395                    }
396                    [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
397                    [start_of_identifier!(), ..] => 'identifier: {
398                        let mut length = 1;
399                        let mut ended_with_slash = false;
400                        loop {
401                            match self.input.peek(length, 2) {
402                                [part_of_identifier!(), ..] => {
403                                    length += 1;
404                                }
405                                [b'\\', start_of_identifier!(), ..] => {
406                                    ended_with_slash = true;
407                                    break;
408                                }
409                                // special case for `private(set)`
410                                [b'(', ..] if length == 7 => {
411                                    if self.input.is_at(b"private(set)", true) {
412                                        break 'identifier (TokenKind::PrivateSet, 7 + 5);
413                                    }
414
415                                    break;
416                                }
417                                // special case for `public(set)`
418                                [b'(', ..] if length == 6 => {
419                                    if self.input.is_at(b"public(set)", true) {
420                                        break 'identifier (TokenKind::PublicSet, 6 + 5);
421                                    }
422
423                                    break;
424                                }
425                                // special case for `protected(set)`
426                                [b'(', ..] if length == 9 => {
427                                    if self.input.is_at(b"protected(set)", true) {
428                                        break 'identifier (TokenKind::ProtectedSet, 9 + 5);
429                                    }
430
431                                    break;
432                                }
433                                _ => {
434                                    break;
435                                }
436                            }
437                        }
438
439                        if !ended_with_slash {
440                            for (value, kind) in internal::consts::KEYWORD_TYPES {
441                                if value.len() != length {
442                                    continue;
443                                }
444
445                                if self.input.is_at(value, true) {
446                                    break 'identifier (kind, value.len());
447                                }
448                            }
449                        }
450
451                        let mut slashes = 0;
452                        let mut last_was_slash = false;
453                        loop {
454                            match self.input.peek(length, 1) {
455                                [start_of_identifier!(), ..] if last_was_slash => {
456                                    length += 1;
457                                    last_was_slash = false;
458                                }
459                                [part_of_identifier!(), ..] if !last_was_slash => {
460                                    length += 1;
461                                }
462                                [b'\\', ..] if !self.interpolating => {
463                                    if !last_was_slash {
464                                        length += 1;
465                                        slashes += 1;
466                                        last_was_slash = true;
467                                    } else {
468                                        length -= 1;
469                                        slashes -= 1;
470                                        last_was_slash = false;
471
472                                        break;
473                                    }
474                                }
475                                _ => {
476                                    break;
477                                }
478                            }
479                        }
480
481                        if last_was_slash {
482                            length -= 1;
483                            slashes -= 1;
484                        }
485
486                        if slashes > 0 {
487                            (TokenKind::QualifiedIdentifier, length)
488                        } else {
489                            (TokenKind::Identifier, length)
490                        }
491                    }
492                    [b'.', start_of_number!(), ..] => {
493                        let mut length = read_digits_of_base(&self.input, 2, 10);
494                        if let float_exponent!() = self.input.peek(length, 1) {
495                            length += 1;
496                            if let number_sign!() = self.input.peek(length, 1) {
497                                length += 1;
498                            }
499
500                            length = read_digits_of_base(&self.input, length, 10);
501                        }
502
503                        (TokenKind::LiteralFloat, length)
504                    }
505                    [start_of_number!(), ..] => 'number: {
506                        let mut length = 1;
507
508                        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
509                            start_of_binary_number!() => {
510                                length += 1;
511
512                                (2, NumberKind::Integer)
513                            }
514                            start_of_octal_number!() => {
515                                length += 1;
516
517                                (8, NumberKind::Integer)
518                            }
519                            start_of_hexadecimal_number!() => {
520                                length += 1;
521
522                                (16, NumberKind::Integer)
523                            }
524                            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
525                            start_of_float_number!() => (10, NumberKind::Float),
526                            _ => (10, NumberKind::IntegerOrFloat),
527                        };
528
529                        if kind != NumberKind::Float {
530                            length = read_digits_of_base(&self.input, length, base);
531
532                            if kind == NumberKind::Integer {
533                                break 'number (TokenKind::LiteralInteger, length);
534                            }
535                        }
536
537                        let is_float = matches!(self.input.peek(length, 3), float_separator!());
538
539                        if !is_float {
540                            break 'number (TokenKind::LiteralInteger, length);
541                        }
542
543                        if let [b'.'] = self.input.peek(length, 1) {
544                            length += 1;
545                            length = read_digits_of_base(&self.input, length, 10);
546                        }
547
548                        if let float_exponent!() = self.input.peek(length, 1) {
549                            length += 1;
550                            if let number_sign!() = self.input.peek(length, 1) {
551                                length += 1;
552                            }
553
554                            length = read_digits_of_base(&self.input, length, 10);
555                        }
556
557                        (TokenKind::LiteralFloat, length)
558                    }
559                    [b'.', ..] => (TokenKind::Dot, 1),
560                    [unknown_byte, ..] => {
561                        return Some(Err(SyntaxError::UnrecognizedToken(
562                            self.file_id(),
563                            *unknown_byte,
564                            self.input.current_position(),
565                        )));
566                    }
567                    [] => {
568                        // we check for EOF before entering scripting section,
569                        // so this should be unreachable.
570                        unreachable!()
571                    }
572                };
573
574                self.mode = match token_kind {
575                    TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
576                    TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
577                    TokenKind::CloseTag => LexerMode::Inline,
578                    TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
579                    TokenKind::DocumentStart(document_kind) => LexerMode::DocumentString(
580                        document_kind,
581                        self.arena.alloc_slice_copy(document_label),
582                        Interpolation::None,
583                    ),
584                    _ => LexerMode::Script,
585                };
586
587                let buffer = self.input.consume(len);
588                let end = self.input.current_position();
589
590                self.token(token_kind, buffer, start, end)
591            }
592            LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
593                Interpolation::None => {
594                    let start = self.input.current_position();
595
596                    let mut length = 0;
597                    let mut last_was_slash = false;
598                    let mut token_kind = TokenKind::StringPart;
599                    loop {
600                        match self.input.peek(length, 2) {
601                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
602                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
603
604                                self.mode =
605                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
606
607                                break;
608                            }
609                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
610                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
611
612                                self.mode =
613                                    LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
614
615                                break;
616                            }
617                            [b'\\', ..] => {
618                                length += 1;
619
620                                last_was_slash = !last_was_slash;
621                            }
622                            [b'"', ..] if !last_was_slash => {
623                                if length == 0 {
624                                    length += 1;
625                                    token_kind = TokenKind::DoubleQuote;
626
627                                    break;
628                                }
629
630                                break;
631                            }
632                            [_, ..] => {
633                                length += 1;
634                                last_was_slash = false;
635                            }
636                            [] => {
637                                break;
638                            }
639                        }
640                    }
641
642                    let buffer = self.input.consume(length);
643                    let end = self.input.current_position();
644
645                    if TokenKind::DoubleQuote == token_kind {
646                        self.mode = LexerMode::Script;
647                    }
648
649                    self.token(token_kind, buffer, start, end)
650                }
651                Interpolation::Until(offset) => {
652                    self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
653                }
654            },
655            LexerMode::ShellExecuteString(interpolation) => match &interpolation {
656                Interpolation::None => {
657                    let start = self.input.current_position();
658
659                    let mut length = 0;
660                    let mut last_was_slash = false;
661                    let mut token_kind = TokenKind::StringPart;
662                    loop {
663                        match self.input.peek(length, 2) {
664                            [b'$', start_of_identifier!(), ..] if !last_was_slash => {
665                                let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
666
667                                self.mode =
668                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
669
670                                break;
671                            }
672                            [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
673                                let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
674
675                                self.mode =
676                                    LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
677
678                                break;
679                            }
680                            [b'\\', ..] => {
681                                length += 1;
682                                last_was_slash = true;
683                            }
684                            [b'`', ..] if !last_was_slash => {
685                                if length == 0 {
686                                    length += 1;
687                                    token_kind = TokenKind::Backtick;
688
689                                    break;
690                                }
691
692                                break;
693                            }
694                            [_, ..] => {
695                                length += 1;
696                                last_was_slash = false;
697                            }
698                            [] => {
699                                break;
700                            }
701                        }
702                    }
703
704                    let buffer = self.input.consume(length);
705                    let end = self.input.current_position();
706
707                    if TokenKind::Backtick == token_kind {
708                        self.mode = LexerMode::Script;
709                    }
710
711                    self.token(token_kind, buffer, start, end)
712                }
713                Interpolation::Until(offset) => {
714                    self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
715                }
716            },
717            LexerMode::DocumentString(kind, label, interpolation) => match &kind {
718                DocumentKind::Heredoc => match &interpolation {
719                    Interpolation::None => {
720                        let start = self.input.current_position();
721
722                        let mut length = 0;
723                        let mut last_was_slash = false;
724                        let mut only_whitespaces = true;
725                        let mut token_kind = TokenKind::StringPart;
726                        loop {
727                            match self.input.peek(length, 2) {
728                                [b'\r', b'\n'] => {
729                                    length += 2;
730
731                                    break;
732                                }
733                                [b'\n', ..] | [b'\r', ..] => {
734                                    length += 1;
735
736                                    break;
737                                }
738                                [byte, ..] if byte.is_ascii_whitespace() => {
739                                    length += 1;
740                                }
741                                [b'$', start_of_identifier!(), ..] if !last_was_slash => {
742                                    let until_offset =
743                                        read_until_end_of_variable_interpolation(&self.input, length + 2);
744
745                                    self.mode = LexerMode::DocumentString(
746                                        kind,
747                                        label,
748                                        Interpolation::Until(start.offset + until_offset),
749                                    );
750
751                                    break;
752                                }
753                                [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
754                                    let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
755
756                                    self.mode = LexerMode::DocumentString(
757                                        kind,
758                                        label,
759                                        Interpolation::Until(start.offset + until_offset),
760                                    );
761
762                                    break;
763                                }
764                                [b'\\', ..] => {
765                                    length += 1;
766                                    last_was_slash = true;
767                                    only_whitespaces = false;
768                                }
769                                [_, ..] => {
770                                    if only_whitespaces
771                                        && self.input.peek(length, label.len()) == label
772                                        && self
773                                            .input
774                                            .peek(length + label.len(), 1)
775                                            .first()
776                                            .is_none_or(|c| !c.is_ascii_alphanumeric())
777                                    {
778                                        length += label.len();
779                                        token_kind = TokenKind::DocumentEnd;
780
781                                        break;
782                                    }
783
784                                    length += 1;
785                                    last_was_slash = false;
786                                    only_whitespaces = false;
787                                }
788                                [] => {
789                                    break;
790                                }
791                            }
792                        }
793
794                        let buffer = self.input.consume(length);
795                        let end = self.input.current_position();
796
797                        if TokenKind::DocumentEnd == token_kind {
798                            self.mode = LexerMode::Script;
799                        }
800
801                        self.token(token_kind, buffer, start, end)
802                    }
803                    Interpolation::Until(offset) => {
804                        self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
805                    }
806                },
807                DocumentKind::Nowdoc => {
808                    let start = self.input.current_position();
809
810                    let mut length = 0;
811                    let mut terminated = false;
812                    let mut only_whitespaces = true;
813
814                    loop {
815                        match self.input.peek(length, 2) {
816                            [b'\r', b'\n'] => {
817                                length += 2;
818
819                                break;
820                            }
821                            [b'\n', ..] | [b'\r', ..] => {
822                                length += 1;
823
824                                break;
825                            }
826                            [byte, ..] if byte.is_ascii_whitespace() => {
827                                length += 1;
828                            }
829                            [_, ..] => {
830                                if only_whitespaces
831                                    && self.input.peek(length, label.len()) == label
832                                    && self
833                                        .input
834                                        .peek(length + label.len(), 1)
835                                        .first()
836                                        .is_none_or(|c| !c.is_ascii_alphanumeric())
837                                {
838                                    length += label.len();
839                                    terminated = true;
840
841                                    break;
842                                }
843
844                                only_whitespaces = false;
845                                length += 1;
846                            }
847                            [] => {
848                                break;
849                            }
850                        }
851                    }
852
853                    let buffer = self.input.consume(length);
854                    let end = self.input.current_position();
855
856                    if terminated {
857                        self.mode = LexerMode::Script;
858
859                        return self.token(TokenKind::DocumentEnd, buffer, start, end);
860                    }
861
862                    self.token(TokenKind::StringPart, buffer, start, end)
863                }
864            },
865            LexerMode::Halt(stage) => 'halt: {
866                let start = self.input.current_position();
867                if let HaltStage::End = stage {
868                    let buffer = self.input.consume_remaining();
869                    let end = self.input.current_position();
870
871                    break 'halt self.token(TokenKind::InlineText, buffer, start, end);
872                }
873
874                let whitespaces = self.input.consume_whitespaces();
875                if !whitespaces.is_empty() {
876                    let end = self.input.current_position();
877
878                    break 'halt self.token(TokenKind::Whitespace, whitespaces, start, end);
879                }
880
881                match &stage {
882                    HaltStage::LookingForLeftParenthesis => {
883                        if self.input.is_at(b"(", false) {
884                            let buffer = self.input.consume(1);
885                            let end = self.input.current_position();
886
887                            self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
888
889                            self.token(TokenKind::LeftParenthesis, buffer, start, end)
890                        } else {
891                            Some(Err(SyntaxError::UnexpectedToken(
892                                self.file_id(),
893                                self.input.read(1)[0],
894                                self.input.current_position(),
895                            )))
896                        }
897                    }
898                    HaltStage::LookingForRightParenthesis => {
899                        if self.input.is_at(b")", false) {
900                            let buffer = self.input.consume(1);
901                            let end = self.input.current_position();
902
903                            self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
904
905                            self.token(TokenKind::RightParenthesis, buffer, start, end)
906                        } else {
907                            Some(Err(SyntaxError::UnexpectedToken(
908                                self.file_id(),
909                                self.input.read(1)[0],
910                                self.input.current_position(),
911                            )))
912                        }
913                    }
914                    HaltStage::LookingForTerminator => {
915                        if self.input.is_at(b";", false) {
916                            let buffer = self.input.consume(1);
917                            let end = self.input.current_position();
918
919                            self.mode = LexerMode::Halt(HaltStage::End);
920
921                            self.token(TokenKind::Semicolon, buffer, start, end)
922                        } else if self.input.is_at(b"?>", false) {
923                            let buffer = self.input.consume(2);
924                            let end = self.input.current_position();
925
926                            self.mode = LexerMode::Halt(HaltStage::End);
927
928                            self.token(TokenKind::CloseTag, buffer, start, end)
929                        } else {
930                            Some(Err(SyntaxError::UnexpectedToken(
931                                self.file_id(),
932                                self.input.read(1)[0],
933                                self.input.current_position(),
934                            )))
935                        }
936                    }
937                    _ => unreachable!(),
938                }
939            }
940        }
941    }
942
943    #[inline]
944    fn token(
945        &mut self,
946        kind: TokenKind,
947        v: &[u8],
948        from: Position,
949        to: Position,
950    ) -> Option<Result<Token<'arena>, SyntaxError>> {
951        let string = String::from_utf8_lossy(v);
952
953        Some(Ok(Token { kind, value: self.arena.alloc_str(&string), span: Span::new(self.file_id(), from, to) }))
954    }
955
956    #[inline]
957    fn interpolation(
958        &mut self,
959        end_offset: u32,
960        post_interpolation_mode: LexerMode<'arena>,
961    ) -> Option<Result<Token<'arena>, SyntaxError>> {
962        self.mode = LexerMode::Script;
963
964        let was_interpolating = self.interpolating;
965        self.interpolating = true;
966
967        loop {
968            let subsequent_token = self.advance()?.ok()?;
969            let is_final_token = subsequent_token.span.has_offset(end_offset);
970
971            self.buffer.push_back(subsequent_token);
972
973            if is_final_token {
974                break;
975            }
976        }
977
978        self.mode = post_interpolation_mode;
979        self.interpolating = was_interpolating;
980
981        self.advance()
982    }
983}
984
985impl HasFileId for Lexer<'_, '_> {
986    #[inline]
987    fn file_id(&self) -> FileId {
988        self.input.file_id()
989    }
990}
991
992#[inline]
993fn matches_start_of_heredoc_document(input: &Input) -> bool {
994    let total = input.len();
995    let base = input.current_offset();
996
997    // Start after the fixed opener (3 bytes).
998    let mut length = 3;
999    // Consume any following whitespace.
1000    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1001        length += 1;
1002    }
1003
1004    // The next byte must be a valid start-of-identifier.
1005    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1006        return false;
1007    }
1008    length += 1; // Include that identifier start.
1009
1010    // Now continue reading identifier characters until a newline is found.
1011    loop {
1012        let pos = base + length;
1013        if pos >= total {
1014            return false; // Unexpected EOF
1015        }
1016
1017        let byte = *input.read_at(pos);
1018        if byte == b'\n' {
1019            return true; // Newline found: valid heredoc opener.
1020        } else if byte == b'\r' {
1021            // Handle CRLF: treat '\r' followed by '\n' as a newline as well.
1022            return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1023        } else if is_part_of_identifier(input.read_at(pos)) {
1024            length += 1;
1025        } else {
1026            return false; // Unexpected character.
1027        }
1028    }
1029}
1030
1031#[inline]
1032fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1033    let total = input.len();
1034    let base = input.current_offset();
1035
1036    // Start after the fixed opener (3 bytes), then skip any whitespace.
1037    let mut length = 3;
1038    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1039        length += 1;
1040    }
1041
1042    // Next, expect an opening double quote.
1043    if base + length >= total || *input.read_at(base + length) != b'"' {
1044        return false;
1045    }
1046    length += 1;
1047
1048    // The following byte must be a valid start-of-identifier.
1049    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1050        return false;
1051    }
1052    length += 1;
1053
1054    // Now scan the label. For double‑quoted heredoc, a terminating double quote is required.
1055    let mut terminated = false;
1056    loop {
1057        let pos = base + length;
1058        if pos >= total {
1059            return false;
1060        }
1061        let byte = input.read_at(pos);
1062        if *byte == b'\n' {
1063            // End of line: valid only if a closing double quote was encountered.
1064            return terminated;
1065        } else if *byte == b'\r' {
1066            // Handle CRLF sequences.
1067            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1068        } else if !terminated && is_part_of_identifier(byte) {
1069            length += 1;
1070        } else if !terminated && *byte == b'"' {
1071            terminated = true;
1072            length += 1;
1073        } else {
1074            return false;
1075        }
1076    }
1077}
1078
1079#[inline]
1080fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1081    let total = input.len();
1082    let base = input.current_offset();
1083
1084    // Start after the fixed opener (3 bytes) and skip whitespace.
1085    let mut length = 3;
1086    while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1087        length += 1;
1088    }
1089
1090    // Now, the next byte must be a single quote.
1091    if base + length >= total || *input.read_at(base + length) != b'\'' {
1092        return false;
1093    }
1094    length += 1;
1095
1096    // The following byte must be a valid start-of-identifier.
1097    if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1098        return false;
1099    }
1100    length += 1;
1101
1102    // Read the label until a newline. A terminating single quote is required.
1103    let mut terminated = false;
1104    loop {
1105        let pos = base + length;
1106        if pos >= total {
1107            return false;
1108        }
1109        let byte = *input.read_at(pos);
1110        if byte == b'\n' {
1111            return terminated;
1112        } else if byte == b'\r' {
1113            return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1114        } else if !terminated && is_part_of_identifier(&byte) {
1115            length += 1;
1116        } else if !terminated && byte == b'\'' {
1117            terminated = true;
1118            length += 1;
1119        } else {
1120            return false;
1121        }
1122    }
1123}
1124
1125#[inline]
1126fn matches_literal_double_quote_string(input: &Input) -> bool {
1127    let total = input.len();
1128    let base = input.current_offset();
1129
1130    // Start after the initial double-quote (assumed consumed).
1131    let mut pos = base + 1;
1132    loop {
1133        if pos >= total {
1134            // Reached EOF: assume literal is complete.
1135            return true;
1136        }
1137        let byte = *input.read_at(pos);
1138        if byte == b'"' {
1139            // Encounter a closing double quote.
1140            return true;
1141        } else if byte == b'\\' {
1142            // Skip an escape sequence: assume that the backslash and the escaped character form a pair.
1143            pos += 2;
1144            continue;
1145        } else {
1146            // Check for variable interpolation or complex expression start:
1147            // If two-byte sequences match either "$" followed by a start-of-identifier or "{" and "$", then return false.
1148            if pos + 1 < total {
1149                let next = *input.read_at(pos + 1);
1150                if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1151                    return false;
1152                }
1153            }
1154            pos += 1;
1155        }
1156    }
1157}
1158
1159#[inline]
1160fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1161    let total = input.len();
1162    let base = input.current_offset();
1163
1164    // --- Block 1: Consume Whitespace ---
1165    // Start reading at offset base+3 (the fixed opener length).
1166    let mut pos = base + 3;
1167    let mut whitespaces = 0;
1168    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1169        whitespaces += 1;
1170        pos += 1;
1171    }
1172
1173    // --- Block 2: Calculate Initial Label Offset ---
1174    // The label (or delimiter) starts after:
1175    //   3 bytes + whitespace bytes + an extra offset:
1176    //      if double-quoted: 2 bytes (opening and closing quotes around the label)
1177    //      else: 1 byte.
1178    let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1179
1180    // --- Block 3: Read the Label ---
1181    let mut label_length = 1; // Start with at least one byte for the label.
1182    let mut terminated = false; // For double-quoted heredoc, to track the closing quote.
1183    loop {
1184        let pos = base + length;
1185        // Ensure we haven't run past the input.
1186        if pos >= total {
1187            unreachable!("Unexpected end of input while reading heredoc label");
1188        }
1189
1190        let byte = *input.read_at(pos);
1191        if byte == b'\n' {
1192            // Newline ends the label.
1193            length += 1;
1194            return (length, whitespaces, label_length);
1195        } else if byte == b'\r' {
1196            // Handle CRLF sequences
1197            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1198                length += 2;
1199            } else {
1200                length += 1;
1201            }
1202            return (length, whitespaces, label_length);
1203        } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1204            // For both unquoted and double-quoted (before the closing quote) heredoc,
1205            // a valid identifier character is part of the label.
1206            length += 1;
1207            label_length += 1;
1208        } else if double_quoted && !terminated && byte == b'"' {
1209            // In a double-quoted heredoc, a double quote terminates the label.
1210            length += 1;
1211            terminated = true;
1212        } else {
1213            unreachable!("Unexpected character encountered in heredoc label");
1214        }
1215    }
1216}
1217
1218#[inline]
1219fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1220    let total = input.len();
1221    let base = input.current_offset();
1222
1223    // --- Block 1: Consume Whitespace ---
1224    let mut pos = base + 3;
1225    let mut whitespaces = 0;
1226    while pos < total && input.read_at(pos).is_ascii_whitespace() {
1227        whitespaces += 1;
1228        pos += 1;
1229    }
1230
1231    // --- Block 2: Calculate Initial Label Offset ---
1232    // For nowdoc, the fixed extra offset is always 2.
1233    let mut length = 3 + whitespaces + 2;
1234
1235    // --- Block 3: Read the Label ---
1236    let mut label_length = 1;
1237    let mut terminated = false;
1238    loop {
1239        let pos = base + length;
1240        if pos >= total {
1241            unreachable!("Unexpected end of input while reading nowdoc label");
1242        }
1243        let byte = *input.read_at(pos);
1244
1245        if byte == b'\n' {
1246            // A newline indicates the end of the label.
1247            length += 1;
1248            return (length, whitespaces, label_length);
1249        } else if byte == b'\r' {
1250            // Handle CRLF sequences
1251            if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1252                length += 2;
1253            } else {
1254                length += 1;
1255            }
1256            return (length, whitespaces, label_length);
1257        } else if is_part_of_identifier(&byte) && !terminated {
1258            // For nowdoc, identifier characters contribute to the label until terminated.
1259            length += 1;
1260            label_length += 1;
1261        } else if !terminated && byte == b'\'' {
1262            // A single quote terminates the nowdoc label.
1263            length += 1;
1264            terminated = true;
1265        } else {
1266            unreachable!("Unexpected character encountered in nowdoc label");
1267        }
1268    }
1269}
1270
1271#[inline]
1272fn read_literal_string(input: &Input, quote: &u8) -> (TokenKind, usize) {
1273    let total = input.len();
1274    let start = input.current_offset();
1275    let mut length = 1; // We assume the opening quote is already consumed.
1276    let mut last_was_backslash = false;
1277    let mut partial = false;
1278
1279    loop {
1280        let pos = start + length;
1281        if pos >= total {
1282            // Reached EOF before closing quote.
1283            partial = true;
1284            break;
1285        }
1286
1287        let byte = input.read_at(pos);
1288        if *byte == b'\\' {
1289            // Toggle the backslash flag.
1290            last_was_backslash = !last_was_backslash;
1291            length += 1;
1292        } else {
1293            // If we see the closing quote and the previous byte was not an escape.
1294            if *byte == *quote && !last_was_backslash {
1295                length += 1; // Include the closing quote.
1296                break;
1297            }
1298            length += 1;
1299            last_was_backslash = false;
1300        }
1301    }
1302
1303    if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1304}
1305
1306#[inline]
1307fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1308    let total = input.len();
1309    let base = input.current_offset();
1310    // `offset` is relative to the current position.
1311    let mut offset = from;
1312
1313    loop {
1314        let abs = base + offset;
1315        if abs >= total {
1316            // End of input.
1317            break;
1318        }
1319
1320        // Pattern 1: If the current byte is part of an identifier, simply advance.
1321        if is_part_of_identifier(input.read_at(abs)) {
1322            offset += 1;
1323            continue;
1324        }
1325
1326        // Pattern 2: If the current byte is a '[' then we enter a bracketed interpolation.
1327        if *input.read_at(abs) == b'[' {
1328            offset += 1;
1329            let mut nesting = 0;
1330            loop {
1331                let abs_inner = base + offset;
1332                if abs_inner >= total {
1333                    break;
1334                }
1335                let b = input.read_at(abs_inner);
1336                if *b == b']' {
1337                    offset += 1;
1338                    if nesting == 0 {
1339                        break;
1340                    } else {
1341                        nesting -= 1;
1342                    }
1343                } else if *b == b'[' {
1344                    offset += 1;
1345                    nesting += 1;
1346                } else if b.is_ascii_whitespace() {
1347                    // Do not include whitespace.
1348                    break;
1349                } else {
1350                    offset += 1;
1351                }
1352            }
1353            // When bracketed interpolation is processed, exit the loop.
1354            break;
1355        }
1356
1357        // Pattern 3: Check for "->" followed by a valid identifier start.
1358        if base + offset + 2 < total
1359            && *input.read_at(abs) == b'-'
1360            && *input.read_at(base + offset + 1) == b'>'
1361            && is_start_of_identifier(input.read_at(base + offset + 2))
1362        {
1363            offset += 3;
1364            // Consume any following identifier characters.
1365            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1366                offset += 1;
1367            }
1368            break;
1369        }
1370
1371        // Pattern 4: Check for "?->" followed by a valid identifier start.
1372        if base + offset + 3 < total
1373            && *input.read_at(abs) == b'?'
1374            && *input.read_at(base + offset + 1) == b'-'
1375            && *input.read_at(base + offset + 2) == b'>'
1376            && is_start_of_identifier(input.read_at(base + offset + 3))
1377        {
1378            offset += 4;
1379            while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1380                offset += 1;
1381            }
1382            break;
1383        }
1384
1385        // None of the expected patterns matched: exit the loop.
1386        break;
1387    }
1388
1389    offset as u32
1390}
1391
1392#[inline]
1393fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1394    let total = input.len();
1395    let base = input.current_offset();
1396    let mut offset = from;
1397    let mut nesting = 0;
1398
1399    loop {
1400        let abs = base + offset;
1401        if abs >= total {
1402            break;
1403        }
1404        match input.read_at(abs) {
1405            b'}' => {
1406                offset += 1;
1407                if nesting == 0 {
1408                    break;
1409                } else {
1410                    nesting -= 1;
1411                }
1412            }
1413            b'{' => {
1414                offset += 1;
1415                nesting += 1;
1416            }
1417            _ => {
1418                offset += 1;
1419            }
1420        }
1421    }
1422
1423    offset as u32
1424}
mago_syntax/lexer/mod.rs

mago_syntax/lexer/
mod.rs