1use std::collections::VecDeque;
2use std::fmt::Debug;
3
4use bumpalo::Bump;
5use mago_database::file::FileId;
6use mago_database::file::HasFileId;
7use mago_span::Position;
8use mago_span::Span;
9
10use mago_syntax_core::float_exponent;
11use mago_syntax_core::float_separator;
12use mago_syntax_core::input::Input;
13use mago_syntax_core::number_sign;
14use mago_syntax_core::part_of_identifier;
15use mago_syntax_core::start_of_binary_number;
16use mago_syntax_core::start_of_float_number;
17use mago_syntax_core::start_of_hexadecimal_number;
18use mago_syntax_core::start_of_identifier;
19use mago_syntax_core::start_of_number;
20use mago_syntax_core::start_of_octal_number;
21use mago_syntax_core::start_of_octal_or_float_number;
22use mago_syntax_core::utils::is_part_of_identifier;
23use mago_syntax_core::utils::is_start_of_identifier;
24use mago_syntax_core::utils::read_digits_of_base;
25
26use crate::error::SyntaxError;
27use crate::lexer::internal::mode::HaltStage;
28use crate::lexer::internal::mode::Interpolation;
29use crate::lexer::internal::mode::LexerMode;
30use crate::lexer::internal::utils::NumberKind;
31use crate::token::DocumentKind;
32use crate::token::Token;
33use crate::token::TokenKind;
34
35mod internal;
36
37#[derive(Debug)]
49pub struct Lexer<'input, 'arena> {
50 arena: &'arena Bump,
51 input: Input<'input>,
52 mode: LexerMode<'arena>,
53 interpolating: bool,
54 buffer: VecDeque<Token<'arena>>,
55}
56
57impl<'input, 'arena> Lexer<'input, 'arena> {
58 pub fn new(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
69 Lexer { arena, input, mode: LexerMode::Inline, interpolating: false, buffer: VecDeque::new() }
70 }
71
72 pub fn scripting(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
83 Lexer { arena, input, mode: LexerMode::Script, interpolating: false, buffer: VecDeque::new() }
84 }
85
86 #[must_use]
90 pub fn has_reached_eof(&self) -> bool {
91 self.input.has_reached_eof()
92 }
93
94 #[must_use]
96 pub fn get_position(&self) -> Position {
97 self.input.current_position()
98 }
99
100 #[inline]
133 pub fn advance(&mut self) -> Option<Result<Token<'arena>, SyntaxError>> {
134 if !self.interpolating
135 && let Some(token) = self.buffer.pop_front()
136 {
137 return Some(Ok(token));
138 }
139
140 if self.input.has_reached_eof() {
141 return None;
142 }
143
144 match self.mode {
145 LexerMode::Inline => {
146 let start = self.input.current_position();
147 if self.input.is_at(b"<?", false) {
148 let (kind, buffer) = if self.input.is_at(b"<?php", true) {
149 (TokenKind::OpenTag, self.input.consume(5))
150 } else if self.input.is_at(b"<?=", false) {
151 (TokenKind::EchoTag, self.input.consume(3))
152 } else {
153 (TokenKind::ShortOpenTag, self.input.consume(2))
154 };
155
156 let end = self.input.current_position();
157 let tag = self.token(kind, buffer, start, end);
158
159 self.mode = LexerMode::Script;
160
161 return Some(Ok(tag));
162 }
163
164 if self.input.is_at(b"#!", true) {
165 let buffer = self.input.consume_through(b'\n');
166 let end = self.input.current_position();
167
168 Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)))
169 } else {
170 let buffer = self.input.consume_until(b"<?", false);
171 let end = self.input.current_position();
172
173 Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
174 }
175 }
176 LexerMode::Script => {
177 let start = self.input.current_position();
178 let whitespaces = self.input.consume_whitespaces();
179 if !whitespaces.is_empty() {
180 return Some(Ok(self.token(
181 TokenKind::Whitespace,
182 whitespaces,
183 start,
184 self.input.current_position(),
185 )));
186 }
187
188 let mut document_label: &[u8] = &[];
189
190 let (token_kind, len) = match self.input.read(3) {
191 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
192 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
193 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
194 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
195 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
196 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
197 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
198 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
199 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
200 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
201 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
202
203 document_label = self.input.peek(3 + whitespaces, label_length);
204
205 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
206 }
207 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
208 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
209
210 document_label = self.input.peek(4 + whitespaces, label_length);
211
212 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
213 }
214 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
215 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
216
217 document_label = self.input.peek(4 + whitespaces, label_length);
218
219 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
220 }
221 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
222 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
223 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
224 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
225 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
226 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
227 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
228 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
229 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
230 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
231 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
232 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
233 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
234 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
235 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
236 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
237 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
238 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
239 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
240 [b':', b':', ..] => (TokenKind::ColonColon, 2),
241 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
242 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
243 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
244 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
245 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
246 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
247 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
248 [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
249 [b'/', b'/', ..] => {
250 let mut length = 2;
251 loop {
252 match self.input.peek(length, 3) {
253 [b'\n' | b'\r', ..] => {
254 break;
255 }
256 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
257 break;
258 }
259 [b'?', b'>', ..] | [] => {
260 break;
261 }
262 [_, ..] => {
263 length += 1;
264 }
265 }
266 }
267
268 (TokenKind::SingleLineComment, length)
269 }
270 [b'/', b'*', asterisk] => {
271 let mut length = 2;
272 let mut is_multiline = false;
273 let mut terminated = false;
274 loop {
275 match self.input.peek(length, 2) {
276 [b'*', b'/'] => {
277 if length == 2 {
278 is_multiline = true;
279 }
280
281 length += 2;
282
283 terminated = true;
284 break;
285 }
286 [_, ..] => {
287 length += 1;
288 }
289 [] => {
290 break;
291 }
292 }
293 }
294
295 if !terminated {
296 self.input.consume(length);
297
298 return Some(Err(SyntaxError::UnexpectedEndOfFile(
299 self.file_id(),
300 self.input.current_position(),
301 )));
302 }
303
304 if !is_multiline && asterisk == &b'*' {
305 (TokenKind::DocBlockComment, length)
306 } else {
307 (TokenKind::MultiLineComment, length)
308 }
309 }
310 [b'\\', start_of_identifier!(), ..] => {
311 let mut length = 2;
312 let mut last_was_slash = false;
313 loop {
314 match self.input.peek(length, 1) {
315 [start_of_identifier!(), ..] if last_was_slash => {
316 length += 1;
317 last_was_slash = false;
318 }
319 [part_of_identifier!(), ..] if !last_was_slash => {
320 length += 1;
321 }
322 [b'\\', ..] => {
323 if last_was_slash {
324 length -= 1;
325
326 break;
327 }
328
329 length += 1;
330 last_was_slash = true;
331 }
332 _ => {
333 break;
334 }
335 }
336 }
337
338 if last_was_slash {
339 length -= 1;
340 }
341
342 (TokenKind::FullyQualifiedIdentifier, length)
343 }
344 [b'$', start_of_identifier!(), ..] => {
345 let mut length = 2;
346 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
347 length += 1;
348 }
349
350 (TokenKind::Variable, length)
351 }
352 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
353 [b'$', ..] => (TokenKind::Dollar, 1),
354 [b'@', ..] => (TokenKind::At, 1),
355 [b'!', ..] => (TokenKind::Bang, 1),
356 [b'&', ..] => (TokenKind::Ampersand, 1),
357 [b'?', ..] => (TokenKind::Question, 1),
358 [b'=', ..] => (TokenKind::Equal, 1),
359 [b'`', ..] => (TokenKind::Backtick, 1),
360 [b')', ..] => (TokenKind::RightParenthesis, 1),
361 [b';', ..] => (TokenKind::Semicolon, 1),
362 [b'+', ..] => (TokenKind::Plus, 1),
363 [b'%', ..] => (TokenKind::Percent, 1),
364 [b'-', ..] => (TokenKind::Minus, 1),
365 [b'<', ..] => (TokenKind::LessThan, 1),
366 [b'>', ..] => (TokenKind::GreaterThan, 1),
367 [b',', ..] => (TokenKind::Comma, 1),
368 [b'[', ..] => (TokenKind::LeftBracket, 1),
369 [b']', ..] => (TokenKind::RightBracket, 1),
370 [b'{', ..] => (TokenKind::LeftBrace, 1),
371 [b'}', ..] => (TokenKind::RightBrace, 1),
372 [b':', ..] => (TokenKind::Colon, 1),
373 [b'~', ..] => (TokenKind::Tilde, 1),
374 [b'|', ..] => (TokenKind::Pipe, 1),
375 [b'^', ..] => (TokenKind::Caret, 1),
376 [b'*', ..] => (TokenKind::Asterisk, 1),
377 [b'/', ..] => (TokenKind::Slash, 1),
378 [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
379 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
380 read_literal_string(&self.input, *quote)
381 }
382 [b'"', ..] => (TokenKind::DoubleQuote, 1),
383 [b'(', ..] => 'parenthesis: {
384 for (value, kind) in internal::consts::CAST_TYPES {
385 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
386 break 'parenthesis (kind, length);
387 }
388 }
389
390 (TokenKind::LeftParenthesis, 1)
391 }
392 [b'#', ..] => {
393 let mut length = 1;
394 loop {
395 match self.input.peek(length, 3) {
396 [b'\n' | b'\r', ..] => {
397 break;
398 }
399 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
400 break;
401 }
402 [b'?', b'>', ..] | [] => {
403 break;
404 }
405 [_, ..] => {
406 length += 1;
407 }
408 }
409 }
410
411 (TokenKind::HashComment, length)
412 }
413 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
414 [start_of_identifier!(), ..] => 'identifier: {
415 let mut length = 1;
416 let mut ended_with_slash = false;
417 loop {
418 match self.input.peek(length, 2) {
419 [part_of_identifier!(), ..] => {
420 length += 1;
421 }
422 [b'\\', start_of_identifier!(), ..] => {
423 ended_with_slash = true;
424 break;
425 }
426 [b'(', ..] if length == 7 => {
428 if self.input.is_at(b"private(set)", true) {
429 break 'identifier (TokenKind::PrivateSet, 7 + 5);
430 }
431
432 break;
433 }
434 [b'(', ..] if length == 6 => {
436 if self.input.is_at(b"public(set)", true) {
437 break 'identifier (TokenKind::PublicSet, 6 + 5);
438 }
439
440 break;
441 }
442 [b'(', ..] if length == 9 => {
444 if self.input.is_at(b"protected(set)", true) {
445 break 'identifier (TokenKind::ProtectedSet, 9 + 5);
446 }
447
448 break;
449 }
450 _ => {
451 break;
452 }
453 }
454 }
455
456 if !ended_with_slash {
457 for (value, kind) in internal::consts::KEYWORD_TYPES {
458 if value.len() != length {
459 continue;
460 }
461
462 if self.input.is_at(value, true) {
463 break 'identifier (kind, value.len());
464 }
465 }
466 }
467
468 let mut slashes = 0;
469 let mut last_was_slash = false;
470 loop {
471 match self.input.peek(length, 1) {
472 [start_of_identifier!(), ..] if last_was_slash => {
473 length += 1;
474 last_was_slash = false;
475 }
476 [part_of_identifier!(), ..] if !last_was_slash => {
477 length += 1;
478 }
479 [b'\\', ..] if !self.interpolating => {
480 if last_was_slash {
481 length -= 1;
482 slashes -= 1;
483 last_was_slash = false;
484
485 break;
486 }
487
488 length += 1;
489 slashes += 1;
490 last_was_slash = true;
491 }
492 _ => {
493 break;
494 }
495 }
496 }
497
498 if last_was_slash {
499 length -= 1;
500 slashes -= 1;
501 }
502
503 if slashes > 0 {
504 (TokenKind::QualifiedIdentifier, length)
505 } else {
506 (TokenKind::Identifier, length)
507 }
508 }
509 [b'.', start_of_number!(), ..] => {
510 let mut length = read_digits_of_base(&self.input, 2, 10);
511 if let float_exponent!() = self.input.peek(length, 1) {
512 length += 1;
513 if let number_sign!() = self.input.peek(length, 1) {
514 length += 1;
515 }
516
517 length = read_digits_of_base(&self.input, length, 10);
518 }
519
520 (TokenKind::LiteralFloat, length)
521 }
522 [start_of_number!(), ..] => 'number: {
523 let mut length = 1;
524
525 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
526 start_of_binary_number!() => {
527 length += 1;
528
529 (2, NumberKind::Integer)
530 }
531 start_of_octal_number!() => {
532 length += 1;
533
534 (8, NumberKind::Integer)
535 }
536 start_of_hexadecimal_number!() => {
537 length += 1;
538
539 (16, NumberKind::Integer)
540 }
541 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
542 start_of_float_number!() => (10, NumberKind::Float),
543 _ => (10, NumberKind::IntegerOrFloat),
544 };
545
546 if kind != NumberKind::Float {
547 length = read_digits_of_base(&self.input, length, base);
548
549 if kind == NumberKind::Integer {
550 break 'number (TokenKind::LiteralInteger, length);
551 }
552 }
553
554 let is_float = matches!(self.input.peek(length, 3), float_separator!());
555
556 if !is_float {
557 break 'number (TokenKind::LiteralInteger, length);
558 }
559
560 if let [b'.'] = self.input.peek(length, 1) {
561 length += 1;
562 length = read_digits_of_base(&self.input, length, 10);
563 }
564
565 if let float_exponent!() = self.input.peek(length, 1) {
566 length += 1;
567 if let number_sign!() = self.input.peek(length, 1) {
568 length += 1;
569 }
570
571 length = read_digits_of_base(&self.input, length, 10);
572 }
573
574 (TokenKind::LiteralFloat, length)
575 }
576 [b'.', ..] => (TokenKind::Dot, 1),
577 [unknown_byte, ..] => {
578 return Some(Err(SyntaxError::UnrecognizedToken(
579 self.file_id(),
580 *unknown_byte,
581 self.input.current_position(),
582 )));
583 }
584 [] => {
585 unreachable!()
588 }
589 };
590
591 self.mode = match token_kind {
592 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
593 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
594 TokenKind::CloseTag => LexerMode::Inline,
595 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
596 TokenKind::DocumentStart(document_kind) => LexerMode::DocumentString(
597 document_kind,
598 self.arena.alloc_slice_copy(document_label),
599 Interpolation::None,
600 ),
601 _ => LexerMode::Script,
602 };
603
604 let buffer = self.input.consume(len);
605 let end = self.input.current_position();
606
607 Some(Ok(self.token(token_kind, buffer, start, end)))
608 }
609 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
610 Interpolation::None => {
611 let start = self.input.current_position();
612
613 let mut length = 0;
614 let mut last_was_slash = false;
615 let mut token_kind = TokenKind::StringPart;
616 loop {
617 match self.input.peek(length, 2) {
618 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
619 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
620
621 self.mode =
622 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
623
624 break;
625 }
626 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
627 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
628
629 self.mode =
630 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
631
632 break;
633 }
634 [b'\\', ..] => {
635 length += 1;
636
637 last_was_slash = !last_was_slash;
638 }
639 [b'"', ..] if !last_was_slash => {
640 if length == 0 {
641 length += 1;
642 token_kind = TokenKind::DoubleQuote;
643
644 break;
645 }
646
647 break;
648 }
649 [_, ..] => {
650 length += 1;
651 last_was_slash = false;
652 }
653 [] => {
654 break;
655 }
656 }
657 }
658
659 let buffer = self.input.consume(length);
660 let end = self.input.current_position();
661
662 if TokenKind::DoubleQuote == token_kind {
663 self.mode = LexerMode::Script;
664 }
665
666 Some(Ok(self.token(token_kind, buffer, start, end)))
667 }
668 Interpolation::Until(offset) => {
669 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
670 }
671 },
672 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
673 Interpolation::None => {
674 let start = self.input.current_position();
675
676 let mut length = 0;
677 let mut last_was_slash = false;
678 let mut token_kind = TokenKind::StringPart;
679 loop {
680 match self.input.peek(length, 2) {
681 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
682 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
683
684 self.mode =
685 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
686
687 break;
688 }
689 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
690 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
691
692 self.mode =
693 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
694
695 break;
696 }
697 [b'\\', ..] => {
698 length += 1;
699 last_was_slash = true;
700 }
701 [b'`', ..] if !last_was_slash => {
702 if length == 0 {
703 length += 1;
704 token_kind = TokenKind::Backtick;
705
706 break;
707 }
708
709 break;
710 }
711 [_, ..] => {
712 length += 1;
713 last_was_slash = false;
714 }
715 [] => {
716 break;
717 }
718 }
719 }
720
721 let buffer = self.input.consume(length);
722 let end = self.input.current_position();
723
724 if TokenKind::Backtick == token_kind {
725 self.mode = LexerMode::Script;
726 }
727
728 Some(Ok(self.token(token_kind, buffer, start, end)))
729 }
730 Interpolation::Until(offset) => {
731 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
732 }
733 },
734 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
735 DocumentKind::Heredoc => match &interpolation {
736 Interpolation::None => {
737 let start = self.input.current_position();
738
739 let mut length = 0;
740 let mut last_was_slash = false;
741 let mut only_whitespaces = true;
742 let mut token_kind = TokenKind::StringPart;
743 loop {
744 match self.input.peek(length, 2) {
745 [b'\r', b'\n'] => {
746 length += 2;
747
748 break;
749 }
750 [b'\n' | b'\r', ..] => {
751 length += 1;
752
753 break;
754 }
755 [byte, ..] if byte.is_ascii_whitespace() => {
756 length += 1;
757 }
758 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
759 let until_offset =
760 read_until_end_of_variable_interpolation(&self.input, length + 2);
761
762 self.mode = LexerMode::DocumentString(
763 kind,
764 label,
765 Interpolation::Until(start.offset + until_offset),
766 );
767
768 break;
769 }
770 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
771 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
772
773 self.mode = LexerMode::DocumentString(
774 kind,
775 label,
776 Interpolation::Until(start.offset + until_offset),
777 );
778
779 break;
780 }
781 [b'\\', ..] => {
782 length += 1;
783 last_was_slash = true;
784 only_whitespaces = false;
785 }
786 [_, ..] => {
787 if only_whitespaces
788 && self.input.peek(length, label.len()) == label
789 && self
790 .input
791 .peek(length + label.len(), 1)
792 .first()
793 .is_none_or(|c| !c.is_ascii_alphanumeric())
794 {
795 length += label.len();
796 token_kind = TokenKind::DocumentEnd;
797
798 break;
799 }
800
801 length += 1;
802 last_was_slash = false;
803 only_whitespaces = false;
804 }
805 [] => {
806 break;
807 }
808 }
809 }
810
811 let buffer = self.input.consume(length);
812 let end = self.input.current_position();
813
814 if TokenKind::DocumentEnd == token_kind {
815 self.mode = LexerMode::Script;
816 }
817
818 Some(Ok(self.token(token_kind, buffer, start, end)))
819 }
820 Interpolation::Until(offset) => {
821 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
822 }
823 },
824 DocumentKind::Nowdoc => {
825 let start = self.input.current_position();
826
827 let mut length = 0;
828 let mut terminated = false;
829 let mut only_whitespaces = true;
830
831 loop {
832 match self.input.peek(length, 2) {
833 [b'\r', b'\n'] => {
834 length += 2;
835
836 break;
837 }
838 [b'\n' | b'\r', ..] => {
839 length += 1;
840
841 break;
842 }
843 [byte, ..] if byte.is_ascii_whitespace() => {
844 length += 1;
845 }
846 [_, ..] => {
847 if only_whitespaces
848 && self.input.peek(length, label.len()) == label
849 && self
850 .input
851 .peek(length + label.len(), 1)
852 .first()
853 .is_none_or(|c| !c.is_ascii_alphanumeric())
854 {
855 length += label.len();
856 terminated = true;
857
858 break;
859 }
860
861 only_whitespaces = false;
862 length += 1;
863 }
864 [] => {
865 break;
866 }
867 }
868 }
869
870 let buffer = self.input.consume(length);
871 let end = self.input.current_position();
872
873 if terminated {
874 self.mode = LexerMode::Script;
875
876 return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
877 }
878
879 Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
880 }
881 },
882 LexerMode::Halt(stage) => 'halt: {
883 let start = self.input.current_position();
884 if let HaltStage::End = stage {
885 let buffer = self.input.consume_remaining();
886 let end = self.input.current_position();
887
888 break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
889 }
890
891 let whitespaces = self.input.consume_whitespaces();
892 if !whitespaces.is_empty() {
893 let end = self.input.current_position();
894
895 break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
896 }
897
898 match &stage {
899 HaltStage::LookingForLeftParenthesis => {
900 if self.input.is_at(b"(", false) {
901 let buffer = self.input.consume(1);
902 let end = self.input.current_position();
903
904 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
905
906 Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
907 } else {
908 Some(Err(SyntaxError::UnexpectedToken(
909 self.file_id(),
910 self.input.read(1)[0],
911 self.input.current_position(),
912 )))
913 }
914 }
915 HaltStage::LookingForRightParenthesis => {
916 if self.input.is_at(b")", false) {
917 let buffer = self.input.consume(1);
918 let end = self.input.current_position();
919
920 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
921
922 Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
923 } else {
924 Some(Err(SyntaxError::UnexpectedToken(
925 self.file_id(),
926 self.input.read(1)[0],
927 self.input.current_position(),
928 )))
929 }
930 }
931 HaltStage::LookingForTerminator => {
932 if self.input.is_at(b";", false) {
933 let buffer = self.input.consume(1);
934 let end = self.input.current_position();
935
936 self.mode = LexerMode::Halt(HaltStage::End);
937
938 Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
939 } else if self.input.is_at(b"?>", false) {
940 let buffer = self.input.consume(2);
941 let end = self.input.current_position();
942
943 self.mode = LexerMode::Halt(HaltStage::End);
944
945 Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
946 } else {
947 Some(Err(SyntaxError::UnexpectedToken(
948 self.file_id(),
949 self.input.read(1)[0],
950 self.input.current_position(),
951 )))
952 }
953 }
954 _ => unreachable!(),
955 }
956 }
957 }
958 }
959
960 #[inline]
961 fn token(&mut self, kind: TokenKind, v: &[u8], from: Position, to: Position) -> Token<'arena> {
962 let string = unsafe { std::str::from_utf8_unchecked(v) };
967
968 Token { kind, value: self.arena.alloc_str(string), span: Span::new(self.file_id(), from, to) }
969 }
970
971 #[inline]
972 fn interpolation(
973 &mut self,
974 end_offset: u32,
975 post_interpolation_mode: LexerMode<'arena>,
976 ) -> Option<Result<Token<'arena>, SyntaxError>> {
977 self.mode = LexerMode::Script;
978
979 let was_interpolating = self.interpolating;
980 self.interpolating = true;
981
982 loop {
983 let subsequent_token = self.advance()?.ok()?;
984 let is_final_token = subsequent_token.span.has_offset(end_offset);
985
986 self.buffer.push_back(subsequent_token);
987
988 if is_final_token {
989 break;
990 }
991 }
992
993 self.mode = post_interpolation_mode;
994 self.interpolating = was_interpolating;
995
996 self.advance()
997 }
998}
999
1000impl HasFileId for Lexer<'_, '_> {
1001 #[inline]
1002 fn file_id(&self) -> FileId {
1003 self.input.file_id()
1004 }
1005}
1006
1007#[inline]
1008fn matches_start_of_heredoc_document(input: &Input) -> bool {
1009 let total = input.len();
1010 let base = input.current_offset();
1011
1012 let mut length = 3;
1014 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1016 length += 1;
1017 }
1018
1019 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1021 return false;
1022 }
1023 length += 1; loop {
1027 let pos = base + length;
1028 if pos >= total {
1029 return false; }
1031
1032 let byte = *input.read_at(pos);
1033 if byte == b'\n' {
1034 return true; } else if byte == b'\r' {
1036 return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1038 } else if is_part_of_identifier(input.read_at(pos)) {
1039 length += 1;
1040 } else {
1041 return false; }
1043 }
1044}
1045
1046#[inline]
1047fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1048 let total = input.len();
1049 let base = input.current_offset();
1050
1051 let mut length = 3;
1053 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1054 length += 1;
1055 }
1056
1057 if base + length >= total || *input.read_at(base + length) != b'"' {
1059 return false;
1060 }
1061 length += 1;
1062
1063 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1065 return false;
1066 }
1067 length += 1;
1068
1069 let mut terminated = false;
1071 loop {
1072 let pos = base + length;
1073 if pos >= total {
1074 return false;
1075 }
1076 let byte = input.read_at(pos);
1077 if *byte == b'\n' {
1078 return terminated;
1080 } else if *byte == b'\r' {
1081 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1083 } else if !terminated && is_part_of_identifier(byte) {
1084 length += 1;
1085 } else if !terminated && *byte == b'"' {
1086 terminated = true;
1087 length += 1;
1088 } else {
1089 return false;
1090 }
1091 }
1092}
1093
1094#[inline]
1095fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1096 let total = input.len();
1097 let base = input.current_offset();
1098
1099 let mut length = 3;
1101 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1102 length += 1;
1103 }
1104
1105 if base + length >= total || *input.read_at(base + length) != b'\'' {
1107 return false;
1108 }
1109 length += 1;
1110
1111 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1113 return false;
1114 }
1115 length += 1;
1116
1117 let mut terminated = false;
1119 loop {
1120 let pos = base + length;
1121 if pos >= total {
1122 return false;
1123 }
1124 let byte = *input.read_at(pos);
1125 if byte == b'\n' {
1126 return terminated;
1127 } else if byte == b'\r' {
1128 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1129 } else if !terminated && is_part_of_identifier(&byte) {
1130 length += 1;
1131 } else if !terminated && byte == b'\'' {
1132 terminated = true;
1133 length += 1;
1134 } else {
1135 return false;
1136 }
1137 }
1138}
1139
1140#[inline]
1141fn matches_literal_double_quote_string(input: &Input) -> bool {
1142 let total = input.len();
1143 let base = input.current_offset();
1144
1145 let mut pos = base + 1;
1147 loop {
1148 if pos >= total {
1149 return true;
1151 }
1152 let byte = *input.read_at(pos);
1153 if byte == b'"' {
1154 return true;
1156 } else if byte == b'\\' {
1157 pos += 2;
1159 continue;
1160 }
1161
1162 if pos + 1 < total {
1165 let next = *input.read_at(pos + 1);
1166 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1167 return false;
1168 }
1169 }
1170 pos += 1;
1171 }
1172}
1173
1174#[inline]
1175fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1176 let total = input.len();
1177 let base = input.current_offset();
1178
1179 let mut pos = base + 3;
1182 let mut whitespaces = 0;
1183 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1184 whitespaces += 1;
1185 pos += 1;
1186 }
1187
1188 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1194
1195 let mut label_length = 1; let mut terminated = false; loop {
1199 let pos = base + length;
1200 if pos >= total {
1202 unreachable!("Unexpected end of input while reading heredoc label");
1203 }
1204
1205 let byte = *input.read_at(pos);
1206 if byte == b'\n' {
1207 length += 1;
1209 return (length, whitespaces, label_length);
1210 } else if byte == b'\r' {
1211 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1213 length += 2;
1214 } else {
1215 length += 1;
1216 }
1217 return (length, whitespaces, label_length);
1218 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1219 length += 1;
1222 label_length += 1;
1223 } else if double_quoted && !terminated && byte == b'"' {
1224 length += 1;
1226 terminated = true;
1227 } else {
1228 unreachable!("Unexpected character encountered in heredoc label");
1229 }
1230 }
1231}
1232
1233#[inline]
1234fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1235 let total = input.len();
1236 let base = input.current_offset();
1237
1238 let mut pos = base + 3;
1240 let mut whitespaces = 0;
1241 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1242 whitespaces += 1;
1243 pos += 1;
1244 }
1245
1246 let mut length = 3 + whitespaces + 2;
1249
1250 let mut label_length = 1;
1252 let mut terminated = false;
1253 loop {
1254 let pos = base + length;
1255 if pos >= total {
1256 unreachable!("Unexpected end of input while reading nowdoc label");
1257 }
1258 let byte = *input.read_at(pos);
1259
1260 if byte == b'\n' {
1261 length += 1;
1263 return (length, whitespaces, label_length);
1264 } else if byte == b'\r' {
1265 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1267 length += 2;
1268 } else {
1269 length += 1;
1270 }
1271 return (length, whitespaces, label_length);
1272 } else if is_part_of_identifier(&byte) && !terminated {
1273 length += 1;
1275 label_length += 1;
1276 } else if !terminated && byte == b'\'' {
1277 length += 1;
1279 terminated = true;
1280 } else {
1281 unreachable!("Unexpected character encountered in nowdoc label");
1282 }
1283 }
1284}
1285
1286#[inline]
1287fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1288 let total = input.len();
1289 let start = input.current_offset();
1290 let mut length = 1; let mut last_was_backslash = false;
1292 let mut partial = false;
1293
1294 loop {
1295 let pos = start + length;
1296 if pos >= total {
1297 partial = true;
1299 break;
1300 }
1301
1302 let byte = input.read_at(pos);
1303 if *byte == b'\\' {
1304 last_was_backslash = !last_was_backslash;
1306 length += 1;
1307 } else {
1308 if *byte == quote && !last_was_backslash {
1310 length += 1; break;
1312 }
1313 length += 1;
1314 last_was_backslash = false;
1315 }
1316 }
1317
1318 if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1319}
1320
1321#[inline]
1322fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1323 let total = input.len();
1324 let base = input.current_offset();
1325 let mut offset = from;
1327
1328 loop {
1329 let abs = base + offset;
1330 if abs >= total {
1331 break;
1333 }
1334
1335 if is_part_of_identifier(input.read_at(abs)) {
1337 offset += 1;
1338 continue;
1339 }
1340
1341 if *input.read_at(abs) == b'[' {
1343 offset += 1;
1344 let mut nesting = 0;
1345 loop {
1346 let abs_inner = base + offset;
1347 if abs_inner >= total {
1348 break;
1349 }
1350 let b = input.read_at(abs_inner);
1351 if *b == b']' {
1352 offset += 1;
1353 if nesting == 0 {
1354 break;
1355 }
1356
1357 nesting -= 1;
1358 } else if *b == b'[' {
1359 offset += 1;
1360 nesting += 1;
1361 } else if b.is_ascii_whitespace() {
1362 break;
1364 } else {
1365 offset += 1;
1366 }
1367 }
1368 break;
1370 }
1371
1372 if base + offset + 2 < total
1374 && *input.read_at(abs) == b'-'
1375 && *input.read_at(base + offset + 1) == b'>'
1376 && is_start_of_identifier(input.read_at(base + offset + 2))
1377 {
1378 offset += 3;
1379 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1381 offset += 1;
1382 }
1383 break;
1384 }
1385
1386 if base + offset + 3 < total
1388 && *input.read_at(abs) == b'?'
1389 && *input.read_at(base + offset + 1) == b'-'
1390 && *input.read_at(base + offset + 2) == b'>'
1391 && is_start_of_identifier(input.read_at(base + offset + 3))
1392 {
1393 offset += 4;
1394 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1395 offset += 1;
1396 }
1397 break;
1398 }
1399
1400 break;
1402 }
1403
1404 offset as u32
1405}
1406
1407#[inline]
1408fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1409 let total = input.len();
1410 let base = input.current_offset();
1411 let mut offset = from;
1412 let mut nesting = 0;
1413
1414 loop {
1415 let abs = base + offset;
1416 if abs >= total {
1417 break;
1418 }
1419 match input.read_at(abs) {
1420 b'}' => {
1421 offset += 1;
1422 if nesting == 0 {
1423 break;
1424 }
1425
1426 nesting -= 1;
1427 }
1428 b'{' => {
1429 offset += 1;
1430 nesting += 1;
1431 }
1432 _ => {
1433 offset += 1;
1434 }
1435 }
1436 }
1437
1438 offset as u32
1439}