1use std::collections::VecDeque;
2use std::fmt::Debug;
3
4use bumpalo::Bump;
5use mago_database::file::FileId;
6use mago_database::file::HasFileId;
7use mago_span::Position;
8use mago_span::Span;
9
10use mago_syntax_core::input::Input;
11use mago_syntax_core::utils::is_part_of_identifier;
12use mago_syntax_core::utils::is_start_of_identifier;
13use mago_syntax_core::utils::read_digits_of_base;
14use mago_syntax_core::*;
15
16use crate::error::SyntaxError;
17use crate::lexer::internal::mode::HaltStage;
18use crate::lexer::internal::mode::Interpolation;
19use crate::lexer::internal::mode::LexerMode;
20use crate::lexer::internal::utils::NumberKind;
21use crate::token::DocumentKind;
22use crate::token::Token;
23use crate::token::TokenKind;
24
25mod internal;
26
27#[derive(Debug)]
39pub struct Lexer<'input, 'arena> {
40 arena: &'arena Bump,
41 input: Input<'input>,
42 mode: LexerMode<'arena>,
43 interpolating: bool,
44 buffer: VecDeque<Token<'arena>>,
45}
46
47impl<'input, 'arena> Lexer<'input, 'arena> {
48 pub fn new(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
59 Lexer { arena, input, mode: LexerMode::Inline, interpolating: false, buffer: VecDeque::new() }
60 }
61
62 pub fn scripting(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
73 Lexer { arena, input, mode: LexerMode::Script, interpolating: false, buffer: VecDeque::new() }
74 }
75
76 pub fn has_reached_eof(&self) -> bool {
80 self.input.has_reached_eof()
81 }
82
83 pub fn get_position(&self) -> Position {
85 self.input.current_position()
86 }
87
88 #[inline]
121 pub fn advance(&mut self) -> Option<Result<Token<'arena>, SyntaxError>> {
122 if !self.interpolating
123 && let Some(token) = self.buffer.pop_front()
124 {
125 return Some(Ok(token));
126 }
127
128 if self.input.has_reached_eof() {
129 return None;
130 }
131
132 match self.mode {
133 LexerMode::Inline => {
134 let start = self.input.current_position();
135 if self.input.is_at(b"<?", false) {
136 let (kind, buffer) = if self.input.is_at(b"<?php", true) {
137 (TokenKind::OpenTag, self.input.consume(5))
138 } else if self.input.is_at(b"<?=", false) {
139 (TokenKind::EchoTag, self.input.consume(3))
140 } else {
141 (TokenKind::ShortOpenTag, self.input.consume(2))
142 };
143
144 let end = self.input.current_position();
145 let tag = self.token(kind, buffer, start, end);
146
147 self.mode = LexerMode::Script;
148
149 return tag;
150 }
151
152 if self.input.is_at(b"#!", true) {
153 let buffer = self.input.consume_through(b'\n');
154 let end = self.input.current_position();
155
156 self.token(TokenKind::InlineShebang, buffer, start, end)
157 } else {
158 let buffer = self.input.consume_until(b"<?", false);
159 let end = self.input.current_position();
160
161 self.token(TokenKind::InlineText, buffer, start, end)
162 }
163 }
164 LexerMode::Script => {
165 let start = self.input.current_position();
166 let whitespaces = self.input.consume_whitespaces();
167 if !whitespaces.is_empty() {
168 return self.token(TokenKind::Whitespace, whitespaces, start, self.input.current_position());
169 }
170
171 let mut document_label: &[u8] = &[];
172
173 let (token_kind, len) = match self.input.read(3) {
174 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
175 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
176 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
177 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
178 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
179 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
180 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
181 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
182 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
183 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
184 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
185
186 document_label = self.input.peek(3 + whitespaces, label_length);
187
188 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
189 }
190 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
191 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
192
193 document_label = self.input.peek(4 + whitespaces, label_length);
194
195 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
196 }
197 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
198 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
199
200 document_label = self.input.peek(4 + whitespaces, label_length);
201
202 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
203 }
204 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
205 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
206 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
207 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
208 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
209 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
210 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
211 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
212 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
213 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
214 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
215 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
216 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
217 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
218 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
219 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
220 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
221 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
222 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
223 [b':', b':', ..] => (TokenKind::ColonColon, 2),
224 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
225 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
226 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
227 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
228 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
229 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
230 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
231 [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
232 [b'/', b'/', ..] => {
233 let mut length = 2;
234 loop {
235 match self.input.peek(length, 3) {
236 [b'\n' | b'\r', ..] => {
237 break;
238 }
239 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
240 break;
241 }
242 [b'?', b'>', ..] | [] => {
243 break;
244 }
245 [_, ..] => {
246 length += 1;
247 }
248 }
249 }
250
251 (TokenKind::SingleLineComment, length)
252 }
253 [b'/', b'*', asterisk] => {
254 let mut length = 2;
255 let mut is_multiline = false;
256 let mut terminated = false;
257 loop {
258 match self.input.peek(length, 2) {
259 [b'*', b'/'] => {
260 if length == 2 {
261 is_multiline = true;
262 }
263
264 length += 2;
265
266 terminated = true;
267 break;
268 }
269 [_, ..] => {
270 length += 1;
271 }
272 [] => {
273 break;
274 }
275 }
276 }
277
278 if !terminated {
279 self.input.consume(length);
280
281 return Some(Err(SyntaxError::UnexpectedEndOfFile(
282 self.file_id(),
283 self.input.current_position(),
284 )));
285 }
286
287 if !is_multiline && asterisk == &b'*' {
288 (TokenKind::DocBlockComment, length)
289 } else {
290 (TokenKind::MultiLineComment, length)
291 }
292 }
293 [b'\\', start_of_identifier!(), ..] => {
294 let mut length = 2;
295 let mut last_was_slash = false;
296 loop {
297 match self.input.peek(length, 1) {
298 [start_of_identifier!(), ..] if last_was_slash => {
299 length += 1;
300 last_was_slash = false;
301 }
302 [part_of_identifier!(), ..] if !last_was_slash => {
303 length += 1;
304 }
305 [b'\\', ..] => {
306 if last_was_slash {
307 length -= 1;
308
309 break;
310 }
311
312 length += 1;
313 last_was_slash = true;
314 }
315 _ => {
316 break;
317 }
318 }
319 }
320
321 if last_was_slash {
322 length -= 1;
323 }
324
325 (TokenKind::FullyQualifiedIdentifier, length)
326 }
327 [b'$', start_of_identifier!(), ..] => {
328 let mut length = 2;
329 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
330 length += 1;
331 }
332
333 (TokenKind::Variable, length)
334 }
335 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
336 [b'$', ..] => (TokenKind::Dollar, 1),
337 [b'@', ..] => (TokenKind::At, 1),
338 [b'!', ..] => (TokenKind::Bang, 1),
339 [b'&', ..] => (TokenKind::Ampersand, 1),
340 [b'?', ..] => (TokenKind::Question, 1),
341 [b'=', ..] => (TokenKind::Equal, 1),
342 [b'`', ..] => (TokenKind::Backtick, 1),
343 [b')', ..] => (TokenKind::RightParenthesis, 1),
344 [b';', ..] => (TokenKind::Semicolon, 1),
345 [b'+', ..] => (TokenKind::Plus, 1),
346 [b'%', ..] => (TokenKind::Percent, 1),
347 [b'-', ..] => (TokenKind::Minus, 1),
348 [b'<', ..] => (TokenKind::LessThan, 1),
349 [b'>', ..] => (TokenKind::GreaterThan, 1),
350 [b',', ..] => (TokenKind::Comma, 1),
351 [b'[', ..] => (TokenKind::LeftBracket, 1),
352 [b']', ..] => (TokenKind::RightBracket, 1),
353 [b'{', ..] => (TokenKind::LeftBrace, 1),
354 [b'}', ..] => (TokenKind::RightBrace, 1),
355 [b':', ..] => (TokenKind::Colon, 1),
356 [b'~', ..] => (TokenKind::Tilde, 1),
357 [b'|', ..] => (TokenKind::Pipe, 1),
358 [b'^', ..] => (TokenKind::Caret, 1),
359 [b'*', ..] => (TokenKind::Asterisk, 1),
360 [b'/', ..] => (TokenKind::Slash, 1),
361 [quote @ b'\'', ..] => read_literal_string(&self.input, quote),
362 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
363 read_literal_string(&self.input, quote)
364 }
365 [b'"', ..] => (TokenKind::DoubleQuote, 1),
366 [b'(', ..] => 'parenthesis: {
367 for (value, kind) in internal::consts::CAST_TYPES {
368 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
369 break 'parenthesis (kind, length);
370 }
371 }
372
373 (TokenKind::LeftParenthesis, 1)
374 }
375 [b'#', ..] => {
376 let mut length = 1;
377 loop {
378 match self.input.peek(length, 3) {
379 [b'\n' | b'\r', ..] => {
380 break;
381 }
382 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
383 break;
384 }
385 [b'?', b'>', ..] | [] => {
386 break;
387 }
388 [_, ..] => {
389 length += 1;
390 }
391 }
392 }
393
394 (TokenKind::HashComment, length)
395 }
396 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
397 [start_of_identifier!(), ..] => 'identifier: {
398 let mut length = 1;
399 let mut ended_with_slash = false;
400 loop {
401 match self.input.peek(length, 2) {
402 [part_of_identifier!(), ..] => {
403 length += 1;
404 }
405 [b'\\', start_of_identifier!(), ..] => {
406 ended_with_slash = true;
407 break;
408 }
409 [b'(', ..] if length == 7 => {
411 if self.input.is_at(b"private(set)", true) {
412 break 'identifier (TokenKind::PrivateSet, 7 + 5);
413 }
414
415 break;
416 }
417 [b'(', ..] if length == 6 => {
419 if self.input.is_at(b"public(set)", true) {
420 break 'identifier (TokenKind::PublicSet, 6 + 5);
421 }
422
423 break;
424 }
425 [b'(', ..] if length == 9 => {
427 if self.input.is_at(b"protected(set)", true) {
428 break 'identifier (TokenKind::ProtectedSet, 9 + 5);
429 }
430
431 break;
432 }
433 _ => {
434 break;
435 }
436 }
437 }
438
439 if !ended_with_slash {
440 for (value, kind) in internal::consts::KEYWORD_TYPES {
441 if value.len() != length {
442 continue;
443 }
444
445 if self.input.is_at(value, true) {
446 break 'identifier (kind, value.len());
447 }
448 }
449 }
450
451 let mut slashes = 0;
452 let mut last_was_slash = false;
453 loop {
454 match self.input.peek(length, 1) {
455 [start_of_identifier!(), ..] if last_was_slash => {
456 length += 1;
457 last_was_slash = false;
458 }
459 [part_of_identifier!(), ..] if !last_was_slash => {
460 length += 1;
461 }
462 [b'\\', ..] if !self.interpolating => {
463 if !last_was_slash {
464 length += 1;
465 slashes += 1;
466 last_was_slash = true;
467 } else {
468 length -= 1;
469 slashes -= 1;
470 last_was_slash = false;
471
472 break;
473 }
474 }
475 _ => {
476 break;
477 }
478 }
479 }
480
481 if last_was_slash {
482 length -= 1;
483 slashes -= 1;
484 }
485
486 if slashes > 0 {
487 (TokenKind::QualifiedIdentifier, length)
488 } else {
489 (TokenKind::Identifier, length)
490 }
491 }
492 [b'.', start_of_number!(), ..] => {
493 let mut length = read_digits_of_base(&self.input, 2, 10);
494 if let float_exponent!() = self.input.peek(length, 1) {
495 length += 1;
496 if let number_sign!() = self.input.peek(length, 1) {
497 length += 1;
498 }
499
500 length = read_digits_of_base(&self.input, length, 10);
501 }
502
503 (TokenKind::LiteralFloat, length)
504 }
505 [start_of_number!(), ..] => 'number: {
506 let mut length = 1;
507
508 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
509 start_of_binary_number!() => {
510 length += 1;
511
512 (2, NumberKind::Integer)
513 }
514 start_of_octal_number!() => {
515 length += 1;
516
517 (8, NumberKind::Integer)
518 }
519 start_of_hexadecimal_number!() => {
520 length += 1;
521
522 (16, NumberKind::Integer)
523 }
524 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
525 start_of_float_number!() => (10, NumberKind::Float),
526 _ => (10, NumberKind::IntegerOrFloat),
527 };
528
529 if kind != NumberKind::Float {
530 length = read_digits_of_base(&self.input, length, base);
531
532 if kind == NumberKind::Integer {
533 break 'number (TokenKind::LiteralInteger, length);
534 }
535 }
536
537 let is_float = matches!(self.input.peek(length, 3), float_separator!());
538
539 if !is_float {
540 break 'number (TokenKind::LiteralInteger, length);
541 }
542
543 if let [b'.'] = self.input.peek(length, 1) {
544 length += 1;
545 length = read_digits_of_base(&self.input, length, 10);
546 }
547
548 if let float_exponent!() = self.input.peek(length, 1) {
549 length += 1;
550 if let number_sign!() = self.input.peek(length, 1) {
551 length += 1;
552 }
553
554 length = read_digits_of_base(&self.input, length, 10);
555 }
556
557 (TokenKind::LiteralFloat, length)
558 }
559 [b'.', ..] => (TokenKind::Dot, 1),
560 [unknown_byte, ..] => {
561 return Some(Err(SyntaxError::UnrecognizedToken(
562 self.file_id(),
563 *unknown_byte,
564 self.input.current_position(),
565 )));
566 }
567 [] => {
568 unreachable!()
571 }
572 };
573
574 self.mode = match token_kind {
575 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
576 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
577 TokenKind::CloseTag => LexerMode::Inline,
578 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
579 TokenKind::DocumentStart(document_kind) => LexerMode::DocumentString(
580 document_kind,
581 self.arena.alloc_slice_copy(document_label),
582 Interpolation::None,
583 ),
584 _ => LexerMode::Script,
585 };
586
587 let buffer = self.input.consume(len);
588 let end = self.input.current_position();
589
590 self.token(token_kind, buffer, start, end)
591 }
592 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
593 Interpolation::None => {
594 let start = self.input.current_position();
595
596 let mut length = 0;
597 let mut last_was_slash = false;
598 let mut token_kind = TokenKind::StringPart;
599 loop {
600 match self.input.peek(length, 2) {
601 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
602 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
603
604 self.mode =
605 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
606
607 break;
608 }
609 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
610 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
611
612 self.mode =
613 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
614
615 break;
616 }
617 [b'\\', ..] => {
618 length += 1;
619
620 last_was_slash = !last_was_slash;
621 }
622 [b'"', ..] if !last_was_slash => {
623 if length == 0 {
624 length += 1;
625 token_kind = TokenKind::DoubleQuote;
626
627 break;
628 }
629
630 break;
631 }
632 [_, ..] => {
633 length += 1;
634 last_was_slash = false;
635 }
636 [] => {
637 break;
638 }
639 }
640 }
641
642 let buffer = self.input.consume(length);
643 let end = self.input.current_position();
644
645 if TokenKind::DoubleQuote == token_kind {
646 self.mode = LexerMode::Script;
647 }
648
649 self.token(token_kind, buffer, start, end)
650 }
651 Interpolation::Until(offset) => {
652 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
653 }
654 },
655 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
656 Interpolation::None => {
657 let start = self.input.current_position();
658
659 let mut length = 0;
660 let mut last_was_slash = false;
661 let mut token_kind = TokenKind::StringPart;
662 loop {
663 match self.input.peek(length, 2) {
664 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
665 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
666
667 self.mode =
668 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
669
670 break;
671 }
672 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
673 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
674
675 self.mode =
676 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
677
678 break;
679 }
680 [b'\\', ..] => {
681 length += 1;
682 last_was_slash = true;
683 }
684 [b'`', ..] if !last_was_slash => {
685 if length == 0 {
686 length += 1;
687 token_kind = TokenKind::Backtick;
688
689 break;
690 }
691
692 break;
693 }
694 [_, ..] => {
695 length += 1;
696 last_was_slash = false;
697 }
698 [] => {
699 break;
700 }
701 }
702 }
703
704 let buffer = self.input.consume(length);
705 let end = self.input.current_position();
706
707 if TokenKind::Backtick == token_kind {
708 self.mode = LexerMode::Script;
709 }
710
711 self.token(token_kind, buffer, start, end)
712 }
713 Interpolation::Until(offset) => {
714 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
715 }
716 },
717 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
718 DocumentKind::Heredoc => match &interpolation {
719 Interpolation::None => {
720 let start = self.input.current_position();
721
722 let mut length = 0;
723 let mut last_was_slash = false;
724 let mut only_whitespaces = true;
725 let mut token_kind = TokenKind::StringPart;
726 loop {
727 match self.input.peek(length, 2) {
728 [b'\r', b'\n'] => {
729 length += 2;
730
731 break;
732 }
733 [b'\n', ..] | [b'\r', ..] => {
734 length += 1;
735
736 break;
737 }
738 [byte, ..] if byte.is_ascii_whitespace() => {
739 length += 1;
740 }
741 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
742 let until_offset =
743 read_until_end_of_variable_interpolation(&self.input, length + 2);
744
745 self.mode = LexerMode::DocumentString(
746 kind,
747 label,
748 Interpolation::Until(start.offset + until_offset),
749 );
750
751 break;
752 }
753 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
754 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
755
756 self.mode = LexerMode::DocumentString(
757 kind,
758 label,
759 Interpolation::Until(start.offset + until_offset),
760 );
761
762 break;
763 }
764 [b'\\', ..] => {
765 length += 1;
766 last_was_slash = true;
767 only_whitespaces = false;
768 }
769 [_, ..] => {
770 if only_whitespaces
771 && self.input.peek(length, label.len()) == label
772 && self
773 .input
774 .peek(length + label.len(), 1)
775 .first()
776 .is_none_or(|c| !c.is_ascii_alphanumeric())
777 {
778 length += label.len();
779 token_kind = TokenKind::DocumentEnd;
780
781 break;
782 }
783
784 length += 1;
785 last_was_slash = false;
786 only_whitespaces = false;
787 }
788 [] => {
789 break;
790 }
791 }
792 }
793
794 let buffer = self.input.consume(length);
795 let end = self.input.current_position();
796
797 if TokenKind::DocumentEnd == token_kind {
798 self.mode = LexerMode::Script;
799 }
800
801 self.token(token_kind, buffer, start, end)
802 }
803 Interpolation::Until(offset) => {
804 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
805 }
806 },
807 DocumentKind::Nowdoc => {
808 let start = self.input.current_position();
809
810 let mut length = 0;
811 let mut terminated = false;
812 let mut only_whitespaces = true;
813
814 loop {
815 match self.input.peek(length, 2) {
816 [b'\r', b'\n'] => {
817 length += 2;
818
819 break;
820 }
821 [b'\n', ..] | [b'\r', ..] => {
822 length += 1;
823
824 break;
825 }
826 [byte, ..] if byte.is_ascii_whitespace() => {
827 length += 1;
828 }
829 [_, ..] => {
830 if only_whitespaces
831 && self.input.peek(length, label.len()) == label
832 && self
833 .input
834 .peek(length + label.len(), 1)
835 .first()
836 .is_none_or(|c| !c.is_ascii_alphanumeric())
837 {
838 length += label.len();
839 terminated = true;
840
841 break;
842 }
843
844 only_whitespaces = false;
845 length += 1;
846 }
847 [] => {
848 break;
849 }
850 }
851 }
852
853 let buffer = self.input.consume(length);
854 let end = self.input.current_position();
855
856 if terminated {
857 self.mode = LexerMode::Script;
858
859 return self.token(TokenKind::DocumentEnd, buffer, start, end);
860 }
861
862 self.token(TokenKind::StringPart, buffer, start, end)
863 }
864 },
865 LexerMode::Halt(stage) => 'halt: {
866 let start = self.input.current_position();
867 if let HaltStage::End = stage {
868 let buffer = self.input.consume_remaining();
869 let end = self.input.current_position();
870
871 break 'halt self.token(TokenKind::InlineText, buffer, start, end);
872 }
873
874 let whitespaces = self.input.consume_whitespaces();
875 if !whitespaces.is_empty() {
876 let end = self.input.current_position();
877
878 break 'halt self.token(TokenKind::Whitespace, whitespaces, start, end);
879 }
880
881 match &stage {
882 HaltStage::LookingForLeftParenthesis => {
883 if self.input.is_at(b"(", false) {
884 let buffer = self.input.consume(1);
885 let end = self.input.current_position();
886
887 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
888
889 self.token(TokenKind::LeftParenthesis, buffer, start, end)
890 } else {
891 Some(Err(SyntaxError::UnexpectedToken(
892 self.file_id(),
893 self.input.read(1)[0],
894 self.input.current_position(),
895 )))
896 }
897 }
898 HaltStage::LookingForRightParenthesis => {
899 if self.input.is_at(b")", false) {
900 let buffer = self.input.consume(1);
901 let end = self.input.current_position();
902
903 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
904
905 self.token(TokenKind::RightParenthesis, buffer, start, end)
906 } else {
907 Some(Err(SyntaxError::UnexpectedToken(
908 self.file_id(),
909 self.input.read(1)[0],
910 self.input.current_position(),
911 )))
912 }
913 }
914 HaltStage::LookingForTerminator => {
915 if self.input.is_at(b";", false) {
916 let buffer = self.input.consume(1);
917 let end = self.input.current_position();
918
919 self.mode = LexerMode::Halt(HaltStage::End);
920
921 self.token(TokenKind::Semicolon, buffer, start, end)
922 } else if self.input.is_at(b"?>", false) {
923 let buffer = self.input.consume(2);
924 let end = self.input.current_position();
925
926 self.mode = LexerMode::Halt(HaltStage::End);
927
928 self.token(TokenKind::CloseTag, buffer, start, end)
929 } else {
930 Some(Err(SyntaxError::UnexpectedToken(
931 self.file_id(),
932 self.input.read(1)[0],
933 self.input.current_position(),
934 )))
935 }
936 }
937 _ => unreachable!(),
938 }
939 }
940 }
941 }
942
943 #[inline]
944 fn token(
945 &mut self,
946 kind: TokenKind,
947 v: &[u8],
948 from: Position,
949 to: Position,
950 ) -> Option<Result<Token<'arena>, SyntaxError>> {
951 let string = String::from_utf8_lossy(v);
952
953 Some(Ok(Token { kind, value: self.arena.alloc_str(&string), span: Span::new(self.file_id(), from, to) }))
954 }
955
956 #[inline]
957 fn interpolation(
958 &mut self,
959 end_offset: u32,
960 post_interpolation_mode: LexerMode<'arena>,
961 ) -> Option<Result<Token<'arena>, SyntaxError>> {
962 self.mode = LexerMode::Script;
963
964 let was_interpolating = self.interpolating;
965 self.interpolating = true;
966
967 loop {
968 let subsequent_token = self.advance()?.ok()?;
969 let is_final_token = subsequent_token.span.has_offset(end_offset);
970
971 self.buffer.push_back(subsequent_token);
972
973 if is_final_token {
974 break;
975 }
976 }
977
978 self.mode = post_interpolation_mode;
979 self.interpolating = was_interpolating;
980
981 self.advance()
982 }
983}
984
985impl HasFileId for Lexer<'_, '_> {
986 #[inline]
987 fn file_id(&self) -> FileId {
988 self.input.file_id()
989 }
990}
991
992#[inline]
993fn matches_start_of_heredoc_document(input: &Input) -> bool {
994 let total = input.len();
995 let base = input.current_offset();
996
997 let mut length = 3;
999 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1001 length += 1;
1002 }
1003
1004 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1006 return false;
1007 }
1008 length += 1; loop {
1012 let pos = base + length;
1013 if pos >= total {
1014 return false; }
1016
1017 let byte = *input.read_at(pos);
1018 if byte == b'\n' {
1019 return true; } else if byte == b'\r' {
1021 return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1023 } else if is_part_of_identifier(input.read_at(pos)) {
1024 length += 1;
1025 } else {
1026 return false; }
1028 }
1029}
1030
1031#[inline]
1032fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1033 let total = input.len();
1034 let base = input.current_offset();
1035
1036 let mut length = 3;
1038 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1039 length += 1;
1040 }
1041
1042 if base + length >= total || *input.read_at(base + length) != b'"' {
1044 return false;
1045 }
1046 length += 1;
1047
1048 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1050 return false;
1051 }
1052 length += 1;
1053
1054 let mut terminated = false;
1056 loop {
1057 let pos = base + length;
1058 if pos >= total {
1059 return false;
1060 }
1061 let byte = input.read_at(pos);
1062 if *byte == b'\n' {
1063 return terminated;
1065 } else if *byte == b'\r' {
1066 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1068 } else if !terminated && is_part_of_identifier(byte) {
1069 length += 1;
1070 } else if !terminated && *byte == b'"' {
1071 terminated = true;
1072 length += 1;
1073 } else {
1074 return false;
1075 }
1076 }
1077}
1078
1079#[inline]
1080fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1081 let total = input.len();
1082 let base = input.current_offset();
1083
1084 let mut length = 3;
1086 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1087 length += 1;
1088 }
1089
1090 if base + length >= total || *input.read_at(base + length) != b'\'' {
1092 return false;
1093 }
1094 length += 1;
1095
1096 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1098 return false;
1099 }
1100 length += 1;
1101
1102 let mut terminated = false;
1104 loop {
1105 let pos = base + length;
1106 if pos >= total {
1107 return false;
1108 }
1109 let byte = *input.read_at(pos);
1110 if byte == b'\n' {
1111 return terminated;
1112 } else if byte == b'\r' {
1113 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1114 } else if !terminated && is_part_of_identifier(&byte) {
1115 length += 1;
1116 } else if !terminated && byte == b'\'' {
1117 terminated = true;
1118 length += 1;
1119 } else {
1120 return false;
1121 }
1122 }
1123}
1124
1125#[inline]
1126fn matches_literal_double_quote_string(input: &Input) -> bool {
1127 let total = input.len();
1128 let base = input.current_offset();
1129
1130 let mut pos = base + 1;
1132 loop {
1133 if pos >= total {
1134 return true;
1136 }
1137 let byte = *input.read_at(pos);
1138 if byte == b'"' {
1139 return true;
1141 } else if byte == b'\\' {
1142 pos += 2;
1144 continue;
1145 } else {
1146 if pos + 1 < total {
1149 let next = *input.read_at(pos + 1);
1150 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1151 return false;
1152 }
1153 }
1154 pos += 1;
1155 }
1156 }
1157}
1158
1159#[inline]
1160fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1161 let total = input.len();
1162 let base = input.current_offset();
1163
1164 let mut pos = base + 3;
1167 let mut whitespaces = 0;
1168 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1169 whitespaces += 1;
1170 pos += 1;
1171 }
1172
1173 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1179
1180 let mut label_length = 1; let mut terminated = false; loop {
1184 let pos = base + length;
1185 if pos >= total {
1187 unreachable!("Unexpected end of input while reading heredoc label");
1188 }
1189
1190 let byte = *input.read_at(pos);
1191 if byte == b'\n' {
1192 length += 1;
1194 return (length, whitespaces, label_length);
1195 } else if byte == b'\r' {
1196 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1198 length += 2;
1199 } else {
1200 length += 1;
1201 }
1202 return (length, whitespaces, label_length);
1203 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1204 length += 1;
1207 label_length += 1;
1208 } else if double_quoted && !terminated && byte == b'"' {
1209 length += 1;
1211 terminated = true;
1212 } else {
1213 unreachable!("Unexpected character encountered in heredoc label");
1214 }
1215 }
1216}
1217
1218#[inline]
1219fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1220 let total = input.len();
1221 let base = input.current_offset();
1222
1223 let mut pos = base + 3;
1225 let mut whitespaces = 0;
1226 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1227 whitespaces += 1;
1228 pos += 1;
1229 }
1230
1231 let mut length = 3 + whitespaces + 2;
1234
1235 let mut label_length = 1;
1237 let mut terminated = false;
1238 loop {
1239 let pos = base + length;
1240 if pos >= total {
1241 unreachable!("Unexpected end of input while reading nowdoc label");
1242 }
1243 let byte = *input.read_at(pos);
1244
1245 if byte == b'\n' {
1246 length += 1;
1248 return (length, whitespaces, label_length);
1249 } else if byte == b'\r' {
1250 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1252 length += 2;
1253 } else {
1254 length += 1;
1255 }
1256 return (length, whitespaces, label_length);
1257 } else if is_part_of_identifier(&byte) && !terminated {
1258 length += 1;
1260 label_length += 1;
1261 } else if !terminated && byte == b'\'' {
1262 length += 1;
1264 terminated = true;
1265 } else {
1266 unreachable!("Unexpected character encountered in nowdoc label");
1267 }
1268 }
1269}
1270
1271#[inline]
1272fn read_literal_string(input: &Input, quote: &u8) -> (TokenKind, usize) {
1273 let total = input.len();
1274 let start = input.current_offset();
1275 let mut length = 1; let mut last_was_backslash = false;
1277 let mut partial = false;
1278
1279 loop {
1280 let pos = start + length;
1281 if pos >= total {
1282 partial = true;
1284 break;
1285 }
1286
1287 let byte = input.read_at(pos);
1288 if *byte == b'\\' {
1289 last_was_backslash = !last_was_backslash;
1291 length += 1;
1292 } else {
1293 if *byte == *quote && !last_was_backslash {
1295 length += 1; break;
1297 }
1298 length += 1;
1299 last_was_backslash = false;
1300 }
1301 }
1302
1303 if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1304}
1305
1306#[inline]
1307fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1308 let total = input.len();
1309 let base = input.current_offset();
1310 let mut offset = from;
1312
1313 loop {
1314 let abs = base + offset;
1315 if abs >= total {
1316 break;
1318 }
1319
1320 if is_part_of_identifier(input.read_at(abs)) {
1322 offset += 1;
1323 continue;
1324 }
1325
1326 if *input.read_at(abs) == b'[' {
1328 offset += 1;
1329 let mut nesting = 0;
1330 loop {
1331 let abs_inner = base + offset;
1332 if abs_inner >= total {
1333 break;
1334 }
1335 let b = input.read_at(abs_inner);
1336 if *b == b']' {
1337 offset += 1;
1338 if nesting == 0 {
1339 break;
1340 } else {
1341 nesting -= 1;
1342 }
1343 } else if *b == b'[' {
1344 offset += 1;
1345 nesting += 1;
1346 } else if b.is_ascii_whitespace() {
1347 break;
1349 } else {
1350 offset += 1;
1351 }
1352 }
1353 break;
1355 }
1356
1357 if base + offset + 2 < total
1359 && *input.read_at(abs) == b'-'
1360 && *input.read_at(base + offset + 1) == b'>'
1361 && is_start_of_identifier(input.read_at(base + offset + 2))
1362 {
1363 offset += 3;
1364 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1366 offset += 1;
1367 }
1368 break;
1369 }
1370
1371 if base + offset + 3 < total
1373 && *input.read_at(abs) == b'?'
1374 && *input.read_at(base + offset + 1) == b'-'
1375 && *input.read_at(base + offset + 2) == b'>'
1376 && is_start_of_identifier(input.read_at(base + offset + 3))
1377 {
1378 offset += 4;
1379 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1380 offset += 1;
1381 }
1382 break;
1383 }
1384
1385 break;
1387 }
1388
1389 offset as u32
1390}
1391
1392#[inline]
1393fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1394 let total = input.len();
1395 let base = input.current_offset();
1396 let mut offset = from;
1397 let mut nesting = 0;
1398
1399 loop {
1400 let abs = base + offset;
1401 if abs >= total {
1402 break;
1403 }
1404 match input.read_at(abs) {
1405 b'}' => {
1406 offset += 1;
1407 if nesting == 0 {
1408 break;
1409 } else {
1410 nesting -= 1;
1411 }
1412 }
1413 b'{' => {
1414 offset += 1;
1415 nesting += 1;
1416 }
1417 _ => {
1418 offset += 1;
1419 }
1420 }
1421 }
1422
1423 offset as u32
1424}