1use std::fmt::Debug;
2use std::mem;
3
4use mago_interner::ThreadedInterner;
5use mago_span::Position;
6use mago_span::Span;
7use mago_token::DocumentKind;
8use mago_token::Token;
9use mago_token::TokenKind;
10
11use crate::error::SyntaxError;
12use crate::input::Input;
13use crate::internal::macros::float_exponent;
14use crate::internal::macros::float_separator;
15use crate::internal::macros::number_separator;
16use crate::internal::macros::number_sign;
17use crate::internal::macros::part_of_identifier;
18use crate::internal::macros::start_of_binary_number;
19use crate::internal::macros::start_of_float_number;
20use crate::internal::macros::start_of_hexadecimal_number;
21use crate::internal::macros::start_of_identifier;
22use crate::internal::macros::start_of_number;
23use crate::internal::macros::start_of_octal_number;
24use crate::internal::macros::start_of_octal_or_float_number;
25use crate::internal::mode::HaltStage;
26use crate::internal::mode::Interpolation;
27use crate::internal::mode::LexerMode;
28use crate::internal::utils::NumberKind;
29
30pub mod error;
31pub mod input;
32
33mod internal;
34
35#[derive(Debug)]
47pub struct Lexer<'a, 'i> {
48 interner: &'i ThreadedInterner,
49 input: Input<'a>,
50 mode: LexerMode<'a>,
51 interpolating: bool,
52}
53
54impl<'a, 'i> Lexer<'a, 'i> {
55 pub fn new(interner: &'i ThreadedInterner, input: Input<'a>) -> Lexer<'a, 'i> {
66 Lexer { interner, input, mode: LexerMode::Inline, interpolating: false }
67 }
68
69 pub fn scripting(interner: &'i ThreadedInterner, input: Input<'a>) -> Lexer<'a, 'i> {
80 Lexer { interner, input, mode: LexerMode::Script, interpolating: false }
81 }
82
83 pub fn has_reached_eof(&self) -> bool {
87 self.input.has_reached_eof()
88 }
89
90 pub fn get_position(&self) -> Position {
92 self.input.position()
93 }
94
95 #[inline]
151 pub fn advance(&mut self) -> Option<Result<Token, SyntaxError>> {
152 if self.input.has_reached_eof() {
153 return None;
154 }
155
156 match self.mode {
157 LexerMode::Inline => {
158 let start = self.input.position();
159 if self.input.is_at(b"<?", false) {
160 let (kind, buffer) = if self.input.is_at(b"<?php", true) {
161 (TokenKind::OpenTag, self.input.consume(5))
162 } else if self.input.is_at(b"<?=", false) {
163 (TokenKind::EchoTag, self.input.consume(3))
164 } else {
165 (TokenKind::ShortOpenTag, self.input.consume(2))
166 };
167
168 let end = self.input.position();
169 let tag = self.token(kind, buffer, start, end);
170
171 self.mode = LexerMode::Script;
172
173 return tag;
174 }
175
176 if self.input.is_at(b"#!", true) {
177 let buffer = self.input.consume_through(b'\n');
178 let end = self.input.position();
179
180 self.token(TokenKind::InlineShebang, buffer, start, end)
181 } else {
182 let buffer = self.input.consume_until(b"<?", false);
183 let end = self.input.position();
184
185 self.token(TokenKind::InlineText, buffer, start, end)
186 }
187 }
188 LexerMode::Script => {
189 let whitespaces = self.input.consume_whitespaces();
190 if !whitespaces.is_empty() {
191 let start = self.input.position();
192 let buffer = whitespaces;
193 let end = self.input.position();
194
195 return self.token(TokenKind::Whitespace, buffer, start, end);
196 }
197
198 let mut document_label: &[u8] = &[];
199
200 let (token_kind, len) = match self.input.read(3) {
201 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
202 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
203 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
204 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
205 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
206 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
207 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
208 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
209 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
210 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
211 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
212
213 document_label = self.input.peek(3 + whitespaces, label_length);
214
215 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
216 }
217 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
218 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
219
220 document_label = self.input.peek(4 + whitespaces, label_length);
221
222 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
223 }
224 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
225 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
226
227 document_label = self.input.peek(4 + whitespaces, label_length);
228
229 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
230 }
231 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
232 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
233 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
234 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
235 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
236 [b'?', b':', ..] => (TokenKind::QuestionColon, 2),
237 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
238 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
239 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
240 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
241 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
242 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
243 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
244 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
245 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
246 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
247 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
248 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
249 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
250 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
251 [b':', b':', ..] => (TokenKind::ColonColon, 2),
252 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
253 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
254 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
255 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
256 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
257 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
258 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
259 [b'/', b'/', ..] => {
260 let mut length = 2;
261 loop {
262 match self.input.peek(length, 3) {
263 [b'\n' | b'\r', ..] => {
264 break;
265 }
266 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
267 break;
268 }
269 [b'?', b'>', ..] | [] => {
270 break;
271 }
272 [_, ..] => {
273 length += 1;
274 }
275 }
276 }
277
278 (TokenKind::SingleLineComment, length)
279 }
280 [b'/', b'*', asterisk] => {
281 let mut length = 2;
282 let mut is_multiline = false;
283 let mut terminated = false;
284 loop {
285 match self.input.peek(length, 2) {
286 [b'*', b'/'] => {
287 if length == 2 {
288 is_multiline = true;
289 }
290
291 length += 2;
292
293 terminated = true;
294 break;
295 }
296 [_, ..] => {
297 length += 1;
298 }
299 [] => {
300 break;
301 }
302 }
303 }
304
305 if !terminated {
306 self.input.consume(length);
307
308 return Some(Err(SyntaxError::UnexpectedEndOfFile(self.input.position())));
309 }
310
311 if !is_multiline && asterisk == &b'*' {
312 (TokenKind::DocBlockComment, length)
313 } else {
314 (TokenKind::MultiLineComment, length)
315 }
316 }
317 [b'\\', start_of_identifier!(), ..] => {
318 let mut length = 2;
319 let mut last_was_slash = false;
320 loop {
321 match self.input.peek(length, 1) {
322 [start_of_identifier!(), ..] if last_was_slash => {
323 length += 1;
324 last_was_slash = false;
325 }
326 [part_of_identifier!(), ..] if !last_was_slash => {
327 length += 1;
328 }
329 [b'\\', ..] => {
330 if last_was_slash {
331 length -= 1;
332
333 break;
334 }
335
336 length += 1;
337 last_was_slash = true;
338 }
339 _ => {
340 break;
341 }
342 }
343 }
344
345 (TokenKind::FullyQualifiedIdentifier, length)
346 }
347 [b'$', start_of_identifier!(), ..] => {
348 let mut length = 2;
349 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
350 length += 1;
351 }
352
353 (TokenKind::Variable, length)
354 }
355 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
356 [b'$', ..] => (TokenKind::Dollar, 1),
357 [b'@', ..] => (TokenKind::At, 1),
358 [b'!', ..] => (TokenKind::Bang, 1),
359 [b'&', ..] => (TokenKind::Ampersand, 1),
360 [b'?', ..] => (TokenKind::Question, 1),
361 [b'=', ..] => (TokenKind::Equal, 1),
362 [b'`', ..] => (TokenKind::Backtick, 1),
363 [b')', ..] => (TokenKind::RightParenthesis, 1),
364 [b';', ..] => (TokenKind::Semicolon, 1),
365 [b'+', ..] => (TokenKind::Plus, 1),
366 [b'%', ..] => (TokenKind::Percent, 1),
367 [b'-', ..] => (TokenKind::Minus, 1),
368 [b'<', ..] => (TokenKind::LessThan, 1),
369 [b'>', ..] => (TokenKind::GreaterThan, 1),
370 [b',', ..] => (TokenKind::Comma, 1),
371 [b'[', ..] => (TokenKind::LeftBracket, 1),
372 [b']', ..] => (TokenKind::RightBracket, 1),
373 [b'{', ..] => (TokenKind::LeftBrace, 1),
374 [b'}', ..] => (TokenKind::RightBrace, 1),
375 [b':', ..] => (TokenKind::Colon, 1),
376 [b'~', ..] => (TokenKind::Tilde, 1),
377 [b'|', ..] => (TokenKind::Pipe, 1),
378 [b'^', ..] => (TokenKind::Caret, 1),
379 [b'*', ..] => (TokenKind::Asterisk, 1),
380 [b'/', ..] => (TokenKind::Slash, 1),
381 [quote @ b'\'', ..] => read_literal_string(&self.input, quote),
382 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
383 read_literal_string(&self.input, quote)
384 }
385 [b'"', ..] => (TokenKind::DoubleQuote, 1),
386 [b'(', ..] => 'parenthesis: {
387 for (value, kind) in internal::consts::CAST_TYPES {
388 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
389 break 'parenthesis (kind, length);
390 }
391 }
392
393 (TokenKind::LeftParenthesis, 1)
394 }
395 [b'#', ..] => {
396 let mut length = 1;
397 loop {
398 match self.input.peek(length, 3) {
399 [b'\n' | b'\r', ..] => {
400 break;
401 }
402 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
403 break;
404 }
405 [b'?', b'>', ..] | [] => {
406 break;
407 }
408 [_, ..] => {
409 length += 1;
410 }
411 }
412 }
413
414 (TokenKind::HashComment, length)
415 }
416 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
417 [start_of_identifier!(), ..] => 'identifier: {
418 let mut length = 1;
419 let mut ended_with_slash = false;
420 loop {
421 match self.input.peek(length, 2) {
422 [part_of_identifier!(), ..] => {
423 length += 1;
424 }
425 [b'\\', start_of_identifier!(), ..] => {
426 ended_with_slash = true;
427 break;
428 }
429 [b'(', ..] if length == 7 => {
431 if self.input.is_at(b"private(set)", true) {
432 break 'identifier (TokenKind::PrivateSet, 7 + 5);
433 }
434
435 break;
436 }
437 [b'(', ..] if length == 6 => {
439 if self.input.is_at(b"public(set)", true) {
440 break 'identifier (TokenKind::PublicSet, 6 + 5);
441 }
442
443 break;
444 }
445 [b'(', ..] if length == 9 => {
447 if self.input.is_at(b"protected(set)", true) {
448 break 'identifier (TokenKind::ProtectedSet, 9 + 5);
449 }
450
451 break;
452 }
453 _ => {
454 break;
455 }
456 }
457 }
458
459 if !ended_with_slash {
460 for (value, kind) in internal::consts::KEYWORD_TYPES {
461 if value.len() != length {
462 continue;
463 }
464
465 if self.input.is_at(value, true) {
466 break 'identifier (kind, value.len());
467 }
468 }
469 }
470
471 let mut slashes = 0;
472 let mut last_was_slash = false;
473 loop {
474 match self.input.peek(length, 1) {
475 [start_of_identifier!(), ..] if last_was_slash => {
476 length += 1;
477 last_was_slash = false;
478 }
479 [part_of_identifier!(), ..] if !last_was_slash => {
480 length += 1;
481 }
482 [b'\\', ..] if !self.interpolating => {
483 if !last_was_slash {
484 length += 1;
485 slashes += 1;
486 last_was_slash = true;
487 } else {
488 length -= 1;
489 slashes -= 1;
490 last_was_slash = false;
491
492 break;
493 }
494 }
495 _ => {
496 break;
497 }
498 }
499 }
500
501 if last_was_slash {
502 length -= 1;
503 slashes -= 1;
504 }
505
506 if slashes > 0 {
507 (TokenKind::QualifiedIdentifier, length)
508 } else {
509 (TokenKind::Identifier, length)
510 }
511 }
512 [b'.', start_of_number!(), ..] => {
513 let mut length = read_digits_of_base(&self.input, 2, 10);
514 if let float_exponent!() = self.input.peek(length, 1) {
515 length += 1;
516 if let number_sign!() = self.input.peek(length, 1) {
517 length += 1;
518 }
519
520 length = read_digits_of_base(&self.input, length, 10);
521 }
522
523 (TokenKind::LiteralFloat, length)
524 }
525 [start_of_number!(), ..] => 'number: {
526 let mut length = 1;
527
528 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
529 start_of_binary_number!() => {
530 length += 1;
531
532 (2, NumberKind::Integer)
533 }
534 start_of_octal_number!() => {
535 length += 1;
536
537 (8, NumberKind::Integer)
538 }
539 start_of_hexadecimal_number!() => {
540 length += 1;
541
542 (16, NumberKind::Integer)
543 }
544 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
545 start_of_float_number!() => (10, NumberKind::Float),
546 _ => (10, NumberKind::IntegerOrFloat),
547 };
548
549 if kind != NumberKind::Float {
550 length = read_digits_of_base(&self.input, length, base);
551
552 if kind == NumberKind::Integer {
553 break 'number (TokenKind::LiteralInteger, length);
554 }
555 }
556
557 let is_float = matches!(self.input.peek(length, 3), float_separator!());
558
559 if !is_float {
560 break 'number (TokenKind::LiteralInteger, length);
561 }
562
563 if let [b'.'] = self.input.peek(length, 1) {
564 length += 1;
565 length = read_digits_of_base(&self.input, length, 10);
566 }
567
568 if let float_exponent!() = self.input.peek(length, 1) {
569 length += 1;
570 if let number_sign!() = self.input.peek(length, 1) {
571 length += 1;
572 }
573
574 length = read_digits_of_base(&self.input, length, 10);
575 }
576
577 (TokenKind::LiteralFloat, length)
578 }
579 [b'.', ..] => (TokenKind::Dot, 1),
580 [unknown_byte, ..] => {
581 return Some(Err(SyntaxError::UnrecognizedToken(*unknown_byte, self.input.position())));
582 }
583 [] => {
584 unreachable!()
587 }
588 };
589
590 self.mode = match token_kind {
591 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
592 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
593 TokenKind::CloseTag => LexerMode::Inline,
594 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
595 TokenKind::DocumentStart(document_kind) => {
596 LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
597 }
598 _ => LexerMode::Script,
599 };
600
601 let start = self.input.position();
602 let buffer = self.input.consume(len);
603 let end = self.input.position();
604
605 self.token(token_kind, buffer, start, end)
606 }
607 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
608 Interpolation::None => {
609 let start = self.input.position();
610
611 let mut length = 0;
612 let mut last_was_slash = false;
613 let mut token_kind = TokenKind::StringPart;
614 loop {
615 match self.input.peek(length, 2) {
616 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
617 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
618
619 self.mode =
620 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
621
622 break;
623 }
624 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
625 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
626
627 self.mode =
628 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
629
630 break;
631 }
632 [b'\\', ..] => {
633 length += 1;
634
635 last_was_slash = !last_was_slash;
636 }
637 [b'"', ..] if !last_was_slash => {
638 if length == 0 {
639 length += 1;
640 token_kind = TokenKind::DoubleQuote;
641
642 break;
643 }
644
645 break;
646 }
647 [_, ..] => {
648 length += 1;
649 last_was_slash = false;
650 }
651 [] => {
652 break;
653 }
654 }
655 }
656
657 let buffer = self.input.consume(length);
658 let end = self.input.position();
659
660 if TokenKind::DoubleQuote == token_kind {
661 self.mode = LexerMode::Script;
662 }
663
664 self.token(token_kind, buffer, start, end)
665 }
666 Interpolation::Until(offset) => {
667 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
668 }
669 },
670 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
671 Interpolation::None => {
672 let start = self.input.position();
673
674 let mut length = 0;
675 let mut last_was_slash = false;
676 let mut token_kind = TokenKind::StringPart;
677 loop {
678 match self.input.peek(length, 2) {
679 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
680 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
681
682 self.mode =
683 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
684
685 break;
686 }
687 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
688 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
689
690 self.mode =
691 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
692
693 break;
694 }
695 [b'\\', ..] => {
696 length += 1;
697 last_was_slash = true;
698 }
699 [b'`', ..] if !last_was_slash => {
700 if length == 0 {
701 length += 1;
702 token_kind = TokenKind::Backtick;
703
704 break;
705 }
706
707 break;
708 }
709 [_, ..] => {
710 length += 1;
711 last_was_slash = false;
712 }
713 [] => {
714 break;
715 }
716 }
717 }
718
719 let buffer = self.input.consume(length);
720 let end = self.input.position();
721
722 if TokenKind::Backtick == token_kind {
723 self.mode = LexerMode::Script;
724 }
725
726 self.token(token_kind, buffer, start, end)
727 }
728 Interpolation::Until(offset) => {
729 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
730 }
731 },
732 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
733 DocumentKind::Heredoc => match &interpolation {
734 Interpolation::None => {
735 let start = self.input.position();
736
737 let mut length = 0;
738 let mut last_was_slash = false;
739 let mut only_whitespaces = true;
740 let mut token_kind = TokenKind::StringPart;
741 loop {
742 match self.input.peek(length, 2) {
743 [b'\n', ..] => {
744 length += 1;
745
746 break;
747 }
748 [byte, ..] if byte.is_ascii_whitespace() => {
749 length += 1;
750 }
751 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
752 let until_offset =
753 read_until_end_of_variable_interpolation(&self.input, length + 2);
754
755 self.mode = LexerMode::DocumentString(
756 kind,
757 label,
758 Interpolation::Until(start.offset + until_offset),
759 );
760
761 break;
762 }
763 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
764 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
765
766 self.mode = LexerMode::DocumentString(
767 kind,
768 label,
769 Interpolation::Until(start.offset + until_offset),
770 );
771
772 break;
773 }
774 [b'\\', ..] => {
775 length += 1;
776 last_was_slash = true;
777 only_whitespaces = false;
778 }
779 [_, ..] => {
780 if only_whitespaces
781 && self.input.peek(length, label.len()) == label
782 && self
783 .input
784 .peek(length + label.len(), 1)
785 .first()
786 .is_none_or(|c| !c.is_ascii_alphanumeric())
787 {
788 length += label.len();
789 token_kind = TokenKind::DocumentEnd;
790
791 break;
792 }
793
794 length += 1;
795 last_was_slash = false;
796 only_whitespaces = false;
797 }
798 [] => {
799 break;
800 }
801 }
802 }
803
804 let buffer = self.input.consume(length);
805 let end = self.input.position();
806
807 if TokenKind::DocumentEnd == token_kind {
808 self.mode = LexerMode::Script;
809 }
810
811 self.token(token_kind, buffer, start, end)
812 }
813 Interpolation::Until(offset) => {
814 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
815 }
816 },
817 DocumentKind::Nowdoc => {
818 let start = self.input.position();
819
820 let mut length = 0;
821 let mut terminated = false;
822 let mut only_whitespaces = true;
823
824 loop {
825 match self.input.peek(length, 1) {
826 [b'\n', ..] => {
827 length += 1;
828
829 break;
830 }
831 [byte, ..] if byte.is_ascii_whitespace() => {
832 length += 1;
833 }
834 [_, ..] => {
835 if only_whitespaces
836 && self.input.peek(length, label.len()) == label
837 && self
838 .input
839 .peek(length + label.len(), 1)
840 .first()
841 .is_none_or(|c| !c.is_ascii_alphanumeric())
842 {
843 length += label.len();
844 terminated = true;
845
846 break;
847 }
848
849 only_whitespaces = false;
850 length += 1;
851 }
852 [] => {
853 break;
854 }
855 }
856 }
857
858 let buffer = self.input.consume(length);
859 let end = self.input.position();
860
861 if terminated {
862 self.mode = LexerMode::Script;
863
864 return self.token(TokenKind::DocumentEnd, buffer, start, end);
865 }
866
867 self.token(TokenKind::StringPart, buffer, start, end)
868 }
869 },
870 LexerMode::Halt(stage) => 'halt: {
871 let start = self.input.position();
872 if let HaltStage::End = stage {
873 let buffer = self.input.consume_remaining();
874 let end = self.input.position();
875
876 break 'halt self.token(TokenKind::InlineText, buffer, start, end);
877 }
878
879 let whitespaces = self.input.consume_whitespaces();
880 if !whitespaces.is_empty() {
881 let end = self.input.position();
882
883 break 'halt self.token(TokenKind::Whitespace, whitespaces, start, end);
884 }
885
886 match &stage {
887 HaltStage::LookingForLeftParenthesis => {
888 if self.input.is_at(b"(", false) {
889 let buffer = self.input.consume(1);
890 let end = self.input.position();
891
892 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
893
894 self.token(TokenKind::LeftParenthesis, buffer, start, end)
895 } else {
896 Some(Err(SyntaxError::UnexpectedToken(self.input.read(1)[0], self.input.position())))
897 }
898 }
899 HaltStage::LookingForRightParenthesis => {
900 if self.input.is_at(b")", false) {
901 let buffer = self.input.consume(1);
902 let end = self.input.position();
903
904 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
905
906 self.token(TokenKind::RightParenthesis, buffer, start, end)
907 } else {
908 Some(Err(SyntaxError::UnexpectedToken(self.input.read(1)[0], self.input.position())))
909 }
910 }
911 HaltStage::LookingForTerminator => {
912 if self.input.is_at(b";", false) {
913 let buffer = self.input.consume(1);
914 let end = self.input.position();
915
916 self.mode = LexerMode::Halt(HaltStage::End);
917
918 self.token(TokenKind::Semicolon, buffer, start, end)
919 } else if self.input.is_at(b"?>", false) {
920 let buffer = self.input.consume(2);
921 let end = self.input.position();
922
923 self.mode = LexerMode::Halt(HaltStage::End);
924
925 self.token(TokenKind::CloseTag, buffer, start, end)
926 } else {
927 return Some(Err(SyntaxError::UnexpectedToken(
928 self.input.read(1)[0],
929 self.input.position(),
930 )));
931 }
932 }
933 _ => unreachable!(),
934 }
935 }
936 }
937 }
938
939 #[inline]
940 fn token(
941 &mut self,
942 kind: TokenKind,
943 value: &[u8],
944 from: Position,
945 to: Position,
946 ) -> Option<Result<Token, SyntaxError>> {
947 Some(Ok(Token { kind, value: self.interner.intern(String::from_utf8_lossy(value)), span: Span::new(from, to) }))
948 }
949
950 #[inline]
951 fn interpolation(&mut self, until: usize, next_mode: LexerMode<'a>) -> Option<Result<Token, SyntaxError>> {
952 let mut mode = LexerMode::Script;
953
954 mem::swap(&mut self.mode, &mut mode);
955 self.interpolating = true;
956
957 let result = self.advance();
958
959 mem::swap(&mut self.mode, &mut mode);
960 self.interpolating = false;
961
962 match result {
963 Some(Ok(token)) if token.span.has_offset(until) => {
964 self.mode = next_mode;
965 }
966 _ => {}
967 }
968
969 result
970 }
971}
972
973#[inline]
974fn is_start_of_identifier(byte: u8) -> bool {
975 byte.is_ascii_lowercase() || byte.is_ascii_uppercase() || (byte == b'_')
976}
977
978#[inline]
979fn is_part_of_identifier(byte: u8) -> bool {
980 byte.is_ascii_digit() || byte.is_ascii_lowercase() || byte.is_ascii_uppercase() || (byte == b'_') || (byte >= 0x80)
981}
982
983#[inline]
984fn matches_start_of_heredoc_document(input: &Input) -> bool {
985 let bytes = input.bytes;
987 let total = input.length;
988 let base = input.position.offset;
989
990 let mut length = 3;
992 while base + length < total && bytes[base + length].is_ascii_whitespace() {
994 length += 1;
995 }
996
997 if base + length >= total || !is_start_of_identifier(bytes[base + length]) {
999 return false;
1000 }
1001 length += 1; loop {
1005 let pos = base + length;
1006 if pos >= total {
1007 return false; }
1009 if bytes[pos] == b'\n' {
1010 return true; } else if is_part_of_identifier(bytes[pos]) {
1012 length += 1;
1013 } else {
1014 return false; }
1016 }
1017}
1018
1019#[inline]
1020fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1021 let bytes = input.bytes;
1022 let total = input.length;
1023 let base = input.position.offset;
1024
1025 let mut length = 3;
1027 while base + length < total && bytes[base + length].is_ascii_whitespace() {
1028 length += 1;
1029 }
1030
1031 if base + length >= total || bytes[base + length] != b'"' {
1033 return false;
1034 }
1035 length += 1;
1036
1037 if base + length >= total || !is_start_of_identifier(bytes[base + length]) {
1039 return false;
1040 }
1041 length += 1;
1042
1043 let mut terminated = false;
1045 loop {
1046 let pos = base + length;
1047 if pos >= total {
1048 return false;
1049 }
1050 let byte = bytes[pos];
1051 if byte == b'\n' {
1052 return terminated;
1054 } else if !terminated && is_part_of_identifier(byte) {
1055 length += 1;
1056 } else if !terminated && byte == b'"' {
1057 terminated = true;
1058 length += 1;
1059 } else {
1060 return false;
1061 }
1062 }
1063}
1064
1065#[inline]
1066fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1067 let bytes = input.bytes;
1068 let total = input.length;
1069 let base = input.position.offset;
1070
1071 let mut length = 3;
1073 while base + length < total && bytes[base + length].is_ascii_whitespace() {
1074 length += 1;
1075 }
1076
1077 if base + length >= total || bytes[base + length] != b'\'' {
1079 return false;
1080 }
1081 length += 1;
1082
1083 if base + length >= total || !is_start_of_identifier(bytes[base + length]) {
1085 return false;
1086 }
1087 length += 1;
1088
1089 let mut terminated = false;
1091 loop {
1092 let pos = base + length;
1093 if pos >= total {
1094 return false;
1095 }
1096 let byte = bytes[pos];
1097 if byte == b'\n' {
1098 return terminated;
1099 } else if !terminated && is_part_of_identifier(byte) {
1100 length += 1;
1101 } else if !terminated && byte == b'\'' {
1102 terminated = true;
1103 length += 1;
1104 } else {
1105 return false;
1106 }
1107 }
1108}
1109
1110#[inline]
1111fn matches_literal_double_quote_string(input: &Input) -> bool {
1112 let bytes = input.bytes;
1113 let total = input.length;
1114 let base = input.position.offset;
1115
1116 let mut pos = base + 1;
1118 loop {
1119 if pos >= total {
1120 return true;
1122 }
1123 let byte = bytes[pos];
1124 if byte == b'"' {
1125 return true;
1127 } else if byte == b'\\' {
1128 pos += 2;
1130 continue;
1131 } else {
1132 if pos + 1 < total {
1135 let next = bytes[pos + 1];
1136 if (byte == b'$' && (is_start_of_identifier(next) || next == b'{')) || (byte == b'{' && next == b'$') {
1137 return false;
1138 }
1139 }
1140 pos += 1;
1141 }
1142 }
1143}
1144
1145#[inline]
1146fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1147 let bytes = input.bytes;
1149 let total = input.length;
1150 let base = input.position.offset;
1151
1152 let mut pos = base + 3;
1155 let mut whitespaces = 0;
1156 while pos < total && bytes[pos].is_ascii_whitespace() {
1157 whitespaces += 1;
1158 pos += 1;
1159 }
1160
1161 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1167
1168 let mut label_length = 1; let mut terminated = false; loop {
1172 let pos = base + length;
1173 if pos >= total {
1175 unreachable!("Unexpected end of input while reading heredoc label");
1176 }
1177
1178 let byte = bytes[pos];
1179
1180 if byte == b'\n' {
1181 length += 1;
1183 return (length, whitespaces, label_length);
1184 } else if is_part_of_identifier(byte) && (!double_quoted || !terminated) {
1185 length += 1;
1188 label_length += 1;
1189 } else if double_quoted && !terminated && byte == b'"' {
1190 length += 1;
1192 terminated = true;
1193 } else {
1194 unreachable!("Unexpected character encountered in heredoc label");
1195 }
1196 }
1197}
1198
1199#[inline]
1200fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1201 let bytes = input.bytes;
1203 let total = input.length;
1204 let base = input.position.offset;
1205
1206 let mut pos = base + 3;
1208 let mut whitespaces = 0;
1209 while pos < total && bytes[pos].is_ascii_whitespace() {
1210 whitespaces += 1;
1211 pos += 1;
1212 }
1213
1214 let mut length = 3 + whitespaces + 2;
1217
1218 let mut label_length = 1;
1220 let mut terminated = false;
1221 loop {
1222 let pos = base + length;
1223 if pos >= total {
1224 unreachable!("Unexpected end of input while reading nowdoc label");
1225 }
1226 let byte = bytes[pos];
1227
1228 if byte == b'\n' {
1229 length += 1;
1231 return (length, whitespaces, label_length);
1232 } else if is_part_of_identifier(byte) && !terminated {
1233 length += 1;
1235 label_length += 1;
1236 } else if !terminated && byte == b'\'' {
1237 length += 1;
1239 terminated = true;
1240 } else {
1241 unreachable!("Unexpected character encountered in nowdoc label");
1242 }
1243 }
1244}
1245
1246#[inline]
1247fn read_literal_string(input: &Input, quote: &u8) -> (TokenKind, usize) {
1248 let bytes = input.bytes;
1249 let total = input.length;
1250 let start = input.position.offset; let mut length = 1; let mut last_was_backslash = false;
1253 let mut partial = false;
1254
1255 loop {
1256 let pos = start + length;
1257 if pos >= total {
1258 partial = true;
1260 break;
1261 }
1262
1263 let byte = bytes[pos];
1264 if byte == b'\\' {
1265 last_was_backslash = !last_was_backslash;
1267 length += 1;
1268 } else {
1269 if byte == *quote && !last_was_backslash {
1271 length += 1; break;
1273 }
1274 length += 1;
1275 last_was_backslash = false;
1276 }
1277 }
1278
1279 if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1280}
1281
1282#[inline]
1283fn read_digits_of_base(input: &Input, offset: usize, base: u8) -> usize {
1284 if base == 16 {
1285 read_digits_with(input, offset, u8::is_ascii_hexdigit)
1286 } else {
1287 let max = b'0' + base;
1288
1289 read_digits_with(input, offset, |b| b >= &b'0' && b < &max)
1290 }
1291}
1292
1293#[inline]
1294fn read_digits_with<F: Fn(&u8) -> bool>(input: &Input, offset: usize, is_digit: F) -> usize {
1295 let bytes = input.bytes;
1296 let total = input.length;
1297 let start = input.position.offset;
1298 let mut pos = start + offset; while pos < total {
1301 let current = bytes[pos];
1302 if is_digit(¤t) {
1303 pos += 1;
1304 } else if pos + 1 < total && bytes[pos] == number_separator!() && is_digit(&bytes[pos + 1]) {
1305 pos += 2; } else {
1307 break;
1308 }
1309 }
1310
1311 pos - start
1313}
1314#[inline]
1315fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> usize {
1316 let bytes = input.bytes;
1318 let total = input.length;
1319 let base = input.position.offset;
1320 let mut offset = from;
1322
1323 loop {
1324 let abs = base + offset;
1325 if abs >= total {
1326 break;
1328 }
1329
1330 if is_part_of_identifier(bytes[abs]) {
1332 offset += 1;
1333 continue;
1334 }
1335
1336 if bytes[abs] == b'[' {
1338 offset += 1;
1339 let mut nesting = 0;
1340 loop {
1341 let abs_inner = base + offset;
1342 if abs_inner >= total {
1343 break;
1344 }
1345 let b = bytes[abs_inner];
1346 if b == b']' {
1347 offset += 1;
1348 if nesting == 0 {
1349 break;
1350 } else {
1351 nesting -= 1;
1352 }
1353 } else if b == b'[' {
1354 offset += 1;
1355 nesting += 1;
1356 } else if b.is_ascii_whitespace() {
1357 break;
1359 } else {
1360 offset += 1;
1361 }
1362 }
1363 break;
1365 }
1366
1367 if base + offset + 2 < total
1369 && bytes[abs] == b'-'
1370 && bytes[base + offset + 1] == b'>'
1371 && is_start_of_identifier(bytes[base + offset + 2])
1372 {
1373 offset += 3;
1374 while base + offset < total && is_part_of_identifier(bytes[base + offset]) {
1376 offset += 1;
1377 }
1378 break;
1379 }
1380
1381 if base + offset + 3 < total
1383 && bytes[abs] == b'?'
1384 && bytes[base + offset + 1] == b'-'
1385 && bytes[base + offset + 2] == b'>'
1386 && is_start_of_identifier(bytes[base + offset + 3])
1387 {
1388 offset += 4;
1389 while base + offset < total && is_part_of_identifier(bytes[base + offset]) {
1390 offset += 1;
1391 }
1392 break;
1393 }
1394
1395 break;
1397 }
1398
1399 offset
1400}
1401
1402#[inline]
1403fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> usize {
1404 let bytes = input.bytes;
1405 let total = input.length;
1406 let base = input.position.offset;
1407 let mut offset = from;
1408 let mut nesting = 0;
1409
1410 loop {
1411 let abs = base + offset;
1412 if abs >= total {
1413 break;
1414 }
1415 match bytes[abs] {
1416 b'}' => {
1417 offset += 1;
1418 if nesting == 0 {
1419 break;
1420 } else {
1421 nesting -= 1;
1422 }
1423 }
1424 b'{' => {
1425 offset += 1;
1426 nesting += 1;
1427 }
1428 _ => {
1429 offset += 1;
1430 }
1431 }
1432 }
1433
1434 offset
1435}