1use std::fmt::Debug;
2use std::mem;
3
4use mago_interner::ThreadedInterner;
5use mago_span::Position;
6use mago_span::Span;
7
8use mago_syntax_core::input::Input;
9use mago_syntax_core::utils::is_part_of_identifier;
10use mago_syntax_core::utils::is_start_of_identifier;
11use mago_syntax_core::utils::read_digits_of_base;
12use mago_syntax_core::*;
13
14use crate::error::SyntaxError;
15use crate::lexer::internal::mode::HaltStage;
16use crate::lexer::internal::mode::Interpolation;
17use crate::lexer::internal::mode::LexerMode;
18use crate::lexer::internal::utils::NumberKind;
19use crate::token::DocumentKind;
20use crate::token::Token;
21use crate::token::TokenKind;
22
23mod internal;
24
25#[derive(Debug)]
37pub struct Lexer<'a, 'i> {
38 interner: &'i ThreadedInterner,
39 input: Input<'a>,
40 mode: LexerMode<'a>,
41 interpolating: bool,
42}
43
44impl<'a, 'i> Lexer<'a, 'i> {
45 pub fn new(interner: &'i ThreadedInterner, input: Input<'a>) -> Lexer<'a, 'i> {
56 Lexer { interner, input, mode: LexerMode::Inline, interpolating: false }
57 }
58
59 pub fn scripting(interner: &'i ThreadedInterner, input: Input<'a>) -> Lexer<'a, 'i> {
70 Lexer { interner, input, mode: LexerMode::Script, interpolating: false }
71 }
72
73 pub fn has_reached_eof(&self) -> bool {
77 self.input.has_reached_eof()
78 }
79
80 pub fn get_position(&self) -> Position {
82 self.input.current_position()
83 }
84
85 #[inline]
141 pub fn advance(&mut self) -> Option<Result<Token, SyntaxError>> {
142 if self.input.has_reached_eof() {
143 return None;
144 }
145
146 match self.mode {
147 LexerMode::Inline => {
148 let start = self.input.current_position();
149 if self.input.is_at(b"<?", false) {
150 let (kind, buffer) = if self.input.is_at(b"<?php", true) {
151 (TokenKind::OpenTag, self.input.consume(5))
152 } else if self.input.is_at(b"<?=", false) {
153 (TokenKind::EchoTag, self.input.consume(3))
154 } else {
155 (TokenKind::ShortOpenTag, self.input.consume(2))
156 };
157
158 let end = self.input.current_position();
159 let tag = self.token(kind, buffer, start, end);
160
161 self.mode = LexerMode::Script;
162
163 return tag;
164 }
165
166 if self.input.is_at(b"#!", true) {
167 let buffer = self.input.consume_through(b'\n');
168 let end = self.input.current_position();
169
170 self.token(TokenKind::InlineShebang, buffer, start, end)
171 } else {
172 let buffer = self.input.consume_until(b"<?", false);
173 let end = self.input.current_position();
174
175 self.token(TokenKind::InlineText, buffer, start, end)
176 }
177 }
178 LexerMode::Script => {
179 let whitespaces = self.input.consume_whitespaces();
180 if !whitespaces.is_empty() {
181 let start = self.input.current_position();
182 let buffer = whitespaces;
183 let end = self.input.current_position();
184
185 return self.token(TokenKind::Whitespace, buffer, start, end);
186 }
187
188 let mut document_label: &[u8] = &[];
189
190 let (token_kind, len) = match self.input.read(3) {
191 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
192 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
193 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
194 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
195 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
196 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
197 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
198 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
199 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
200 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
201 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
202
203 document_label = self.input.peek(3 + whitespaces, label_length);
204
205 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
206 }
207 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
208 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
209
210 document_label = self.input.peek(4 + whitespaces, label_length);
211
212 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
213 }
214 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
215 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
216
217 document_label = self.input.peek(4 + whitespaces, label_length);
218
219 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
220 }
221 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
222 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
223 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
224 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
225 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
226 [b'?', b':', ..] => (TokenKind::QuestionColon, 2),
227 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
228 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
229 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
230 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
231 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
232 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
233 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
234 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
235 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
236 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
237 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
238 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
239 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
240 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
241 [b':', b':', ..] => (TokenKind::ColonColon, 2),
242 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
243 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
244 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
245 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
246 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
247 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
248 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
249 [b'/', b'/', ..] => {
250 let mut length = 2;
251 loop {
252 match self.input.peek(length, 3) {
253 [b'\n' | b'\r', ..] => {
254 break;
255 }
256 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
257 break;
258 }
259 [b'?', b'>', ..] | [] => {
260 break;
261 }
262 [_, ..] => {
263 length += 1;
264 }
265 }
266 }
267
268 (TokenKind::SingleLineComment, length)
269 }
270 [b'/', b'*', asterisk] => {
271 let mut length = 2;
272 let mut is_multiline = false;
273 let mut terminated = false;
274 loop {
275 match self.input.peek(length, 2) {
276 [b'*', b'/'] => {
277 if length == 2 {
278 is_multiline = true;
279 }
280
281 length += 2;
282
283 terminated = true;
284 break;
285 }
286 [_, ..] => {
287 length += 1;
288 }
289 [] => {
290 break;
291 }
292 }
293 }
294
295 if !terminated {
296 self.input.consume(length);
297
298 return Some(Err(SyntaxError::UnexpectedEndOfFile(self.input.current_position())));
299 }
300
301 if !is_multiline && asterisk == &b'*' {
302 (TokenKind::DocBlockComment, length)
303 } else {
304 (TokenKind::MultiLineComment, length)
305 }
306 }
307 [b'\\', start_of_identifier!(), ..] => {
308 let mut length = 2;
309 let mut last_was_slash = false;
310 loop {
311 match self.input.peek(length, 1) {
312 [start_of_identifier!(), ..] if last_was_slash => {
313 length += 1;
314 last_was_slash = false;
315 }
316 [part_of_identifier!(), ..] if !last_was_slash => {
317 length += 1;
318 }
319 [b'\\', ..] => {
320 if last_was_slash {
321 length -= 1;
322
323 break;
324 }
325
326 length += 1;
327 last_was_slash = true;
328 }
329 _ => {
330 break;
331 }
332 }
333 }
334
335 (TokenKind::FullyQualifiedIdentifier, length)
336 }
337 [b'$', start_of_identifier!(), ..] => {
338 let mut length = 2;
339 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
340 length += 1;
341 }
342
343 (TokenKind::Variable, length)
344 }
345 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
346 [b'$', ..] => (TokenKind::Dollar, 1),
347 [b'@', ..] => (TokenKind::At, 1),
348 [b'!', ..] => (TokenKind::Bang, 1),
349 [b'&', ..] => (TokenKind::Ampersand, 1),
350 [b'?', ..] => (TokenKind::Question, 1),
351 [b'=', ..] => (TokenKind::Equal, 1),
352 [b'`', ..] => (TokenKind::Backtick, 1),
353 [b')', ..] => (TokenKind::RightParenthesis, 1),
354 [b';', ..] => (TokenKind::Semicolon, 1),
355 [b'+', ..] => (TokenKind::Plus, 1),
356 [b'%', ..] => (TokenKind::Percent, 1),
357 [b'-', ..] => (TokenKind::Minus, 1),
358 [b'<', ..] => (TokenKind::LessThan, 1),
359 [b'>', ..] => (TokenKind::GreaterThan, 1),
360 [b',', ..] => (TokenKind::Comma, 1),
361 [b'[', ..] => (TokenKind::LeftBracket, 1),
362 [b']', ..] => (TokenKind::RightBracket, 1),
363 [b'{', ..] => (TokenKind::LeftBrace, 1),
364 [b'}', ..] => (TokenKind::RightBrace, 1),
365 [b':', ..] => (TokenKind::Colon, 1),
366 [b'~', ..] => (TokenKind::Tilde, 1),
367 [b'|', ..] => (TokenKind::Pipe, 1),
368 [b'^', ..] => (TokenKind::Caret, 1),
369 [b'*', ..] => (TokenKind::Asterisk, 1),
370 [b'/', ..] => (TokenKind::Slash, 1),
371 [quote @ b'\'', ..] => read_literal_string(&self.input, quote),
372 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
373 read_literal_string(&self.input, quote)
374 }
375 [b'"', ..] => (TokenKind::DoubleQuote, 1),
376 [b'(', ..] => 'parenthesis: {
377 for (value, kind) in internal::consts::CAST_TYPES {
378 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
379 break 'parenthesis (kind, length);
380 }
381 }
382
383 (TokenKind::LeftParenthesis, 1)
384 }
385 [b'#', ..] => {
386 let mut length = 1;
387 loop {
388 match self.input.peek(length, 3) {
389 [b'\n' | b'\r', ..] => {
390 break;
391 }
392 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
393 break;
394 }
395 [b'?', b'>', ..] | [] => {
396 break;
397 }
398 [_, ..] => {
399 length += 1;
400 }
401 }
402 }
403
404 (TokenKind::HashComment, length)
405 }
406 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
407 [start_of_identifier!(), ..] => 'identifier: {
408 let mut length = 1;
409 let mut ended_with_slash = false;
410 loop {
411 match self.input.peek(length, 2) {
412 [part_of_identifier!(), ..] => {
413 length += 1;
414 }
415 [b'\\', start_of_identifier!(), ..] => {
416 ended_with_slash = true;
417 break;
418 }
419 [b'(', ..] if length == 7 => {
421 if self.input.is_at(b"private(set)", true) {
422 break 'identifier (TokenKind::PrivateSet, 7 + 5);
423 }
424
425 break;
426 }
427 [b'(', ..] if length == 6 => {
429 if self.input.is_at(b"public(set)", true) {
430 break 'identifier (TokenKind::PublicSet, 6 + 5);
431 }
432
433 break;
434 }
435 [b'(', ..] if length == 9 => {
437 if self.input.is_at(b"protected(set)", true) {
438 break 'identifier (TokenKind::ProtectedSet, 9 + 5);
439 }
440
441 break;
442 }
443 _ => {
444 break;
445 }
446 }
447 }
448
449 if !ended_with_slash {
450 for (value, kind) in internal::consts::KEYWORD_TYPES {
451 if value.len() != length {
452 continue;
453 }
454
455 if self.input.is_at(value, true) {
456 break 'identifier (kind, value.len());
457 }
458 }
459 }
460
461 let mut slashes = 0;
462 let mut last_was_slash = false;
463 loop {
464 match self.input.peek(length, 1) {
465 [start_of_identifier!(), ..] if last_was_slash => {
466 length += 1;
467 last_was_slash = false;
468 }
469 [part_of_identifier!(), ..] if !last_was_slash => {
470 length += 1;
471 }
472 [b'\\', ..] if !self.interpolating => {
473 if !last_was_slash {
474 length += 1;
475 slashes += 1;
476 last_was_slash = true;
477 } else {
478 length -= 1;
479 slashes -= 1;
480 last_was_slash = false;
481
482 break;
483 }
484 }
485 _ => {
486 break;
487 }
488 }
489 }
490
491 if last_was_slash {
492 length -= 1;
493 slashes -= 1;
494 }
495
496 if slashes > 0 {
497 (TokenKind::QualifiedIdentifier, length)
498 } else {
499 (TokenKind::Identifier, length)
500 }
501 }
502 [b'.', start_of_number!(), ..] => {
503 let mut length = read_digits_of_base(&self.input, 2, 10);
504 if let float_exponent!() = self.input.peek(length, 1) {
505 length += 1;
506 if let number_sign!() = self.input.peek(length, 1) {
507 length += 1;
508 }
509
510 length = read_digits_of_base(&self.input, length, 10);
511 }
512
513 (TokenKind::LiteralFloat, length)
514 }
515 [start_of_number!(), ..] => 'number: {
516 let mut length = 1;
517
518 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
519 start_of_binary_number!() => {
520 length += 1;
521
522 (2, NumberKind::Integer)
523 }
524 start_of_octal_number!() => {
525 length += 1;
526
527 (8, NumberKind::Integer)
528 }
529 start_of_hexadecimal_number!() => {
530 length += 1;
531
532 (16, NumberKind::Integer)
533 }
534 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
535 start_of_float_number!() => (10, NumberKind::Float),
536 _ => (10, NumberKind::IntegerOrFloat),
537 };
538
539 if kind != NumberKind::Float {
540 length = read_digits_of_base(&self.input, length, base);
541
542 if kind == NumberKind::Integer {
543 break 'number (TokenKind::LiteralInteger, length);
544 }
545 }
546
547 let is_float = matches!(self.input.peek(length, 3), float_separator!());
548
549 if !is_float {
550 break 'number (TokenKind::LiteralInteger, length);
551 }
552
553 if let [b'.'] = self.input.peek(length, 1) {
554 length += 1;
555 length = read_digits_of_base(&self.input, length, 10);
556 }
557
558 if let float_exponent!() = self.input.peek(length, 1) {
559 length += 1;
560 if let number_sign!() = self.input.peek(length, 1) {
561 length += 1;
562 }
563
564 length = read_digits_of_base(&self.input, length, 10);
565 }
566
567 (TokenKind::LiteralFloat, length)
568 }
569 [b'.', ..] => (TokenKind::Dot, 1),
570 [unknown_byte, ..] => {
571 return Some(Err(SyntaxError::UnrecognizedToken(*unknown_byte, self.input.current_position())));
572 }
573 [] => {
574 unreachable!()
577 }
578 };
579
580 self.mode = match token_kind {
581 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
582 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
583 TokenKind::CloseTag => LexerMode::Inline,
584 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
585 TokenKind::DocumentStart(document_kind) => {
586 LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
587 }
588 _ => LexerMode::Script,
589 };
590
591 let start = self.input.current_position();
592 let buffer = self.input.consume(len);
593 let end = self.input.current_position();
594
595 self.token(token_kind, buffer, start, end)
596 }
597 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
598 Interpolation::None => {
599 let start = self.input.current_position();
600
601 let mut length = 0;
602 let mut last_was_slash = false;
603 let mut token_kind = TokenKind::StringPart;
604 loop {
605 match self.input.peek(length, 2) {
606 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
607 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
608
609 self.mode =
610 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
611
612 break;
613 }
614 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
615 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
616
617 self.mode =
618 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
619
620 break;
621 }
622 [b'\\', ..] => {
623 length += 1;
624
625 last_was_slash = !last_was_slash;
626 }
627 [b'"', ..] if !last_was_slash => {
628 if length == 0 {
629 length += 1;
630 token_kind = TokenKind::DoubleQuote;
631
632 break;
633 }
634
635 break;
636 }
637 [_, ..] => {
638 length += 1;
639 last_was_slash = false;
640 }
641 [] => {
642 break;
643 }
644 }
645 }
646
647 let buffer = self.input.consume(length);
648 let end = self.input.current_position();
649
650 if TokenKind::DoubleQuote == token_kind {
651 self.mode = LexerMode::Script;
652 }
653
654 self.token(token_kind, buffer, start, end)
655 }
656 Interpolation::Until(offset) => {
657 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
658 }
659 },
660 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
661 Interpolation::None => {
662 let start = self.input.current_position();
663
664 let mut length = 0;
665 let mut last_was_slash = false;
666 let mut token_kind = TokenKind::StringPart;
667 loop {
668 match self.input.peek(length, 2) {
669 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
670 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
671
672 self.mode =
673 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
674
675 break;
676 }
677 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
678 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
679
680 self.mode =
681 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
682
683 break;
684 }
685 [b'\\', ..] => {
686 length += 1;
687 last_was_slash = true;
688 }
689 [b'`', ..] if !last_was_slash => {
690 if length == 0 {
691 length += 1;
692 token_kind = TokenKind::Backtick;
693
694 break;
695 }
696
697 break;
698 }
699 [_, ..] => {
700 length += 1;
701 last_was_slash = false;
702 }
703 [] => {
704 break;
705 }
706 }
707 }
708
709 let buffer = self.input.consume(length);
710 let end = self.input.current_position();
711
712 if TokenKind::Backtick == token_kind {
713 self.mode = LexerMode::Script;
714 }
715
716 self.token(token_kind, buffer, start, end)
717 }
718 Interpolation::Until(offset) => {
719 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
720 }
721 },
722 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
723 DocumentKind::Heredoc => match &interpolation {
724 Interpolation::None => {
725 let start = self.input.current_position();
726
727 let mut length = 0;
728 let mut last_was_slash = false;
729 let mut only_whitespaces = true;
730 let mut token_kind = TokenKind::StringPart;
731 loop {
732 match self.input.peek(length, 2) {
733 [b'\n', ..] => {
734 length += 1;
735
736 break;
737 }
738 [byte, ..] if byte.is_ascii_whitespace() => {
739 length += 1;
740 }
741 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
742 let until_offset =
743 read_until_end_of_variable_interpolation(&self.input, length + 2);
744
745 self.mode = LexerMode::DocumentString(
746 kind,
747 label,
748 Interpolation::Until(start.offset + until_offset),
749 );
750
751 break;
752 }
753 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
754 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
755
756 self.mode = LexerMode::DocumentString(
757 kind,
758 label,
759 Interpolation::Until(start.offset + until_offset),
760 );
761
762 break;
763 }
764 [b'\\', ..] => {
765 length += 1;
766 last_was_slash = true;
767 only_whitespaces = false;
768 }
769 [_, ..] => {
770 if only_whitespaces
771 && self.input.peek(length, label.len()) == label
772 && self
773 .input
774 .peek(length + label.len(), 1)
775 .first()
776 .is_none_or(|c| !c.is_ascii_alphanumeric())
777 {
778 length += label.len();
779 token_kind = TokenKind::DocumentEnd;
780
781 break;
782 }
783
784 length += 1;
785 last_was_slash = false;
786 only_whitespaces = false;
787 }
788 [] => {
789 break;
790 }
791 }
792 }
793
794 let buffer = self.input.consume(length);
795 let end = self.input.current_position();
796
797 if TokenKind::DocumentEnd == token_kind {
798 self.mode = LexerMode::Script;
799 }
800
801 self.token(token_kind, buffer, start, end)
802 }
803 Interpolation::Until(offset) => {
804 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
805 }
806 },
807 DocumentKind::Nowdoc => {
808 let start = self.input.current_position();
809
810 let mut length = 0;
811 let mut terminated = false;
812 let mut only_whitespaces = true;
813
814 loop {
815 match self.input.peek(length, 1) {
816 [b'\n', ..] => {
817 length += 1;
818
819 break;
820 }
821 [byte, ..] if byte.is_ascii_whitespace() => {
822 length += 1;
823 }
824 [_, ..] => {
825 if only_whitespaces
826 && self.input.peek(length, label.len()) == label
827 && self
828 .input
829 .peek(length + label.len(), 1)
830 .first()
831 .is_none_or(|c| !c.is_ascii_alphanumeric())
832 {
833 length += label.len();
834 terminated = true;
835
836 break;
837 }
838
839 only_whitespaces = false;
840 length += 1;
841 }
842 [] => {
843 break;
844 }
845 }
846 }
847
848 let buffer = self.input.consume(length);
849 let end = self.input.current_position();
850
851 if terminated {
852 self.mode = LexerMode::Script;
853
854 return self.token(TokenKind::DocumentEnd, buffer, start, end);
855 }
856
857 self.token(TokenKind::StringPart, buffer, start, end)
858 }
859 },
860 LexerMode::Halt(stage) => 'halt: {
861 let start = self.input.current_position();
862 if let HaltStage::End = stage {
863 let buffer = self.input.consume_remaining();
864 let end = self.input.current_position();
865
866 break 'halt self.token(TokenKind::InlineText, buffer, start, end);
867 }
868
869 let whitespaces = self.input.consume_whitespaces();
870 if !whitespaces.is_empty() {
871 let end = self.input.current_position();
872
873 break 'halt self.token(TokenKind::Whitespace, whitespaces, start, end);
874 }
875
876 match &stage {
877 HaltStage::LookingForLeftParenthesis => {
878 if self.input.is_at(b"(", false) {
879 let buffer = self.input.consume(1);
880 let end = self.input.current_position();
881
882 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
883
884 self.token(TokenKind::LeftParenthesis, buffer, start, end)
885 } else {
886 Some(Err(SyntaxError::UnexpectedToken(
887 self.input.read(1)[0],
888 self.input.current_position(),
889 )))
890 }
891 }
892 HaltStage::LookingForRightParenthesis => {
893 if self.input.is_at(b")", false) {
894 let buffer = self.input.consume(1);
895 let end = self.input.current_position();
896
897 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
898
899 self.token(TokenKind::RightParenthesis, buffer, start, end)
900 } else {
901 Some(Err(SyntaxError::UnexpectedToken(
902 self.input.read(1)[0],
903 self.input.current_position(),
904 )))
905 }
906 }
907 HaltStage::LookingForTerminator => {
908 if self.input.is_at(b";", false) {
909 let buffer = self.input.consume(1);
910 let end = self.input.current_position();
911
912 self.mode = LexerMode::Halt(HaltStage::End);
913
914 self.token(TokenKind::Semicolon, buffer, start, end)
915 } else if self.input.is_at(b"?>", false) {
916 let buffer = self.input.consume(2);
917 let end = self.input.current_position();
918
919 self.mode = LexerMode::Halt(HaltStage::End);
920
921 self.token(TokenKind::CloseTag, buffer, start, end)
922 } else {
923 return Some(Err(SyntaxError::UnexpectedToken(
924 self.input.read(1)[0],
925 self.input.current_position(),
926 )));
927 }
928 }
929 _ => unreachable!(),
930 }
931 }
932 }
933 }
934
935 #[inline]
936 fn token(
937 &mut self,
938 kind: TokenKind,
939 value: &[u8],
940 from: Position,
941 to: Position,
942 ) -> Option<Result<Token, SyntaxError>> {
943 Some(Ok(Token { kind, value: self.interner.intern(String::from_utf8_lossy(value)), span: Span::new(from, to) }))
944 }
945
946 #[inline]
947 fn interpolation(&mut self, until: usize, next_mode: LexerMode<'a>) -> Option<Result<Token, SyntaxError>> {
948 let mut mode = LexerMode::Script;
949
950 mem::swap(&mut self.mode, &mut mode);
951 self.interpolating = true;
952
953 let result = self.advance();
954
955 mem::swap(&mut self.mode, &mut mode);
956 self.interpolating = false;
957
958 match result {
959 Some(Ok(token)) if token.span.has_offset(until) => {
960 self.mode = next_mode;
961 }
962 _ => {}
963 }
964
965 result
966 }
967}
968
969#[inline]
970fn matches_start_of_heredoc_document(input: &Input) -> bool {
971 let total = input.len();
972 let base = input.current_offset();
973
974 let mut length = 3;
976 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
978 length += 1;
979 }
980
981 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
983 return false;
984 }
985 length += 1; loop {
989 let pos = base + length;
990 if pos >= total {
991 return false; }
993
994 if *input.read_at(pos) == b'\n' {
995 return true; } else if is_part_of_identifier(input.read_at(pos)) {
997 length += 1;
998 } else {
999 return false; }
1001 }
1002}
1003
1004#[inline]
1005fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1006 let total = input.len();
1007 let base = input.current_offset();
1008
1009 let mut length = 3;
1011 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1012 length += 1;
1013 }
1014
1015 if base + length >= total || *input.read_at(base + length) != b'"' {
1017 return false;
1018 }
1019 length += 1;
1020
1021 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1023 return false;
1024 }
1025 length += 1;
1026
1027 let mut terminated = false;
1029 loop {
1030 let pos = base + length;
1031 if pos >= total {
1032 return false;
1033 }
1034 let byte = input.read_at(pos);
1035 if *byte == b'\n' {
1036 return terminated;
1038 } else if !terminated && is_part_of_identifier(byte) {
1039 length += 1;
1040 } else if !terminated && *byte == b'"' {
1041 terminated = true;
1042 length += 1;
1043 } else {
1044 return false;
1045 }
1046 }
1047}
1048
1049#[inline]
1050fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1051 let total = input.len();
1052 let base = input.current_offset();
1053
1054 let mut length = 3;
1056 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1057 length += 1;
1058 }
1059
1060 if base + length >= total || *input.read_at(base + length) != b'\'' {
1062 return false;
1063 }
1064 length += 1;
1065
1066 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1068 return false;
1069 }
1070 length += 1;
1071
1072 let mut terminated = false;
1074 loop {
1075 let pos = base + length;
1076 if pos >= total {
1077 return false;
1078 }
1079 let byte = *input.read_at(pos);
1080 if byte == b'\n' {
1081 return terminated;
1082 } else if !terminated && is_part_of_identifier(&byte) {
1083 length += 1;
1084 } else if !terminated && byte == b'\'' {
1085 terminated = true;
1086 length += 1;
1087 } else {
1088 return false;
1089 }
1090 }
1091}
1092
1093#[inline]
1094fn matches_literal_double_quote_string(input: &Input) -> bool {
1095 let total = input.len();
1096 let base = input.current_offset();
1097
1098 let mut pos = base + 1;
1100 loop {
1101 if pos >= total {
1102 return true;
1104 }
1105 let byte = *input.read_at(pos);
1106 if byte == b'"' {
1107 return true;
1109 } else if byte == b'\\' {
1110 pos += 2;
1112 continue;
1113 } else {
1114 if pos + 1 < total {
1117 let next = *input.read_at(pos + 1);
1118 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1119 return false;
1120 }
1121 }
1122 pos += 1;
1123 }
1124 }
1125}
1126
1127#[inline]
1128fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1129 let total = input.len();
1130 let base = input.current_offset();
1131
1132 let mut pos = base + 3;
1135 let mut whitespaces = 0;
1136 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1137 whitespaces += 1;
1138 pos += 1;
1139 }
1140
1141 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1147
1148 let mut label_length = 1; let mut terminated = false; loop {
1152 let pos = base + length;
1153 if pos >= total {
1155 unreachable!("Unexpected end of input while reading heredoc label");
1156 }
1157
1158 let byte = *input.read_at(pos);
1159 if byte == b'\n' {
1160 length += 1;
1162 return (length, whitespaces, label_length);
1163 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1164 length += 1;
1167 label_length += 1;
1168 } else if double_quoted && !terminated && byte == b'"' {
1169 length += 1;
1171 terminated = true;
1172 } else {
1173 unreachable!("Unexpected character encountered in heredoc label");
1174 }
1175 }
1176}
1177
1178#[inline]
1179fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1180 let total = input.len();
1181 let base = input.current_offset();
1182
1183 let mut pos = base + 3;
1185 let mut whitespaces = 0;
1186 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1187 whitespaces += 1;
1188 pos += 1;
1189 }
1190
1191 let mut length = 3 + whitespaces + 2;
1194
1195 let mut label_length = 1;
1197 let mut terminated = false;
1198 loop {
1199 let pos = base + length;
1200 if pos >= total {
1201 unreachable!("Unexpected end of input while reading nowdoc label");
1202 }
1203 let byte = *input.read_at(pos);
1204
1205 if byte == b'\n' {
1206 length += 1;
1208 return (length, whitespaces, label_length);
1209 } else if is_part_of_identifier(&byte) && !terminated {
1210 length += 1;
1212 label_length += 1;
1213 } else if !terminated && byte == b'\'' {
1214 length += 1;
1216 terminated = true;
1217 } else {
1218 unreachable!("Unexpected character encountered in nowdoc label");
1219 }
1220 }
1221}
1222
1223#[inline]
1224fn read_literal_string(input: &Input, quote: &u8) -> (TokenKind, usize) {
1225 let total = input.len();
1226 let start = input.current_offset();
1227 let mut length = 1; let mut last_was_backslash = false;
1229 let mut partial = false;
1230
1231 loop {
1232 let pos = start + length;
1233 if pos >= total {
1234 partial = true;
1236 break;
1237 }
1238
1239 let byte = input.read_at(pos);
1240 if *byte == b'\\' {
1241 last_was_backslash = !last_was_backslash;
1243 length += 1;
1244 } else {
1245 if *byte == *quote && !last_was_backslash {
1247 length += 1; break;
1249 }
1250 length += 1;
1251 last_was_backslash = false;
1252 }
1253 }
1254
1255 if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1256}
1257
1258#[inline]
1259fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> usize {
1260 let total = input.len();
1261 let base = input.current_offset();
1262 let mut offset = from;
1264
1265 loop {
1266 let abs = base + offset;
1267 if abs >= total {
1268 break;
1270 }
1271
1272 if is_part_of_identifier(input.read_at(abs)) {
1274 offset += 1;
1275 continue;
1276 }
1277
1278 if *input.read_at(abs) == b'[' {
1280 offset += 1;
1281 let mut nesting = 0;
1282 loop {
1283 let abs_inner = base + offset;
1284 if abs_inner >= total {
1285 break;
1286 }
1287 let b = input.read_at(abs_inner);
1288 if *b == b']' {
1289 offset += 1;
1290 if nesting == 0 {
1291 break;
1292 } else {
1293 nesting -= 1;
1294 }
1295 } else if *b == b'[' {
1296 offset += 1;
1297 nesting += 1;
1298 } else if b.is_ascii_whitespace() {
1299 break;
1301 } else {
1302 offset += 1;
1303 }
1304 }
1305 break;
1307 }
1308
1309 if base + offset + 2 < total
1311 && *input.read_at(abs) == b'-'
1312 && *input.read_at(base + offset + 1) == b'>'
1313 && is_start_of_identifier(input.read_at(base + offset + 2))
1314 {
1315 offset += 3;
1316 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1318 offset += 1;
1319 }
1320 break;
1321 }
1322
1323 if base + offset + 3 < total
1325 && *input.read_at(abs) == b'?'
1326 && *input.read_at(base + offset + 1) == b'-'
1327 && *input.read_at(base + offset + 2) == b'>'
1328 && is_start_of_identifier(input.read_at(base + offset + 3))
1329 {
1330 offset += 4;
1331 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1332 offset += 1;
1333 }
1334 break;
1335 }
1336
1337 break;
1339 }
1340
1341 offset
1342}
1343
1344#[inline]
1345fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> usize {
1346 let total = input.len();
1347 let base = input.current_offset();
1348 let mut offset = from;
1349 let mut nesting = 0;
1350
1351 loop {
1352 let abs = base + offset;
1353 if abs >= total {
1354 break;
1355 }
1356 match input.read_at(abs) {
1357 b'}' => {
1358 offset += 1;
1359 if nesting == 0 {
1360 break;
1361 } else {
1362 nesting -= 1;
1363 }
1364 }
1365 b'{' => {
1366 offset += 1;
1367 nesting += 1;
1368 }
1369 _ => {
1370 offset += 1;
1371 }
1372 }
1373 }
1374
1375 offset
1376}