1use std::fmt::Debug;
2use std::mem;
3
4use mago_interner::ThreadedInterner;
5use mago_span::Position;
6use mago_span::Span;
7
8use mago_syntax_core::input::Input;
9use mago_syntax_core::utils::is_part_of_identifier;
10use mago_syntax_core::utils::is_start_of_identifier;
11use mago_syntax_core::utils::read_digits_of_base;
12use mago_syntax_core::*;
13
14use crate::error::SyntaxError;
15use crate::lexer::internal::mode::HaltStage;
16use crate::lexer::internal::mode::Interpolation;
17use crate::lexer::internal::mode::LexerMode;
18use crate::lexer::internal::utils::NumberKind;
19use crate::token::DocumentKind;
20use crate::token::Token;
21use crate::token::TokenKind;
22
23mod internal;
24
25#[derive(Debug)]
37pub struct Lexer<'a, 'i> {
38 interner: &'i ThreadedInterner,
39 input: Input<'a>,
40 mode: LexerMode<'a>,
41 interpolating: bool,
42}
43
44impl<'a, 'i> Lexer<'a, 'i> {
45 pub fn new(interner: &'i ThreadedInterner, input: Input<'a>) -> Lexer<'a, 'i> {
56 Lexer { interner, input, mode: LexerMode::Inline, interpolating: false }
57 }
58
59 pub fn scripting(interner: &'i ThreadedInterner, input: Input<'a>) -> Lexer<'a, 'i> {
70 Lexer { interner, input, mode: LexerMode::Script, interpolating: false }
71 }
72
73 pub fn has_reached_eof(&self) -> bool {
77 self.input.has_reached_eof()
78 }
79
80 pub fn get_position(&self) -> Position {
82 self.input.current_position()
83 }
84
85 #[inline]
141 pub fn advance(&mut self) -> Option<Result<Token, SyntaxError>> {
142 if self.input.has_reached_eof() {
143 return None;
144 }
145
146 match self.mode {
147 LexerMode::Inline => {
148 let start = self.input.current_position();
149 if self.input.is_at(b"<?", false) {
150 let (kind, buffer) = if self.input.is_at(b"<?php", true) {
151 (TokenKind::OpenTag, self.input.consume(5))
152 } else if self.input.is_at(b"<?=", false) {
153 (TokenKind::EchoTag, self.input.consume(3))
154 } else {
155 (TokenKind::ShortOpenTag, self.input.consume(2))
156 };
157
158 let end = self.input.current_position();
159 let tag = self.token(kind, buffer, start, end);
160
161 self.mode = LexerMode::Script;
162
163 return tag;
164 }
165
166 if self.input.is_at(b"#!", true) {
167 let buffer = self.input.consume_through(b'\n');
168 let end = self.input.current_position();
169
170 self.token(TokenKind::InlineShebang, buffer, start, end)
171 } else {
172 let buffer = self.input.consume_until(b"<?", false);
173 let end = self.input.current_position();
174
175 self.token(TokenKind::InlineText, buffer, start, end)
176 }
177 }
178 LexerMode::Script => {
179 let whitespaces = self.input.consume_whitespaces();
180 if !whitespaces.is_empty() {
181 let start = self.input.current_position();
182 let buffer = whitespaces;
183 let end = self.input.current_position();
184
185 return self.token(TokenKind::Whitespace, buffer, start, end);
186 }
187
188 let mut document_label: &[u8] = &[];
189
190 let (token_kind, len) = match self.input.read(3) {
191 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
192 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
193 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
194 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
195 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
196 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
197 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
198 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
199 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
200 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
201 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
202
203 document_label = self.input.peek(3 + whitespaces, label_length);
204
205 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
206 }
207 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
208 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
209
210 document_label = self.input.peek(4 + whitespaces, label_length);
211
212 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
213 }
214 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
215 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
216
217 document_label = self.input.peek(4 + whitespaces, label_length);
218
219 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
220 }
221 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
222 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
223 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
224 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
225 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
226 [b'?', b':', ..] => (TokenKind::QuestionColon, 2),
227 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
228 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
229 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
230 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
231 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
232 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
233 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
234 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
235 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
236 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
237 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
238 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
239 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
240 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
241 [b':', b':', ..] => (TokenKind::ColonColon, 2),
242 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
243 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
244 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
245 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
246 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
247 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
248 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
249 [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
250 [b'/', b'/', ..] => {
251 let mut length = 2;
252 loop {
253 match self.input.peek(length, 3) {
254 [b'\n' | b'\r', ..] => {
255 break;
256 }
257 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
258 break;
259 }
260 [b'?', b'>', ..] | [] => {
261 break;
262 }
263 [_, ..] => {
264 length += 1;
265 }
266 }
267 }
268
269 (TokenKind::SingleLineComment, length)
270 }
271 [b'/', b'*', asterisk] => {
272 let mut length = 2;
273 let mut is_multiline = false;
274 let mut terminated = false;
275 loop {
276 match self.input.peek(length, 2) {
277 [b'*', b'/'] => {
278 if length == 2 {
279 is_multiline = true;
280 }
281
282 length += 2;
283
284 terminated = true;
285 break;
286 }
287 [_, ..] => {
288 length += 1;
289 }
290 [] => {
291 break;
292 }
293 }
294 }
295
296 if !terminated {
297 self.input.consume(length);
298
299 return Some(Err(SyntaxError::UnexpectedEndOfFile(self.input.current_position())));
300 }
301
302 if !is_multiline && asterisk == &b'*' {
303 (TokenKind::DocBlockComment, length)
304 } else {
305 (TokenKind::MultiLineComment, length)
306 }
307 }
308 [b'\\', start_of_identifier!(), ..] => {
309 let mut length = 2;
310 let mut last_was_slash = false;
311 loop {
312 match self.input.peek(length, 1) {
313 [start_of_identifier!(), ..] if last_was_slash => {
314 length += 1;
315 last_was_slash = false;
316 }
317 [part_of_identifier!(), ..] if !last_was_slash => {
318 length += 1;
319 }
320 [b'\\', ..] => {
321 if last_was_slash {
322 length -= 1;
323
324 break;
325 }
326
327 length += 1;
328 last_was_slash = true;
329 }
330 _ => {
331 break;
332 }
333 }
334 }
335
336 (TokenKind::FullyQualifiedIdentifier, length)
337 }
338 [b'$', start_of_identifier!(), ..] => {
339 let mut length = 2;
340 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
341 length += 1;
342 }
343
344 (TokenKind::Variable, length)
345 }
346 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
347 [b'$', ..] => (TokenKind::Dollar, 1),
348 [b'@', ..] => (TokenKind::At, 1),
349 [b'!', ..] => (TokenKind::Bang, 1),
350 [b'&', ..] => (TokenKind::Ampersand, 1),
351 [b'?', ..] => (TokenKind::Question, 1),
352 [b'=', ..] => (TokenKind::Equal, 1),
353 [b'`', ..] => (TokenKind::Backtick, 1),
354 [b')', ..] => (TokenKind::RightParenthesis, 1),
355 [b';', ..] => (TokenKind::Semicolon, 1),
356 [b'+', ..] => (TokenKind::Plus, 1),
357 [b'%', ..] => (TokenKind::Percent, 1),
358 [b'-', ..] => (TokenKind::Minus, 1),
359 [b'<', ..] => (TokenKind::LessThan, 1),
360 [b'>', ..] => (TokenKind::GreaterThan, 1),
361 [b',', ..] => (TokenKind::Comma, 1),
362 [b'[', ..] => (TokenKind::LeftBracket, 1),
363 [b']', ..] => (TokenKind::RightBracket, 1),
364 [b'{', ..] => (TokenKind::LeftBrace, 1),
365 [b'}', ..] => (TokenKind::RightBrace, 1),
366 [b':', ..] => (TokenKind::Colon, 1),
367 [b'~', ..] => (TokenKind::Tilde, 1),
368 [b'|', ..] => (TokenKind::Pipe, 1),
369 [b'^', ..] => (TokenKind::Caret, 1),
370 [b'*', ..] => (TokenKind::Asterisk, 1),
371 [b'/', ..] => (TokenKind::Slash, 1),
372 [quote @ b'\'', ..] => read_literal_string(&self.input, quote),
373 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
374 read_literal_string(&self.input, quote)
375 }
376 [b'"', ..] => (TokenKind::DoubleQuote, 1),
377 [b'(', ..] => 'parenthesis: {
378 for (value, kind) in internal::consts::CAST_TYPES {
379 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
380 break 'parenthesis (kind, length);
381 }
382 }
383
384 (TokenKind::LeftParenthesis, 1)
385 }
386 [b'#', ..] => {
387 let mut length = 1;
388 loop {
389 match self.input.peek(length, 3) {
390 [b'\n' | b'\r', ..] => {
391 break;
392 }
393 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
394 break;
395 }
396 [b'?', b'>', ..] | [] => {
397 break;
398 }
399 [_, ..] => {
400 length += 1;
401 }
402 }
403 }
404
405 (TokenKind::HashComment, length)
406 }
407 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
408 [start_of_identifier!(), ..] => 'identifier: {
409 let mut length = 1;
410 let mut ended_with_slash = false;
411 loop {
412 match self.input.peek(length, 2) {
413 [part_of_identifier!(), ..] => {
414 length += 1;
415 }
416 [b'\\', start_of_identifier!(), ..] => {
417 ended_with_slash = true;
418 break;
419 }
420 [b'(', ..] if length == 7 => {
422 if self.input.is_at(b"private(set)", true) {
423 break 'identifier (TokenKind::PrivateSet, 7 + 5);
424 }
425
426 break;
427 }
428 [b'(', ..] if length == 6 => {
430 if self.input.is_at(b"public(set)", true) {
431 break 'identifier (TokenKind::PublicSet, 6 + 5);
432 }
433
434 break;
435 }
436 [b'(', ..] if length == 9 => {
438 if self.input.is_at(b"protected(set)", true) {
439 break 'identifier (TokenKind::ProtectedSet, 9 + 5);
440 }
441
442 break;
443 }
444 _ => {
445 break;
446 }
447 }
448 }
449
450 if !ended_with_slash {
451 for (value, kind) in internal::consts::KEYWORD_TYPES {
452 if value.len() != length {
453 continue;
454 }
455
456 if self.input.is_at(value, true) {
457 break 'identifier (kind, value.len());
458 }
459 }
460 }
461
462 let mut slashes = 0;
463 let mut last_was_slash = false;
464 loop {
465 match self.input.peek(length, 1) {
466 [start_of_identifier!(), ..] if last_was_slash => {
467 length += 1;
468 last_was_slash = false;
469 }
470 [part_of_identifier!(), ..] if !last_was_slash => {
471 length += 1;
472 }
473 [b'\\', ..] if !self.interpolating => {
474 if !last_was_slash {
475 length += 1;
476 slashes += 1;
477 last_was_slash = true;
478 } else {
479 length -= 1;
480 slashes -= 1;
481 last_was_slash = false;
482
483 break;
484 }
485 }
486 _ => {
487 break;
488 }
489 }
490 }
491
492 if last_was_slash {
493 length -= 1;
494 slashes -= 1;
495 }
496
497 if slashes > 0 {
498 (TokenKind::QualifiedIdentifier, length)
499 } else {
500 (TokenKind::Identifier, length)
501 }
502 }
503 [b'.', start_of_number!(), ..] => {
504 let mut length = read_digits_of_base(&self.input, 2, 10);
505 if let float_exponent!() = self.input.peek(length, 1) {
506 length += 1;
507 if let number_sign!() = self.input.peek(length, 1) {
508 length += 1;
509 }
510
511 length = read_digits_of_base(&self.input, length, 10);
512 }
513
514 (TokenKind::LiteralFloat, length)
515 }
516 [start_of_number!(), ..] => 'number: {
517 let mut length = 1;
518
519 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
520 start_of_binary_number!() => {
521 length += 1;
522
523 (2, NumberKind::Integer)
524 }
525 start_of_octal_number!() => {
526 length += 1;
527
528 (8, NumberKind::Integer)
529 }
530 start_of_hexadecimal_number!() => {
531 length += 1;
532
533 (16, NumberKind::Integer)
534 }
535 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
536 start_of_float_number!() => (10, NumberKind::Float),
537 _ => (10, NumberKind::IntegerOrFloat),
538 };
539
540 if kind != NumberKind::Float {
541 length = read_digits_of_base(&self.input, length, base);
542
543 if kind == NumberKind::Integer {
544 break 'number (TokenKind::LiteralInteger, length);
545 }
546 }
547
548 let is_float = matches!(self.input.peek(length, 3), float_separator!());
549
550 if !is_float {
551 break 'number (TokenKind::LiteralInteger, length);
552 }
553
554 if let [b'.'] = self.input.peek(length, 1) {
555 length += 1;
556 length = read_digits_of_base(&self.input, length, 10);
557 }
558
559 if let float_exponent!() = self.input.peek(length, 1) {
560 length += 1;
561 if let number_sign!() = self.input.peek(length, 1) {
562 length += 1;
563 }
564
565 length = read_digits_of_base(&self.input, length, 10);
566 }
567
568 (TokenKind::LiteralFloat, length)
569 }
570 [b'.', ..] => (TokenKind::Dot, 1),
571 [unknown_byte, ..] => {
572 return Some(Err(SyntaxError::UnrecognizedToken(*unknown_byte, self.input.current_position())));
573 }
574 [] => {
575 unreachable!()
578 }
579 };
580
581 self.mode = match token_kind {
582 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
583 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
584 TokenKind::CloseTag => LexerMode::Inline,
585 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
586 TokenKind::DocumentStart(document_kind) => {
587 LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
588 }
589 _ => LexerMode::Script,
590 };
591
592 let start = self.input.current_position();
593 let buffer = self.input.consume(len);
594 let end = self.input.current_position();
595
596 self.token(token_kind, buffer, start, end)
597 }
598 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
599 Interpolation::None => {
600 let start = self.input.current_position();
601
602 let mut length = 0;
603 let mut last_was_slash = false;
604 let mut token_kind = TokenKind::StringPart;
605 loop {
606 match self.input.peek(length, 2) {
607 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
608 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
609
610 self.mode =
611 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
612
613 break;
614 }
615 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
616 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
617
618 self.mode =
619 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
620
621 break;
622 }
623 [b'\\', ..] => {
624 length += 1;
625
626 last_was_slash = !last_was_slash;
627 }
628 [b'"', ..] if !last_was_slash => {
629 if length == 0 {
630 length += 1;
631 token_kind = TokenKind::DoubleQuote;
632
633 break;
634 }
635
636 break;
637 }
638 [_, ..] => {
639 length += 1;
640 last_was_slash = false;
641 }
642 [] => {
643 break;
644 }
645 }
646 }
647
648 let buffer = self.input.consume(length);
649 let end = self.input.current_position();
650
651 if TokenKind::DoubleQuote == token_kind {
652 self.mode = LexerMode::Script;
653 }
654
655 self.token(token_kind, buffer, start, end)
656 }
657 Interpolation::Until(offset) => {
658 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
659 }
660 },
661 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
662 Interpolation::None => {
663 let start = self.input.current_position();
664
665 let mut length = 0;
666 let mut last_was_slash = false;
667 let mut token_kind = TokenKind::StringPart;
668 loop {
669 match self.input.peek(length, 2) {
670 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
671 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
672
673 self.mode =
674 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
675
676 break;
677 }
678 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
679 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
680
681 self.mode =
682 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
683
684 break;
685 }
686 [b'\\', ..] => {
687 length += 1;
688 last_was_slash = true;
689 }
690 [b'`', ..] if !last_was_slash => {
691 if length == 0 {
692 length += 1;
693 token_kind = TokenKind::Backtick;
694
695 break;
696 }
697
698 break;
699 }
700 [_, ..] => {
701 length += 1;
702 last_was_slash = false;
703 }
704 [] => {
705 break;
706 }
707 }
708 }
709
710 let buffer = self.input.consume(length);
711 let end = self.input.current_position();
712
713 if TokenKind::Backtick == token_kind {
714 self.mode = LexerMode::Script;
715 }
716
717 self.token(token_kind, buffer, start, end)
718 }
719 Interpolation::Until(offset) => {
720 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
721 }
722 },
723 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
724 DocumentKind::Heredoc => match &interpolation {
725 Interpolation::None => {
726 let start = self.input.current_position();
727
728 let mut length = 0;
729 let mut last_was_slash = false;
730 let mut only_whitespaces = true;
731 let mut token_kind = TokenKind::StringPart;
732 loop {
733 match self.input.peek(length, 2) {
734 [b'\n', ..] => {
735 length += 1;
736
737 break;
738 }
739 [byte, ..] if byte.is_ascii_whitespace() => {
740 length += 1;
741 }
742 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
743 let until_offset =
744 read_until_end_of_variable_interpolation(&self.input, length + 2);
745
746 self.mode = LexerMode::DocumentString(
747 kind,
748 label,
749 Interpolation::Until(start.offset + until_offset),
750 );
751
752 break;
753 }
754 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
755 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
756
757 self.mode = LexerMode::DocumentString(
758 kind,
759 label,
760 Interpolation::Until(start.offset + until_offset),
761 );
762
763 break;
764 }
765 [b'\\', ..] => {
766 length += 1;
767 last_was_slash = true;
768 only_whitespaces = false;
769 }
770 [_, ..] => {
771 if only_whitespaces
772 && self.input.peek(length, label.len()) == label
773 && self
774 .input
775 .peek(length + label.len(), 1)
776 .first()
777 .is_none_or(|c| !c.is_ascii_alphanumeric())
778 {
779 length += label.len();
780 token_kind = TokenKind::DocumentEnd;
781
782 break;
783 }
784
785 length += 1;
786 last_was_slash = false;
787 only_whitespaces = false;
788 }
789 [] => {
790 break;
791 }
792 }
793 }
794
795 let buffer = self.input.consume(length);
796 let end = self.input.current_position();
797
798 if TokenKind::DocumentEnd == token_kind {
799 self.mode = LexerMode::Script;
800 }
801
802 self.token(token_kind, buffer, start, end)
803 }
804 Interpolation::Until(offset) => {
805 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
806 }
807 },
808 DocumentKind::Nowdoc => {
809 let start = self.input.current_position();
810
811 let mut length = 0;
812 let mut terminated = false;
813 let mut only_whitespaces = true;
814
815 loop {
816 match self.input.peek(length, 1) {
817 [b'\n', ..] => {
818 length += 1;
819
820 break;
821 }
822 [byte, ..] if byte.is_ascii_whitespace() => {
823 length += 1;
824 }
825 [_, ..] => {
826 if only_whitespaces
827 && self.input.peek(length, label.len()) == label
828 && self
829 .input
830 .peek(length + label.len(), 1)
831 .first()
832 .is_none_or(|c| !c.is_ascii_alphanumeric())
833 {
834 length += label.len();
835 terminated = true;
836
837 break;
838 }
839
840 only_whitespaces = false;
841 length += 1;
842 }
843 [] => {
844 break;
845 }
846 }
847 }
848
849 let buffer = self.input.consume(length);
850 let end = self.input.current_position();
851
852 if terminated {
853 self.mode = LexerMode::Script;
854
855 return self.token(TokenKind::DocumentEnd, buffer, start, end);
856 }
857
858 self.token(TokenKind::StringPart, buffer, start, end)
859 }
860 },
861 LexerMode::Halt(stage) => 'halt: {
862 let start = self.input.current_position();
863 if let HaltStage::End = stage {
864 let buffer = self.input.consume_remaining();
865 let end = self.input.current_position();
866
867 break 'halt self.token(TokenKind::InlineText, buffer, start, end);
868 }
869
870 let whitespaces = self.input.consume_whitespaces();
871 if !whitespaces.is_empty() {
872 let end = self.input.current_position();
873
874 break 'halt self.token(TokenKind::Whitespace, whitespaces, start, end);
875 }
876
877 match &stage {
878 HaltStage::LookingForLeftParenthesis => {
879 if self.input.is_at(b"(", false) {
880 let buffer = self.input.consume(1);
881 let end = self.input.current_position();
882
883 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
884
885 self.token(TokenKind::LeftParenthesis, buffer, start, end)
886 } else {
887 Some(Err(SyntaxError::UnexpectedToken(
888 self.input.read(1)[0],
889 self.input.current_position(),
890 )))
891 }
892 }
893 HaltStage::LookingForRightParenthesis => {
894 if self.input.is_at(b")", false) {
895 let buffer = self.input.consume(1);
896 let end = self.input.current_position();
897
898 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
899
900 self.token(TokenKind::RightParenthesis, buffer, start, end)
901 } else {
902 Some(Err(SyntaxError::UnexpectedToken(
903 self.input.read(1)[0],
904 self.input.current_position(),
905 )))
906 }
907 }
908 HaltStage::LookingForTerminator => {
909 if self.input.is_at(b";", false) {
910 let buffer = self.input.consume(1);
911 let end = self.input.current_position();
912
913 self.mode = LexerMode::Halt(HaltStage::End);
914
915 self.token(TokenKind::Semicolon, buffer, start, end)
916 } else if self.input.is_at(b"?>", false) {
917 let buffer = self.input.consume(2);
918 let end = self.input.current_position();
919
920 self.mode = LexerMode::Halt(HaltStage::End);
921
922 self.token(TokenKind::CloseTag, buffer, start, end)
923 } else {
924 return Some(Err(SyntaxError::UnexpectedToken(
925 self.input.read(1)[0],
926 self.input.current_position(),
927 )));
928 }
929 }
930 _ => unreachable!(),
931 }
932 }
933 }
934 }
935
936 #[inline]
937 fn token(
938 &mut self,
939 kind: TokenKind,
940 value: &[u8],
941 from: Position,
942 to: Position,
943 ) -> Option<Result<Token, SyntaxError>> {
944 Some(Ok(Token { kind, value: self.interner.intern(String::from_utf8_lossy(value)), span: Span::new(from, to) }))
945 }
946
947 #[inline]
948 fn interpolation(&mut self, until: usize, next_mode: LexerMode<'a>) -> Option<Result<Token, SyntaxError>> {
949 let mut mode = LexerMode::Script;
950
951 mem::swap(&mut self.mode, &mut mode);
952 self.interpolating = true;
953
954 let result = self.advance();
955
956 mem::swap(&mut self.mode, &mut mode);
957 self.interpolating = false;
958
959 match result {
960 Some(Ok(token)) if token.span.has_offset(until) => {
961 self.mode = next_mode;
962 }
963 _ => {}
964 }
965
966 result
967 }
968}
969
970#[inline]
971fn matches_start_of_heredoc_document(input: &Input) -> bool {
972 let total = input.len();
973 let base = input.current_offset();
974
975 let mut length = 3;
977 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
979 length += 1;
980 }
981
982 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
984 return false;
985 }
986 length += 1; loop {
990 let pos = base + length;
991 if pos >= total {
992 return false; }
994
995 if *input.read_at(pos) == b'\n' {
996 return true; } else if is_part_of_identifier(input.read_at(pos)) {
998 length += 1;
999 } else {
1000 return false; }
1002 }
1003}
1004
1005#[inline]
1006fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1007 let total = input.len();
1008 let base = input.current_offset();
1009
1010 let mut length = 3;
1012 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1013 length += 1;
1014 }
1015
1016 if base + length >= total || *input.read_at(base + length) != b'"' {
1018 return false;
1019 }
1020 length += 1;
1021
1022 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1024 return false;
1025 }
1026 length += 1;
1027
1028 let mut terminated = false;
1030 loop {
1031 let pos = base + length;
1032 if pos >= total {
1033 return false;
1034 }
1035 let byte = input.read_at(pos);
1036 if *byte == b'\n' {
1037 return terminated;
1039 } else if !terminated && is_part_of_identifier(byte) {
1040 length += 1;
1041 } else if !terminated && *byte == b'"' {
1042 terminated = true;
1043 length += 1;
1044 } else {
1045 return false;
1046 }
1047 }
1048}
1049
1050#[inline]
1051fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1052 let total = input.len();
1053 let base = input.current_offset();
1054
1055 let mut length = 3;
1057 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1058 length += 1;
1059 }
1060
1061 if base + length >= total || *input.read_at(base + length) != b'\'' {
1063 return false;
1064 }
1065 length += 1;
1066
1067 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1069 return false;
1070 }
1071 length += 1;
1072
1073 let mut terminated = false;
1075 loop {
1076 let pos = base + length;
1077 if pos >= total {
1078 return false;
1079 }
1080 let byte = *input.read_at(pos);
1081 if byte == b'\n' {
1082 return terminated;
1083 } else if !terminated && is_part_of_identifier(&byte) {
1084 length += 1;
1085 } else if !terminated && byte == b'\'' {
1086 terminated = true;
1087 length += 1;
1088 } else {
1089 return false;
1090 }
1091 }
1092}
1093
1094#[inline]
1095fn matches_literal_double_quote_string(input: &Input) -> bool {
1096 let total = input.len();
1097 let base = input.current_offset();
1098
1099 let mut pos = base + 1;
1101 loop {
1102 if pos >= total {
1103 return true;
1105 }
1106 let byte = *input.read_at(pos);
1107 if byte == b'"' {
1108 return true;
1110 } else if byte == b'\\' {
1111 pos += 2;
1113 continue;
1114 } else {
1115 if pos + 1 < total {
1118 let next = *input.read_at(pos + 1);
1119 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1120 return false;
1121 }
1122 }
1123 pos += 1;
1124 }
1125 }
1126}
1127
1128#[inline]
1129fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1130 let total = input.len();
1131 let base = input.current_offset();
1132
1133 let mut pos = base + 3;
1136 let mut whitespaces = 0;
1137 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1138 whitespaces += 1;
1139 pos += 1;
1140 }
1141
1142 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1148
1149 let mut label_length = 1; let mut terminated = false; loop {
1153 let pos = base + length;
1154 if pos >= total {
1156 unreachable!("Unexpected end of input while reading heredoc label");
1157 }
1158
1159 let byte = *input.read_at(pos);
1160 if byte == b'\n' {
1161 length += 1;
1163 return (length, whitespaces, label_length);
1164 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1165 length += 1;
1168 label_length += 1;
1169 } else if double_quoted && !terminated && byte == b'"' {
1170 length += 1;
1172 terminated = true;
1173 } else {
1174 unreachable!("Unexpected character encountered in heredoc label");
1175 }
1176 }
1177}
1178
1179#[inline]
1180fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1181 let total = input.len();
1182 let base = input.current_offset();
1183
1184 let mut pos = base + 3;
1186 let mut whitespaces = 0;
1187 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1188 whitespaces += 1;
1189 pos += 1;
1190 }
1191
1192 let mut length = 3 + whitespaces + 2;
1195
1196 let mut label_length = 1;
1198 let mut terminated = false;
1199 loop {
1200 let pos = base + length;
1201 if pos >= total {
1202 unreachable!("Unexpected end of input while reading nowdoc label");
1203 }
1204 let byte = *input.read_at(pos);
1205
1206 if byte == b'\n' {
1207 length += 1;
1209 return (length, whitespaces, label_length);
1210 } else if is_part_of_identifier(&byte) && !terminated {
1211 length += 1;
1213 label_length += 1;
1214 } else if !terminated && byte == b'\'' {
1215 length += 1;
1217 terminated = true;
1218 } else {
1219 unreachable!("Unexpected character encountered in nowdoc label");
1220 }
1221 }
1222}
1223
1224#[inline]
1225fn read_literal_string(input: &Input, quote: &u8) -> (TokenKind, usize) {
1226 let total = input.len();
1227 let start = input.current_offset();
1228 let mut length = 1; let mut last_was_backslash = false;
1230 let mut partial = false;
1231
1232 loop {
1233 let pos = start + length;
1234 if pos >= total {
1235 partial = true;
1237 break;
1238 }
1239
1240 let byte = input.read_at(pos);
1241 if *byte == b'\\' {
1242 last_was_backslash = !last_was_backslash;
1244 length += 1;
1245 } else {
1246 if *byte == *quote && !last_was_backslash {
1248 length += 1; break;
1250 }
1251 length += 1;
1252 last_was_backslash = false;
1253 }
1254 }
1255
1256 if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1257}
1258
1259#[inline]
1260fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> usize {
1261 let total = input.len();
1262 let base = input.current_offset();
1263 let mut offset = from;
1265
1266 loop {
1267 let abs = base + offset;
1268 if abs >= total {
1269 break;
1271 }
1272
1273 if is_part_of_identifier(input.read_at(abs)) {
1275 offset += 1;
1276 continue;
1277 }
1278
1279 if *input.read_at(abs) == b'[' {
1281 offset += 1;
1282 let mut nesting = 0;
1283 loop {
1284 let abs_inner = base + offset;
1285 if abs_inner >= total {
1286 break;
1287 }
1288 let b = input.read_at(abs_inner);
1289 if *b == b']' {
1290 offset += 1;
1291 if nesting == 0 {
1292 break;
1293 } else {
1294 nesting -= 1;
1295 }
1296 } else if *b == b'[' {
1297 offset += 1;
1298 nesting += 1;
1299 } else if b.is_ascii_whitespace() {
1300 break;
1302 } else {
1303 offset += 1;
1304 }
1305 }
1306 break;
1308 }
1309
1310 if base + offset + 2 < total
1312 && *input.read_at(abs) == b'-'
1313 && *input.read_at(base + offset + 1) == b'>'
1314 && is_start_of_identifier(input.read_at(base + offset + 2))
1315 {
1316 offset += 3;
1317 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1319 offset += 1;
1320 }
1321 break;
1322 }
1323
1324 if base + offset + 3 < total
1326 && *input.read_at(abs) == b'?'
1327 && *input.read_at(base + offset + 1) == b'-'
1328 && *input.read_at(base + offset + 2) == b'>'
1329 && is_start_of_identifier(input.read_at(base + offset + 3))
1330 {
1331 offset += 4;
1332 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1333 offset += 1;
1334 }
1335 break;
1336 }
1337
1338 break;
1340 }
1341
1342 offset
1343}
1344
1345#[inline]
1346fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> usize {
1347 let total = input.len();
1348 let base = input.current_offset();
1349 let mut offset = from;
1350 let mut nesting = 0;
1351
1352 loop {
1353 let abs = base + offset;
1354 if abs >= total {
1355 break;
1356 }
1357 match input.read_at(abs) {
1358 b'}' => {
1359 offset += 1;
1360 if nesting == 0 {
1361 break;
1362 } else {
1363 nesting -= 1;
1364 }
1365 }
1366 b'{' => {
1367 offset += 1;
1368 nesting += 1;
1369 }
1370 _ => {
1371 offset += 1;
1372 }
1373 }
1374 }
1375
1376 offset
1377}