1use std::collections::VecDeque;
2use std::fmt::Debug;
3
4use bumpalo::Bump;
5use mago_database::file::FileId;
6use mago_database::file::HasFileId;
7use mago_span::Position;
8use mago_span::Span;
9
10use mago_syntax_core::input::Input;
11use mago_syntax_core::utils::is_part_of_identifier;
12use mago_syntax_core::utils::is_start_of_identifier;
13use mago_syntax_core::utils::read_digits_of_base;
14use mago_syntax_core::*;
15
16use crate::error::SyntaxError;
17use crate::lexer::internal::mode::HaltStage;
18use crate::lexer::internal::mode::Interpolation;
19use crate::lexer::internal::mode::LexerMode;
20use crate::lexer::internal::utils::NumberKind;
21use crate::token::DocumentKind;
22use crate::token::Token;
23use crate::token::TokenKind;
24
25mod internal;
26
27#[derive(Debug)]
39pub struct Lexer<'input, 'arena> {
40 arena: &'arena Bump,
41 input: Input<'input>,
42 mode: LexerMode<'arena>,
43 interpolating: bool,
44 buffer: VecDeque<Token<'arena>>,
45}
46
47impl<'input, 'arena> Lexer<'input, 'arena> {
48 pub fn new(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
59 Lexer { arena, input, mode: LexerMode::Inline, interpolating: false, buffer: VecDeque::new() }
60 }
61
62 pub fn scripting(arena: &'arena Bump, input: Input<'input>) -> Lexer<'input, 'arena> {
73 Lexer { arena, input, mode: LexerMode::Script, interpolating: false, buffer: VecDeque::new() }
74 }
75
76 pub fn has_reached_eof(&self) -> bool {
80 self.input.has_reached_eof()
81 }
82
83 pub fn get_position(&self) -> Position {
85 self.input.current_position()
86 }
87
88 #[inline]
121 pub fn advance(&mut self) -> Option<Result<Token<'arena>, SyntaxError>> {
122 if !self.interpolating
123 && let Some(token) = self.buffer.pop_front()
124 {
125 return Some(Ok(token));
126 }
127
128 if self.input.has_reached_eof() {
129 return None;
130 }
131
132 match self.mode {
133 LexerMode::Inline => {
134 let start = self.input.current_position();
135 if self.input.is_at(b"<?", false) {
136 let (kind, buffer) = if self.input.is_at(b"<?php", true) {
137 (TokenKind::OpenTag, self.input.consume(5))
138 } else if self.input.is_at(b"<?=", false) {
139 (TokenKind::EchoTag, self.input.consume(3))
140 } else {
141 (TokenKind::ShortOpenTag, self.input.consume(2))
142 };
143
144 let end = self.input.current_position();
145 let tag = self.token(kind, buffer, start, end);
146
147 self.mode = LexerMode::Script;
148
149 return tag;
150 }
151
152 if self.input.is_at(b"#!", true) {
153 let buffer = self.input.consume_through(b'\n');
154 let end = self.input.current_position();
155
156 self.token(TokenKind::InlineShebang, buffer, start, end)
157 } else {
158 let buffer = self.input.consume_until(b"<?", false);
159 let end = self.input.current_position();
160
161 self.token(TokenKind::InlineText, buffer, start, end)
162 }
163 }
164 LexerMode::Script => {
165 let whitespaces = self.input.consume_whitespaces();
166 if !whitespaces.is_empty() {
167 let start = self.input.current_position();
168 let buffer = whitespaces;
169 let end = self.input.current_position();
170
171 return self.token(TokenKind::Whitespace, buffer, start, end);
172 }
173
174 let mut document_label: &[u8] = &[];
175
176 let (token_kind, len) = match self.input.read(3) {
177 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
178 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
179 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
180 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
181 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
182 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
183 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
184 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
185 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
186 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
187 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
188
189 document_label = self.input.peek(3 + whitespaces, label_length);
190
191 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
192 }
193 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
194 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
195
196 document_label = self.input.peek(4 + whitespaces, label_length);
197
198 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
199 }
200 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
201 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
202
203 document_label = self.input.peek(4 + whitespaces, label_length);
204
205 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
206 }
207 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
208 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
209 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
210 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
211 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
212 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
213 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
214 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
215 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
216 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
217 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
218 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
219 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
220 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
221 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
222 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
223 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
224 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
225 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
226 [b':', b':', ..] => (TokenKind::ColonColon, 2),
227 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
228 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
229 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
230 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
231 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
232 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
233 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
234 [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
235 [b'/', b'/', ..] => {
236 let mut length = 2;
237 loop {
238 match self.input.peek(length, 3) {
239 [b'\n' | b'\r', ..] => {
240 break;
241 }
242 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
243 break;
244 }
245 [b'?', b'>', ..] | [] => {
246 break;
247 }
248 [_, ..] => {
249 length += 1;
250 }
251 }
252 }
253
254 (TokenKind::SingleLineComment, length)
255 }
256 [b'/', b'*', asterisk] => {
257 let mut length = 2;
258 let mut is_multiline = false;
259 let mut terminated = false;
260 loop {
261 match self.input.peek(length, 2) {
262 [b'*', b'/'] => {
263 if length == 2 {
264 is_multiline = true;
265 }
266
267 length += 2;
268
269 terminated = true;
270 break;
271 }
272 [_, ..] => {
273 length += 1;
274 }
275 [] => {
276 break;
277 }
278 }
279 }
280
281 if !terminated {
282 self.input.consume(length);
283
284 return Some(Err(SyntaxError::UnexpectedEndOfFile(
285 self.file_id(),
286 self.input.current_position(),
287 )));
288 }
289
290 if !is_multiline && asterisk == &b'*' {
291 (TokenKind::DocBlockComment, length)
292 } else {
293 (TokenKind::MultiLineComment, length)
294 }
295 }
296 [b'\\', start_of_identifier!(), ..] => {
297 let mut length = 2;
298 let mut last_was_slash = false;
299 loop {
300 match self.input.peek(length, 1) {
301 [start_of_identifier!(), ..] if last_was_slash => {
302 length += 1;
303 last_was_slash = false;
304 }
305 [part_of_identifier!(), ..] if !last_was_slash => {
306 length += 1;
307 }
308 [b'\\', ..] => {
309 if last_was_slash {
310 length -= 1;
311
312 break;
313 }
314
315 length += 1;
316 last_was_slash = true;
317 }
318 _ => {
319 break;
320 }
321 }
322 }
323
324 if last_was_slash {
325 length -= 1;
326 }
327
328 (TokenKind::FullyQualifiedIdentifier, length)
329 }
330 [b'$', start_of_identifier!(), ..] => {
331 let mut length = 2;
332 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
333 length += 1;
334 }
335
336 (TokenKind::Variable, length)
337 }
338 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
339 [b'$', ..] => (TokenKind::Dollar, 1),
340 [b'@', ..] => (TokenKind::At, 1),
341 [b'!', ..] => (TokenKind::Bang, 1),
342 [b'&', ..] => (TokenKind::Ampersand, 1),
343 [b'?', ..] => (TokenKind::Question, 1),
344 [b'=', ..] => (TokenKind::Equal, 1),
345 [b'`', ..] => (TokenKind::Backtick, 1),
346 [b')', ..] => (TokenKind::RightParenthesis, 1),
347 [b';', ..] => (TokenKind::Semicolon, 1),
348 [b'+', ..] => (TokenKind::Plus, 1),
349 [b'%', ..] => (TokenKind::Percent, 1),
350 [b'-', ..] => (TokenKind::Minus, 1),
351 [b'<', ..] => (TokenKind::LessThan, 1),
352 [b'>', ..] => (TokenKind::GreaterThan, 1),
353 [b',', ..] => (TokenKind::Comma, 1),
354 [b'[', ..] => (TokenKind::LeftBracket, 1),
355 [b']', ..] => (TokenKind::RightBracket, 1),
356 [b'{', ..] => (TokenKind::LeftBrace, 1),
357 [b'}', ..] => (TokenKind::RightBrace, 1),
358 [b':', ..] => (TokenKind::Colon, 1),
359 [b'~', ..] => (TokenKind::Tilde, 1),
360 [b'|', ..] => (TokenKind::Pipe, 1),
361 [b'^', ..] => (TokenKind::Caret, 1),
362 [b'*', ..] => (TokenKind::Asterisk, 1),
363 [b'/', ..] => (TokenKind::Slash, 1),
364 [quote @ b'\'', ..] => read_literal_string(&self.input, quote),
365 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
366 read_literal_string(&self.input, quote)
367 }
368 [b'"', ..] => (TokenKind::DoubleQuote, 1),
369 [b'(', ..] => 'parenthesis: {
370 for (value, kind) in internal::consts::CAST_TYPES {
371 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
372 break 'parenthesis (kind, length);
373 }
374 }
375
376 (TokenKind::LeftParenthesis, 1)
377 }
378 [b'#', ..] => {
379 let mut length = 1;
380 loop {
381 match self.input.peek(length, 3) {
382 [b'\n' | b'\r', ..] => {
383 break;
384 }
385 [w, b'?', b'>'] if w.is_ascii_whitespace() => {
386 break;
387 }
388 [b'?', b'>', ..] | [] => {
389 break;
390 }
391 [_, ..] => {
392 length += 1;
393 }
394 }
395 }
396
397 (TokenKind::HashComment, length)
398 }
399 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
400 [start_of_identifier!(), ..] => 'identifier: {
401 let mut length = 1;
402 let mut ended_with_slash = false;
403 loop {
404 match self.input.peek(length, 2) {
405 [part_of_identifier!(), ..] => {
406 length += 1;
407 }
408 [b'\\', start_of_identifier!(), ..] => {
409 ended_with_slash = true;
410 break;
411 }
412 [b'(', ..] if length == 7 => {
414 if self.input.is_at(b"private(set)", true) {
415 break 'identifier (TokenKind::PrivateSet, 7 + 5);
416 }
417
418 break;
419 }
420 [b'(', ..] if length == 6 => {
422 if self.input.is_at(b"public(set)", true) {
423 break 'identifier (TokenKind::PublicSet, 6 + 5);
424 }
425
426 break;
427 }
428 [b'(', ..] if length == 9 => {
430 if self.input.is_at(b"protected(set)", true) {
431 break 'identifier (TokenKind::ProtectedSet, 9 + 5);
432 }
433
434 break;
435 }
436 _ => {
437 break;
438 }
439 }
440 }
441
442 if !ended_with_slash {
443 for (value, kind) in internal::consts::KEYWORD_TYPES {
444 if value.len() != length {
445 continue;
446 }
447
448 if self.input.is_at(value, true) {
449 break 'identifier (kind, value.len());
450 }
451 }
452 }
453
454 let mut slashes = 0;
455 let mut last_was_slash = false;
456 loop {
457 match self.input.peek(length, 1) {
458 [start_of_identifier!(), ..] if last_was_slash => {
459 length += 1;
460 last_was_slash = false;
461 }
462 [part_of_identifier!(), ..] if !last_was_slash => {
463 length += 1;
464 }
465 [b'\\', ..] if !self.interpolating => {
466 if !last_was_slash {
467 length += 1;
468 slashes += 1;
469 last_was_slash = true;
470 } else {
471 length -= 1;
472 slashes -= 1;
473 last_was_slash = false;
474
475 break;
476 }
477 }
478 _ => {
479 break;
480 }
481 }
482 }
483
484 if last_was_slash {
485 length -= 1;
486 slashes -= 1;
487 }
488
489 if slashes > 0 {
490 (TokenKind::QualifiedIdentifier, length)
491 } else {
492 (TokenKind::Identifier, length)
493 }
494 }
495 [b'.', start_of_number!(), ..] => {
496 let mut length = read_digits_of_base(&self.input, 2, 10);
497 if let float_exponent!() = self.input.peek(length, 1) {
498 length += 1;
499 if let number_sign!() = self.input.peek(length, 1) {
500 length += 1;
501 }
502
503 length = read_digits_of_base(&self.input, length, 10);
504 }
505
506 (TokenKind::LiteralFloat, length)
507 }
508 [start_of_number!(), ..] => 'number: {
509 let mut length = 1;
510
511 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
512 start_of_binary_number!() => {
513 length += 1;
514
515 (2, NumberKind::Integer)
516 }
517 start_of_octal_number!() => {
518 length += 1;
519
520 (8, NumberKind::Integer)
521 }
522 start_of_hexadecimal_number!() => {
523 length += 1;
524
525 (16, NumberKind::Integer)
526 }
527 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
528 start_of_float_number!() => (10, NumberKind::Float),
529 _ => (10, NumberKind::IntegerOrFloat),
530 };
531
532 if kind != NumberKind::Float {
533 length = read_digits_of_base(&self.input, length, base);
534
535 if kind == NumberKind::Integer {
536 break 'number (TokenKind::LiteralInteger, length);
537 }
538 }
539
540 let is_float = matches!(self.input.peek(length, 3), float_separator!());
541
542 if !is_float {
543 break 'number (TokenKind::LiteralInteger, length);
544 }
545
546 if let [b'.'] = self.input.peek(length, 1) {
547 length += 1;
548 length = read_digits_of_base(&self.input, length, 10);
549 }
550
551 if let float_exponent!() = self.input.peek(length, 1) {
552 length += 1;
553 if let number_sign!() = self.input.peek(length, 1) {
554 length += 1;
555 }
556
557 length = read_digits_of_base(&self.input, length, 10);
558 }
559
560 (TokenKind::LiteralFloat, length)
561 }
562 [b'.', ..] => (TokenKind::Dot, 1),
563 [unknown_byte, ..] => {
564 return Some(Err(SyntaxError::UnrecognizedToken(
565 self.file_id(),
566 *unknown_byte,
567 self.input.current_position(),
568 )));
569 }
570 [] => {
571 unreachable!()
574 }
575 };
576
577 self.mode = match token_kind {
578 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
579 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
580 TokenKind::CloseTag => LexerMode::Inline,
581 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
582 TokenKind::DocumentStart(document_kind) => LexerMode::DocumentString(
583 document_kind,
584 self.arena.alloc_slice_copy(document_label),
585 Interpolation::None,
586 ),
587 _ => LexerMode::Script,
588 };
589
590 let start = self.input.current_position();
591 let buffer = self.input.consume(len);
592 let end = self.input.current_position();
593
594 self.token(token_kind, buffer, start, end)
595 }
596 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
597 Interpolation::None => {
598 let start = self.input.current_position();
599
600 let mut length = 0;
601 let mut last_was_slash = false;
602 let mut token_kind = TokenKind::StringPart;
603 loop {
604 match self.input.peek(length, 2) {
605 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
606 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
607
608 self.mode =
609 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
610
611 break;
612 }
613 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
614 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
615
616 self.mode =
617 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
618
619 break;
620 }
621 [b'\\', ..] => {
622 length += 1;
623
624 last_was_slash = !last_was_slash;
625 }
626 [b'"', ..] if !last_was_slash => {
627 if length == 0 {
628 length += 1;
629 token_kind = TokenKind::DoubleQuote;
630
631 break;
632 }
633
634 break;
635 }
636 [_, ..] => {
637 length += 1;
638 last_was_slash = false;
639 }
640 [] => {
641 break;
642 }
643 }
644 }
645
646 let buffer = self.input.consume(length);
647 let end = self.input.current_position();
648
649 if TokenKind::DoubleQuote == token_kind {
650 self.mode = LexerMode::Script;
651 }
652
653 self.token(token_kind, buffer, start, end)
654 }
655 Interpolation::Until(offset) => {
656 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
657 }
658 },
659 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
660 Interpolation::None => {
661 let start = self.input.current_position();
662
663 let mut length = 0;
664 let mut last_was_slash = false;
665 let mut token_kind = TokenKind::StringPart;
666 loop {
667 match self.input.peek(length, 2) {
668 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
669 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
670
671 self.mode =
672 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
673
674 break;
675 }
676 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
677 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
678
679 self.mode =
680 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
681
682 break;
683 }
684 [b'\\', ..] => {
685 length += 1;
686 last_was_slash = true;
687 }
688 [b'`', ..] if !last_was_slash => {
689 if length == 0 {
690 length += 1;
691 token_kind = TokenKind::Backtick;
692
693 break;
694 }
695
696 break;
697 }
698 [_, ..] => {
699 length += 1;
700 last_was_slash = false;
701 }
702 [] => {
703 break;
704 }
705 }
706 }
707
708 let buffer = self.input.consume(length);
709 let end = self.input.current_position();
710
711 if TokenKind::Backtick == token_kind {
712 self.mode = LexerMode::Script;
713 }
714
715 self.token(token_kind, buffer, start, end)
716 }
717 Interpolation::Until(offset) => {
718 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
719 }
720 },
721 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
722 DocumentKind::Heredoc => match &interpolation {
723 Interpolation::None => {
724 let start = self.input.current_position();
725
726 let mut length = 0;
727 let mut last_was_slash = false;
728 let mut only_whitespaces = true;
729 let mut token_kind = TokenKind::StringPart;
730 loop {
731 match self.input.peek(length, 2) {
732 [b'\r', b'\n'] => {
733 length += 2;
734
735 break;
736 }
737 [b'\n', ..] | [b'\r', ..] => {
738 length += 1;
739
740 break;
741 }
742 [byte, ..] if byte.is_ascii_whitespace() => {
743 length += 1;
744 }
745 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
746 let until_offset =
747 read_until_end_of_variable_interpolation(&self.input, length + 2);
748
749 self.mode = LexerMode::DocumentString(
750 kind,
751 label,
752 Interpolation::Until(start.offset + until_offset),
753 );
754
755 break;
756 }
757 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
758 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
759
760 self.mode = LexerMode::DocumentString(
761 kind,
762 label,
763 Interpolation::Until(start.offset + until_offset),
764 );
765
766 break;
767 }
768 [b'\\', ..] => {
769 length += 1;
770 last_was_slash = true;
771 only_whitespaces = false;
772 }
773 [_, ..] => {
774 if only_whitespaces
775 && self.input.peek(length, label.len()) == label
776 && self
777 .input
778 .peek(length + label.len(), 1)
779 .first()
780 .is_none_or(|c| !c.is_ascii_alphanumeric())
781 {
782 length += label.len();
783 token_kind = TokenKind::DocumentEnd;
784
785 break;
786 }
787
788 length += 1;
789 last_was_slash = false;
790 only_whitespaces = false;
791 }
792 [] => {
793 break;
794 }
795 }
796 }
797
798 let buffer = self.input.consume(length);
799 let end = self.input.current_position();
800
801 if TokenKind::DocumentEnd == token_kind {
802 self.mode = LexerMode::Script;
803 }
804
805 self.token(token_kind, buffer, start, end)
806 }
807 Interpolation::Until(offset) => {
808 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
809 }
810 },
811 DocumentKind::Nowdoc => {
812 let start = self.input.current_position();
813
814 let mut length = 0;
815 let mut terminated = false;
816 let mut only_whitespaces = true;
817
818 loop {
819 match self.input.peek(length, 2) {
820 [b'\r', b'\n'] => {
821 length += 2;
822
823 break;
824 }
825 [b'\n', ..] | [b'\r', ..] => {
826 length += 1;
827
828 break;
829 }
830 [byte, ..] if byte.is_ascii_whitespace() => {
831 length += 1;
832 }
833 [_, ..] => {
834 if only_whitespaces
835 && self.input.peek(length, label.len()) == label
836 && self
837 .input
838 .peek(length + label.len(), 1)
839 .first()
840 .is_none_or(|c| !c.is_ascii_alphanumeric())
841 {
842 length += label.len();
843 terminated = true;
844
845 break;
846 }
847
848 only_whitespaces = false;
849 length += 1;
850 }
851 [] => {
852 break;
853 }
854 }
855 }
856
857 let buffer = self.input.consume(length);
858 let end = self.input.current_position();
859
860 if terminated {
861 self.mode = LexerMode::Script;
862
863 return self.token(TokenKind::DocumentEnd, buffer, start, end);
864 }
865
866 self.token(TokenKind::StringPart, buffer, start, end)
867 }
868 },
869 LexerMode::Halt(stage) => 'halt: {
870 let start = self.input.current_position();
871 if let HaltStage::End = stage {
872 let buffer = self.input.consume_remaining();
873 let end = self.input.current_position();
874
875 break 'halt self.token(TokenKind::InlineText, buffer, start, end);
876 }
877
878 let whitespaces = self.input.consume_whitespaces();
879 if !whitespaces.is_empty() {
880 let end = self.input.current_position();
881
882 break 'halt self.token(TokenKind::Whitespace, whitespaces, start, end);
883 }
884
885 match &stage {
886 HaltStage::LookingForLeftParenthesis => {
887 if self.input.is_at(b"(", false) {
888 let buffer = self.input.consume(1);
889 let end = self.input.current_position();
890
891 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
892
893 self.token(TokenKind::LeftParenthesis, buffer, start, end)
894 } else {
895 Some(Err(SyntaxError::UnexpectedToken(
896 self.file_id(),
897 self.input.read(1)[0],
898 self.input.current_position(),
899 )))
900 }
901 }
902 HaltStage::LookingForRightParenthesis => {
903 if self.input.is_at(b")", false) {
904 let buffer = self.input.consume(1);
905 let end = self.input.current_position();
906
907 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
908
909 self.token(TokenKind::RightParenthesis, buffer, start, end)
910 } else {
911 Some(Err(SyntaxError::UnexpectedToken(
912 self.file_id(),
913 self.input.read(1)[0],
914 self.input.current_position(),
915 )))
916 }
917 }
918 HaltStage::LookingForTerminator => {
919 if self.input.is_at(b";", false) {
920 let buffer = self.input.consume(1);
921 let end = self.input.current_position();
922
923 self.mode = LexerMode::Halt(HaltStage::End);
924
925 self.token(TokenKind::Semicolon, buffer, start, end)
926 } else if self.input.is_at(b"?>", false) {
927 let buffer = self.input.consume(2);
928 let end = self.input.current_position();
929
930 self.mode = LexerMode::Halt(HaltStage::End);
931
932 self.token(TokenKind::CloseTag, buffer, start, end)
933 } else {
934 Some(Err(SyntaxError::UnexpectedToken(
935 self.file_id(),
936 self.input.read(1)[0],
937 self.input.current_position(),
938 )))
939 }
940 }
941 _ => unreachable!(),
942 }
943 }
944 }
945 }
946
947 #[inline]
948 fn token(
949 &mut self,
950 kind: TokenKind,
951 v: &[u8],
952 from: Position,
953 to: Position,
954 ) -> Option<Result<Token<'arena>, SyntaxError>> {
955 let string = String::from_utf8_lossy(v);
956
957 Some(Ok(Token { kind, value: self.arena.alloc_str(&string), span: Span::new(self.file_id(), from, to) }))
958 }
959
960 #[inline]
961 fn interpolation(
962 &mut self,
963 end_offset: u32,
964 post_interpolation_mode: LexerMode<'arena>,
965 ) -> Option<Result<Token<'arena>, SyntaxError>> {
966 self.mode = LexerMode::Script;
967
968 let was_interpolating = self.interpolating;
969 self.interpolating = true;
970
971 loop {
972 let subsequent_token = self.advance()?.ok()?;
973 let is_final_token = subsequent_token.span.has_offset(end_offset);
974
975 self.buffer.push_back(subsequent_token);
976
977 if is_final_token {
978 break;
979 }
980 }
981
982 self.mode = post_interpolation_mode;
983 self.interpolating = was_interpolating;
984
985 self.advance()
986 }
987}
988
989impl HasFileId for Lexer<'_, '_> {
990 #[inline]
991 fn file_id(&self) -> FileId {
992 self.input.file_id()
993 }
994}
995
996#[inline]
997fn matches_start_of_heredoc_document(input: &Input) -> bool {
998 let total = input.len();
999 let base = input.current_offset();
1000
1001 let mut length = 3;
1003 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1005 length += 1;
1006 }
1007
1008 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1010 return false;
1011 }
1012 length += 1; loop {
1016 let pos = base + length;
1017 if pos >= total {
1018 return false; }
1020
1021 let byte = *input.read_at(pos);
1022 if byte == b'\n' {
1023 return true; } else if byte == b'\r' {
1025 return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1027 } else if is_part_of_identifier(input.read_at(pos)) {
1028 length += 1;
1029 } else {
1030 return false; }
1032 }
1033}
1034
1035#[inline]
1036fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1037 let total = input.len();
1038 let base = input.current_offset();
1039
1040 let mut length = 3;
1042 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1043 length += 1;
1044 }
1045
1046 if base + length >= total || *input.read_at(base + length) != b'"' {
1048 return false;
1049 }
1050 length += 1;
1051
1052 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1054 return false;
1055 }
1056 length += 1;
1057
1058 let mut terminated = false;
1060 loop {
1061 let pos = base + length;
1062 if pos >= total {
1063 return false;
1064 }
1065 let byte = input.read_at(pos);
1066 if *byte == b'\n' {
1067 return terminated;
1069 } else if *byte == b'\r' {
1070 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1072 } else if !terminated && is_part_of_identifier(byte) {
1073 length += 1;
1074 } else if !terminated && *byte == b'"' {
1075 terminated = true;
1076 length += 1;
1077 } else {
1078 return false;
1079 }
1080 }
1081}
1082
1083#[inline]
1084fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1085 let total = input.len();
1086 let base = input.current_offset();
1087
1088 let mut length = 3;
1090 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1091 length += 1;
1092 }
1093
1094 if base + length >= total || *input.read_at(base + length) != b'\'' {
1096 return false;
1097 }
1098 length += 1;
1099
1100 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1102 return false;
1103 }
1104 length += 1;
1105
1106 let mut terminated = false;
1108 loop {
1109 let pos = base + length;
1110 if pos >= total {
1111 return false;
1112 }
1113 let byte = *input.read_at(pos);
1114 if byte == b'\n' {
1115 return terminated;
1116 } else if byte == b'\r' {
1117 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1118 } else if !terminated && is_part_of_identifier(&byte) {
1119 length += 1;
1120 } else if !terminated && byte == b'\'' {
1121 terminated = true;
1122 length += 1;
1123 } else {
1124 return false;
1125 }
1126 }
1127}
1128
1129#[inline]
1130fn matches_literal_double_quote_string(input: &Input) -> bool {
1131 let total = input.len();
1132 let base = input.current_offset();
1133
1134 let mut pos = base + 1;
1136 loop {
1137 if pos >= total {
1138 return true;
1140 }
1141 let byte = *input.read_at(pos);
1142 if byte == b'"' {
1143 return true;
1145 } else if byte == b'\\' {
1146 pos += 2;
1148 continue;
1149 } else {
1150 if pos + 1 < total {
1153 let next = *input.read_at(pos + 1);
1154 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1155 return false;
1156 }
1157 }
1158 pos += 1;
1159 }
1160 }
1161}
1162
1163#[inline]
1164fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1165 let total = input.len();
1166 let base = input.current_offset();
1167
1168 let mut pos = base + 3;
1171 let mut whitespaces = 0;
1172 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1173 whitespaces += 1;
1174 pos += 1;
1175 }
1176
1177 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1183
1184 let mut label_length = 1; let mut terminated = false; loop {
1188 let pos = base + length;
1189 if pos >= total {
1191 unreachable!("Unexpected end of input while reading heredoc label");
1192 }
1193
1194 let byte = *input.read_at(pos);
1195 if byte == b'\n' {
1196 length += 1;
1198 return (length, whitespaces, label_length);
1199 } else if byte == b'\r' {
1200 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1202 length += 2;
1203 } else {
1204 length += 1;
1205 }
1206 return (length, whitespaces, label_length);
1207 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1208 length += 1;
1211 label_length += 1;
1212 } else if double_quoted && !terminated && byte == b'"' {
1213 length += 1;
1215 terminated = true;
1216 } else {
1217 unreachable!("Unexpected character encountered in heredoc label");
1218 }
1219 }
1220}
1221
1222#[inline]
1223fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1224 let total = input.len();
1225 let base = input.current_offset();
1226
1227 let mut pos = base + 3;
1229 let mut whitespaces = 0;
1230 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1231 whitespaces += 1;
1232 pos += 1;
1233 }
1234
1235 let mut length = 3 + whitespaces + 2;
1238
1239 let mut label_length = 1;
1241 let mut terminated = false;
1242 loop {
1243 let pos = base + length;
1244 if pos >= total {
1245 unreachable!("Unexpected end of input while reading nowdoc label");
1246 }
1247 let byte = *input.read_at(pos);
1248
1249 if byte == b'\n' {
1250 length += 1;
1252 return (length, whitespaces, label_length);
1253 } else if byte == b'\r' {
1254 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1256 length += 2;
1257 } else {
1258 length += 1;
1259 }
1260 return (length, whitespaces, label_length);
1261 } else if is_part_of_identifier(&byte) && !terminated {
1262 length += 1;
1264 label_length += 1;
1265 } else if !terminated && byte == b'\'' {
1266 length += 1;
1268 terminated = true;
1269 } else {
1270 unreachable!("Unexpected character encountered in nowdoc label");
1271 }
1272 }
1273}
1274
1275#[inline]
1276fn read_literal_string(input: &Input, quote: &u8) -> (TokenKind, usize) {
1277 let total = input.len();
1278 let start = input.current_offset();
1279 let mut length = 1; let mut last_was_backslash = false;
1281 let mut partial = false;
1282
1283 loop {
1284 let pos = start + length;
1285 if pos >= total {
1286 partial = true;
1288 break;
1289 }
1290
1291 let byte = input.read_at(pos);
1292 if *byte == b'\\' {
1293 last_was_backslash = !last_was_backslash;
1295 length += 1;
1296 } else {
1297 if *byte == *quote && !last_was_backslash {
1299 length += 1; break;
1301 }
1302 length += 1;
1303 last_was_backslash = false;
1304 }
1305 }
1306
1307 if partial { (TokenKind::PartialLiteralString, length) } else { (TokenKind::LiteralString, length) }
1308}
1309
1310#[inline]
1311fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1312 let total = input.len();
1313 let base = input.current_offset();
1314 let mut offset = from;
1316
1317 loop {
1318 let abs = base + offset;
1319 if abs >= total {
1320 break;
1322 }
1323
1324 if is_part_of_identifier(input.read_at(abs)) {
1326 offset += 1;
1327 continue;
1328 }
1329
1330 if *input.read_at(abs) == b'[' {
1332 offset += 1;
1333 let mut nesting = 0;
1334 loop {
1335 let abs_inner = base + offset;
1336 if abs_inner >= total {
1337 break;
1338 }
1339 let b = input.read_at(abs_inner);
1340 if *b == b']' {
1341 offset += 1;
1342 if nesting == 0 {
1343 break;
1344 } else {
1345 nesting -= 1;
1346 }
1347 } else if *b == b'[' {
1348 offset += 1;
1349 nesting += 1;
1350 } else if b.is_ascii_whitespace() {
1351 break;
1353 } else {
1354 offset += 1;
1355 }
1356 }
1357 break;
1359 }
1360
1361 if base + offset + 2 < total
1363 && *input.read_at(abs) == b'-'
1364 && *input.read_at(base + offset + 1) == b'>'
1365 && is_start_of_identifier(input.read_at(base + offset + 2))
1366 {
1367 offset += 3;
1368 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1370 offset += 1;
1371 }
1372 break;
1373 }
1374
1375 if base + offset + 3 < total
1377 && *input.read_at(abs) == b'?'
1378 && *input.read_at(base + offset + 1) == b'-'
1379 && *input.read_at(base + offset + 2) == b'>'
1380 && is_start_of_identifier(input.read_at(base + offset + 3))
1381 {
1382 offset += 4;
1383 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1384 offset += 1;
1385 }
1386 break;
1387 }
1388
1389 break;
1391 }
1392
1393 offset as u32
1394}
1395
1396#[inline]
1397fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1398 let total = input.len();
1399 let base = input.current_offset();
1400 let mut offset = from;
1401 let mut nesting = 0;
1402
1403 loop {
1404 let abs = base + offset;
1405 if abs >= total {
1406 break;
1407 }
1408 match input.read_at(abs) {
1409 b'}' => {
1410 offset += 1;
1411 if nesting == 0 {
1412 break;
1413 } else {
1414 nesting -= 1;
1415 }
1416 }
1417 b'{' => {
1418 offset += 1;
1419 nesting += 1;
1420 }
1421 _ => {
1422 offset += 1;
1423 }
1424 }
1425 }
1426
1427 offset as u32
1428}