1use std::collections::VecDeque;
2use std::fmt::Debug;
3use std::hint::unreachable_unchecked;
4
5use memchr::memchr2;
6use memchr::memmem;
7
8const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
12 let mut table: [Option<TokenKind>; 256] = [None; 256];
13 table[b';' as usize] = Some(TokenKind::Semicolon);
14 table[b',' as usize] = Some(TokenKind::Comma);
15 table[b')' as usize] = Some(TokenKind::RightParenthesis);
16 table[b'[' as usize] = Some(TokenKind::LeftBracket);
17 table[b']' as usize] = Some(TokenKind::RightBracket);
18 table[b'{' as usize] = Some(TokenKind::LeftBrace);
19 table[b'}' as usize] = Some(TokenKind::RightBrace);
20 table[b'~' as usize] = Some(TokenKind::Tilde);
21 table[b'@' as usize] = Some(TokenKind::At);
22 table
23};
24
25const IDENT_START_TABLE: [bool; 256] = {
27 let mut table = [false; 256];
28 let mut i = 0usize;
29 while i < 256 {
30 table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
31 i += 1;
32 }
33
34 table
35};
36
37use mago_database::file::FileId;
38use mago_database::file::HasFileId;
39use mago_span::Position;
40use mago_syntax_core::float_exponent;
41use mago_syntax_core::float_separator;
42use mago_syntax_core::input::Input;
43use mago_syntax_core::number_sign;
44use mago_syntax_core::start_of_binary_number;
45use mago_syntax_core::start_of_float_number;
46use mago_syntax_core::start_of_hexadecimal_number;
47use mago_syntax_core::start_of_identifier;
48use mago_syntax_core::start_of_number;
49use mago_syntax_core::start_of_octal_number;
50use mago_syntax_core::start_of_octal_or_float_number;
51use mago_syntax_core::utils::is_part_of_identifier;
52use mago_syntax_core::utils::is_start_of_identifier;
53use mago_syntax_core::utils::read_digits_of_base;
54
55use crate::error::SyntaxError;
56use crate::lexer::internal::mode::HaltStage;
57use crate::lexer::internal::mode::Interpolation;
58use crate::lexer::internal::mode::LexerMode;
59use crate::lexer::internal::utils::NumberKind;
60use crate::settings::LexerSettings;
61use crate::token::DocumentKind;
62use crate::token::Token;
63use crate::token::TokenKind;
64
65mod internal;
66
67#[derive(Debug)]
79pub struct Lexer<'input> {
80 input: Input<'input>,
81 settings: LexerSettings,
82 mode: LexerMode<'input>,
83 interpolating: bool,
84 buffer: VecDeque<Token<'input>>,
86}
87
88impl<'input> Lexer<'input> {
89 const BUFFER_INITIAL_CAPACITY: usize = 8;
92
93 pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
104 Lexer {
105 input,
106 settings,
107 mode: LexerMode::Inline,
108 interpolating: false,
109 buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
110 }
111 }
112
113 pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
124 Lexer {
125 input,
126 settings,
127 mode: LexerMode::Script,
128 interpolating: false,
129 buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
130 }
131 }
132
133 #[must_use]
137 pub fn has_reached_eof(&self) -> bool {
138 self.input.has_reached_eof()
139 }
140
141 #[inline]
143 pub const fn current_position(&self) -> Position {
144 self.input.current_position()
145 }
146
147 #[inline]
180 pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
181 if !self.interpolating
183 && let Some(token) = self.buffer.pop_front()
184 {
185 return Some(Ok(token));
186 }
187
188 if self.input.has_reached_eof() {
189 return None;
190 }
191
192 match self.mode {
193 LexerMode::Inline => {
194 let start = self.input.current_position();
195 let offset = self.input.current_offset();
196
197 if offset == 0
199 && self.input.len() >= 2
200 && unsafe { *self.input.read_at_unchecked(0) } == b'#'
201 && unsafe { *self.input.read_at_unchecked(1) } == b'!'
202 {
203 let buffer = self.input.consume_through(b'\n');
204 let end = self.input.current_position();
205
206 return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
207 }
208
209 let bytes = self.input.read_remaining();
211
212 if self.settings.enable_short_tags {
213 if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
214 if pos > 0 {
215 let buffer = self.input.consume(pos);
216 let end = self.input.current_position();
217
218 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
219 }
220
221 if self.input.is_at(b"<?php", true) {
222 let buffer = self.input.consume(5);
223 self.mode = LexerMode::Script;
224 return Some(Ok(self.token(
225 TokenKind::OpenTag,
226 buffer,
227 start,
228 self.input.current_position(),
229 )));
230 }
231
232 if self.input.is_at(b"<?=", false) {
233 let buffer = self.input.consume(3);
234 self.mode = LexerMode::Script;
235 return Some(Ok(self.token(
236 TokenKind::EchoTag,
237 buffer,
238 start,
239 self.input.current_position(),
240 )));
241 }
242
243 let buffer = self.input.consume(2);
244 self.mode = LexerMode::Script;
245 return Some(Ok(self.token(
246 TokenKind::ShortOpenTag,
247 buffer,
248 start,
249 self.input.current_position(),
250 )));
251 }
252 } else {
253 let iter = memchr::memmem::find_iter(bytes, b"<?");
254
255 for pos in iter {
256 let candidate = unsafe { bytes.get_unchecked(pos..) };
258
259 if candidate.len() >= 5
260 && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
261 && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
262 && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
263 {
264 if pos > 0 {
265 let buffer = self.input.consume(pos);
266 let end = self.input.current_position();
267 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
268 }
269
270 let buffer = self.input.consume(5);
271 self.mode = LexerMode::Script;
272 return Some(Ok(self.token(
273 TokenKind::OpenTag,
274 buffer,
275 start,
276 self.input.current_position(),
277 )));
278 }
279
280 if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
281 if pos > 0 {
282 let buffer = self.input.consume(pos);
283 let end = self.input.current_position();
284 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
285 }
286
287 let buffer = self.input.consume(3);
288 self.mode = LexerMode::Script;
289 return Some(Ok(self.token(
290 TokenKind::EchoTag,
291 buffer,
292 start,
293 self.input.current_position(),
294 )));
295 }
296 }
297 }
298
299 if self.input.has_reached_eof() {
300 return None;
301 }
302
303 let buffer = self.input.consume_remaining();
304 let end = self.input.current_position();
305 Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
306 }
307 LexerMode::Script => {
308 let start = self.input.current_position();
309 let whitespaces = self.input.consume_whitespaces();
310 if !whitespaces.is_empty() {
311 return Some(Ok(self.token(
312 TokenKind::Whitespace,
313 whitespaces,
314 start,
315 self.input.current_position(),
316 )));
317 }
318
319 let first_byte = match self.input.read(1).first() {
320 Some(&b) => b,
321 None => {
322 unsafe { unreachable_unchecked() }
324 }
325 };
326
327 if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
328 let buffer = self.input.consume(1);
329 let end = self.input.current_position();
330 return Some(Ok(self.token(kind, buffer, start, end)));
331 }
332
333 if IDENT_START_TABLE[first_byte as usize] {
334 let (token_kind, len) = self.scan_identifier_or_keyword_info();
335
336 if token_kind == TokenKind::HaltCompiler {
337 self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
338 }
339
340 let buffer = self.input.consume(len);
341 let end = self.input.current_position();
342 return Some(Ok(self.token(token_kind, buffer, start, end)));
343 }
344
345 if first_byte == b'$'
346 && let Some(&next) = self.input.read(2).get(1)
347 && IDENT_START_TABLE[next as usize]
348 {
349 let (ident_len, _) = self.input.scan_identifier(1);
350 let buffer = self.input.consume(1 + ident_len);
351 let end = self.input.current_position();
352 return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
353 }
354
355 let mut document_label: &[u8] = &[];
356
357 let (token_kind, len) = match self.input.read(3) {
358 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
359 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
360 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
361 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
362 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
363 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
364 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
365 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
366 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
367 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
368 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
369
370 document_label = self.input.peek(3 + whitespaces, label_length);
371
372 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
373 }
374 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
375 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
376
377 document_label = self.input.peek(4 + whitespaces, label_length);
378
379 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
380 }
381 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
382 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
383
384 document_label = self.input.peek(4 + whitespaces, label_length);
385
386 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
387 }
388 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
389 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
390 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
391 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
392 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
393 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
394 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
395 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
396 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
397 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
398 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
399 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
400 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
401 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
402 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
403 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
404 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
405 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
406 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
407 [b':', b':', ..] => (TokenKind::ColonColon, 2),
408 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
409 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
410 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
411 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
412 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
413 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
414 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
415 [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
416 [b'/', b'/', ..] => {
417 let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
418 let comment_len = scan_single_line_comment(remaining);
419 (TokenKind::SingleLineComment, 2 + comment_len)
420 }
421 [b'/', b'*', asterisk] => {
422 let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
423 match scan_multi_line_comment(remaining) {
424 Some(len) => {
425 let is_docblock = asterisk == &b'*' && len > 2;
426 if is_docblock {
427 (TokenKind::DocBlockComment, len + 2)
428 } else {
429 (TokenKind::MultiLineComment, len + 2)
430 }
431 }
432 None => {
433 self.input.consume(remaining.len() + 2);
434 return Some(Err(SyntaxError::UnexpectedEndOfFile(
435 self.file_id(),
436 self.input.current_position(),
437 )));
438 }
439 }
440 }
441 [b'\\', start_of_identifier!(), ..] => {
442 let mut length = 1;
443 loop {
444 let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
445 length += ident_len;
446 if ends_with_ns {
447 length += 1; } else {
449 break;
450 }
451 }
452
453 (TokenKind::FullyQualifiedIdentifier, length)
454 }
455 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
456 [b'$', ..] => (TokenKind::Dollar, 1),
457 [b'!', ..] => (TokenKind::Bang, 1),
458 [b'&', ..] => (TokenKind::Ampersand, 1),
459 [b'?', ..] => (TokenKind::Question, 1),
460 [b'=', ..] => (TokenKind::Equal, 1),
461 [b'`', ..] => (TokenKind::Backtick, 1),
462 [b'+', ..] => (TokenKind::Plus, 1),
463 [b'%', ..] => (TokenKind::Percent, 1),
464 [b'-', ..] => (TokenKind::Minus, 1),
465 [b'<', ..] => (TokenKind::LessThan, 1),
466 [b'>', ..] => (TokenKind::GreaterThan, 1),
467 [b':', ..] => (TokenKind::Colon, 1),
468 [b'|', ..] => (TokenKind::Pipe, 1),
469 [b'^', ..] => (TokenKind::Caret, 1),
470 [b'*', ..] => (TokenKind::Asterisk, 1),
471 [b'/', ..] => (TokenKind::Slash, 1),
472 [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
473 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
474 read_literal_string(&self.input, *quote)
475 }
476 [b'"', ..] => (TokenKind::DoubleQuote, 1),
477 [b'(', ..] => 'parenthesis: {
478 let mut peek_offset = 1;
479 while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
480 if b.is_ascii_whitespace() {
481 peek_offset += 1;
482 } else {
483 let lower = b | 0x20; if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u' | b'v')
486 {
487 break 'parenthesis (TokenKind::LeftParenthesis, 1);
488 }
489 break;
490 }
491 }
492
493 for (value, kind) in internal::consts::CAST_TYPES {
494 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
495 break 'parenthesis (kind, length);
496 }
497 }
498
499 (TokenKind::LeftParenthesis, 1)
500 }
501 [b'#', ..] => {
502 let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
503 let comment_len = scan_single_line_comment(remaining);
504 (TokenKind::HashComment, 1 + comment_len)
505 }
506 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
507 [b'.', start_of_number!(), ..] => {
508 let mut length = read_digits_of_base(&self.input, 2, 10);
509 if let float_exponent!() = self.input.peek(length, 1) {
510 let mut exp_length = length + 1;
511 if let number_sign!() = self.input.peek(exp_length, 1) {
512 exp_length += 1;
513 }
514
515 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
516 if after_exp > exp_length {
517 length = after_exp;
518 }
519 }
520
521 (TokenKind::LiteralFloat, length)
522 }
523 [start_of_number!(), ..] => 'number: {
524 let mut length = 1;
525
526 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
527 start_of_binary_number!() => {
528 length += 1;
529
530 (2, NumberKind::Integer)
531 }
532 start_of_octal_number!() => {
533 length += 1;
534
535 (8, NumberKind::Integer)
536 }
537 start_of_hexadecimal_number!() => {
538 length += 1;
539
540 (16, NumberKind::Integer)
541 }
542 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
543 start_of_float_number!() => (10, NumberKind::Float),
544 _ => (10, NumberKind::IntegerOrFloat),
545 };
546
547 if kind != NumberKind::Float {
548 length = read_digits_of_base(&self.input, length, base);
549
550 if kind == NumberKind::Integer {
551 break 'number (TokenKind::LiteralInteger, length);
552 }
553 }
554
555 let is_float = matches!(self.input.peek(length, 3), float_separator!());
556
557 if !is_float {
558 break 'number (TokenKind::LiteralInteger, length);
559 }
560
561 if let [b'.'] = self.input.peek(length, 1) {
562 length += 1;
563 length = read_digits_of_base(&self.input, length, 10);
564 }
565
566 if let float_exponent!() = self.input.peek(length, 1) {
567 let mut exp_length = length + 1;
569 if let number_sign!() = self.input.peek(exp_length, 1) {
570 exp_length += 1;
571 }
572 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
573 if after_exp > exp_length {
574 length = after_exp;
576 }
577 }
578
579 (TokenKind::LiteralFloat, length)
580 }
581 [b'.', ..] => (TokenKind::Dot, 1),
582 [unknown_byte, ..] => {
583 let position = self.input.current_position();
584 self.input.consume(1);
585
586 return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
587 }
588 [] => {
589 unreachable!()
592 }
593 };
594
595 self.mode = match token_kind {
596 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
597 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
598 TokenKind::CloseTag => LexerMode::Inline,
599 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
600 TokenKind::DocumentStart(document_kind) => {
601 LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
602 }
603 _ => LexerMode::Script,
604 };
605
606 let buffer = self.input.consume(len);
607 let end = self.input.current_position();
608
609 Some(Ok(self.token(token_kind, buffer, start, end)))
610 }
611 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
612 Interpolation::None => {
613 let start = self.input.current_position();
614
615 let mut length = 0;
616 let mut last_was_slash = false;
617 let mut token_kind = TokenKind::StringPart;
618 loop {
619 match self.input.peek(length, 2) {
620 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
621 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
622
623 self.mode =
624 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
625
626 break;
627 }
628 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
629 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
630
631 self.mode =
632 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
633
634 break;
635 }
636 [b'\\', ..] => {
637 length += 1;
638
639 last_was_slash = !last_was_slash;
640 }
641 [b'"', ..] if !last_was_slash => {
642 if length == 0 {
643 length += 1;
644 token_kind = TokenKind::DoubleQuote;
645
646 break;
647 }
648
649 break;
650 }
651 [_, ..] => {
652 length += 1;
653 last_was_slash = false;
654 }
655 [] => {
656 break;
657 }
658 }
659 }
660
661 let buffer = self.input.consume(length);
662 let end = self.input.current_position();
663
664 if TokenKind::DoubleQuote == token_kind {
665 self.mode = LexerMode::Script;
666 }
667
668 Some(Ok(self.token(token_kind, buffer, start, end)))
669 }
670 Interpolation::Until(offset) => {
671 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
672 }
673 },
674 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
675 Interpolation::None => {
676 let start = self.input.current_position();
677
678 let mut length = 0;
679 let mut last_was_slash = false;
680 let mut token_kind = TokenKind::StringPart;
681 loop {
682 match self.input.peek(length, 2) {
683 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
684 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
685
686 self.mode =
687 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
688
689 break;
690 }
691 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
692 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
693
694 self.mode =
695 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
696
697 break;
698 }
699 [b'\\', ..] => {
700 length += 1;
701 last_was_slash = true;
702 }
703 [b'`', ..] if !last_was_slash => {
704 if length == 0 {
705 length += 1;
706 token_kind = TokenKind::Backtick;
707
708 break;
709 }
710
711 break;
712 }
713 [_, ..] => {
714 length += 1;
715 last_was_slash = false;
716 }
717 [] => {
718 break;
719 }
720 }
721 }
722
723 let buffer = self.input.consume(length);
724 let end = self.input.current_position();
725
726 if TokenKind::Backtick == token_kind {
727 self.mode = LexerMode::Script;
728 }
729
730 Some(Ok(self.token(token_kind, buffer, start, end)))
731 }
732 Interpolation::Until(offset) => {
733 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
734 }
735 },
736 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
737 DocumentKind::Heredoc => match &interpolation {
738 Interpolation::None => {
739 let start = self.input.current_position();
740
741 let mut length = 0;
742 let mut last_was_slash = false;
743 let mut only_whitespaces = true;
744 let mut token_kind = TokenKind::StringPart;
745 loop {
746 match self.input.peek(length, 2) {
747 [b'\r', b'\n'] => {
748 length += 2;
749
750 break;
751 }
752 [b'\n' | b'\r', ..] => {
753 length += 1;
754
755 break;
756 }
757 [byte, ..] if byte.is_ascii_whitespace() => {
758 length += 1;
759 }
760 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
761 let until_offset =
762 read_until_end_of_variable_interpolation(&self.input, length + 2);
763
764 self.mode = LexerMode::DocumentString(
765 kind,
766 label,
767 Interpolation::Until(start.offset + until_offset),
768 );
769
770 break;
771 }
772 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
773 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
774
775 self.mode = LexerMode::DocumentString(
776 kind,
777 label,
778 Interpolation::Until(start.offset + until_offset),
779 );
780
781 break;
782 }
783 [b'\\', ..] => {
784 length += 1;
785 last_was_slash = true;
786 only_whitespaces = false;
787 }
788 [_, ..] => {
789 if only_whitespaces
790 && self.input.peek(length, label.len()) == label
791 && self
792 .input
793 .peek(length + label.len(), 1)
794 .first()
795 .is_none_or(|c| !c.is_ascii_alphanumeric())
796 {
797 length += label.len();
798 token_kind = TokenKind::DocumentEnd;
799
800 break;
801 }
802
803 length += 1;
804 last_was_slash = false;
805 only_whitespaces = false;
806 }
807 [] => {
808 break;
809 }
810 }
811 }
812
813 let buffer = self.input.consume(length);
814 let end = self.input.current_position();
815
816 if TokenKind::DocumentEnd == token_kind {
817 self.mode = LexerMode::Script;
818 }
819
820 Some(Ok(self.token(token_kind, buffer, start, end)))
821 }
822 Interpolation::Until(offset) => {
823 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
824 }
825 },
826 DocumentKind::Nowdoc => {
827 let start = self.input.current_position();
828
829 let mut length = 0;
830 let mut terminated = false;
831 let mut only_whitespaces = true;
832
833 loop {
834 match self.input.peek(length, 2) {
835 [b'\r', b'\n'] => {
836 length += 2;
837
838 break;
839 }
840 [b'\n' | b'\r', ..] => {
841 length += 1;
842
843 break;
844 }
845 [byte, ..] if byte.is_ascii_whitespace() => {
846 length += 1;
847 }
848 [_, ..] => {
849 if only_whitespaces
850 && self.input.peek(length, label.len()) == label
851 && self
852 .input
853 .peek(length + label.len(), 1)
854 .first()
855 .is_none_or(|c| !c.is_ascii_alphanumeric())
856 {
857 length += label.len();
858 terminated = true;
859
860 break;
861 }
862
863 only_whitespaces = false;
864 length += 1;
865 }
866 [] => {
867 break;
868 }
869 }
870 }
871
872 let buffer = self.input.consume(length);
873 let end = self.input.current_position();
874
875 if terminated {
876 self.mode = LexerMode::Script;
877
878 return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
879 }
880
881 Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
882 }
883 },
884 LexerMode::Halt(stage) => 'halt: {
885 let start = self.input.current_position();
886 if let HaltStage::End = stage {
887 let buffer = self.input.consume_remaining();
888 let end = self.input.current_position();
889
890 break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
891 }
892
893 let whitespaces = self.input.consume_whitespaces();
894 if !whitespaces.is_empty() {
895 let end = self.input.current_position();
896
897 break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
898 }
899
900 match &stage {
901 HaltStage::LookingForLeftParenthesis => {
902 if self.input.is_at(b"(", false) {
903 let buffer = self.input.consume(1);
904 let end = self.input.current_position();
905
906 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
907
908 Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
909 } else {
910 let byte = self.input.read(1)[0];
911 let position = self.input.current_position();
912 self.input.consume(1);
914 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
915 }
916 }
917 HaltStage::LookingForRightParenthesis => {
918 if self.input.is_at(b")", false) {
919 let buffer = self.input.consume(1);
920 let end = self.input.current_position();
921
922 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
923
924 Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
925 } else {
926 let byte = self.input.read(1)[0];
927 let position = self.input.current_position();
928 self.input.consume(1);
929 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
930 }
931 }
932 HaltStage::LookingForTerminator => {
933 if self.input.is_at(b";", false) {
934 let buffer = self.input.consume(1);
935 let end = self.input.current_position();
936
937 self.mode = LexerMode::Halt(HaltStage::End);
938
939 Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
940 } else if self.input.is_at(b"?>", false) {
941 let buffer = self.input.consume(2);
942 let end = self.input.current_position();
943
944 self.mode = LexerMode::Halt(HaltStage::End);
945
946 Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
947 } else {
948 let byte = self.input.read(1)[0];
949 let position = self.input.current_position();
950 self.input.consume(1);
951 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
952 }
953 }
954 _ => unreachable!(),
955 }
956 }
957 }
958 }
959
960 #[inline]
964 fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
965 let (mut length, ended_with_slash) = self.input.scan_identifier(0);
966
967 if !ended_with_slash {
968 match length {
969 6 => {
970 if self.input.is_at(b"public(set)", true) {
971 return (TokenKind::PublicSet, 11);
972 }
973 }
974 7 => {
975 if self.input.is_at(b"private(set)", true) {
976 return (TokenKind::PrivateSet, 12);
977 }
978 }
979 9 => {
980 if self.input.is_at(b"protected(set)", true) {
981 return (TokenKind::ProtectedSet, 14);
982 }
983 }
984 _ => {}
985 }
986 }
987
988 if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
989 return (kind, length);
990 }
991
992 let mut slashes = 0;
993 let mut last_was_slash = false;
994 loop {
995 match self.input.peek(length, 1) {
996 [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
997 length += 1;
998 last_was_slash = false;
999 }
1000 [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1001 length += 1;
1002 }
1003 [b'\\'] if !self.interpolating => {
1004 if last_was_slash {
1005 length -= 1;
1006 slashes -= 1;
1007 last_was_slash = false;
1008 break;
1009 }
1010
1011 length += 1;
1012 slashes += 1;
1013 last_was_slash = true;
1014 }
1015 _ => {
1016 break;
1017 }
1018 }
1019 }
1020
1021 if last_was_slash {
1022 length -= 1;
1023 slashes -= 1;
1024 }
1025
1026 let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1027
1028 (kind, length)
1029 }
1030
1031 #[inline]
1032 fn token(&self, kind: TokenKind, v: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1033 let value = unsafe { std::str::from_utf8_unchecked(v) };
1038
1039 Token { kind, start, value }
1040 }
1041
1042 #[inline]
1043 fn interpolation(
1044 &mut self,
1045 end_offset: u32,
1046 post_interpolation_mode: LexerMode<'input>,
1047 ) -> Option<Result<Token<'input>, SyntaxError>> {
1048 self.mode = LexerMode::Script;
1049
1050 let was_interpolating = self.interpolating;
1051 self.interpolating = true;
1052
1053 loop {
1054 let subsequent_token = self.advance()?.ok()?;
1055 let token_start = subsequent_token.start.offset;
1057 let token_end = token_start + subsequent_token.value.len() as u32;
1058 let is_final_token = token_start <= end_offset && end_offset <= token_end;
1059
1060 self.buffer.push_back(subsequent_token);
1061
1062 if is_final_token {
1063 break;
1064 }
1065 }
1066
1067 self.mode = post_interpolation_mode;
1068 self.interpolating = was_interpolating;
1069
1070 self.advance()
1071 }
1072}
1073
1074impl HasFileId for Lexer<'_> {
1075 #[inline]
1076 fn file_id(&self) -> FileId {
1077 self.input.file_id()
1078 }
1079}
1080
1081#[inline]
1082fn matches_start_of_heredoc_document(input: &Input) -> bool {
1083 let total = input.len();
1084 let base = input.current_offset();
1085
1086 let mut length = 3;
1088 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1090 length += 1;
1091 }
1092
1093 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1095 return false;
1096 }
1097 length += 1; loop {
1101 let pos = base + length;
1102 if pos >= total {
1103 return false; }
1105
1106 let byte = *input.read_at(pos);
1107 if byte == b'\n' {
1108 return true; } else if byte == b'\r' {
1110 return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1112 } else if is_part_of_identifier(input.read_at(pos)) {
1113 length += 1;
1114 } else {
1115 return false; }
1117 }
1118}
1119
1120#[inline]
1121fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1122 let total = input.len();
1123 let base = input.current_offset();
1124
1125 let mut length = 3;
1127 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1128 length += 1;
1129 }
1130
1131 if base + length >= total || *input.read_at(base + length) != b'"' {
1133 return false;
1134 }
1135 length += 1;
1136
1137 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1139 return false;
1140 }
1141 length += 1;
1142
1143 let mut terminated = false;
1145 loop {
1146 let pos = base + length;
1147 if pos >= total {
1148 return false;
1149 }
1150 let byte = input.read_at(pos);
1151 if *byte == b'\n' {
1152 return terminated;
1154 } else if *byte == b'\r' {
1155 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1157 } else if !terminated && is_part_of_identifier(byte) {
1158 length += 1;
1159 } else if !terminated && *byte == b'"' {
1160 terminated = true;
1161 length += 1;
1162 } else {
1163 return false;
1164 }
1165 }
1166}
1167
1168#[inline]
1169fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1170 let total = input.len();
1171 let base = input.current_offset();
1172
1173 let mut length = 3;
1175 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1176 length += 1;
1177 }
1178
1179 if base + length >= total || *input.read_at(base + length) != b'\'' {
1181 return false;
1182 }
1183 length += 1;
1184
1185 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1187 return false;
1188 }
1189 length += 1;
1190
1191 let mut terminated = false;
1193 loop {
1194 let pos = base + length;
1195 if pos >= total {
1196 return false;
1197 }
1198 let byte = *input.read_at(pos);
1199 if byte == b'\n' {
1200 return terminated;
1201 } else if byte == b'\r' {
1202 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1203 } else if !terminated && is_part_of_identifier(&byte) {
1204 length += 1;
1205 } else if !terminated && byte == b'\'' {
1206 terminated = true;
1207 length += 1;
1208 } else {
1209 return false;
1210 }
1211 }
1212}
1213
1214#[inline]
1215fn matches_literal_double_quote_string(input: &Input) -> bool {
1216 let total = input.len();
1217 let base = input.current_offset();
1218
1219 let mut pos = base + 1;
1221 loop {
1222 if pos >= total {
1223 return true;
1225 }
1226 let byte = *input.read_at(pos);
1227 if byte == b'"' {
1228 return true;
1230 } else if byte == b'\\' {
1231 pos += 2;
1233 continue;
1234 }
1235
1236 if pos + 1 < total {
1239 let next = *input.read_at(pos + 1);
1240 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1241 return false;
1242 }
1243 }
1244 pos += 1;
1245 }
1246}
1247
1248#[inline]
1249fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1250 let total = input.len();
1251 let base = input.current_offset();
1252
1253 let mut pos = base + 3;
1255 let mut whitespaces = 0;
1256 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1257 whitespaces += 1;
1258 pos += 1;
1259 }
1260
1261 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1266
1267 let mut label_length = 1; let mut terminated = false; loop {
1270 let pos = base + length;
1271 if pos >= total {
1273 unreachable!("Unexpected end of input while reading heredoc label");
1274 }
1275
1276 let byte = *input.read_at(pos);
1277 if byte == b'\n' {
1278 length += 1;
1280 return (length, whitespaces, label_length);
1281 } else if byte == b'\r' {
1282 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1284 length += 2;
1285 } else {
1286 length += 1;
1287 }
1288 return (length, whitespaces, label_length);
1289 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1290 length += 1;
1293 label_length += 1;
1294 } else if double_quoted && !terminated && byte == b'"' {
1295 length += 1;
1297 terminated = true;
1298 } else {
1299 unreachable!("Unexpected character encountered in heredoc label");
1300 }
1301 }
1302}
1303
1304#[inline]
1305fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1306 let total = input.len();
1307 let base = input.current_offset();
1308
1309 let mut pos = base + 3;
1310 let mut whitespaces = 0;
1311 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1312 whitespaces += 1;
1313 pos += 1;
1314 }
1315
1316 let mut length = 3 + whitespaces + 2;
1318
1319 let mut label_length = 1;
1320 let mut terminated = false;
1321 loop {
1322 let pos = base + length;
1323 if pos >= total {
1324 unreachable!("Unexpected end of input while reading nowdoc label");
1325 }
1326 let byte = *input.read_at(pos);
1327
1328 if byte == b'\n' {
1329 length += 1;
1331 return (length, whitespaces, label_length);
1332 } else if byte == b'\r' {
1333 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1335 length += 2;
1336 } else {
1337 length += 1;
1338 }
1339 return (length, whitespaces, label_length);
1340 } else if is_part_of_identifier(&byte) && !terminated {
1341 length += 1;
1343 label_length += 1;
1344 } else if !terminated && byte == b'\'' {
1345 length += 1;
1347 terminated = true;
1348 } else {
1349 unreachable!("Unexpected character encountered in nowdoc label");
1350 }
1351 }
1352}
1353
1354#[inline]
1355fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1356 let total = input.len();
1357 let start = input.current_offset();
1358 let mut length = 1; let bytes = input.peek(length, total - start - length);
1361 loop {
1362 match memchr2(quote, b'\\', &bytes[length - 1..]) {
1363 Some(pos) => {
1364 let abs_pos = length - 1 + pos;
1365 let byte = bytes[abs_pos];
1366
1367 if byte == b'\\' {
1368 length = abs_pos + 2 + 1; if length > total - start {
1370 return (TokenKind::PartialLiteralString, total - start);
1371 }
1372 } else {
1373 length = abs_pos + 2; return (TokenKind::LiteralString, length);
1375 }
1376 }
1377 None => {
1378 return (TokenKind::PartialLiteralString, total - start);
1380 }
1381 }
1382 }
1383}
1384
1385#[inline]
1386fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1387 let total = input.len();
1388 let base = input.current_offset();
1389 let mut offset = from;
1391
1392 loop {
1393 let abs = base + offset;
1394 if abs >= total {
1395 break;
1397 }
1398
1399 if is_part_of_identifier(input.read_at(abs)) {
1401 offset += 1;
1402 continue;
1403 }
1404
1405 if *input.read_at(abs) == b'[' {
1407 offset += 1;
1408 let mut nesting = 0;
1409 loop {
1410 let abs_inner = base + offset;
1411 if abs_inner >= total {
1412 break;
1413 }
1414 let b = input.read_at(abs_inner);
1415 if *b == b']' {
1416 offset += 1;
1417 if nesting == 0 {
1418 break;
1419 }
1420
1421 nesting -= 1;
1422 } else if *b == b'[' {
1423 offset += 1;
1424 nesting += 1;
1425 } else if b.is_ascii_whitespace() {
1426 break;
1428 } else {
1429 offset += 1;
1430 }
1431 }
1432 break;
1434 }
1435
1436 if base + offset + 2 < total
1438 && *input.read_at(abs) == b'-'
1439 && *input.read_at(base + offset + 1) == b'>'
1440 && is_start_of_identifier(input.read_at(base + offset + 2))
1441 {
1442 offset += 3;
1443 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1445 offset += 1;
1446 }
1447 break;
1448 }
1449
1450 if base + offset + 3 < total
1452 && *input.read_at(abs) == b'?'
1453 && *input.read_at(base + offset + 1) == b'-'
1454 && *input.read_at(base + offset + 2) == b'>'
1455 && is_start_of_identifier(input.read_at(base + offset + 3))
1456 {
1457 offset += 4;
1458 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1459 offset += 1;
1460 }
1461 break;
1462 }
1463
1464 break;
1466 }
1467
1468 offset as u32
1469}
1470
1471#[inline]
1472fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1473 let total = input.len();
1474 let base = input.current_offset();
1475 let mut offset = from;
1476 let mut nesting = 0;
1477
1478 loop {
1479 let abs = base + offset;
1480 if abs >= total {
1481 break;
1482 }
1483 match input.read_at(abs) {
1484 b'}' => {
1485 offset += 1;
1486 if nesting == 0 {
1487 break;
1488 }
1489
1490 nesting -= 1;
1491 }
1492 b'{' => {
1493 offset += 1;
1494 nesting += 1;
1495 }
1496 _ => {
1497 offset += 1;
1498 }
1499 }
1500 }
1501
1502 offset as u32
1503}
1504
1505#[inline]
1508fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1509 memmem::find(bytes, b"*/").map(|pos| pos + 2)
1511}
1512
1513#[inline]
1517fn scan_single_line_comment(bytes: &[u8]) -> usize {
1518 let mut pos = 0;
1519 while pos < bytes.len() {
1520 match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1521 Some(offset) => {
1522 let found_pos = pos + offset;
1523 match bytes[found_pos] {
1524 b'\n' | b'\r' => return found_pos,
1525 b'?' => {
1526 if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1528 if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1530 return found_pos - 1;
1531 }
1532 return found_pos;
1533 }
1534 pos = found_pos + 1;
1536 }
1537 _ => unreachable!(),
1538 }
1539 }
1540 None => return bytes.len(),
1541 }
1542 }
1543
1544 bytes.len()
1545}