1use std::collections::VecDeque;
2use std::fmt::Debug;
3use std::hint::unreachable_unchecked;
4
5use memchr::memchr2;
6use memchr::memmem;
7
8const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
12 let mut table: [Option<TokenKind>; 256] = [None; 256];
13 table[b';' as usize] = Some(TokenKind::Semicolon);
14 table[b',' as usize] = Some(TokenKind::Comma);
15 table[b')' as usize] = Some(TokenKind::RightParenthesis);
16 table[b'[' as usize] = Some(TokenKind::LeftBracket);
17 table[b']' as usize] = Some(TokenKind::RightBracket);
18 table[b'{' as usize] = Some(TokenKind::LeftBrace);
19 table[b'}' as usize] = Some(TokenKind::RightBrace);
20 table[b'~' as usize] = Some(TokenKind::Tilde);
21 table[b'@' as usize] = Some(TokenKind::At);
22 table
23};
24
25const IDENT_START_TABLE: [bool; 256] = {
27 let mut table = [false; 256];
28 let mut i = 0usize;
29 while i < 256 {
30 table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
31 i += 1;
32 }
33
34 table
35};
36
37use mago_database::file::FileId;
38use mago_database::file::HasFileId;
39use mago_span::Position;
40use mago_syntax_core::float_exponent;
41use mago_syntax_core::float_separator;
42use mago_syntax_core::input::Input;
43use mago_syntax_core::number_sign;
44use mago_syntax_core::start_of_binary_number;
45use mago_syntax_core::start_of_float_number;
46use mago_syntax_core::start_of_hexadecimal_number;
47use mago_syntax_core::start_of_identifier;
48use mago_syntax_core::start_of_number;
49use mago_syntax_core::start_of_octal_number;
50use mago_syntax_core::start_of_octal_or_float_number;
51use mago_syntax_core::utils::is_part_of_identifier;
52use mago_syntax_core::utils::is_start_of_identifier;
53use mago_syntax_core::utils::read_digits_of_base;
54
55use crate::error::SyntaxError;
56use crate::lexer::internal::mode::HaltStage;
57use crate::lexer::internal::mode::Interpolation;
58use crate::lexer::internal::mode::LexerMode;
59use crate::lexer::internal::utils::NumberKind;
60use crate::settings::LexerSettings;
61use crate::token::DocumentKind;
62use crate::token::Token;
63use crate::token::TokenKind;
64
65mod internal;
66
67#[derive(Debug)]
79pub struct Lexer<'input> {
80 input: Input<'input>,
81 settings: LexerSettings,
82 mode: LexerMode<'input>,
83 interpolating: bool,
84 buffer: VecDeque<Token<'input>>,
86}
87
88impl<'input> Lexer<'input> {
89 const BUFFER_INITIAL_CAPACITY: usize = 8;
92
93 pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
104 Lexer {
105 input,
106 settings,
107 mode: LexerMode::Inline,
108 interpolating: false,
109 buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
110 }
111 }
112
113 pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
124 Lexer {
125 input,
126 settings,
127 mode: LexerMode::Script,
128 interpolating: false,
129 buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
130 }
131 }
132
133 #[must_use]
137 pub fn has_reached_eof(&self) -> bool {
138 self.input.has_reached_eof()
139 }
140
141 #[inline]
143 pub const fn current_position(&self) -> Position {
144 self.input.current_position()
145 }
146
147 #[inline]
180 pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
181 if !self.interpolating
183 && let Some(token) = self.buffer.pop_front()
184 {
185 return Some(Ok(token));
186 }
187
188 if self.input.has_reached_eof() {
189 return None;
190 }
191
192 match self.mode {
193 LexerMode::Inline => {
194 let start = self.input.current_position();
195 let offset = self.input.current_offset();
196
197 if offset == 0
199 && self.input.len() >= 2
200 && unsafe { *self.input.read_at_unchecked(0) } == b'#'
201 && unsafe { *self.input.read_at_unchecked(1) } == b'!'
202 {
203 let buffer = self.input.consume_through(b'\n');
204 let end = self.input.current_position();
205
206 return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
207 }
208
209 let bytes = self.input.read_remaining();
211
212 if self.settings.enable_short_tags {
213 if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
214 if pos > 0 {
215 let buffer = self.input.consume(pos);
216 let end = self.input.current_position();
217
218 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
219 }
220
221 if self.input.is_at(b"<?php", true) {
222 let buffer = self.input.consume(5);
223 self.mode = LexerMode::Script;
224 return Some(Ok(self.token(
225 TokenKind::OpenTag,
226 buffer,
227 start,
228 self.input.current_position(),
229 )));
230 }
231
232 if self.input.is_at(b"<?=", false) {
233 let buffer = self.input.consume(3);
234 self.mode = LexerMode::Script;
235 return Some(Ok(self.token(
236 TokenKind::EchoTag,
237 buffer,
238 start,
239 self.input.current_position(),
240 )));
241 }
242
243 let buffer = self.input.consume(2);
244 self.mode = LexerMode::Script;
245 return Some(Ok(self.token(
246 TokenKind::ShortOpenTag,
247 buffer,
248 start,
249 self.input.current_position(),
250 )));
251 }
252 } else {
253 let iter = memchr::memmem::find_iter(bytes, b"<?");
254
255 for pos in iter {
256 let candidate = unsafe { bytes.get_unchecked(pos..) };
258
259 if candidate.len() >= 5
260 && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
261 && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
262 && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
263 {
264 if pos > 0 {
265 let buffer = self.input.consume(pos);
266 let end = self.input.current_position();
267 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
268 }
269
270 let buffer = self.input.consume(5);
271 self.mode = LexerMode::Script;
272 return Some(Ok(self.token(
273 TokenKind::OpenTag,
274 buffer,
275 start,
276 self.input.current_position(),
277 )));
278 }
279
280 if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
281 if pos > 0 {
282 let buffer = self.input.consume(pos);
283 let end = self.input.current_position();
284 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
285 }
286
287 let buffer = self.input.consume(3);
288 self.mode = LexerMode::Script;
289 return Some(Ok(self.token(
290 TokenKind::EchoTag,
291 buffer,
292 start,
293 self.input.current_position(),
294 )));
295 }
296 }
297 }
298
299 if self.input.has_reached_eof() {
300 return None;
301 }
302
303 let buffer = self.input.consume_remaining();
304 let end = self.input.current_position();
305 Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
306 }
307 LexerMode::Script => {
308 let start = self.input.current_position();
309 let whitespaces = self.input.consume_whitespaces();
310 if !whitespaces.is_empty() {
311 return Some(Ok(self.token(
312 TokenKind::Whitespace,
313 whitespaces,
314 start,
315 self.input.current_position(),
316 )));
317 }
318
319 let first_byte = match self.input.read(1).first() {
320 Some(&b) => b,
321 None => {
322 unsafe { unreachable_unchecked() }
324 }
325 };
326
327 if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
328 let buffer = self.input.consume(1);
329 let end = self.input.current_position();
330 return Some(Ok(self.token(kind, buffer, start, end)));
331 }
332
333 if IDENT_START_TABLE[first_byte as usize] {
334 let (token_kind, len) = self.scan_identifier_or_keyword_info();
335
336 if token_kind == TokenKind::HaltCompiler {
337 self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
338 }
339
340 let buffer = self.input.consume(len);
341 let end = self.input.current_position();
342 return Some(Ok(self.token(token_kind, buffer, start, end)));
343 }
344
345 if first_byte == b'$'
346 && let Some(&next) = self.input.read(2).get(1)
347 && IDENT_START_TABLE[next as usize]
348 {
349 let (ident_len, _) = self.input.scan_identifier(1);
350 let buffer = self.input.consume(1 + ident_len);
351 let end = self.input.current_position();
352 return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
353 }
354
355 let mut document_label: &[u8] = &[];
356
357 let (token_kind, len) = match self.input.read(3) {
358 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
359 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
360 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
361 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
362 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
363 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
364 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
365 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
366 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
367 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
368 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
369
370 document_label = self.input.peek(3 + whitespaces, label_length);
371
372 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
373 }
374 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
375 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
376
377 document_label = self.input.peek(4 + whitespaces, label_length);
378
379 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
380 }
381 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
382 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
383
384 document_label = self.input.peek(4 + whitespaces, label_length);
385
386 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
387 }
388 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
389 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
390 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
391 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
392 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
393 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
394 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
395 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
396 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
397 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
398 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
399 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
400 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
401 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
402 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
403 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
404 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
405 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
406 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
407 [b':', b':', ..] => (TokenKind::ColonColon, 2),
408 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
409 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
410 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
411 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
412 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
413 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
414 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
415 [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
416 [b'/', b'/', ..] => {
417 let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
418 let comment_len = scan_single_line_comment(remaining);
419 (TokenKind::SingleLineComment, 2 + comment_len)
420 }
421 [b'/', b'*', asterisk] => {
422 let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
423 match scan_multi_line_comment(remaining) {
424 Some(len) => {
425 let is_docblock = asterisk == &b'*' && len > 2;
426 if is_docblock {
427 (TokenKind::DocBlockComment, len + 2)
428 } else {
429 (TokenKind::MultiLineComment, len + 2)
430 }
431 }
432 None => {
433 self.input.consume(remaining.len() + 2);
434 return Some(Err(SyntaxError::UnexpectedEndOfFile(
435 self.file_id(),
436 self.input.current_position(),
437 )));
438 }
439 }
440 }
441 [b'\\', start_of_identifier!(), ..] => {
442 let mut length = 1;
443 loop {
444 let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
445 length += ident_len;
446 if ends_with_ns {
447 length += 1; } else {
449 break;
450 }
451 }
452
453 (TokenKind::FullyQualifiedIdentifier, length)
454 }
455 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
456 [b'$', ..] => (TokenKind::Dollar, 1),
457 [b'!', ..] => (TokenKind::Bang, 1),
458 [b'&', ..] => (TokenKind::Ampersand, 1),
459 [b'?', ..] => (TokenKind::Question, 1),
460 [b'=', ..] => (TokenKind::Equal, 1),
461 [b'`', ..] => (TokenKind::Backtick, 1),
462 [b'+', ..] => (TokenKind::Plus, 1),
463 [b'%', ..] => (TokenKind::Percent, 1),
464 [b'-', ..] => (TokenKind::Minus, 1),
465 [b'<', ..] => (TokenKind::LessThan, 1),
466 [b'>', ..] => (TokenKind::GreaterThan, 1),
467 [b':', ..] => (TokenKind::Colon, 1),
468 [b'|', ..] => (TokenKind::Pipe, 1),
469 [b'^', ..] => (TokenKind::Caret, 1),
470 [b'*', ..] => (TokenKind::Asterisk, 1),
471 [b'/', ..] => (TokenKind::Slash, 1),
472 [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
473 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
474 read_literal_string(&self.input, *quote)
475 }
476 [b'"', ..] => (TokenKind::DoubleQuote, 1),
477 [b'(', ..] => 'parenthesis: {
478 let mut peek_offset = 1;
479 while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
480 if b.is_ascii_whitespace() {
481 peek_offset += 1;
482 } else {
483 let lower = b | 0x20; if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u') {
486 break 'parenthesis (TokenKind::LeftParenthesis, 1);
487 }
488 break;
489 }
490 }
491
492 for (value, kind) in internal::consts::CAST_TYPES {
493 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
494 break 'parenthesis (kind, length);
495 }
496 }
497
498 (TokenKind::LeftParenthesis, 1)
499 }
500 [b'#', ..] => {
501 let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
502 let comment_len = scan_single_line_comment(remaining);
503 (TokenKind::HashComment, 1 + comment_len)
504 }
505 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
506 [b'.', start_of_number!(), ..] => {
507 let mut length = read_digits_of_base(&self.input, 2, 10);
508 if let float_exponent!() = self.input.peek(length, 1) {
509 let mut exp_length = length + 1;
510 if let number_sign!() = self.input.peek(exp_length, 1) {
511 exp_length += 1;
512 }
513
514 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
515 if after_exp > exp_length {
516 length = after_exp;
517 }
518 }
519
520 (TokenKind::LiteralFloat, length)
521 }
522 [start_of_number!(), ..] => 'number: {
523 let mut length = 1;
524
525 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
526 start_of_binary_number!() => {
527 length += 1;
528
529 (2, NumberKind::Integer)
530 }
531 start_of_octal_number!() => {
532 length += 1;
533
534 (8, NumberKind::Integer)
535 }
536 start_of_hexadecimal_number!() => {
537 length += 1;
538
539 (16, NumberKind::Integer)
540 }
541 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
542 start_of_float_number!() => (10, NumberKind::Float),
543 _ => (10, NumberKind::IntegerOrFloat),
544 };
545
546 if kind != NumberKind::Float {
547 length = read_digits_of_base(&self.input, length, base);
548
549 if kind == NumberKind::Integer {
550 break 'number (TokenKind::LiteralInteger, length);
551 }
552 }
553
554 let is_float = matches!(self.input.peek(length, 3), float_separator!());
555
556 if !is_float {
557 break 'number (TokenKind::LiteralInteger, length);
558 }
559
560 if let [b'.'] = self.input.peek(length, 1) {
561 length += 1;
562 length = read_digits_of_base(&self.input, length, 10);
563 }
564
565 if let float_exponent!() = self.input.peek(length, 1) {
566 let mut exp_length = length + 1;
568 if let number_sign!() = self.input.peek(exp_length, 1) {
569 exp_length += 1;
570 }
571 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
572 if after_exp > exp_length {
573 length = after_exp;
575 }
576 }
577
578 (TokenKind::LiteralFloat, length)
579 }
580 [b'.', ..] => (TokenKind::Dot, 1),
581 [unknown_byte, ..] => {
582 let position = self.input.current_position();
583 self.input.consume(1);
584
585 return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
586 }
587 [] => {
588 unreachable!()
591 }
592 };
593
594 self.mode = match token_kind {
595 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
596 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
597 TokenKind::CloseTag => LexerMode::Inline,
598 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
599 TokenKind::DocumentStart(document_kind) => {
600 LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
601 }
602 _ => LexerMode::Script,
603 };
604
605 let buffer = self.input.consume(len);
606 let end = self.input.current_position();
607
608 Some(Ok(self.token(token_kind, buffer, start, end)))
609 }
610 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
611 Interpolation::None => {
612 let start = self.input.current_position();
613
614 let mut length = 0;
615 let mut last_was_slash = false;
616 let mut token_kind = TokenKind::StringPart;
617 loop {
618 match self.input.peek(length, 2) {
619 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
620 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
621
622 self.mode =
623 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
624
625 break;
626 }
627 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
628 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
629
630 self.mode =
631 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
632
633 break;
634 }
635 [b'\\', ..] => {
636 length += 1;
637
638 last_was_slash = !last_was_slash;
639 }
640 [b'"', ..] if !last_was_slash => {
641 if length == 0 {
642 length += 1;
643 token_kind = TokenKind::DoubleQuote;
644
645 break;
646 }
647
648 break;
649 }
650 [_, ..] => {
651 length += 1;
652 last_was_slash = false;
653 }
654 [] => {
655 break;
656 }
657 }
658 }
659
660 let buffer = self.input.consume(length);
661 let end = self.input.current_position();
662
663 if TokenKind::DoubleQuote == token_kind {
664 self.mode = LexerMode::Script;
665 }
666
667 Some(Ok(self.token(token_kind, buffer, start, end)))
668 }
669 Interpolation::Until(offset) => {
670 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None))
671 }
672 },
673 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
674 Interpolation::None => {
675 let start = self.input.current_position();
676
677 let mut length = 0;
678 let mut last_was_slash = false;
679 let mut token_kind = TokenKind::StringPart;
680 loop {
681 match self.input.peek(length, 2) {
682 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
683 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
684
685 self.mode =
686 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
687
688 break;
689 }
690 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
691 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
692
693 self.mode =
694 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
695
696 break;
697 }
698 [b'\\', ..] => {
699 length += 1;
700 last_was_slash = true;
701 }
702 [b'`', ..] if !last_was_slash => {
703 if length == 0 {
704 length += 1;
705 token_kind = TokenKind::Backtick;
706
707 break;
708 }
709
710 break;
711 }
712 [_, ..] => {
713 length += 1;
714 last_was_slash = false;
715 }
716 [] => {
717 break;
718 }
719 }
720 }
721
722 let buffer = self.input.consume(length);
723 let end = self.input.current_position();
724
725 if TokenKind::Backtick == token_kind {
726 self.mode = LexerMode::Script;
727 }
728
729 Some(Ok(self.token(token_kind, buffer, start, end)))
730 }
731 Interpolation::Until(offset) => {
732 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None))
733 }
734 },
735 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
736 DocumentKind::Heredoc => match &interpolation {
737 Interpolation::None => {
738 let start = self.input.current_position();
739
740 let mut length = 0;
741 let mut last_was_slash = false;
742 let mut only_whitespaces = true;
743 let mut token_kind = TokenKind::StringPart;
744 loop {
745 match self.input.peek(length, 2) {
746 [b'\r', b'\n'] => {
747 length += 2;
748
749 break;
750 }
751 [b'\n' | b'\r', ..] => {
752 length += 1;
753
754 break;
755 }
756 [byte, ..] if byte.is_ascii_whitespace() => {
757 length += 1;
758 }
759 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
760 let until_offset =
761 read_until_end_of_variable_interpolation(&self.input, length + 2);
762
763 self.mode = LexerMode::DocumentString(
764 kind,
765 label,
766 Interpolation::Until(start.offset + until_offset),
767 );
768
769 break;
770 }
771 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
772 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
773
774 self.mode = LexerMode::DocumentString(
775 kind,
776 label,
777 Interpolation::Until(start.offset + until_offset),
778 );
779
780 break;
781 }
782 [b'\\', ..] => {
783 length += 1;
784 last_was_slash = true;
785 only_whitespaces = false;
786 }
787 [_, ..] => {
788 if only_whitespaces
789 && self.input.peek(length, label.len()) == label
790 && self
791 .input
792 .peek(length + label.len(), 1)
793 .first()
794 .is_none_or(|c| !c.is_ascii_alphanumeric())
795 {
796 length += label.len();
797 token_kind = TokenKind::DocumentEnd;
798
799 break;
800 }
801
802 length += 1;
803 last_was_slash = false;
804 only_whitespaces = false;
805 }
806 [] => {
807 break;
808 }
809 }
810 }
811
812 let buffer = self.input.consume(length);
813 let end = self.input.current_position();
814
815 if TokenKind::DocumentEnd == token_kind {
816 self.mode = LexerMode::Script;
817 }
818
819 Some(Ok(self.token(token_kind, buffer, start, end)))
820 }
821 Interpolation::Until(offset) => {
822 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None))
823 }
824 },
825 DocumentKind::Nowdoc => {
826 let start = self.input.current_position();
827
828 let mut length = 0;
829 let mut terminated = false;
830 let mut only_whitespaces = true;
831
832 loop {
833 match self.input.peek(length, 2) {
834 [b'\r', b'\n'] => {
835 length += 2;
836
837 break;
838 }
839 [b'\n' | b'\r', ..] => {
840 length += 1;
841
842 break;
843 }
844 [byte, ..] if byte.is_ascii_whitespace() => {
845 length += 1;
846 }
847 [_, ..] => {
848 if only_whitespaces
849 && self.input.peek(length, label.len()) == label
850 && self
851 .input
852 .peek(length + label.len(), 1)
853 .first()
854 .is_none_or(|c| !c.is_ascii_alphanumeric())
855 {
856 length += label.len();
857 terminated = true;
858
859 break;
860 }
861
862 only_whitespaces = false;
863 length += 1;
864 }
865 [] => {
866 break;
867 }
868 }
869 }
870
871 let buffer = self.input.consume(length);
872 let end = self.input.current_position();
873
874 if terminated {
875 self.mode = LexerMode::Script;
876
877 return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
878 }
879
880 Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
881 }
882 },
883 LexerMode::Halt(stage) => 'halt: {
884 let start = self.input.current_position();
885 if let HaltStage::End = stage {
886 let buffer = self.input.consume_remaining();
887 let end = self.input.current_position();
888
889 break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
890 }
891
892 let whitespaces = self.input.consume_whitespaces();
893 if !whitespaces.is_empty() {
894 let end = self.input.current_position();
895
896 break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
897 }
898
899 match &stage {
900 HaltStage::LookingForLeftParenthesis => {
901 if self.input.is_at(b"(", false) {
902 let buffer = self.input.consume(1);
903 let end = self.input.current_position();
904
905 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
906
907 Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
908 } else {
909 let byte = self.input.read(1)[0];
910 let position = self.input.current_position();
911 self.input.consume(1);
913 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
914 }
915 }
916 HaltStage::LookingForRightParenthesis => {
917 if self.input.is_at(b")", false) {
918 let buffer = self.input.consume(1);
919 let end = self.input.current_position();
920
921 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
922
923 Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
924 } else {
925 let byte = self.input.read(1)[0];
926 let position = self.input.current_position();
927 self.input.consume(1);
928 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
929 }
930 }
931 HaltStage::LookingForTerminator => {
932 if self.input.is_at(b";", false) {
933 let buffer = self.input.consume(1);
934 let end = self.input.current_position();
935
936 self.mode = LexerMode::Halt(HaltStage::End);
937
938 Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
939 } else if self.input.is_at(b"?>", false) {
940 let buffer = self.input.consume(2);
941 let end = self.input.current_position();
942
943 self.mode = LexerMode::Halt(HaltStage::End);
944
945 Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
946 } else {
947 let byte = self.input.read(1)[0];
948 let position = self.input.current_position();
949 self.input.consume(1);
950 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
951 }
952 }
953 _ => unreachable!(),
954 }
955 }
956 }
957 }
958
959 #[inline]
963 fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
964 let (mut length, ended_with_slash) = self.input.scan_identifier(0);
965
966 if !ended_with_slash {
967 match length {
968 6 => {
969 if self.input.is_at(b"public(set)", true) {
970 return (TokenKind::PublicSet, 11);
971 }
972 }
973 7 => {
974 if self.input.is_at(b"private(set)", true) {
975 return (TokenKind::PrivateSet, 12);
976 }
977 }
978 9 => {
979 if self.input.is_at(b"protected(set)", true) {
980 return (TokenKind::ProtectedSet, 14);
981 }
982 }
983 _ => {}
984 }
985 }
986
987 if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
988 return (kind, length);
989 }
990
991 let mut slashes = 0;
992 let mut last_was_slash = false;
993 loop {
994 match self.input.peek(length, 1) {
995 [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
996 length += 1;
997 last_was_slash = false;
998 }
999 [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1000 length += 1;
1001 }
1002 [b'\\'] if !self.interpolating => {
1003 if last_was_slash {
1004 length -= 1;
1005 slashes -= 1;
1006 last_was_slash = false;
1007 break;
1008 }
1009
1010 length += 1;
1011 slashes += 1;
1012 last_was_slash = true;
1013 }
1014 _ => {
1015 break;
1016 }
1017 }
1018 }
1019
1020 if last_was_slash {
1021 length -= 1;
1022 slashes -= 1;
1023 }
1024
1025 let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1026
1027 (kind, length)
1028 }
1029
1030 #[inline]
1031 fn token(&self, kind: TokenKind, v: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1032 let value = unsafe { std::str::from_utf8_unchecked(v) };
1037
1038 Token { kind, start, value }
1039 }
1040
1041 #[inline]
1042 fn interpolation(
1043 &mut self,
1044 end_offset: u32,
1045 post_interpolation_mode: LexerMode<'input>,
1046 ) -> Option<Result<Token<'input>, SyntaxError>> {
1047 self.mode = LexerMode::Script;
1048
1049 let was_interpolating = self.interpolating;
1050 self.interpolating = true;
1051
1052 loop {
1053 let subsequent_token = self.advance()?.ok()?;
1054 let token_start = subsequent_token.start.offset;
1056 let token_end = token_start + subsequent_token.value.len() as u32;
1057 let is_final_token = token_start <= end_offset && end_offset <= token_end;
1058
1059 self.buffer.push_back(subsequent_token);
1060
1061 if is_final_token {
1062 break;
1063 }
1064 }
1065
1066 self.mode = post_interpolation_mode;
1067 self.interpolating = was_interpolating;
1068
1069 self.advance()
1070 }
1071}
1072
1073impl HasFileId for Lexer<'_> {
1074 #[inline]
1075 fn file_id(&self) -> FileId {
1076 self.input.file_id()
1077 }
1078}
1079
1080#[inline]
1081fn matches_start_of_heredoc_document(input: &Input) -> bool {
1082 let total = input.len();
1083 let base = input.current_offset();
1084
1085 let mut length = 3;
1087 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1089 length += 1;
1090 }
1091
1092 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1094 return false;
1095 }
1096 length += 1; loop {
1100 let pos = base + length;
1101 if pos >= total {
1102 return false; }
1104
1105 let byte = *input.read_at(pos);
1106 if byte == b'\n' {
1107 return true; } else if byte == b'\r' {
1109 return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1111 } else if is_part_of_identifier(input.read_at(pos)) {
1112 length += 1;
1113 } else {
1114 return false; }
1116 }
1117}
1118
1119#[inline]
1120fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1121 let total = input.len();
1122 let base = input.current_offset();
1123
1124 let mut length = 3;
1126 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1127 length += 1;
1128 }
1129
1130 if base + length >= total || *input.read_at(base + length) != b'"' {
1132 return false;
1133 }
1134 length += 1;
1135
1136 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1138 return false;
1139 }
1140 length += 1;
1141
1142 let mut terminated = false;
1144 loop {
1145 let pos = base + length;
1146 if pos >= total {
1147 return false;
1148 }
1149 let byte = input.read_at(pos);
1150 if *byte == b'\n' {
1151 return terminated;
1153 } else if *byte == b'\r' {
1154 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1156 } else if !terminated && is_part_of_identifier(byte) {
1157 length += 1;
1158 } else if !terminated && *byte == b'"' {
1159 terminated = true;
1160 length += 1;
1161 } else {
1162 return false;
1163 }
1164 }
1165}
1166
1167#[inline]
1168fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1169 let total = input.len();
1170 let base = input.current_offset();
1171
1172 let mut length = 3;
1174 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1175 length += 1;
1176 }
1177
1178 if base + length >= total || *input.read_at(base + length) != b'\'' {
1180 return false;
1181 }
1182 length += 1;
1183
1184 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1186 return false;
1187 }
1188 length += 1;
1189
1190 let mut terminated = false;
1192 loop {
1193 let pos = base + length;
1194 if pos >= total {
1195 return false;
1196 }
1197 let byte = *input.read_at(pos);
1198 if byte == b'\n' {
1199 return terminated;
1200 } else if byte == b'\r' {
1201 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1202 } else if !terminated && is_part_of_identifier(&byte) {
1203 length += 1;
1204 } else if !terminated && byte == b'\'' {
1205 terminated = true;
1206 length += 1;
1207 } else {
1208 return false;
1209 }
1210 }
1211}
1212
1213#[inline]
1214fn matches_literal_double_quote_string(input: &Input) -> bool {
1215 let total = input.len();
1216 let base = input.current_offset();
1217
1218 let mut pos = base + 1;
1220 loop {
1221 if pos >= total {
1222 return true;
1224 }
1225 let byte = *input.read_at(pos);
1226 if byte == b'"' {
1227 return true;
1229 } else if byte == b'\\' {
1230 pos += 2;
1232 continue;
1233 }
1234
1235 if pos + 1 < total {
1238 let next = *input.read_at(pos + 1);
1239 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1240 return false;
1241 }
1242 }
1243 pos += 1;
1244 }
1245}
1246
1247#[inline]
1248fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1249 let total = input.len();
1250 let base = input.current_offset();
1251
1252 let mut pos = base + 3;
1254 let mut whitespaces = 0;
1255 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1256 whitespaces += 1;
1257 pos += 1;
1258 }
1259
1260 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1265
1266 let mut label_length = 1; let mut terminated = false; loop {
1269 let pos = base + length;
1270 if pos >= total {
1272 unreachable!("Unexpected end of input while reading heredoc label");
1273 }
1274
1275 let byte = *input.read_at(pos);
1276 if byte == b'\n' {
1277 length += 1;
1279 return (length, whitespaces, label_length);
1280 } else if byte == b'\r' {
1281 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1283 length += 2;
1284 } else {
1285 length += 1;
1286 }
1287 return (length, whitespaces, label_length);
1288 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1289 length += 1;
1292 label_length += 1;
1293 } else if double_quoted && !terminated && byte == b'"' {
1294 length += 1;
1296 terminated = true;
1297 } else {
1298 unreachable!("Unexpected character encountered in heredoc label");
1299 }
1300 }
1301}
1302
1303#[inline]
1304fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1305 let total = input.len();
1306 let base = input.current_offset();
1307
1308 let mut pos = base + 3;
1309 let mut whitespaces = 0;
1310 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1311 whitespaces += 1;
1312 pos += 1;
1313 }
1314
1315 let mut length = 3 + whitespaces + 2;
1317
1318 let mut label_length = 1;
1319 let mut terminated = false;
1320 loop {
1321 let pos = base + length;
1322 if pos >= total {
1323 unreachable!("Unexpected end of input while reading nowdoc label");
1324 }
1325 let byte = *input.read_at(pos);
1326
1327 if byte == b'\n' {
1328 length += 1;
1330 return (length, whitespaces, label_length);
1331 } else if byte == b'\r' {
1332 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1334 length += 2;
1335 } else {
1336 length += 1;
1337 }
1338 return (length, whitespaces, label_length);
1339 } else if is_part_of_identifier(&byte) && !terminated {
1340 length += 1;
1342 label_length += 1;
1343 } else if !terminated && byte == b'\'' {
1344 length += 1;
1346 terminated = true;
1347 } else {
1348 unreachable!("Unexpected character encountered in nowdoc label");
1349 }
1350 }
1351}
1352
1353#[inline]
1354fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1355 let total = input.len();
1356 let start = input.current_offset();
1357 let mut length = 1; let bytes = input.peek(length, total - start - length);
1360 loop {
1361 match memchr2(quote, b'\\', &bytes[length - 1..]) {
1362 Some(pos) => {
1363 let abs_pos = length - 1 + pos;
1364 let byte = bytes[abs_pos];
1365
1366 if byte == b'\\' {
1367 length = abs_pos + 2 + 1; if length > total - start {
1369 return (TokenKind::PartialLiteralString, total - start);
1370 }
1371 } else {
1372 length = abs_pos + 2; return (TokenKind::LiteralString, length);
1374 }
1375 }
1376 None => {
1377 return (TokenKind::PartialLiteralString, total - start);
1379 }
1380 }
1381 }
1382}
1383
1384#[inline]
1385fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1386 let total = input.len();
1387 let base = input.current_offset();
1388 let mut offset = from;
1390
1391 loop {
1392 let abs = base + offset;
1393 if abs >= total {
1394 break;
1396 }
1397
1398 if is_part_of_identifier(input.read_at(abs)) {
1400 offset += 1;
1401 continue;
1402 }
1403
1404 if *input.read_at(abs) == b'[' {
1406 offset += 1;
1407 let mut nesting = 0;
1408 loop {
1409 let abs_inner = base + offset;
1410 if abs_inner >= total {
1411 break;
1412 }
1413 let b = input.read_at(abs_inner);
1414 if *b == b']' {
1415 offset += 1;
1416 if nesting == 0 {
1417 break;
1418 }
1419
1420 nesting -= 1;
1421 } else if *b == b'[' {
1422 offset += 1;
1423 nesting += 1;
1424 } else if b.is_ascii_whitespace() {
1425 break;
1427 } else {
1428 offset += 1;
1429 }
1430 }
1431 break;
1433 }
1434
1435 if base + offset + 2 < total
1437 && *input.read_at(abs) == b'-'
1438 && *input.read_at(base + offset + 1) == b'>'
1439 && is_start_of_identifier(input.read_at(base + offset + 2))
1440 {
1441 offset += 3;
1442 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1444 offset += 1;
1445 }
1446 break;
1447 }
1448
1449 if base + offset + 3 < total
1451 && *input.read_at(abs) == b'?'
1452 && *input.read_at(base + offset + 1) == b'-'
1453 && *input.read_at(base + offset + 2) == b'>'
1454 && is_start_of_identifier(input.read_at(base + offset + 3))
1455 {
1456 offset += 4;
1457 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1458 offset += 1;
1459 }
1460 break;
1461 }
1462
1463 break;
1465 }
1466
1467 offset as u32
1468}
1469
1470#[inline]
1471fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1472 let total = input.len();
1473 let base = input.current_offset();
1474 let mut offset = from;
1475 let mut nesting = 0;
1476
1477 loop {
1478 let abs = base + offset;
1479 if abs >= total {
1480 break;
1481 }
1482 match input.read_at(abs) {
1483 b'}' => {
1484 offset += 1;
1485 if nesting == 0 {
1486 break;
1487 }
1488
1489 nesting -= 1;
1490 }
1491 b'{' => {
1492 offset += 1;
1493 nesting += 1;
1494 }
1495 _ => {
1496 offset += 1;
1497 }
1498 }
1499 }
1500
1501 offset as u32
1502}
1503
1504#[inline]
1507fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1508 memmem::find(bytes, b"*/").map(|pos| pos + 2)
1510}
1511
1512#[inline]
1516fn scan_single_line_comment(bytes: &[u8]) -> usize {
1517 let mut pos = 0;
1518 while pos < bytes.len() {
1519 match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1520 Some(offset) => {
1521 let found_pos = pos + offset;
1522 match bytes[found_pos] {
1523 b'\n' | b'\r' => return found_pos,
1524 b'?' => {
1525 if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1527 if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1529 return found_pos - 1;
1530 }
1531 return found_pos;
1532 }
1533 pos = found_pos + 1;
1535 }
1536 _ => unreachable!(),
1537 }
1538 }
1539 None => return bytes.len(),
1540 }
1541 }
1542
1543 bytes.len()
1544}