1use std::collections::VecDeque;
2use std::fmt::Debug;
3use std::hint::unreachable_unchecked;
4
5use memchr::memchr2;
6use memchr::memmem;
7
8const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
12 let mut table: [Option<TokenKind>; 256] = [None; 256];
13 table[b';' as usize] = Some(TokenKind::Semicolon);
14 table[b',' as usize] = Some(TokenKind::Comma);
15 table[b')' as usize] = Some(TokenKind::RightParenthesis);
16 table[b'[' as usize] = Some(TokenKind::LeftBracket);
17 table[b']' as usize] = Some(TokenKind::RightBracket);
18 table[b'{' as usize] = Some(TokenKind::LeftBrace);
19 table[b'}' as usize] = Some(TokenKind::RightBrace);
20 table[b'~' as usize] = Some(TokenKind::Tilde);
21 table[b'@' as usize] = Some(TokenKind::At);
22 table
23};
24
25const IDENT_START_TABLE: [bool; 256] = {
27 let mut table = [false; 256];
28 let mut i = 0usize;
29 while i < 256 {
30 table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
31 i += 1;
32 }
33
34 table
35};
36
37use mago_database::file::FileId;
38use mago_database::file::HasFileId;
39use mago_span::Position;
40use mago_syntax_core::float_exponent;
41use mago_syntax_core::float_separator;
42use mago_syntax_core::input::Input;
43use mago_syntax_core::number_sign;
44use mago_syntax_core::start_of_binary_number;
45use mago_syntax_core::start_of_float_number;
46use mago_syntax_core::start_of_hexadecimal_number;
47use mago_syntax_core::start_of_identifier;
48use mago_syntax_core::start_of_number;
49use mago_syntax_core::start_of_octal_number;
50use mago_syntax_core::start_of_octal_or_float_number;
51use mago_syntax_core::utils::is_part_of_identifier;
52use mago_syntax_core::utils::is_start_of_identifier;
53use mago_syntax_core::utils::read_digits_of_base;
54
55use crate::error::SyntaxError;
56use crate::lexer::internal::mode::HaltStage;
57use crate::lexer::internal::mode::Interpolation;
58use crate::lexer::internal::mode::LexerMode;
59use crate::lexer::internal::utils::NumberKind;
60use crate::settings::LexerSettings;
61use crate::token::DocumentKind;
62use crate::token::Token;
63use crate::token::TokenKind;
64
65mod internal;
66
67#[derive(Debug)]
79pub struct Lexer<'input> {
80 input: Input<'input>,
81 settings: LexerSettings,
82 mode: LexerMode<'input>,
83 interpolating: bool,
84 brace_interpolating: bool,
85 buffer: VecDeque<Token<'input>>,
87}
88
89impl<'input> Lexer<'input> {
90 const BUFFER_INITIAL_CAPACITY: usize = 8;
93
94 pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
105 Lexer {
106 input,
107 settings,
108 mode: LexerMode::Inline,
109 interpolating: false,
110 brace_interpolating: false,
111 buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
112 }
113 }
114
115 pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
126 Lexer {
127 input,
128 settings,
129 mode: LexerMode::Script,
130 interpolating: false,
131 brace_interpolating: false,
132 buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
133 }
134 }
135
136 #[must_use]
140 pub fn has_reached_eof(&self) -> bool {
141 self.input.has_reached_eof()
142 }
143
144 #[inline]
146 pub const fn current_position(&self) -> Position {
147 self.input.current_position()
148 }
149
150 #[inline]
183 pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
184 if !self.interpolating
186 && let Some(token) = self.buffer.pop_front()
187 {
188 return Some(Ok(token));
189 }
190
191 if self.input.has_reached_eof() {
192 return None;
193 }
194
195 match self.mode {
196 LexerMode::Inline => {
197 let start = self.input.current_position();
198 let offset = self.input.current_offset();
199
200 if offset == 0
202 && self.input.len() >= 2
203 && unsafe { *self.input.read_at_unchecked(0) } == b'#'
204 && unsafe { *self.input.read_at_unchecked(1) } == b'!'
205 {
206 let buffer = self.input.consume_through(b'\n');
207 let end = self.input.current_position();
208
209 return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
210 }
211
212 let bytes = self.input.read_remaining();
214
215 if self.settings.enable_short_tags {
216 if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
217 if pos > 0 {
218 let buffer = self.input.consume(pos);
219 let end = self.input.current_position();
220
221 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
222 }
223
224 if self.input.is_at(b"<?php", true) {
225 let buffer = self.input.consume(5);
226 self.mode = LexerMode::Script;
227 return Some(Ok(self.token(
228 TokenKind::OpenTag,
229 buffer,
230 start,
231 self.input.current_position(),
232 )));
233 }
234
235 if self.input.is_at(b"<?=", false) {
236 let buffer = self.input.consume(3);
237 self.mode = LexerMode::Script;
238 return Some(Ok(self.token(
239 TokenKind::EchoTag,
240 buffer,
241 start,
242 self.input.current_position(),
243 )));
244 }
245
246 let buffer = self.input.consume(2);
247 self.mode = LexerMode::Script;
248 return Some(Ok(self.token(
249 TokenKind::ShortOpenTag,
250 buffer,
251 start,
252 self.input.current_position(),
253 )));
254 }
255 } else {
256 let iter = memchr::memmem::find_iter(bytes, b"<?");
257
258 for pos in iter {
259 let candidate = unsafe { bytes.get_unchecked(pos..) };
261
262 if candidate.len() >= 5
263 && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
264 && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
265 && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
266 {
267 if pos > 0 {
268 let buffer = self.input.consume(pos);
269 let end = self.input.current_position();
270 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
271 }
272
273 let buffer = self.input.consume(5);
274 self.mode = LexerMode::Script;
275 return Some(Ok(self.token(
276 TokenKind::OpenTag,
277 buffer,
278 start,
279 self.input.current_position(),
280 )));
281 }
282
283 if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
284 if pos > 0 {
285 let buffer = self.input.consume(pos);
286 let end = self.input.current_position();
287 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
288 }
289
290 let buffer = self.input.consume(3);
291 self.mode = LexerMode::Script;
292 return Some(Ok(self.token(
293 TokenKind::EchoTag,
294 buffer,
295 start,
296 self.input.current_position(),
297 )));
298 }
299 }
300 }
301
302 if self.input.has_reached_eof() {
303 return None;
304 }
305
306 let buffer = self.input.consume_remaining();
307 let end = self.input.current_position();
308 Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
309 }
310 LexerMode::Script => {
311 let start = self.input.current_position();
312 let whitespaces = self.input.consume_whitespaces();
313 if !whitespaces.is_empty() {
314 return Some(Ok(self.token(
315 TokenKind::Whitespace,
316 whitespaces,
317 start,
318 self.input.current_position(),
319 )));
320 }
321
322 let first_byte = match self.input.read(1).first() {
323 Some(&b) => b,
324 None => {
325 unsafe { unreachable_unchecked() }
327 }
328 };
329
330 if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
331 let buffer = self.input.consume(1);
332 let end = self.input.current_position();
333 return Some(Ok(self.token(kind, buffer, start, end)));
334 }
335
336 if IDENT_START_TABLE[first_byte as usize] {
337 let (token_kind, len) = self.scan_identifier_or_keyword_info();
338
339 if token_kind == TokenKind::HaltCompiler {
340 self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
341 }
342
343 let buffer = self.input.consume(len);
344 let end = self.input.current_position();
345 return Some(Ok(self.token(token_kind, buffer, start, end)));
346 }
347
348 if first_byte == b'$'
349 && let Some(&next) = self.input.read(2).get(1)
350 && IDENT_START_TABLE[next as usize]
351 {
352 let (ident_len, _) = self.input.scan_identifier(1);
353 let buffer = self.input.consume(1 + ident_len);
354 let end = self.input.current_position();
355 return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
356 }
357
358 let mut document_label: &[u8] = &[];
359
360 let (token_kind, len) = match self.input.read(3) {
361 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
362 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
363 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
364 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
365 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
366 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
367 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
368 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
369 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
370 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
371 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
372
373 document_label = self.input.peek(3 + whitespaces, label_length);
374
375 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
376 }
377 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
378 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
379
380 document_label = self.input.peek(4 + whitespaces, label_length);
381
382 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
383 }
384 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
385 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
386
387 document_label = self.input.peek(4 + whitespaces, label_length);
388
389 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
390 }
391 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
392 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
393 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
394 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
395 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
396 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
397 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
398 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
399 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
400 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
401 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
402 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
403 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
404 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
405 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
406 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
407 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
408 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
409 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
410 [b':', b':', ..] => (TokenKind::ColonColon, 2),
411 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
412 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
413 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
414 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
415 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
416 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
417 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
418 [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
419 [b'/', b'/', ..] => {
420 let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
421 let comment_len = scan_single_line_comment(remaining);
422 (TokenKind::SingleLineComment, 2 + comment_len)
423 }
424 [b'/', b'*', asterisk] => {
425 let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
426 match scan_multi_line_comment(remaining) {
427 Some(len) => {
428 let is_docblock = asterisk == &b'*' && len > 2;
429 if is_docblock {
430 (TokenKind::DocBlockComment, len + 2)
431 } else {
432 (TokenKind::MultiLineComment, len + 2)
433 }
434 }
435 None => {
436 self.input.consume(remaining.len() + 2);
437 return Some(Err(SyntaxError::UnexpectedEndOfFile(
438 self.file_id(),
439 self.input.current_position(),
440 )));
441 }
442 }
443 }
444 [b'\\', start_of_identifier!(), ..] => {
445 let mut length = 1;
446 loop {
447 let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
448 length += ident_len;
449 if ends_with_ns {
450 length += 1; } else {
452 break;
453 }
454 }
455
456 (TokenKind::FullyQualifiedIdentifier, length)
457 }
458 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
459 [b'$', ..] => (TokenKind::Dollar, 1),
460 [b'!', ..] => (TokenKind::Bang, 1),
461 [b'&', ..] => (TokenKind::Ampersand, 1),
462 [b'?', ..] => (TokenKind::Question, 1),
463 [b'=', ..] => (TokenKind::Equal, 1),
464 [b'`', ..] => (TokenKind::Backtick, 1),
465 [b'+', ..] => (TokenKind::Plus, 1),
466 [b'%', ..] => (TokenKind::Percent, 1),
467 [b'-', ..] => (TokenKind::Minus, 1),
468 [b'<', ..] => (TokenKind::LessThan, 1),
469 [b'>', ..] => (TokenKind::GreaterThan, 1),
470 [b':', ..] => (TokenKind::Colon, 1),
471 [b'|', ..] => (TokenKind::Pipe, 1),
472 [b'^', ..] => (TokenKind::Caret, 1),
473 [b'*', ..] => (TokenKind::Asterisk, 1),
474 [b'/', ..] => (TokenKind::Slash, 1),
475 [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
476 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
477 read_literal_string(&self.input, *quote)
478 }
479 [b'"', ..] => (TokenKind::DoubleQuote, 1),
480 [b'(', ..] => 'parenthesis: {
481 let mut peek_offset = 1;
482 while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
483 if b.is_ascii_whitespace() {
484 peek_offset += 1;
485 } else {
486 let lower = b | 0x20; if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u' | b'v')
489 {
490 break 'parenthesis (TokenKind::LeftParenthesis, 1);
491 }
492 break;
493 }
494 }
495
496 for (value, kind) in internal::consts::CAST_TYPES {
497 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
498 break 'parenthesis (kind, length);
499 }
500 }
501
502 (TokenKind::LeftParenthesis, 1)
503 }
504 [b'#', ..] => {
505 let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
506 let comment_len = scan_single_line_comment(remaining);
507 (TokenKind::HashComment, 1 + comment_len)
508 }
509 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
510 [b'.', start_of_number!(), ..] => {
511 let mut length = read_digits_of_base(&self.input, 2, 10);
512 if let float_exponent!() = self.input.peek(length, 1) {
513 let mut exp_length = length + 1;
514 if let number_sign!() = self.input.peek(exp_length, 1) {
515 exp_length += 1;
516 }
517
518 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
519 if after_exp > exp_length {
520 length = after_exp;
521 }
522 }
523
524 (TokenKind::LiteralFloat, length)
525 }
526 [start_of_number!(), ..] => 'number: {
527 let mut length = 1;
528
529 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
530 start_of_binary_number!() => {
531 length += 1;
532
533 (2, NumberKind::Integer)
534 }
535 start_of_octal_number!() => {
536 length += 1;
537
538 (8, NumberKind::Integer)
539 }
540 start_of_hexadecimal_number!() => {
541 length += 1;
542
543 (16, NumberKind::Integer)
544 }
545 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
546 start_of_float_number!() => (10, NumberKind::Float),
547 _ => (10, NumberKind::IntegerOrFloat),
548 };
549
550 if kind != NumberKind::Float {
551 length = read_digits_of_base(&self.input, length, base);
552
553 if kind == NumberKind::Integer {
554 break 'number (TokenKind::LiteralInteger, length);
555 }
556 }
557
558 let is_float = matches!(self.input.peek(length, 3), float_separator!());
559
560 if !is_float {
561 break 'number (TokenKind::LiteralInteger, length);
562 }
563
564 if let [b'.'] = self.input.peek(length, 1) {
565 length += 1;
566 length = read_digits_of_base(&self.input, length, 10);
567 }
568
569 if let float_exponent!() = self.input.peek(length, 1) {
570 let mut exp_length = length + 1;
572 if let number_sign!() = self.input.peek(exp_length, 1) {
573 exp_length += 1;
574 }
575 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
576 if after_exp > exp_length {
577 length = after_exp;
579 }
580 }
581
582 (TokenKind::LiteralFloat, length)
583 }
584 [b'.', ..] => (TokenKind::Dot, 1),
585 [unknown_byte, ..] => {
586 let position = self.input.current_position();
587 self.input.consume(1);
588
589 return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
590 }
591 [] => {
592 unreachable!()
595 }
596 };
597
598 self.mode = match token_kind {
599 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
600 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
601 TokenKind::CloseTag => LexerMode::Inline,
602 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
603 TokenKind::DocumentStart(document_kind) => {
604 LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
605 }
606 _ => LexerMode::Script,
607 };
608
609 let buffer = self.input.consume(len);
610 let end = self.input.current_position();
611
612 Some(Ok(self.token(token_kind, buffer, start, end)))
613 }
614 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
615 Interpolation::None => {
616 let start = self.input.current_position();
617
618 let mut length = 0;
619 let mut last_was_slash = false;
620 let mut token_kind = TokenKind::StringPart;
621 loop {
622 match self.input.peek(length, 2) {
623 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
624 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
625
626 self.mode =
627 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
628
629 break;
630 }
631 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
632 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
633
634 self.mode = LexerMode::DoubleQuoteString(Interpolation::BraceUntil(
635 start.offset + until_offset,
636 ));
637
638 break;
639 }
640 [b'\\', ..] => {
641 length += 1;
642
643 last_was_slash = !last_was_slash;
644 }
645 [b'"', ..] if !last_was_slash => {
646 if length == 0 {
647 length += 1;
648 token_kind = TokenKind::DoubleQuote;
649
650 break;
651 }
652
653 break;
654 }
655 [_, ..] => {
656 length += 1;
657 last_was_slash = false;
658 }
659 [] => {
660 break;
661 }
662 }
663 }
664
665 let buffer = self.input.consume(length);
666 let end = self.input.current_position();
667
668 if TokenKind::DoubleQuote == token_kind {
669 self.mode = LexerMode::Script;
670 }
671
672 Some(Ok(self.token(token_kind, buffer, start, end)))
673 }
674 Interpolation::Until(offset) => {
675 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), false)
676 }
677 Interpolation::BraceUntil(offset) => {
678 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), true)
679 }
680 },
681 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
682 Interpolation::None => {
683 let start = self.input.current_position();
684
685 let mut length = 0;
686 let mut last_was_slash = false;
687 let mut token_kind = TokenKind::StringPart;
688 loop {
689 match self.input.peek(length, 2) {
690 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
691 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
692
693 self.mode =
694 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
695
696 break;
697 }
698 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
699 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
700
701 self.mode = LexerMode::ShellExecuteString(Interpolation::BraceUntil(
702 start.offset + until_offset,
703 ));
704
705 break;
706 }
707 [b'\\', ..] => {
708 length += 1;
709 last_was_slash = true;
710 }
711 [b'`', ..] if !last_was_slash => {
712 if length == 0 {
713 length += 1;
714 token_kind = TokenKind::Backtick;
715
716 break;
717 }
718
719 break;
720 }
721 [_, ..] => {
722 length += 1;
723 last_was_slash = false;
724 }
725 [] => {
726 break;
727 }
728 }
729 }
730
731 let buffer = self.input.consume(length);
732 let end = self.input.current_position();
733
734 if TokenKind::Backtick == token_kind {
735 self.mode = LexerMode::Script;
736 }
737
738 Some(Ok(self.token(token_kind, buffer, start, end)))
739 }
740 Interpolation::Until(offset) => {
741 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), false)
742 }
743 Interpolation::BraceUntil(offset) => {
744 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), true)
745 }
746 },
747 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
748 DocumentKind::Heredoc => match &interpolation {
749 Interpolation::None => {
750 let start = self.input.current_position();
751
752 let mut length = 0;
753 let mut last_was_slash = false;
754 let mut only_whitespaces = true;
755 let mut token_kind = TokenKind::StringPart;
756 loop {
757 match self.input.peek(length, 2) {
758 [b'\r', b'\n'] => {
759 length += 2;
760
761 break;
762 }
763 [b'\n' | b'\r', ..] => {
764 length += 1;
765
766 break;
767 }
768 [byte, ..] if byte.is_ascii_whitespace() => {
769 length += 1;
770 }
771 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
772 let until_offset =
773 read_until_end_of_variable_interpolation(&self.input, length + 2);
774
775 self.mode = LexerMode::DocumentString(
776 kind,
777 label,
778 Interpolation::Until(start.offset + until_offset),
779 );
780
781 break;
782 }
783 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
784 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
785
786 self.mode = LexerMode::DocumentString(
787 kind,
788 label,
789 Interpolation::BraceUntil(start.offset + until_offset),
790 );
791
792 break;
793 }
794 [b'\\', ..] => {
795 length += 1;
796 last_was_slash = true;
797 only_whitespaces = false;
798 }
799 [_, ..] => {
800 if only_whitespaces
801 && self.input.peek(length, label.len()) == label
802 && self
803 .input
804 .peek(length + label.len(), 1)
805 .first()
806 .is_none_or(|c| !c.is_ascii_alphanumeric())
807 {
808 length += label.len();
809 token_kind = TokenKind::DocumentEnd;
810
811 break;
812 }
813
814 length += 1;
815 last_was_slash = false;
816 only_whitespaces = false;
817 }
818 [] => {
819 break;
820 }
821 }
822 }
823
824 let buffer = self.input.consume(length);
825 let end = self.input.current_position();
826
827 if TokenKind::DocumentEnd == token_kind {
828 self.mode = LexerMode::Script;
829 }
830
831 Some(Ok(self.token(token_kind, buffer, start, end)))
832 }
833 Interpolation::Until(offset) => {
834 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), false)
835 }
836 Interpolation::BraceUntil(offset) => {
837 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), true)
838 }
839 },
840 DocumentKind::Nowdoc => {
841 let start = self.input.current_position();
842
843 let mut length = 0;
844 let mut terminated = false;
845 let mut only_whitespaces = true;
846
847 loop {
848 match self.input.peek(length, 2) {
849 [b'\r', b'\n'] => {
850 length += 2;
851
852 break;
853 }
854 [b'\n' | b'\r', ..] => {
855 length += 1;
856
857 break;
858 }
859 [byte, ..] if byte.is_ascii_whitespace() => {
860 length += 1;
861 }
862 [_, ..] => {
863 if only_whitespaces
864 && self.input.peek(length, label.len()) == label
865 && self
866 .input
867 .peek(length + label.len(), 1)
868 .first()
869 .is_none_or(|c| !c.is_ascii_alphanumeric())
870 {
871 length += label.len();
872 terminated = true;
873
874 break;
875 }
876
877 only_whitespaces = false;
878 length += 1;
879 }
880 [] => {
881 break;
882 }
883 }
884 }
885
886 let buffer = self.input.consume(length);
887 let end = self.input.current_position();
888
889 if terminated {
890 self.mode = LexerMode::Script;
891
892 return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
893 }
894
895 Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
896 }
897 },
898 LexerMode::Halt(stage) => 'halt: {
899 let start = self.input.current_position();
900 if let HaltStage::End = stage {
901 let buffer = self.input.consume_remaining();
902 let end = self.input.current_position();
903
904 break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
905 }
906
907 let whitespaces = self.input.consume_whitespaces();
908 if !whitespaces.is_empty() {
909 let end = self.input.current_position();
910
911 break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
912 }
913
914 match &stage {
915 HaltStage::LookingForLeftParenthesis => {
916 if self.input.is_at(b"(", false) {
917 let buffer = self.input.consume(1);
918 let end = self.input.current_position();
919
920 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
921
922 Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
923 } else {
924 let byte = self.input.read(1)[0];
925 let position = self.input.current_position();
926 self.input.consume(1);
928 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
929 }
930 }
931 HaltStage::LookingForRightParenthesis => {
932 if self.input.is_at(b")", false) {
933 let buffer = self.input.consume(1);
934 let end = self.input.current_position();
935
936 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
937
938 Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
939 } else {
940 let byte = self.input.read(1)[0];
941 let position = self.input.current_position();
942 self.input.consume(1);
943 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
944 }
945 }
946 HaltStage::LookingForTerminator => {
947 if self.input.is_at(b";", false) {
948 let buffer = self.input.consume(1);
949 let end = self.input.current_position();
950
951 self.mode = LexerMode::Halt(HaltStage::End);
952
953 Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
954 } else if self.input.is_at(b"?>", false) {
955 let buffer = self.input.consume(2);
956 let end = self.input.current_position();
957
958 self.mode = LexerMode::Halt(HaltStage::End);
959
960 Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
961 } else {
962 let byte = self.input.read(1)[0];
963 let position = self.input.current_position();
964 self.input.consume(1);
965 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
966 }
967 }
968 _ => unreachable!(),
969 }
970 }
971 }
972 }
973
974 #[inline]
978 fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
979 let (mut length, ended_with_slash) = self.input.scan_identifier(0);
980
981 if !ended_with_slash {
982 match length {
983 6 if self.input.is_at(b"public(set)", true) => {
984 return (TokenKind::PublicSet, 11);
985 }
986 7 if self.input.is_at(b"private(set)", true) => {
987 return (TokenKind::PrivateSet, 12);
988 }
989 9 if self.input.is_at(b"protected(set)", true) => {
990 return (TokenKind::ProtectedSet, 14);
991 }
992 _ => {}
993 }
994 }
995
996 if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
997 return (kind, length);
998 }
999
1000 let mut slashes = 0;
1001 let mut last_was_slash = false;
1002 loop {
1003 match self.input.peek(length, 1) {
1004 [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
1005 length += 1;
1006 last_was_slash = false;
1007 }
1008 [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1009 length += 1;
1010 }
1011 [b'\\'] if !self.interpolating || self.brace_interpolating => {
1012 if last_was_slash {
1013 length -= 1;
1014 slashes -= 1;
1015 last_was_slash = false;
1016 break;
1017 }
1018
1019 length += 1;
1020 slashes += 1;
1021 last_was_slash = true;
1022 }
1023 _ => {
1024 break;
1025 }
1026 }
1027 }
1028
1029 if last_was_slash {
1030 length -= 1;
1031 slashes -= 1;
1032 }
1033
1034 let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1035
1036 (kind, length)
1037 }
1038
1039 #[inline]
1040 fn token(&self, kind: TokenKind, v: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1041 let value = unsafe { std::str::from_utf8_unchecked(v) };
1046
1047 Token { kind, start, value }
1048 }
1049
1050 #[inline]
1051 fn interpolation(
1052 &mut self,
1053 end_offset: u32,
1054 post_interpolation_mode: LexerMode<'input>,
1055 brace: bool,
1056 ) -> Option<Result<Token<'input>, SyntaxError>> {
1057 self.mode = LexerMode::Script;
1058
1059 let was_interpolating = self.interpolating;
1060 self.interpolating = true;
1061 let was_brace_interpolating = self.brace_interpolating;
1062 self.brace_interpolating = brace;
1064
1065 loop {
1066 let subsequent_token = self.advance()?.ok()?;
1067 let token_start = subsequent_token.start.offset;
1069 let token_end = token_start + subsequent_token.value.len() as u32;
1070 let is_final_token = token_start <= end_offset && end_offset <= token_end;
1071
1072 self.buffer.push_back(subsequent_token);
1073
1074 if is_final_token {
1075 break;
1076 }
1077 }
1078
1079 self.mode = post_interpolation_mode;
1080 self.interpolating = was_interpolating;
1081 self.brace_interpolating = was_brace_interpolating;
1082
1083 self.advance()
1084 }
1085}
1086
1087impl HasFileId for Lexer<'_> {
1088 #[inline]
1089 fn file_id(&self) -> FileId {
1090 self.input.file_id()
1091 }
1092}
1093
1094#[inline]
1095fn matches_start_of_heredoc_document(input: &Input) -> bool {
1096 let total = input.len();
1097 let base = input.current_offset();
1098
1099 let mut length = 3;
1101 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1103 length += 1;
1104 }
1105
1106 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1108 return false;
1109 }
1110 length += 1; loop {
1114 let pos = base + length;
1115 if pos >= total {
1116 return false; }
1118
1119 let byte = *input.read_at(pos);
1120 if byte == b'\n' {
1121 return true; } else if byte == b'\r' {
1123 return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1125 } else if is_part_of_identifier(input.read_at(pos)) {
1126 length += 1;
1127 } else {
1128 return false; }
1130 }
1131}
1132
1133#[inline]
1134fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1135 let total = input.len();
1136 let base = input.current_offset();
1137
1138 let mut length = 3;
1140 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1141 length += 1;
1142 }
1143
1144 if base + length >= total || *input.read_at(base + length) != b'"' {
1146 return false;
1147 }
1148 length += 1;
1149
1150 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1152 return false;
1153 }
1154 length += 1;
1155
1156 let mut terminated = false;
1158 loop {
1159 let pos = base + length;
1160 if pos >= total {
1161 return false;
1162 }
1163 let byte = input.read_at(pos);
1164 if *byte == b'\n' {
1165 return terminated;
1167 } else if *byte == b'\r' {
1168 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1170 } else if !terminated && is_part_of_identifier(byte) {
1171 length += 1;
1172 } else if !terminated && *byte == b'"' {
1173 terminated = true;
1174 length += 1;
1175 } else {
1176 return false;
1177 }
1178 }
1179}
1180
1181#[inline]
1182fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1183 let total = input.len();
1184 let base = input.current_offset();
1185
1186 let mut length = 3;
1188 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1189 length += 1;
1190 }
1191
1192 if base + length >= total || *input.read_at(base + length) != b'\'' {
1194 return false;
1195 }
1196 length += 1;
1197
1198 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1200 return false;
1201 }
1202 length += 1;
1203
1204 let mut terminated = false;
1206 loop {
1207 let pos = base + length;
1208 if pos >= total {
1209 return false;
1210 }
1211 let byte = *input.read_at(pos);
1212 if byte == b'\n' {
1213 return terminated;
1214 } else if byte == b'\r' {
1215 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1216 } else if !terminated && is_part_of_identifier(&byte) {
1217 length += 1;
1218 } else if !terminated && byte == b'\'' {
1219 terminated = true;
1220 length += 1;
1221 } else {
1222 return false;
1223 }
1224 }
1225}
1226
1227#[inline]
1228fn matches_literal_double_quote_string(input: &Input) -> bool {
1229 let total = input.len();
1230 let base = input.current_offset();
1231
1232 let mut pos = base + 1;
1234 loop {
1235 if pos >= total {
1236 return true;
1238 }
1239 let byte = *input.read_at(pos);
1240 if byte == b'"' {
1241 return true;
1243 } else if byte == b'\\' {
1244 pos += 2;
1246 continue;
1247 }
1248
1249 if pos + 1 < total {
1252 let next = *input.read_at(pos + 1);
1253 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1254 return false;
1255 }
1256 }
1257 pos += 1;
1258 }
1259}
1260
1261#[inline]
1262fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1263 let total = input.len();
1264 let base = input.current_offset();
1265
1266 let mut pos = base + 3;
1268 let mut whitespaces = 0;
1269 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1270 whitespaces += 1;
1271 pos += 1;
1272 }
1273
1274 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1279
1280 let mut label_length = 1; let mut terminated = false; loop {
1283 let pos = base + length;
1284 if pos >= total {
1286 unreachable!("Unexpected end of input while reading heredoc label");
1287 }
1288
1289 let byte = *input.read_at(pos);
1290 if byte == b'\n' {
1291 length += 1;
1293 return (length, whitespaces, label_length);
1294 } else if byte == b'\r' {
1295 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1297 length += 2;
1298 } else {
1299 length += 1;
1300 }
1301 return (length, whitespaces, label_length);
1302 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1303 length += 1;
1306 label_length += 1;
1307 } else if double_quoted && !terminated && byte == b'"' {
1308 length += 1;
1310 terminated = true;
1311 } else {
1312 unreachable!("Unexpected character encountered in heredoc label");
1313 }
1314 }
1315}
1316
1317#[inline]
1318fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1319 let total = input.len();
1320 let base = input.current_offset();
1321
1322 let mut pos = base + 3;
1323 let mut whitespaces = 0;
1324 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1325 whitespaces += 1;
1326 pos += 1;
1327 }
1328
1329 let mut length = 3 + whitespaces + 2;
1331
1332 let mut label_length = 1;
1333 let mut terminated = false;
1334 loop {
1335 let pos = base + length;
1336 if pos >= total {
1337 unreachable!("Unexpected end of input while reading nowdoc label");
1338 }
1339 let byte = *input.read_at(pos);
1340
1341 if byte == b'\n' {
1342 length += 1;
1344 return (length, whitespaces, label_length);
1345 } else if byte == b'\r' {
1346 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1348 length += 2;
1349 } else {
1350 length += 1;
1351 }
1352 return (length, whitespaces, label_length);
1353 } else if is_part_of_identifier(&byte) && !terminated {
1354 length += 1;
1356 label_length += 1;
1357 } else if !terminated && byte == b'\'' {
1358 length += 1;
1360 terminated = true;
1361 } else {
1362 unreachable!("Unexpected character encountered in nowdoc label");
1363 }
1364 }
1365}
1366
1367#[inline]
1368fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1369 let total = input.len();
1370 let start = input.current_offset();
1371 let mut length = 1; let bytes = input.peek(length, total - start - length);
1374 loop {
1375 match memchr2(quote, b'\\', &bytes[length - 1..]) {
1376 Some(pos) => {
1377 let abs_pos = length - 1 + pos;
1378 let byte = bytes[abs_pos];
1379
1380 if byte == b'\\' {
1381 length = abs_pos + 2 + 1; if length > total - start {
1383 return (TokenKind::PartialLiteralString, total - start);
1384 }
1385 } else {
1386 length = abs_pos + 2; return (TokenKind::LiteralString, length);
1388 }
1389 }
1390 None => {
1391 return (TokenKind::PartialLiteralString, total - start);
1393 }
1394 }
1395 }
1396}
1397
1398#[inline]
1399fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1400 let total = input.len();
1401 let base = input.current_offset();
1402 let mut offset = from;
1404
1405 loop {
1406 let abs = base + offset;
1407 if abs >= total {
1408 break;
1410 }
1411
1412 if is_part_of_identifier(input.read_at(abs)) {
1414 offset += 1;
1415 continue;
1416 }
1417
1418 if *input.read_at(abs) == b'[' {
1420 offset += 1;
1421 let mut nesting = 0;
1422 loop {
1423 let abs_inner = base + offset;
1424 if abs_inner >= total {
1425 break;
1426 }
1427 let b = input.read_at(abs_inner);
1428 if *b == b']' {
1429 offset += 1;
1430 if nesting == 0 {
1431 break;
1432 }
1433
1434 nesting -= 1;
1435 } else if *b == b'[' {
1436 offset += 1;
1437 nesting += 1;
1438 } else if b.is_ascii_whitespace() {
1439 break;
1441 } else {
1442 offset += 1;
1443 }
1444 }
1445 break;
1447 }
1448
1449 if base + offset + 2 < total
1451 && *input.read_at(abs) == b'-'
1452 && *input.read_at(base + offset + 1) == b'>'
1453 && is_start_of_identifier(input.read_at(base + offset + 2))
1454 {
1455 offset += 3;
1456 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1458 offset += 1;
1459 }
1460 break;
1461 }
1462
1463 if base + offset + 3 < total
1465 && *input.read_at(abs) == b'?'
1466 && *input.read_at(base + offset + 1) == b'-'
1467 && *input.read_at(base + offset + 2) == b'>'
1468 && is_start_of_identifier(input.read_at(base + offset + 3))
1469 {
1470 offset += 4;
1471 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1472 offset += 1;
1473 }
1474 break;
1475 }
1476
1477 break;
1479 }
1480
1481 offset as u32
1482}
1483
1484#[inline]
1485fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1486 let total = input.len();
1487 let base = input.current_offset();
1488 let mut offset = from;
1489 let mut nesting = 0;
1490
1491 loop {
1492 let abs = base + offset;
1493 if abs >= total {
1494 break;
1495 }
1496 match input.read_at(abs) {
1497 b'}' => {
1498 offset += 1;
1499 if nesting == 0 {
1500 break;
1501 }
1502
1503 nesting -= 1;
1504 }
1505 b'{' => {
1506 offset += 1;
1507 nesting += 1;
1508 }
1509 _ => {
1510 offset += 1;
1511 }
1512 }
1513 }
1514
1515 offset as u32
1516}
1517
1518#[inline]
1521fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1522 memmem::find(bytes, b"*/").map(|pos| pos + 2)
1524}
1525
1526#[inline]
1530fn scan_single_line_comment(bytes: &[u8]) -> usize {
1531 let mut pos = 0;
1532 while pos < bytes.len() {
1533 match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1534 Some(offset) => {
1535 let found_pos = pos + offset;
1536 match bytes[found_pos] {
1537 b'\n' | b'\r' => return found_pos,
1538 b'?' => {
1539 if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1541 if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1543 return found_pos - 1;
1544 }
1545 return found_pos;
1546 }
1547 pos = found_pos + 1;
1549 }
1550 _ => unreachable!(),
1551 }
1552 }
1553 None => return bytes.len(),
1554 }
1555 }
1556
1557 bytes.len()
1558}