1use std::collections::VecDeque;
2use std::fmt::Debug;
3use std::hint::unreachable_unchecked;
4
5use memchr::memchr2;
6use memchr::memmem;
7
8const SIMPLE_TOKEN_TABLE: [Option<TokenKind>; 256] = {
12 let mut table: [Option<TokenKind>; 256] = [None; 256];
13 table[b';' as usize] = Some(TokenKind::Semicolon);
14 table[b',' as usize] = Some(TokenKind::Comma);
15 table[b')' as usize] = Some(TokenKind::RightParenthesis);
16 table[b'[' as usize] = Some(TokenKind::LeftBracket);
17 table[b']' as usize] = Some(TokenKind::RightBracket);
18 table[b'{' as usize] = Some(TokenKind::LeftBrace);
19 table[b'}' as usize] = Some(TokenKind::RightBrace);
20 table[b'~' as usize] = Some(TokenKind::Tilde);
21 table[b'@' as usize] = Some(TokenKind::At);
22 table
23};
24
25const IDENT_START_TABLE: [bool; 256] = {
27 let mut table = [false; 256];
28 let mut i = 0usize;
29 while i < 256 {
30 table[i] = matches!(i as u8, b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF);
31 i += 1;
32 }
33
34 table
35};
36
37use mago_database::file::FileId;
38use mago_database::file::HasFileId;
39use mago_span::Position;
40use mago_syntax_core::float_exponent;
41use mago_syntax_core::float_separator;
42use mago_syntax_core::input::Input;
43use mago_syntax_core::number_sign;
44use mago_syntax_core::start_of_binary_number;
45use mago_syntax_core::start_of_float_number;
46use mago_syntax_core::start_of_hexadecimal_number;
47use mago_syntax_core::start_of_identifier;
48use mago_syntax_core::start_of_number;
49use mago_syntax_core::start_of_octal_number;
50use mago_syntax_core::start_of_octal_or_float_number;
51use mago_syntax_core::utils::is_part_of_identifier;
52use mago_syntax_core::utils::is_start_of_identifier;
53use mago_syntax_core::utils::read_digits_of_base;
54
55use crate::error::SyntaxError;
56use crate::lexer::internal::mode::HaltStage;
57use crate::lexer::internal::mode::Interpolation;
58use crate::lexer::internal::mode::LexerMode;
59use crate::lexer::internal::utils::NumberKind;
60use crate::settings::LexerSettings;
61use crate::token::DocumentKind;
62use crate::token::Token;
63use crate::token::TokenKind;
64
65mod internal;
66
67#[derive(Debug)]
79pub struct Lexer<'input> {
80 input: Input<'input>,
81 settings: LexerSettings,
82 mode: LexerMode<'input>,
83 interpolating: bool,
84 brace_interpolating: bool,
85 buffer: VecDeque<Token<'input>>,
87}
88
89impl<'input> Lexer<'input> {
90 const BUFFER_INITIAL_CAPACITY: usize = 8;
93
94 pub fn new(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
105 Lexer {
106 input,
107 settings,
108 mode: LexerMode::Inline,
109 interpolating: false,
110 brace_interpolating: false,
111 buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
112 }
113 }
114
115 pub fn scripting(input: Input<'input>, settings: LexerSettings) -> Lexer<'input> {
126 Lexer {
127 input,
128 settings,
129 mode: LexerMode::Script,
130 interpolating: false,
131 brace_interpolating: false,
132 buffer: VecDeque::with_capacity(Self::BUFFER_INITIAL_CAPACITY),
133 }
134 }
135
136 #[must_use]
140 pub fn has_reached_eof(&self) -> bool {
141 self.input.has_reached_eof()
142 }
143
144 #[inline]
146 pub const fn current_position(&self) -> Position {
147 self.input.current_position()
148 }
149
150 #[inline]
183 pub fn advance(&mut self) -> Option<Result<Token<'input>, SyntaxError>> {
184 if !self.interpolating
186 && let Some(token) = self.buffer.pop_front()
187 {
188 return Some(Ok(token));
189 }
190
191 if self.input.has_reached_eof() {
192 return None;
193 }
194
195 match self.mode {
196 LexerMode::Inline => {
197 let start = self.input.current_position();
198 let offset = self.input.current_offset();
199
200 if offset == 0
202 && self.input.len() >= 2
203 && unsafe { *self.input.read_at_unchecked(0) } == b'#'
204 && unsafe { *self.input.read_at_unchecked(1) } == b'!'
205 {
206 let buffer = self.input.consume_through(b'\n');
207 let end = self.input.current_position();
208
209 return Some(Ok(self.token(TokenKind::InlineShebang, buffer, start, end)));
210 }
211
212 let bytes = self.input.read_remaining();
214
215 if self.settings.enable_short_tags {
216 if let Some(pos) = memchr::memmem::find(bytes, b"<?") {
217 if pos > 0 {
218 let buffer = self.input.consume(pos);
219 let end = self.input.current_position();
220
221 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
222 }
223
224 if self.input.is_at(b"<?php", true) {
225 let buffer = self.input.consume(5);
226 self.mode = LexerMode::Script;
227 return Some(Ok(self.token(
228 TokenKind::OpenTag,
229 buffer,
230 start,
231 self.input.current_position(),
232 )));
233 }
234
235 if self.input.is_at(b"<?=", false) {
236 let buffer = self.input.consume(3);
237 self.mode = LexerMode::Script;
238 return Some(Ok(self.token(
239 TokenKind::EchoTag,
240 buffer,
241 start,
242 self.input.current_position(),
243 )));
244 }
245
246 let buffer = self.input.consume(2);
247 self.mode = LexerMode::Script;
248 return Some(Ok(self.token(
249 TokenKind::ShortOpenTag,
250 buffer,
251 start,
252 self.input.current_position(),
253 )));
254 }
255 } else {
256 let iter = memchr::memmem::find_iter(bytes, b"<?");
257
258 for pos in iter {
259 let candidate = unsafe { bytes.get_unchecked(pos..) };
261
262 if candidate.len() >= 5
263 && (unsafe { *candidate.get_unchecked(2) } | 0x20) == b'p'
264 && (unsafe { *candidate.get_unchecked(3) } | 0x20) == b'h'
265 && (unsafe { *candidate.get_unchecked(4) } | 0x20) == b'p'
266 {
267 if pos > 0 {
268 let buffer = self.input.consume(pos);
269 let end = self.input.current_position();
270 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
271 }
272
273 let buffer = self.input.consume(5);
274 self.mode = LexerMode::Script;
275 return Some(Ok(self.token(
276 TokenKind::OpenTag,
277 buffer,
278 start,
279 self.input.current_position(),
280 )));
281 }
282
283 if candidate.len() >= 3 && unsafe { *candidate.get_unchecked(2) } == b'=' {
284 if pos > 0 {
285 let buffer = self.input.consume(pos);
286 let end = self.input.current_position();
287 return Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
288 }
289
290 let buffer = self.input.consume(3);
291 self.mode = LexerMode::Script;
292 return Some(Ok(self.token(
293 TokenKind::EchoTag,
294 buffer,
295 start,
296 self.input.current_position(),
297 )));
298 }
299 }
300 }
301
302 if self.input.has_reached_eof() {
303 return None;
304 }
305
306 let buffer = self.input.consume_remaining();
307 let end = self.input.current_position();
308 Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)))
309 }
310 LexerMode::Script => {
311 let start = self.input.current_position();
312 let whitespaces = self.input.consume_whitespaces();
313 if !whitespaces.is_empty() {
314 return Some(Ok(self.token(
315 TokenKind::Whitespace,
316 whitespaces,
317 start,
318 self.input.current_position(),
319 )));
320 }
321
322 let first_byte = match self.input.read(1).first() {
323 Some(&b) => b,
324 None => {
325 unsafe { unreachable_unchecked() }
327 }
328 };
329
330 if let Some(kind) = SIMPLE_TOKEN_TABLE[first_byte as usize] {
331 let buffer = self.input.consume(1);
332 let end = self.input.current_position();
333 return Some(Ok(self.token(kind, buffer, start, end)));
334 }
335
336 if IDENT_START_TABLE[first_byte as usize] {
337 let (token_kind, len) = self.scan_identifier_or_keyword_info();
338
339 if token_kind == TokenKind::HaltCompiler {
340 self.mode = LexerMode::Halt(HaltStage::LookingForLeftParenthesis);
341 }
342
343 let buffer = self.input.consume(len);
344 let end = self.input.current_position();
345 return Some(Ok(self.token(token_kind, buffer, start, end)));
346 }
347
348 if first_byte == b'$'
349 && let Some(&next) = self.input.read(2).get(1)
350 && IDENT_START_TABLE[next as usize]
351 {
352 let (ident_len, _) = self.input.scan_identifier(1);
353 let buffer = self.input.consume(1 + ident_len);
354 let end = self.input.current_position();
355 return Some(Ok(self.token(TokenKind::Variable, buffer, start, end)));
356 }
357
358 let mut document_label: &[u8] = &[];
359
360 let (token_kind, len) = match self.input.read(3) {
361 [b'!', b'=', b'='] => (TokenKind::BangEqualEqual, 3),
362 [b'?', b'?', b'='] => (TokenKind::QuestionQuestionEqual, 3),
363 [b'?', b'-', b'>'] => (TokenKind::QuestionMinusGreaterThan, 3),
364 [b'=', b'=', b'='] => (TokenKind::EqualEqualEqual, 3),
365 [b'.', b'.', b'.'] => (TokenKind::DotDotDot, 3),
366 [b'<', b'=', b'>'] => (TokenKind::LessThanEqualGreaterThan, 3),
367 [b'<', b'<', b'='] => (TokenKind::LeftShiftEqual, 3),
368 [b'>', b'>', b'='] => (TokenKind::RightShiftEqual, 3),
369 [b'*', b'*', b'='] => (TokenKind::AsteriskAsteriskEqual, 3),
370 [b'<', b'<', b'<'] if matches_start_of_heredoc_document(&self.input) => {
371 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, false);
372
373 document_label = self.input.peek(3 + whitespaces, label_length);
374
375 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
376 }
377 [b'<', b'<', b'<'] if matches_start_of_double_quote_heredoc_document(&self.input) => {
378 let (length, whitespaces, label_length) = read_start_of_heredoc_document(&self.input, true);
379
380 document_label = self.input.peek(4 + whitespaces, label_length);
381
382 (TokenKind::DocumentStart(DocumentKind::Heredoc), length)
383 }
384 [b'<', b'<', b'<'] if matches_start_of_nowdoc_document(&self.input) => {
385 let (length, whitespaces, label_length) = read_start_of_nowdoc_document(&self.input);
386
387 document_label = self.input.peek(4 + whitespaces, label_length);
388
389 (TokenKind::DocumentStart(DocumentKind::Nowdoc), length)
390 }
391 [b'!', b'=', ..] => (TokenKind::BangEqual, 2),
392 [b'&', b'&', ..] => (TokenKind::AmpersandAmpersand, 2),
393 [b'&', b'=', ..] => (TokenKind::AmpersandEqual, 2),
394 [b'.', b'=', ..] => (TokenKind::DotEqual, 2),
395 [b'?', b'?', ..] => (TokenKind::QuestionQuestion, 2),
396 [b'?', b'>', ..] => (TokenKind::CloseTag, 2),
397 [b'=', b'>', ..] => (TokenKind::EqualGreaterThan, 2),
398 [b'=', b'=', ..] => (TokenKind::EqualEqual, 2),
399 [b'+', b'+', ..] => (TokenKind::PlusPlus, 2),
400 [b'+', b'=', ..] => (TokenKind::PlusEqual, 2),
401 [b'%', b'=', ..] => (TokenKind::PercentEqual, 2),
402 [b'-', b'-', ..] => (TokenKind::MinusMinus, 2),
403 [b'-', b'>', ..] => (TokenKind::MinusGreaterThan, 2),
404 [b'-', b'=', ..] => (TokenKind::MinusEqual, 2),
405 [b'<', b'<', ..] => (TokenKind::LeftShift, 2),
406 [b'<', b'=', ..] => (TokenKind::LessThanEqual, 2),
407 [b'<', b'>', ..] => (TokenKind::LessThanGreaterThan, 2),
408 [b'>', b'>', ..] => (TokenKind::RightShift, 2),
409 [b'>', b'=', ..] => (TokenKind::GreaterThanEqual, 2),
410 [b':', b':', ..] => (TokenKind::ColonColon, 2),
411 [b'#', b'[', ..] => (TokenKind::HashLeftBracket, 2),
412 [b'|', b'=', ..] => (TokenKind::PipeEqual, 2),
413 [b'|', b'|', ..] => (TokenKind::PipePipe, 2),
414 [b'/', b'=', ..] => (TokenKind::SlashEqual, 2),
415 [b'^', b'=', ..] => (TokenKind::CaretEqual, 2),
416 [b'*', b'*', ..] => (TokenKind::AsteriskAsterisk, 2),
417 [b'*', b'=', ..] => (TokenKind::AsteriskEqual, 2),
418 [b'|', b'>', ..] => (TokenKind::PipeGreaterThan, 2),
419 [b'/', b'/', ..] => {
420 let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
421 let comment_len = scan_single_line_comment(remaining);
422 (TokenKind::SingleLineComment, 2 + comment_len)
423 }
424 [b'/', b'*', asterisk] => {
425 let remaining = self.input.peek(2, self.input.len() - self.input.current_offset());
426 match scan_multi_line_comment(remaining) {
427 Some(len) => {
428 let is_docblock = asterisk == &b'*' && len > 2;
429 if is_docblock {
430 (TokenKind::DocBlockComment, len + 2)
431 } else {
432 (TokenKind::MultiLineComment, len + 2)
433 }
434 }
435 None => {
436 self.input.consume(remaining.len() + 2);
437 return Some(Err(SyntaxError::UnexpectedEndOfFile(
438 self.file_id(),
439 self.input.current_position(),
440 )));
441 }
442 }
443 }
444 [b'\\', start_of_identifier!(), ..] => {
445 let mut length = 1;
446 loop {
447 let (ident_len, ends_with_ns) = self.input.scan_identifier(length);
448 length += ident_len;
449 if ends_with_ns {
450 length += 1; } else {
452 break;
453 }
454 }
455
456 (TokenKind::FullyQualifiedIdentifier, length)
457 }
458 [b'$', b'{', ..] => (TokenKind::DollarLeftBrace, 2),
459 [b'$', ..] => (TokenKind::Dollar, 1),
460 [b'!', ..] => (TokenKind::Bang, 1),
461 [b'&', ..] => (TokenKind::Ampersand, 1),
462 [b'?', ..] => (TokenKind::Question, 1),
463 [b'=', ..] => (TokenKind::Equal, 1),
464 [b'`', ..] => (TokenKind::Backtick, 1),
465 [b'+', ..] => (TokenKind::Plus, 1),
466 [b'%', ..] => (TokenKind::Percent, 1),
467 [b'-', ..] => (TokenKind::Minus, 1),
468 [b'<', ..] => (TokenKind::LessThan, 1),
469 [b'>', ..] => (TokenKind::GreaterThan, 1),
470 [b':', ..] => (TokenKind::Colon, 1),
471 [b'|', ..] => (TokenKind::Pipe, 1),
472 [b'^', ..] => (TokenKind::Caret, 1),
473 [b'*', ..] => (TokenKind::Asterisk, 1),
474 [b'/', ..] => (TokenKind::Slash, 1),
475 [quote @ b'\'', ..] => read_literal_string(&self.input, *quote),
476 [quote @ b'"', ..] if matches_literal_double_quote_string(&self.input) => {
477 read_literal_string(&self.input, *quote)
478 }
479 [b'"', ..] => (TokenKind::DoubleQuote, 1),
480 [b'(', ..] => 'parenthesis: {
481 let mut peek_offset = 1;
482 while let Some(&b) = self.input.read(peek_offset + 1).get(peek_offset) {
483 if b.is_ascii_whitespace() {
484 peek_offset += 1;
485 } else {
486 let lower = b | 0x20; if !matches!(lower, b'i' | b'b' | b'f' | b'd' | b'r' | b's' | b'a' | b'o' | b'u' | b'v')
489 {
490 break 'parenthesis (TokenKind::LeftParenthesis, 1);
491 }
492 break;
493 }
494 }
495
496 for (value, kind) in internal::consts::CAST_TYPES {
497 if let Some(length) = self.input.match_sequence_ignore_whitespace(value, true) {
498 break 'parenthesis (kind, length);
499 }
500 }
501
502 (TokenKind::LeftParenthesis, 1)
503 }
504 [b'#', ..] => {
505 let remaining = self.input.peek(1, self.input.len() - self.input.current_offset());
506 let comment_len = scan_single_line_comment(remaining);
507 (TokenKind::HashComment, 1 + comment_len)
508 }
509 [b'\\', ..] => (TokenKind::NamespaceSeparator, 1),
510 [b'.', start_of_number!(), ..] => {
511 let mut length = read_digits_of_base(&self.input, 2, 10);
512 if let float_exponent!() = self.input.peek(length, 1) {
513 let mut exp_length = length + 1;
514 if let number_sign!() = self.input.peek(exp_length, 1) {
515 exp_length += 1;
516 }
517
518 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
519 if after_exp > exp_length {
520 length = after_exp;
521 }
522 }
523
524 (TokenKind::LiteralFloat, length)
525 }
526 [start_of_number!(), ..] => 'number: {
527 let mut length = 1;
528
529 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
530 start_of_binary_number!() => {
531 length += 1;
532
533 (2, NumberKind::Integer)
534 }
535 start_of_octal_number!() => {
536 length += 1;
537
538 (8, NumberKind::Integer)
539 }
540 start_of_hexadecimal_number!() => {
541 length += 1;
542
543 (16, NumberKind::Integer)
544 }
545 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
546 start_of_float_number!() => (10, NumberKind::Float),
547 _ => (10, NumberKind::IntegerOrFloat),
548 };
549
550 if kind != NumberKind::Float {
551 length = read_digits_of_base(&self.input, length, base);
552
553 if kind == NumberKind::Integer {
554 break 'number (TokenKind::LiteralInteger, length);
555 }
556 }
557
558 let is_float = matches!(self.input.peek(length, 3), float_separator!());
559
560 if !is_float {
561 break 'number (TokenKind::LiteralInteger, length);
562 }
563
564 if let [b'.'] = self.input.peek(length, 1) {
565 length += 1;
566 length = read_digits_of_base(&self.input, length, 10);
567 }
568
569 if let float_exponent!() = self.input.peek(length, 1) {
570 let mut exp_length = length + 1;
572 if let number_sign!() = self.input.peek(exp_length, 1) {
573 exp_length += 1;
574 }
575 let after_exp = read_digits_of_base(&self.input, exp_length, 10);
576 if after_exp > exp_length {
577 length = after_exp;
579 }
580 }
581
582 (TokenKind::LiteralFloat, length)
583 }
584 [b'.', ..] => (TokenKind::Dot, 1),
585 [unknown_byte, ..] => {
586 let position = self.input.current_position();
587 self.input.consume(1);
588
589 return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), *unknown_byte, position)));
590 }
591 [] => {
592 unreachable!()
595 }
596 };
597
598 self.mode = match token_kind {
599 TokenKind::DoubleQuote => LexerMode::DoubleQuoteString(Interpolation::None),
600 TokenKind::Backtick => LexerMode::ShellExecuteString(Interpolation::None),
601 TokenKind::CloseTag => LexerMode::Inline,
602 TokenKind::HaltCompiler => LexerMode::Halt(HaltStage::LookingForLeftParenthesis),
603 TokenKind::DocumentStart(document_kind) => {
604 LexerMode::DocumentString(document_kind, document_label, Interpolation::None)
605 }
606 _ => LexerMode::Script,
607 };
608
609 let buffer = self.input.consume(len);
610 let end = self.input.current_position();
611
612 Some(Ok(self.token(token_kind, buffer, start, end)))
613 }
614 LexerMode::DoubleQuoteString(interpolation) => match &interpolation {
615 Interpolation::None => {
616 let start = self.input.current_position();
617
618 let mut length = 0;
619 let mut last_was_slash = false;
620 let mut token_kind = TokenKind::StringPart;
621 loop {
622 match self.input.peek(length, 2) {
623 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
624 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
625
626 self.mode =
627 LexerMode::DoubleQuoteString(Interpolation::Until(start.offset + until_offset));
628
629 break;
630 }
631 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
632 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
633
634 self.mode = LexerMode::DoubleQuoteString(Interpolation::BraceUntil(
635 start.offset + until_offset,
636 ));
637
638 break;
639 }
640 [b'\\', ..] => {
641 length += 1;
642
643 last_was_slash = !last_was_slash;
644 }
645 [b'"', ..] if !last_was_slash => {
646 if length == 0 {
647 length += 1;
648 token_kind = TokenKind::DoubleQuote;
649
650 break;
651 }
652
653 break;
654 }
655 [_, ..] => {
656 length += 1;
657 last_was_slash = false;
658 }
659 [] => {
660 break;
661 }
662 }
663 }
664
665 let buffer = self.input.consume(length);
666 let end = self.input.current_position();
667
668 if TokenKind::DoubleQuote == token_kind {
669 self.mode = LexerMode::Script;
670 }
671
672 Some(Ok(self.token(token_kind, buffer, start, end)))
673 }
674 Interpolation::Until(offset) => {
675 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), false)
676 }
677 Interpolation::BraceUntil(offset) => {
678 self.interpolation(*offset, LexerMode::DoubleQuoteString(Interpolation::None), true)
679 }
680 },
681 LexerMode::ShellExecuteString(interpolation) => match &interpolation {
682 Interpolation::None => {
683 let start = self.input.current_position();
684
685 let mut length = 0;
686 let mut last_was_slash = false;
687 let mut token_kind = TokenKind::StringPart;
688 loop {
689 match self.input.peek(length, 2) {
690 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
691 let until_offset = read_until_end_of_variable_interpolation(&self.input, length + 2);
692
693 self.mode =
694 LexerMode::ShellExecuteString(Interpolation::Until(start.offset + until_offset));
695
696 break;
697 }
698 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
699 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
700
701 self.mode = LexerMode::ShellExecuteString(Interpolation::BraceUntil(
702 start.offset + until_offset,
703 ));
704
705 break;
706 }
707 [b'\\', ..] => {
708 length += 1;
709 last_was_slash = true;
710 }
711 [b'`', ..] if !last_was_slash => {
712 if length == 0 {
713 length += 1;
714 token_kind = TokenKind::Backtick;
715
716 break;
717 }
718
719 break;
720 }
721 [_, ..] => {
722 length += 1;
723 last_was_slash = false;
724 }
725 [] => {
726 break;
727 }
728 }
729 }
730
731 let buffer = self.input.consume(length);
732 let end = self.input.current_position();
733
734 if TokenKind::Backtick == token_kind {
735 self.mode = LexerMode::Script;
736 }
737
738 Some(Ok(self.token(token_kind, buffer, start, end)))
739 }
740 Interpolation::Until(offset) => {
741 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), false)
742 }
743 Interpolation::BraceUntil(offset) => {
744 self.interpolation(*offset, LexerMode::ShellExecuteString(Interpolation::None), true)
745 }
746 },
747 LexerMode::DocumentString(kind, label, interpolation) => match &kind {
748 DocumentKind::Heredoc => match &interpolation {
749 Interpolation::None => {
750 let start = self.input.current_position();
751
752 let mut length = 0;
753 let mut last_was_slash = false;
754 let mut only_whitespaces = true;
755 let mut token_kind = TokenKind::StringPart;
756 loop {
757 match self.input.peek(length, 2) {
758 [b'\r', b'\n'] => {
759 length += 2;
760
761 break;
762 }
763 [b'\n' | b'\r', ..] => {
764 length += 1;
765
766 break;
767 }
768 [byte, ..] if byte.is_ascii_whitespace() => {
769 length += 1;
770 }
771 [b'$', start_of_identifier!(), ..] if !last_was_slash => {
772 let until_offset =
773 read_until_end_of_variable_interpolation(&self.input, length + 2);
774
775 self.mode = LexerMode::DocumentString(
776 kind,
777 label,
778 Interpolation::Until(start.offset + until_offset),
779 );
780
781 break;
782 }
783 [b'{', b'$', ..] | [b'$', b'{', ..] if !last_was_slash => {
784 let until_offset = read_until_end_of_brace_interpolation(&self.input, length + 2);
785
786 self.mode = LexerMode::DocumentString(
787 kind,
788 label,
789 Interpolation::BraceUntil(start.offset + until_offset),
790 );
791
792 break;
793 }
794 [b'\\', ..] => {
795 length += 1;
796 last_was_slash = true;
797 only_whitespaces = false;
798 }
799 [_, ..] => {
800 if only_whitespaces
801 && self.input.peek(length, label.len()) == label
802 && self
803 .input
804 .peek(length + label.len(), 1)
805 .first()
806 .is_none_or(|c| !c.is_ascii_alphanumeric())
807 {
808 length += label.len();
809 token_kind = TokenKind::DocumentEnd;
810
811 break;
812 }
813
814 length += 1;
815 last_was_slash = false;
816 only_whitespaces = false;
817 }
818 [] => {
819 break;
820 }
821 }
822 }
823
824 let buffer = self.input.consume(length);
825 let end = self.input.current_position();
826
827 if TokenKind::DocumentEnd == token_kind {
828 self.mode = LexerMode::Script;
829 }
830
831 Some(Ok(self.token(token_kind, buffer, start, end)))
832 }
833 Interpolation::Until(offset) => {
834 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), false)
835 }
836 Interpolation::BraceUntil(offset) => {
837 self.interpolation(*offset, LexerMode::DocumentString(kind, label, Interpolation::None), true)
838 }
839 },
840 DocumentKind::Nowdoc => {
841 let start = self.input.current_position();
842
843 let mut length = 0;
844 let mut terminated = false;
845 let mut only_whitespaces = true;
846
847 loop {
848 match self.input.peek(length, 2) {
849 [b'\r', b'\n'] => {
850 length += 2;
851
852 break;
853 }
854 [b'\n' | b'\r', ..] => {
855 length += 1;
856
857 break;
858 }
859 [byte, ..] if byte.is_ascii_whitespace() => {
860 length += 1;
861 }
862 [_, ..] => {
863 if only_whitespaces
864 && self.input.peek(length, label.len()) == label
865 && self
866 .input
867 .peek(length + label.len(), 1)
868 .first()
869 .is_none_or(|c| !c.is_ascii_alphanumeric())
870 {
871 length += label.len();
872 terminated = true;
873
874 break;
875 }
876
877 only_whitespaces = false;
878 length += 1;
879 }
880 [] => {
881 break;
882 }
883 }
884 }
885
886 let buffer = self.input.consume(length);
887 let end = self.input.current_position();
888
889 if terminated {
890 self.mode = LexerMode::Script;
891
892 return Some(Ok(self.token(TokenKind::DocumentEnd, buffer, start, end)));
893 }
894
895 Some(Ok(self.token(TokenKind::StringPart, buffer, start, end)))
896 }
897 },
898 LexerMode::Halt(stage) => 'halt: {
899 let start = self.input.current_position();
900 if let HaltStage::End = stage {
901 let buffer = self.input.consume_remaining();
902 let end = self.input.current_position();
903
904 break 'halt Some(Ok(self.token(TokenKind::InlineText, buffer, start, end)));
905 }
906
907 let whitespaces = self.input.consume_whitespaces();
908 if !whitespaces.is_empty() {
909 let end = self.input.current_position();
910
911 break 'halt Some(Ok(self.token(TokenKind::Whitespace, whitespaces, start, end)));
912 }
913
914 match &stage {
915 HaltStage::LookingForLeftParenthesis => {
916 if self.input.is_at(b"(", false) {
917 let buffer = self.input.consume(1);
918 let end = self.input.current_position();
919
920 self.mode = LexerMode::Halt(HaltStage::LookingForRightParenthesis);
921
922 Some(Ok(self.token(TokenKind::LeftParenthesis, buffer, start, end)))
923 } else {
924 let byte = self.input.read(1)[0];
925 let position = self.input.current_position();
926 self.input.consume(1);
928 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
929 }
930 }
931 HaltStage::LookingForRightParenthesis => {
932 if self.input.is_at(b")", false) {
933 let buffer = self.input.consume(1);
934 let end = self.input.current_position();
935
936 self.mode = LexerMode::Halt(HaltStage::LookingForTerminator);
937
938 Some(Ok(self.token(TokenKind::RightParenthesis, buffer, start, end)))
939 } else {
940 let byte = self.input.read(1)[0];
941 let position = self.input.current_position();
942 self.input.consume(1);
943 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
944 }
945 }
946 HaltStage::LookingForTerminator => {
947 if self.input.is_at(b";", false) {
948 let buffer = self.input.consume(1);
949 let end = self.input.current_position();
950
951 self.mode = LexerMode::Halt(HaltStage::End);
952
953 Some(Ok(self.token(TokenKind::Semicolon, buffer, start, end)))
954 } else if self.input.is_at(b"?>", false) {
955 let buffer = self.input.consume(2);
956 let end = self.input.current_position();
957
958 self.mode = LexerMode::Halt(HaltStage::End);
959
960 Some(Ok(self.token(TokenKind::CloseTag, buffer, start, end)))
961 } else {
962 let byte = self.input.read(1)[0];
963 let position = self.input.current_position();
964 self.input.consume(1);
965 Some(Err(SyntaxError::UnexpectedToken(self.file_id(), byte, position)))
966 }
967 }
968 _ => unreachable!(),
969 }
970 }
971 }
972 }
973
974 #[inline]
978 fn scan_identifier_or_keyword_info(&self) -> (TokenKind, usize) {
979 let (mut length, ended_with_slash) = self.input.scan_identifier(0);
980
981 if !ended_with_slash {
982 match length {
983 6 => {
984 if self.input.is_at(b"public(set)", true) {
985 return (TokenKind::PublicSet, 11);
986 }
987 }
988 7 => {
989 if self.input.is_at(b"private(set)", true) {
990 return (TokenKind::PrivateSet, 12);
991 }
992 }
993 9 => {
994 if self.input.is_at(b"protected(set)", true) {
995 return (TokenKind::ProtectedSet, 14);
996 }
997 }
998 _ => {}
999 }
1000 }
1001
1002 if !ended_with_slash && let Some(kind) = internal::keyword::lookup_keyword(self.input.read(length)) {
1003 return (kind, length);
1004 }
1005
1006 let mut slashes = 0;
1007 let mut last_was_slash = false;
1008 loop {
1009 match self.input.peek(length, 1) {
1010 [b'a'..=b'z' | b'A'..=b'Z' | b'_' | 0x80..=0xFF] if last_was_slash => {
1011 length += 1;
1012 last_was_slash = false;
1013 }
1014 [b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF] if !last_was_slash => {
1015 length += 1;
1016 }
1017 [b'\\'] if !self.interpolating || self.brace_interpolating => {
1018 if last_was_slash {
1019 length -= 1;
1020 slashes -= 1;
1021 last_was_slash = false;
1022 break;
1023 }
1024
1025 length += 1;
1026 slashes += 1;
1027 last_was_slash = true;
1028 }
1029 _ => {
1030 break;
1031 }
1032 }
1033 }
1034
1035 if last_was_slash {
1036 length -= 1;
1037 slashes -= 1;
1038 }
1039
1040 let kind = if slashes > 0 { TokenKind::QualifiedIdentifier } else { TokenKind::Identifier };
1041
1042 (kind, length)
1043 }
1044
1045 #[inline]
1046 fn token(&self, kind: TokenKind, v: &'input [u8], start: Position, _end: Position) -> Token<'input> {
1047 let value = unsafe { std::str::from_utf8_unchecked(v) };
1052
1053 Token { kind, start, value }
1054 }
1055
1056 #[inline]
1057 fn interpolation(
1058 &mut self,
1059 end_offset: u32,
1060 post_interpolation_mode: LexerMode<'input>,
1061 brace: bool,
1062 ) -> Option<Result<Token<'input>, SyntaxError>> {
1063 self.mode = LexerMode::Script;
1064
1065 let was_interpolating = self.interpolating;
1066 self.interpolating = true;
1067 let was_brace_interpolating = self.brace_interpolating;
1068 self.brace_interpolating = brace;
1070
1071 loop {
1072 let subsequent_token = self.advance()?.ok()?;
1073 let token_start = subsequent_token.start.offset;
1075 let token_end = token_start + subsequent_token.value.len() as u32;
1076 let is_final_token = token_start <= end_offset && end_offset <= token_end;
1077
1078 self.buffer.push_back(subsequent_token);
1079
1080 if is_final_token {
1081 break;
1082 }
1083 }
1084
1085 self.mode = post_interpolation_mode;
1086 self.interpolating = was_interpolating;
1087 self.brace_interpolating = was_brace_interpolating;
1088
1089 self.advance()
1090 }
1091}
1092
1093impl HasFileId for Lexer<'_> {
1094 #[inline]
1095 fn file_id(&self) -> FileId {
1096 self.input.file_id()
1097 }
1098}
1099
1100#[inline]
1101fn matches_start_of_heredoc_document(input: &Input) -> bool {
1102 let total = input.len();
1103 let base = input.current_offset();
1104
1105 let mut length = 3;
1107 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1109 length += 1;
1110 }
1111
1112 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1114 return false;
1115 }
1116 length += 1; loop {
1120 let pos = base + length;
1121 if pos >= total {
1122 return false; }
1124
1125 let byte = *input.read_at(pos);
1126 if byte == b'\n' {
1127 return true; } else if byte == b'\r' {
1129 return pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1131 } else if is_part_of_identifier(input.read_at(pos)) {
1132 length += 1;
1133 } else {
1134 return false; }
1136 }
1137}
1138
1139#[inline]
1140fn matches_start_of_double_quote_heredoc_document(input: &Input) -> bool {
1141 let total = input.len();
1142 let base = input.current_offset();
1143
1144 let mut length = 3;
1146 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1147 length += 1;
1148 }
1149
1150 if base + length >= total || *input.read_at(base + length) != b'"' {
1152 return false;
1153 }
1154 length += 1;
1155
1156 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1158 return false;
1159 }
1160 length += 1;
1161
1162 let mut terminated = false;
1164 loop {
1165 let pos = base + length;
1166 if pos >= total {
1167 return false;
1168 }
1169 let byte = input.read_at(pos);
1170 if *byte == b'\n' {
1171 return terminated;
1173 } else if *byte == b'\r' {
1174 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1176 } else if !terminated && is_part_of_identifier(byte) {
1177 length += 1;
1178 } else if !terminated && *byte == b'"' {
1179 terminated = true;
1180 length += 1;
1181 } else {
1182 return false;
1183 }
1184 }
1185}
1186
1187#[inline]
1188fn matches_start_of_nowdoc_document(input: &Input) -> bool {
1189 let total = input.len();
1190 let base = input.current_offset();
1191
1192 let mut length = 3;
1194 while base + length < total && input.read_at(base + length).is_ascii_whitespace() {
1195 length += 1;
1196 }
1197
1198 if base + length >= total || *input.read_at(base + length) != b'\'' {
1200 return false;
1201 }
1202 length += 1;
1203
1204 if base + length >= total || !is_start_of_identifier(input.read_at(base + length)) {
1206 return false;
1207 }
1208 length += 1;
1209
1210 let mut terminated = false;
1212 loop {
1213 let pos = base + length;
1214 if pos >= total {
1215 return false;
1216 }
1217 let byte = *input.read_at(pos);
1218 if byte == b'\n' {
1219 return terminated;
1220 } else if byte == b'\r' {
1221 return terminated && pos + 1 < total && *input.read_at(pos + 1) == b'\n';
1222 } else if !terminated && is_part_of_identifier(&byte) {
1223 length += 1;
1224 } else if !terminated && byte == b'\'' {
1225 terminated = true;
1226 length += 1;
1227 } else {
1228 return false;
1229 }
1230 }
1231}
1232
1233#[inline]
1234fn matches_literal_double_quote_string(input: &Input) -> bool {
1235 let total = input.len();
1236 let base = input.current_offset();
1237
1238 let mut pos = base + 1;
1240 loop {
1241 if pos >= total {
1242 return true;
1244 }
1245 let byte = *input.read_at(pos);
1246 if byte == b'"' {
1247 return true;
1249 } else if byte == b'\\' {
1250 pos += 2;
1252 continue;
1253 }
1254
1255 if pos + 1 < total {
1258 let next = *input.read_at(pos + 1);
1259 if (byte == b'$' && (is_start_of_identifier(&next) || next == b'{')) || (byte == b'{' && next == b'$') {
1260 return false;
1261 }
1262 }
1263 pos += 1;
1264 }
1265}
1266
1267#[inline]
1268fn read_start_of_heredoc_document(input: &Input, double_quoted: bool) -> (usize, usize, usize) {
1269 let total = input.len();
1270 let base = input.current_offset();
1271
1272 let mut pos = base + 3;
1274 let mut whitespaces = 0;
1275 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1276 whitespaces += 1;
1277 pos += 1;
1278 }
1279
1280 let mut length = 3 + whitespaces + if double_quoted { 2 } else { 1 };
1285
1286 let mut label_length = 1; let mut terminated = false; loop {
1289 let pos = base + length;
1290 if pos >= total {
1292 unreachable!("Unexpected end of input while reading heredoc label");
1293 }
1294
1295 let byte = *input.read_at(pos);
1296 if byte == b'\n' {
1297 length += 1;
1299 return (length, whitespaces, label_length);
1300 } else if byte == b'\r' {
1301 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1303 length += 2;
1304 } else {
1305 length += 1;
1306 }
1307 return (length, whitespaces, label_length);
1308 } else if is_part_of_identifier(&byte) && (!double_quoted || !terminated) {
1309 length += 1;
1312 label_length += 1;
1313 } else if double_quoted && !terminated && byte == b'"' {
1314 length += 1;
1316 terminated = true;
1317 } else {
1318 unreachable!("Unexpected character encountered in heredoc label");
1319 }
1320 }
1321}
1322
1323#[inline]
1324fn read_start_of_nowdoc_document(input: &Input) -> (usize, usize, usize) {
1325 let total = input.len();
1326 let base = input.current_offset();
1327
1328 let mut pos = base + 3;
1329 let mut whitespaces = 0;
1330 while pos < total && input.read_at(pos).is_ascii_whitespace() {
1331 whitespaces += 1;
1332 pos += 1;
1333 }
1334
1335 let mut length = 3 + whitespaces + 2;
1337
1338 let mut label_length = 1;
1339 let mut terminated = false;
1340 loop {
1341 let pos = base + length;
1342 if pos >= total {
1343 unreachable!("Unexpected end of input while reading nowdoc label");
1344 }
1345 let byte = *input.read_at(pos);
1346
1347 if byte == b'\n' {
1348 length += 1;
1350 return (length, whitespaces, label_length);
1351 } else if byte == b'\r' {
1352 if pos + 1 < total && *input.read_at(pos + 1) == b'\n' {
1354 length += 2;
1355 } else {
1356 length += 1;
1357 }
1358 return (length, whitespaces, label_length);
1359 } else if is_part_of_identifier(&byte) && !terminated {
1360 length += 1;
1362 label_length += 1;
1363 } else if !terminated && byte == b'\'' {
1364 length += 1;
1366 terminated = true;
1367 } else {
1368 unreachable!("Unexpected character encountered in nowdoc label");
1369 }
1370 }
1371}
1372
1373#[inline]
1374fn read_literal_string(input: &Input, quote: u8) -> (TokenKind, usize) {
1375 let total = input.len();
1376 let start = input.current_offset();
1377 let mut length = 1; let bytes = input.peek(length, total - start - length);
1380 loop {
1381 match memchr2(quote, b'\\', &bytes[length - 1..]) {
1382 Some(pos) => {
1383 let abs_pos = length - 1 + pos;
1384 let byte = bytes[abs_pos];
1385
1386 if byte == b'\\' {
1387 length = abs_pos + 2 + 1; if length > total - start {
1389 return (TokenKind::PartialLiteralString, total - start);
1390 }
1391 } else {
1392 length = abs_pos + 2; return (TokenKind::LiteralString, length);
1394 }
1395 }
1396 None => {
1397 return (TokenKind::PartialLiteralString, total - start);
1399 }
1400 }
1401 }
1402}
1403
1404#[inline]
1405fn read_until_end_of_variable_interpolation(input: &Input, from: usize) -> u32 {
1406 let total = input.len();
1407 let base = input.current_offset();
1408 let mut offset = from;
1410
1411 loop {
1412 let abs = base + offset;
1413 if abs >= total {
1414 break;
1416 }
1417
1418 if is_part_of_identifier(input.read_at(abs)) {
1420 offset += 1;
1421 continue;
1422 }
1423
1424 if *input.read_at(abs) == b'[' {
1426 offset += 1;
1427 let mut nesting = 0;
1428 loop {
1429 let abs_inner = base + offset;
1430 if abs_inner >= total {
1431 break;
1432 }
1433 let b = input.read_at(abs_inner);
1434 if *b == b']' {
1435 offset += 1;
1436 if nesting == 0 {
1437 break;
1438 }
1439
1440 nesting -= 1;
1441 } else if *b == b'[' {
1442 offset += 1;
1443 nesting += 1;
1444 } else if b.is_ascii_whitespace() {
1445 break;
1447 } else {
1448 offset += 1;
1449 }
1450 }
1451 break;
1453 }
1454
1455 if base + offset + 2 < total
1457 && *input.read_at(abs) == b'-'
1458 && *input.read_at(base + offset + 1) == b'>'
1459 && is_start_of_identifier(input.read_at(base + offset + 2))
1460 {
1461 offset += 3;
1462 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1464 offset += 1;
1465 }
1466 break;
1467 }
1468
1469 if base + offset + 3 < total
1471 && *input.read_at(abs) == b'?'
1472 && *input.read_at(base + offset + 1) == b'-'
1473 && *input.read_at(base + offset + 2) == b'>'
1474 && is_start_of_identifier(input.read_at(base + offset + 3))
1475 {
1476 offset += 4;
1477 while base + offset < total && is_part_of_identifier(input.read_at(base + offset)) {
1478 offset += 1;
1479 }
1480 break;
1481 }
1482
1483 break;
1485 }
1486
1487 offset as u32
1488}
1489
1490#[inline]
1491fn read_until_end_of_brace_interpolation(input: &Input, from: usize) -> u32 {
1492 let total = input.len();
1493 let base = input.current_offset();
1494 let mut offset = from;
1495 let mut nesting = 0;
1496
1497 loop {
1498 let abs = base + offset;
1499 if abs >= total {
1500 break;
1501 }
1502 match input.read_at(abs) {
1503 b'}' => {
1504 offset += 1;
1505 if nesting == 0 {
1506 break;
1507 }
1508
1509 nesting -= 1;
1510 }
1511 b'{' => {
1512 offset += 1;
1513 nesting += 1;
1514 }
1515 _ => {
1516 offset += 1;
1517 }
1518 }
1519 }
1520
1521 offset as u32
1522}
1523
1524#[inline]
1527fn scan_multi_line_comment(bytes: &[u8]) -> Option<usize> {
1528 memmem::find(bytes, b"*/").map(|pos| pos + 2)
1530}
1531
1532#[inline]
1536fn scan_single_line_comment(bytes: &[u8]) -> usize {
1537 let mut pos = 0;
1538 while pos < bytes.len() {
1539 match memchr::memchr3(b'\n', b'\r', b'?', &bytes[pos..]) {
1540 Some(offset) => {
1541 let found_pos = pos + offset;
1542 match bytes[found_pos] {
1543 b'\n' | b'\r' => return found_pos,
1544 b'?' => {
1545 if found_pos + 1 < bytes.len() && bytes[found_pos + 1] == b'>' {
1547 if found_pos > 0 && bytes[found_pos - 1].is_ascii_whitespace() {
1549 return found_pos - 1;
1550 }
1551 return found_pos;
1552 }
1553 pos = found_pos + 1;
1555 }
1556 _ => unreachable!(),
1557 }
1558 }
1559 None => return bytes.len(),
1560 }
1561 }
1562
1563 bytes.len()
1564}