1pub mod token;
2
3use crate::span::Span;
4use memchr::{memchr, memchr2, memchr3};
5use token::{Token, TokenKind};
6
7#[derive(Debug, Clone, Copy, PartialEq)]
8pub enum LexerMode {
9 Standard,
10 LookingForProperty,
11 LookingForVarName,
12}
13
14#[derive(Debug, Clone, PartialEq)]
15enum LexerState {
16 Initial,
17 Scripting,
18 DoubleQuotes,
19 Backquote,
20 Heredoc(Vec<u8>),
21 Nowdoc(Vec<u8>),
22 HaltCompiler,
23 RawData,
24 VarOffset,
25 VarOffsetDollarCurly,
26 LookingForProperty,
27 LookingForVarName,
28}
29
30fn keyword_lookup(text: &[u8]) -> TokenKind {
31 match text {
32 b"or" => TokenKind::LogicalOr,
33 b"and" => TokenKind::LogicalAnd,
34 b"xor" => TokenKind::LogicalXor,
35 b"bool" => TokenKind::TypeBool,
36 b"int" => TokenKind::TypeInt,
37 b"float" => TokenKind::TypeFloat,
38 b"string" => TokenKind::TypeString,
39 b"mixed" => TokenKind::TypeMixed,
40 b"never" => TokenKind::TypeNever,
41 b"null" => TokenKind::TypeNull,
42 b"false" => TokenKind::TypeFalse,
43 b"true" => TokenKind::TypeTrue,
44 b"exit" => TokenKind::Exit,
45 b"die" => TokenKind::Die,
46 b"function" => TokenKind::Function,
47 b"fn" => TokenKind::Fn,
48 b"const" => TokenKind::Const,
49 b"return" => TokenKind::Return,
50 b"yield" => TokenKind::Yield,
51 b"try" => TokenKind::Try,
52 b"catch" => TokenKind::Catch,
53 b"finally" => TokenKind::Finally,
54 b"throw" => TokenKind::Throw,
55 b"if" => TokenKind::If,
56 b"elseif" => TokenKind::ElseIf,
57 b"endif" => TokenKind::EndIf,
58 b"else" => TokenKind::Else,
59 b"insteadof" => TokenKind::Insteadof,
60 b"while" => TokenKind::While,
61 b"endwhile" => TokenKind::EndWhile,
62 b"do" => TokenKind::Do,
63 b"for" => TokenKind::For,
64 b"endfor" => TokenKind::EndFor,
65 b"foreach" => TokenKind::Foreach,
66 b"endforeach" => TokenKind::EndForeach,
67 b"declare" => TokenKind::Declare,
68 b"enddeclare" => TokenKind::EndDeclare,
69 b"instanceof" => TokenKind::InstanceOf,
70 b"as" => TokenKind::As,
71 b"switch" => TokenKind::Switch,
72 b"endswitch" => TokenKind::EndSwitch,
73 b"case" => TokenKind::Case,
74 b"default" => TokenKind::Default,
75 b"break" => TokenKind::Break,
76 b"continue" => TokenKind::Continue,
77 b"goto" => TokenKind::Goto,
78 b"echo" => TokenKind::Echo,
79 b"print" => TokenKind::Print,
80 b"enum" => TokenKind::Enum,
81 b"class" => TokenKind::Class,
82 b"interface" => TokenKind::Interface,
83 b"trait" => TokenKind::Trait,
84 b"extends" => TokenKind::Extends,
85 b"implements" => TokenKind::Implements,
86 b"new" => TokenKind::New,
87 b"clone" => TokenKind::Clone,
88 b"var" => TokenKind::Public,
89 b"public" => TokenKind::Public,
90 b"protected" => TokenKind::Protected,
91 b"private" => TokenKind::Private,
92 b"final" => TokenKind::Final,
93 b"abstract" => TokenKind::Abstract,
94 b"static" => TokenKind::Static,
95 b"readonly" => TokenKind::Readonly,
96 b"namespace" => TokenKind::Namespace,
97 b"use" => TokenKind::Use,
98 b"global" => TokenKind::Global,
99 b"isset" => TokenKind::Isset,
100 b"empty" => TokenKind::Empty,
101 b"__halt_compiler" => TokenKind::HaltCompiler,
102 b"__class__" => TokenKind::ClassC,
103 b"__trait__" => TokenKind::TraitC,
104 b"__function__" => TokenKind::FuncC,
105 b"__method__" => TokenKind::MethodC,
106 b"__line__" => TokenKind::Line,
107 b"__file__" => TokenKind::File,
108 b"__dir__" => TokenKind::Dir,
109 b"__namespace__" => TokenKind::NsC,
110 b"__property__" => TokenKind::PropertyC,
111 b"array" => TokenKind::Array,
112 b"callable" => TokenKind::TypeCallable,
113 b"iterable" => TokenKind::TypeIterable,
114 b"void" => TokenKind::TypeVoid,
115 b"object" => TokenKind::TypeObject,
116 b"match" => TokenKind::Match,
117 b"list" => TokenKind::List,
118 b"include" => TokenKind::Include,
119 b"include_once" => TokenKind::IncludeOnce,
120 b"require" => TokenKind::Require,
121 b"require_once" => TokenKind::RequireOnce,
122 b"eval" => TokenKind::Eval,
123 b"unset" => TokenKind::Unset,
124 _ => TokenKind::Identifier,
125 }
126}
127
128#[derive(Debug, Clone)]
129pub struct Lexer<'src> {
130 input: &'src [u8],
131 cursor: usize,
132 state_stack: Vec<LexerState>,
133 mode: LexerMode,
134}
135
136impl<'src> Lexer<'src> {
137 pub fn new(input: &'src [u8]) -> Self {
138 let mut cursor = 0;
139 if input.starts_with(b"#!") {
140 if let Some(pos) = memchr(b'\n', input) {
141 cursor = pos + 1;
142 } else {
143 cursor = input.len();
144 }
145 }
146
147 Self {
148 input,
149 cursor,
150 state_stack: vec![LexerState::Initial],
151 mode: LexerMode::Standard,
152 }
153 }
154
155 pub fn set_mode(&mut self, mode: LexerMode) {
156 self.mode = mode;
157 }
158
159 pub fn slice(&self, span: Span) -> &'src [u8] {
160 &self.input[span.start..span.end]
161 }
162
163 fn peek(&self) -> Option<u8> {
164 if self.cursor < self.input.len() {
165 Some(self.input[self.cursor])
166 } else {
167 None
168 }
169 }
170
171 fn advance(&mut self) {
172 self.cursor += 1;
173 }
174
175 fn advance_n(&mut self, n: usize) {
176 self.cursor += n;
177 }
178
179 fn skip_whitespace(&mut self) {
180 while self.cursor < self.input.len() {
181 if self.input[self.cursor].is_ascii_whitespace() {
182 self.cursor += 1;
183 } else {
184 break;
185 }
186 }
187 }
188
189 fn read_identifier(&mut self) {
190 while self.cursor < self.input.len() {
191 let c = self.input[self.cursor];
192 if c.is_ascii_alphanumeric() || c == b'_' || c >= 0x80 {
193 self.cursor += 1;
194 } else {
195 break;
196 }
197 }
198 }
199
200 fn read_number(&mut self) -> TokenKind {
201 let mut is_float = false;
202
203 if self.peek() == Some(b'0') {
205 self.advance();
206 if let Some(c) = self.peek() {
207 if c == b'x' || c == b'X' {
208 self.advance();
209 while let Some(c) = self.peek() {
210 if c.is_ascii_hexdigit() || c == b'_' {
211 self.advance();
212 } else {
213 break;
214 }
215 }
216 return TokenKind::LNumber;
217 } else if c == b'b' || c == b'B' {
218 self.advance();
219 while let Some(c) = self.peek() {
220 if c == b'0' || c == b'1' || c == b'_' {
221 self.advance();
222 } else {
223 break;
224 }
225 }
226 return TokenKind::LNumber;
227 } else if c == b'o' || c == b'O' {
228 self.advance();
229 while let Some(c) = self.peek() {
230 if (b'0'..=b'7').contains(&c) || c == b'_' {
231 self.advance();
232 } else {
233 break;
234 }
235 }
236 return TokenKind::LNumber;
237 }
238 }
239 }
240
241 while let Some(c) = self.peek() {
242 if c.is_ascii_digit() || c == b'_' {
243 self.advance();
244 } else if c == b'.' {
245 if is_float {
246 break; }
248 is_float = true;
249 self.advance();
250 } else if c == b'e' || c == b'E' {
251 is_float = true;
252 self.advance();
253 if let Some(next) = self.peek()
254 && (next == b'+' || next == b'-')
255 {
256 self.advance();
257 }
258 } else {
259 break;
260 }
261 }
262
263 if is_float {
264 TokenKind::DNumber
265 } else {
266 TokenKind::LNumber
267 }
268 }
269
270 fn consume_single_line_comment(&mut self) -> TokenKind {
271 while self.cursor < self.input.len() {
272 let remaining = &self.input[self.cursor..];
273 match memchr3(b'\n', b'\r', b'?', remaining) {
274 Some(pos) => {
275 self.cursor += pos;
276 let c = self.input[self.cursor];
277 if c == b'?' {
278 if self.input.get(self.cursor + 1) == Some(&b'>') {
279 break;
280 } else {
281 self.cursor += 1;
282 }
283 } else {
284 break;
285 }
286 }
287 None => {
288 self.cursor = self.input.len();
289 break;
290 }
291 }
292 }
293 TokenKind::Comment
294 }
295
296 fn consume_multi_line_comment(&mut self) -> TokenKind {
297 let is_doc = if self.peek() == Some(b'*') && self.input.get(self.cursor + 1) != Some(&b'/')
298 {
299 self.advance();
300 true
301 } else {
302 false
303 };
304
305 while self.cursor < self.input.len() {
306 let remaining = &self.input[self.cursor..];
307 match memchr(b'*', remaining) {
308 Some(pos) => {
309 self.cursor += pos;
310 self.advance(); if self.peek() == Some(b'/') {
312 self.advance();
313 return if is_doc {
314 TokenKind::DocComment
315 } else {
316 TokenKind::Comment
317 };
318 }
319 }
320 None => {
321 self.cursor = self.input.len();
322 break;
323 }
324 }
325 }
326
327 TokenKind::Error }
329
330 fn next_in_looking_for_property(&mut self) -> Option<Token> {
331 let start = self.cursor;
332 if self.cursor >= self.input.len() {
333 return Some(Token {
334 kind: TokenKind::Error,
335 span: Span::new(start, start),
336 });
337 }
338
339 let c = self.input[self.cursor];
340
341 if c == b'-' && self.input.get(self.cursor + 1) == Some(&b'>') {
342 self.advance_n(2);
343 return Some(Token {
344 kind: TokenKind::Arrow,
345 span: Span::new(start, self.cursor),
346 });
347 }
348
349 if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 {
350 self.read_identifier();
351 self.state_stack.pop(); return Some(Token {
353 kind: TokenKind::Identifier,
354 span: Span::new(start, self.cursor),
355 });
356 }
357
358 self.state_stack.pop();
365 Some(Token {
368 kind: TokenKind::Error,
369 span: Span::new(start, self.cursor),
370 })
371 }
372
373 fn next_in_looking_for_var_name(&mut self) -> Option<Token> {
374 let start = self.cursor;
375 if self.cursor >= self.input.len() {
376 return Some(Token {
377 kind: TokenKind::Error,
378 span: Span::new(start, start),
379 });
380 }
381
382 let c = self.input[self.cursor];
383
384 if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 {
385 self.read_identifier();
386 return Some(Token {
387 kind: TokenKind::StringVarname,
388 span: Span::new(start, self.cursor),
389 });
390 }
391
392 if c == b'[' {
393 self.advance();
394 self.state_stack.push(LexerState::VarOffsetDollarCurly);
395 return Some(Token {
396 kind: TokenKind::OpenBracket,
397 span: Span::new(start, self.cursor),
398 });
399 }
400
401 if c == b'}' {
402 self.advance();
403 self.state_stack.pop();
404 return Some(Token {
405 kind: TokenKind::CloseBrace,
406 span: Span::new(start, self.cursor),
407 });
408 }
409
410 self.advance();
411 Some(Token {
412 kind: TokenKind::Error,
413 span: Span::new(start, self.cursor),
414 })
415 }
416
417 fn next_in_var_offset(&mut self, is_num_string: bool) -> Option<Token> {
418 let start = self.cursor;
419 if self.cursor >= self.input.len() {
420 return Some(Token {
421 kind: TokenKind::Error,
422 span: Span::new(start, start),
423 });
424 }
425
426 let c = self.input[self.cursor];
427
428 if c == b']' {
429 self.advance();
430 self.state_stack.pop();
431 return Some(Token {
432 kind: TokenKind::CloseBracket,
433 span: Span::new(start, self.cursor),
434 });
435 }
436
437 if c == b'$' {
438 self.advance();
439 if let Some(next) = self.peek()
440 && (next.is_ascii_alphabetic() || next == b'_')
441 {
442 let var_start = self.cursor - 1;
443 self.read_identifier();
444 return Some(Token {
445 kind: TokenKind::Variable,
446 span: Span::new(var_start, self.cursor),
447 });
448 }
449 }
453
454 if c.is_ascii_digit() {
455 if is_num_string {
456 while let Some(c) = self.peek() {
458 if c.is_ascii_digit() {
459 self.advance();
460 } else {
461 break;
462 }
463 }
464 return Some(Token {
465 kind: TokenKind::NumString,
466 span: Span::new(start, self.cursor),
467 });
468 } else {
469 let kind = self.read_number();
470 return Some(Token {
471 kind,
472 span: Span::new(start, self.cursor),
473 });
474 }
475 }
476
477 if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 {
478 self.read_identifier();
479 return Some(Token {
480 kind: TokenKind::Identifier,
481 span: Span::new(start, self.cursor),
482 });
483 }
484
485 if c == b'-' {
486 self.advance();
487 return Some(Token {
488 kind: TokenKind::Minus,
489 span: Span::new(start, self.cursor),
490 });
491 }
492
493 self.advance();
495
496 if c == b'[' {
519 return Some(Token {
520 kind: TokenKind::OpenBracket,
521 span: Span::new(start, self.cursor),
522 });
523 }
524
525 Some(Token {
526 kind: TokenKind::Error,
527 span: Span::new(start, self.cursor),
528 })
529 }
530
531 fn next_in_double_quotes(&mut self) -> Option<Token> {
532 let start = self.cursor;
533 if self.cursor >= self.input.len() {
534 return Some(Token {
535 kind: TokenKind::Error,
536 span: Span::new(start, start),
537 });
538 }
539
540 let char = self.input[self.cursor];
541
542 match char {
543 b'"' => {
544 if let Some(LexerState::DoubleQuotes) = self.state_stack.last() {
545 self.advance();
546 self.state_stack.pop();
547 return Some(Token {
548 kind: TokenKind::DoubleQuote,
549 span: Span::new(start, self.cursor),
550 });
551 }
552 }
553 b'`' => {
554 if let Some(LexerState::Backquote) = self.state_stack.last() {
555 self.advance();
556 self.state_stack.pop();
557 return Some(Token {
558 kind: TokenKind::Backtick,
559 span: Span::new(start, self.cursor),
560 });
561 }
562 }
563 b'$' => {
564 self.advance();
565 if let Some(c) = self.peek() {
566 if c.is_ascii_alphabetic() || c == b'_' {
567 let var_start = self.cursor - 1;
572 self.read_identifier();
573
574 if self.peek() == Some(b'[') {
576 self.state_stack.push(LexerState::VarOffset);
577 } else if self.peek() == Some(b'-')
578 && self.input.get(self.cursor + 1) == Some(&b'>')
579 && let Some(next_next) = self.input.get(self.cursor + 2)
580 && (next_next.is_ascii_alphabetic() || *next_next == b'_')
581 {
582 self.state_stack.push(LexerState::LookingForProperty);
583 }
584
585 return Some(Token {
586 kind: TokenKind::Variable,
587 span: Span::new(var_start, self.cursor),
588 });
589 } else if c == b'{' {
590 self.advance(); self.state_stack.push(LexerState::LookingForVarName);
592 return Some(Token {
593 kind: TokenKind::DollarOpenCurlyBraces,
594 span: Span::new(start, self.cursor),
595 });
596 }
597 }
598 }
600 b'{' => {
601 if self.input.get(self.cursor + 1) == Some(&b'$') {
602 self.advance();
603 self.state_stack.push(LexerState::Scripting);
605 return Some(Token {
606 kind: TokenKind::CurlyOpen,
607 span: Span::new(start, self.cursor),
608 });
609 }
610 }
611 _ => {}
612 }
613
614 while let Some(c) = self.peek() {
616 if c == b'"' && matches!(self.state_stack.last(), Some(LexerState::DoubleQuotes)) {
617 break;
618 }
619 if c == b'`' && matches!(self.state_stack.last(), Some(LexerState::Backquote)) {
620 break;
621 }
622 if c == b'$'
623 && let Some(next) = self.input.get(self.cursor + 1)
624 && (next.is_ascii_alphabetic() || *next == b'_' || *next == b'{')
625 {
626 break;
627 }
628 if c == b'{' && self.input.get(self.cursor + 1) == Some(&b'$') {
629 break;
630 }
631
632 if c == b'\\' {
633 self.advance();
634 if self.peek().is_some() {
635 self.advance();
636 }
637 } else {
638 self.advance();
639 }
640 }
641
642 if self.cursor > start {
643 Some(Token {
644 kind: TokenKind::EncapsedAndWhitespace,
645 span: Span::new(start, self.cursor),
646 })
647 } else {
648 Some(Token {
674 kind: TokenKind::EncapsedAndWhitespace,
675 span: Span::new(start, self.cursor),
676 })
677 }
678 }
679
680 fn read_single_quoted(&mut self) -> TokenKind {
681 while self.cursor < self.input.len() {
682 let remaining = &self.input[self.cursor..];
683 match memchr2(b'\'', b'\\', remaining) {
684 Some(pos) => {
685 self.cursor += pos;
686 let c = self.input[self.cursor];
687 self.advance(); if c == b'\'' {
689 return TokenKind::StringLiteral;
690 } else {
691 if self.cursor < self.input.len() {
693 self.advance(); }
695 }
696 }
697 None => {
698 self.cursor = self.input.len();
699 break;
700 }
701 }
702 }
703 TokenKind::Error
704 }
705
706 fn read_double_quoted(&mut self, quote: u8, start_pos: usize) -> TokenKind {
707 while let Some(c) = self.peek() {
708 if c == quote {
709 self.advance();
710 return TokenKind::StringLiteral;
711 } else if c == b'\\' {
712 self.advance();
713 if self.peek().is_some() {
714 self.advance();
715 }
716 } else if c == b'$' {
717 if let Some(next) = self.input.get(self.cursor + 1)
718 && (next.is_ascii_alphabetic() || *next == b'_' || *next == b'{')
719 {
720 self.cursor = start_pos + 1;
721 self.state_stack.push(if quote == b'"' {
722 LexerState::DoubleQuotes
723 } else {
724 LexerState::Backquote
725 });
726 return if quote == b'"' {
727 TokenKind::DoubleQuote
728 } else {
729 TokenKind::Backtick
730 };
731 }
732 self.advance();
733 } else if c == b'{' {
734 if self.input.get(self.cursor + 1) == Some(&b'$') {
735 self.cursor = start_pos + 1;
736 self.state_stack.push(if quote == b'"' {
737 LexerState::DoubleQuotes
738 } else {
739 LexerState::Backquote
740 });
741 return if quote == b'"' {
742 TokenKind::DoubleQuote
743 } else {
744 TokenKind::Backtick
745 };
746 }
747 self.advance();
748 } else {
749 self.advance();
750 }
751 }
752 TokenKind::Error
753 }
754
755 fn read_heredoc_start(&mut self, start: usize) -> Token {
756 while let Some(c) = self.peek() {
757 if c == b' ' || c == b'\t' {
758 self.advance();
759 } else {
760 break;
761 }
762 }
763
764 let quote = self.peek();
765 let is_quoted = quote == Some(b'\'') || quote == Some(b'"');
766 let is_nowdoc = quote == Some(b'\'');
767
768 if is_quoted {
769 self.advance();
770 }
771
772 let label_start = self.cursor;
773 self.read_identifier();
774 let label = self.input[label_start..self.cursor].to_vec();
775
776 if is_quoted && self.peek() == quote {
777 self.advance();
778 }
779
780 if let Some(c) = self.peek() {
782 if c == b'\n' {
783 self.advance();
784 } else if c == b'\r' {
785 self.advance();
786 if self.peek() == Some(b'\n') {
787 self.advance();
788 }
789 }
790 }
791
792 if is_nowdoc {
793 self.state_stack.push(LexerState::Nowdoc(label));
794 } else {
795 self.state_stack.push(LexerState::Heredoc(label));
796 }
797
798 Token {
799 kind: TokenKind::StartHeredoc,
800 span: Span::new(start, self.cursor),
801 }
802 }
803
804 fn check_heredoc_end(&self, label: &[u8]) -> Option<usize> {
805 let mut current = self.cursor;
806 while current < self.input.len() {
807 let c = self.input[current];
808 if c == b' ' || c == b'\t' {
809 current += 1;
810 } else {
811 break;
812 }
813 }
814
815 if current + label.len() > self.input.len() {
816 return None;
817 }
818
819 if &self.input[current..current + label.len()] == label {
820 let after = current + label.len();
822 if after >= self.input.len() {
823 return Some(after - self.cursor);
824 }
825 let c = self.input[after];
826 if !c.is_ascii_alphanumeric() && c != b'_' && c < 0x80 {
827 return Some(after - self.cursor);
828 }
829 }
830 None
831 }
832
833 fn is_followed_by_var_or_vararg(&self) -> bool {
834 let mut cursor = self.cursor;
835 while cursor < self.input.len() {
836 let c = self.input[cursor];
837 if c.is_ascii_whitespace() {
838 cursor += 1;
839 continue;
840 }
841
842 if c == b'#' {
844 while cursor < self.input.len() && self.input[cursor] != b'\n' {
846 cursor += 1;
847 }
848 continue;
849 }
850 if c == b'/' && cursor + 1 < self.input.len() {
851 if self.input[cursor + 1] == b'/' {
852 while cursor < self.input.len() && self.input[cursor] != b'\n' {
854 cursor += 1;
855 }
856 continue;
857 } else if self.input[cursor + 1] == b'*' {
858 cursor += 2;
860 while cursor < self.input.len() {
861 if self.input[cursor] == b'*'
862 && cursor + 1 < self.input.len()
863 && self.input[cursor + 1] == b'/'
864 {
865 cursor += 2;
866 break;
867 }
868 cursor += 1;
869 }
870 continue;
871 }
872 }
873
874 if c == b'$' && cursor + 1 < self.input.len() {
876 let next = self.input[cursor + 1];
877 if next.is_ascii_alphabetic() || next == b'_' || next >= 0x80 {
878 return true;
879 }
880 }
881
882 if c == b'.'
884 && cursor + 2 < self.input.len()
885 && self.input[cursor + 1] == b'.'
886 && self.input[cursor + 2] == b'.'
887 {
888 return true;
889 }
890
891 return false;
892 }
893 false
894 }
895
896 fn check_set_visibility(&mut self, normal: TokenKind, set: TokenKind) -> TokenKind {
897 let mut look = self.cursor;
898
899 while let Some(b) = self.input.get(look) {
901 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x0b' | b'\x0c') {
902 look += 1;
903 } else {
904 break;
905 }
906 }
907
908 if self.input.get(look) != Some(&b'(') {
909 return normal;
910 }
911 look += 1;
912
913 while let Some(b) = self.input.get(look) {
915 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x0b' | b'\x0c') {
916 look += 1;
917 } else {
918 break;
919 }
920 }
921
922 let set_kw = b"set";
923 let is_set = self
924 .input
925 .get(look..look + set_kw.len())
926 .map(|s| s.eq_ignore_ascii_case(set_kw))
927 .unwrap_or(false);
928
929 if !is_set {
930 return normal;
931 }
932 look += set_kw.len();
933
934 while let Some(b) = self.input.get(look) {
936 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x0b' | b'\x0c') {
937 look += 1;
938 } else {
939 break;
940 }
941 }
942
943 if self.input.get(look) != Some(&b')') {
944 return normal;
945 }
946 look += 1;
947
948 self.cursor = look;
949 set
950 }
951
952 fn next_in_nowdoc(&mut self) -> Option<Token> {
953 let label = if let Some(LexerState::Nowdoc(label)) = self.state_stack.last() {
954 label.clone()
955 } else {
956 return None;
957 };
958
959 if self.cursor >= self.input.len() {
960 return Some(Token {
961 kind: TokenKind::Error,
962 span: Span::new(self.cursor, self.cursor),
963 });
964 }
965
966 let start = self.cursor;
967
968 if let Some(len) = self.check_heredoc_end(&label) {
970 self.advance_n(len);
971 self.state_stack.pop();
972
973 return Some(Token {
974 kind: TokenKind::EndHeredoc,
975 span: Span::new(start, self.cursor),
976 });
977 }
978
979 while let Some(c) = self.peek() {
981 self.advance();
982 if c == b'\n' {
983 if self.check_heredoc_end(&label).is_some() {
985 break;
986 }
987 }
988 }
989
990 Some(Token {
991 kind: TokenKind::EncapsedAndWhitespace,
992 span: Span::new(start, self.cursor),
993 })
994 }
995
996 fn next_in_heredoc(&mut self) -> Option<Token> {
997 let label = if let Some(LexerState::Heredoc(label)) = self.state_stack.last() {
998 label.clone()
999 } else {
1000 return None;
1001 };
1002
1003 if self.cursor >= self.input.len() {
1004 return Some(Token {
1005 kind: TokenKind::Error,
1006 span: Span::new(self.cursor, self.cursor),
1007 });
1008 }
1009
1010 let start = self.cursor;
1011
1012 if let Some(len) = self.check_heredoc_end(&label) {
1014 self.advance_n(len);
1015 self.state_stack.pop();
1016
1017 return Some(Token {
1018 kind: TokenKind::EndHeredoc,
1019 span: Span::new(start, self.cursor),
1020 });
1021 }
1022
1023 if let Some(c) = self.peek() {
1025 if c == b'$' {
1026 self.advance();
1027 if let Some(next) = self.peek() {
1028 if next.is_ascii_alphabetic() || next == b'_' {
1029 let var_start = self.cursor - 1;
1030 self.read_identifier();
1031
1032 if self.peek() == Some(b'[') {
1034 self.state_stack.push(LexerState::VarOffset);
1035 } else if self.peek() == Some(b'-')
1036 && self.input.get(self.cursor + 1) == Some(&b'>')
1037 && let Some(next_next) = self.input.get(self.cursor + 2)
1038 && (next_next.is_ascii_alphabetic() || *next_next == b'_')
1039 {
1040 self.state_stack.push(LexerState::LookingForProperty);
1041 }
1042
1043 return Some(Token {
1044 kind: TokenKind::Variable,
1045 span: Span::new(var_start, self.cursor),
1046 });
1047 } else if next == b'{' {
1048 self.advance();
1049 self.state_stack.push(LexerState::LookingForVarName);
1050 return Some(Token {
1051 kind: TokenKind::DollarOpenCurlyBraces,
1052 span: Span::new(start, self.cursor),
1053 });
1054 }
1055 }
1056 } else if c == b'{' && self.input.get(self.cursor + 1) == Some(&b'$') {
1057 self.advance();
1058 self.state_stack.push(LexerState::Scripting);
1059 return Some(Token {
1060 kind: TokenKind::CurlyOpen,
1061 span: Span::new(start, self.cursor),
1062 });
1063 }
1064 }
1065
1066 while let Some(c) = self.peek() {
1068 if c == b'$'
1069 && let Some(next) = self.input.get(self.cursor + 1)
1070 && (next.is_ascii_alphabetic() || *next == b'_' || *next == b'{')
1071 {
1072 break;
1073 }
1074 if c == b'{' && self.input.get(self.cursor + 1) == Some(&b'$') {
1075 break;
1076 }
1077
1078 self.advance();
1079 if c == b'\n' && self.check_heredoc_end(&label).is_some() {
1080 break;
1081 }
1082
1083 if c == b'\\' && self.peek().is_some() {
1084 self.advance();
1085 }
1086 }
1087
1088 if self.cursor > start {
1089 Some(Token {
1090 kind: TokenKind::EncapsedAndWhitespace,
1091 span: Span::new(start, self.cursor),
1092 })
1093 } else {
1094 Some(Token {
1096 kind: TokenKind::EncapsedAndWhitespace,
1097 span: Span::new(start, self.cursor),
1098 })
1099 }
1100 }
1101
1102 fn next_in_halt_compiler(&mut self) -> Option<Token> {
1103 self.skip_whitespace();
1104
1105 if self.cursor >= self.input.len() {
1106 return Some(Token {
1107 kind: TokenKind::Eof,
1108 span: Span::new(self.cursor, self.cursor),
1109 });
1110 }
1111
1112 let start = self.cursor;
1113 let c = self.input[self.cursor];
1114 self.advance();
1115
1116 let kind = match c {
1117 b'(' => TokenKind::OpenParen,
1118 b')' => TokenKind::CloseParen,
1119 b';' => {
1120 self.state_stack.pop();
1121 self.state_stack.push(LexerState::RawData);
1122 TokenKind::SemiColon
1123 }
1124 b'#' => self.consume_single_line_comment(),
1125 b'/' => {
1126 if self.peek() == Some(b'/') {
1127 self.advance();
1128 self.consume_single_line_comment()
1129 } else if self.peek() == Some(b'*') {
1130 self.advance();
1131 self.consume_multi_line_comment()
1132 } else {
1133 TokenKind::Error
1134 }
1135 }
1136 _ => TokenKind::Error,
1137 };
1138
1139 Some(Token {
1140 kind,
1141 span: Span::new(start, self.cursor),
1142 })
1143 }
1144
1145 pub fn input_slice(&self, span: Span) -> &'src [u8] {
1146 &self.input[span.start..span.end]
1147 }
1148}
1149
1150impl<'src> Iterator for Lexer<'src> {
1151 type Item = Token;
1152
1153 fn next(&mut self) -> Option<Self::Item> {
1154 if let Some(LexerState::Initial) = self.state_stack.last() {
1156 let start = self.cursor;
1157 while self.cursor < self.input.len() {
1158 if self.input[self.cursor] != b'<' {
1159 let remaining = &self.input[self.cursor..];
1160 match memchr(b'<', remaining) {
1161 Some(pos) => self.cursor += pos,
1162 None => {
1163 self.cursor = self.input.len();
1164 break;
1165 }
1166 }
1167 }
1168
1169 if self.input[self.cursor..].starts_with(b"<?php") {
1170 if self.cursor > start {
1171 return Some(Token {
1172 kind: TokenKind::InlineHtml,
1173 span: Span::new(start, self.cursor),
1174 });
1175 }
1176
1177 let tag_start = self.cursor;
1178 self.state_stack.pop();
1179 self.state_stack.push(LexerState::Scripting);
1180 self.advance_n(5);
1181
1182 if self.peek().is_some_and(|c| c.is_ascii_whitespace()) {
1184 self.advance();
1185 }
1186
1187 return Some(Token {
1188 kind: TokenKind::OpenTag,
1189 span: Span::new(tag_start, self.cursor),
1190 });
1191 } else if self.input[self.cursor..].starts_with(b"<?=") {
1192 if self.cursor > start {
1193 return Some(Token {
1194 kind: TokenKind::InlineHtml,
1195 span: Span::new(start, self.cursor),
1196 });
1197 }
1198 let tag_start = self.cursor;
1199 self.state_stack.pop();
1200 self.state_stack.push(LexerState::Scripting);
1201 self.advance_n(3);
1202 return Some(Token {
1203 kind: TokenKind::OpenTagEcho,
1204 span: Span::new(tag_start, self.cursor),
1205 });
1206 }
1207 self.advance();
1208 }
1209
1210 if self.cursor > start {
1211 return Some(Token {
1212 kind: TokenKind::InlineHtml,
1213 span: Span::new(start, self.cursor),
1214 });
1215 }
1216
1217 return Some(Token {
1218 kind: TokenKind::Eof,
1219 span: Span::new(self.cursor, self.cursor),
1220 });
1221 }
1222
1223 if let Some(LexerState::DoubleQuotes) | Some(LexerState::Backquote) =
1225 self.state_stack.last()
1226 {
1227 return self.next_in_double_quotes();
1228 }
1229
1230 if let Some(LexerState::Heredoc(_)) = self.state_stack.last() {
1231 return self.next_in_heredoc();
1232 }
1233
1234 if let Some(LexerState::Nowdoc(_)) = self.state_stack.last() {
1235 return self.next_in_nowdoc();
1236 }
1237
1238 if let Some(LexerState::HaltCompiler) = self.state_stack.last() {
1239 return self.next_in_halt_compiler();
1240 }
1241
1242 if let Some(LexerState::VarOffset) = self.state_stack.last() {
1243 return self.next_in_var_offset(true);
1244 }
1245
1246 if let Some(LexerState::VarOffsetDollarCurly) = self.state_stack.last() {
1247 return self.next_in_var_offset(false);
1248 }
1249
1250 if let Some(LexerState::LookingForProperty) = self.state_stack.last() {
1251 return self.next_in_looking_for_property();
1252 }
1253
1254 if let Some(LexerState::LookingForVarName) = self.state_stack.last() {
1255 return self.next_in_looking_for_var_name();
1256 }
1257
1258 if let Some(LexerState::RawData) = self.state_stack.last() {
1259 if self.cursor >= self.input.len() {
1260 return Some(Token {
1261 kind: TokenKind::Eof,
1262 span: Span::new(self.cursor, self.cursor),
1263 });
1264 }
1265 let start = self.cursor;
1266 self.cursor = self.input.len(); return Some(Token {
1268 kind: TokenKind::InlineHtml,
1269 span: Span::new(start, self.cursor),
1270 });
1271 }
1272
1273 self.skip_whitespace();
1274
1275 if self.cursor >= self.input.len() {
1276 return Some(Token {
1277 kind: TokenKind::Eof,
1278 span: Span::new(self.cursor, self.cursor),
1279 });
1280 }
1281
1282 let start = self.cursor;
1283 let char = self.input[self.cursor];
1284 self.advance();
1285
1286 let kind = match char {
1287 b'$' => {
1288 if let Some(c) = self.peek() {
1289 if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 {
1290 self.read_identifier();
1291 TokenKind::Variable
1292 } else {
1293 TokenKind::Dollar
1294 }
1295 } else {
1296 TokenKind::Dollar
1297 }
1298 }
1299 b'\\' => TokenKind::NsSeparator,
1300 b'\'' => self.read_single_quoted(),
1301 b'"' => self.read_double_quoted(b'"', start),
1302 b'`' => {
1303 self.state_stack.push(LexerState::Backquote);
1304 TokenKind::Backtick
1305 }
1306 b'#' => {
1307 if self.peek() == Some(b'[') {
1308 self.advance();
1309 TokenKind::Attribute
1310 } else {
1311 self.consume_single_line_comment()
1312 }
1313 }
1314 b';' => TokenKind::SemiColon,
1315 b':' => {
1316 if self.peek() == Some(b':') {
1317 self.advance();
1318 TokenKind::DoubleColon
1319 } else {
1320 TokenKind::Colon
1321 }
1322 }
1323 b',' => TokenKind::Comma,
1324 b'{' => {
1325 self.state_stack.push(LexerState::Scripting);
1326 TokenKind::OpenBrace
1327 }
1328 b'}' => {
1329 if self.state_stack.len() > 1 {
1330 self.state_stack.pop();
1331 }
1332 TokenKind::CloseBrace
1333 }
1334 b'(' => {
1335 let saved_cursor = self.cursor;
1337 self.skip_whitespace();
1338
1339 let start_ident = self.cursor;
1340 self.read_identifier();
1341 let ident_len = self.cursor - start_ident;
1342
1343 if ident_len > 0 {
1344 let ident = &self.input[start_ident..self.cursor];
1345 self.skip_whitespace();
1346 if self.peek() == Some(b')') {
1347 let cast_kind = match ident.to_ascii_lowercase().as_slice() {
1348 b"int" | b"integer" => Some(TokenKind::IntCast),
1349 b"bool" | b"boolean" => Some(TokenKind::BoolCast),
1350 b"float" | b"double" | b"real" => Some(TokenKind::FloatCast),
1351 b"string" | b"binary" => Some(TokenKind::StringCast),
1352 b"array" => Some(TokenKind::ArrayCast),
1353 b"object" => Some(TokenKind::ObjectCast),
1354 b"unset" => Some(TokenKind::UnsetCast),
1355 b"void" => Some(TokenKind::VoidCast),
1356 _ => None,
1357 };
1358
1359 if let Some(k) = cast_kind {
1360 self.advance(); k
1362 } else {
1363 self.cursor = saved_cursor;
1364 TokenKind::OpenParen
1365 }
1366 } else {
1367 self.cursor = saved_cursor;
1368 TokenKind::OpenParen
1369 }
1370 } else {
1371 self.cursor = saved_cursor;
1372 TokenKind::OpenParen
1373 }
1374 }
1375 b')' => TokenKind::CloseParen,
1376 b'[' => TokenKind::OpenBracket,
1377 b']' => TokenKind::CloseBracket,
1378 b'+' => {
1379 if self.peek() == Some(b'+') {
1380 self.advance();
1381 TokenKind::Inc
1382 } else if self.peek() == Some(b'=') {
1383 self.advance();
1384 TokenKind::PlusEq
1385 } else {
1386 TokenKind::Plus
1387 }
1388 }
1389 b'-' => {
1390 if self.peek() == Some(b'>') {
1391 self.advance();
1392 TokenKind::Arrow
1393 } else if self.peek() == Some(b'-') {
1394 self.advance();
1395 TokenKind::Dec
1396 } else if self.peek() == Some(b'=') {
1397 self.advance();
1398 TokenKind::MinusEq
1399 } else {
1400 TokenKind::Minus
1401 }
1402 }
1403 b'*' => {
1404 if self.peek() == Some(b'*') {
1405 self.advance();
1406 if self.peek() == Some(b'=') {
1407 self.advance();
1408 TokenKind::PowEq
1409 } else {
1410 TokenKind::Pow
1411 }
1412 } else if self.peek() == Some(b'=') {
1413 self.advance();
1414 TokenKind::MulEq
1415 } else {
1416 TokenKind::Asterisk
1417 }
1418 }
1419 b'/' => {
1420 if self.peek() == Some(b'/') {
1421 self.advance();
1422 self.consume_single_line_comment()
1423 } else if self.peek() == Some(b'*') {
1424 self.advance();
1425 self.consume_multi_line_comment()
1426 } else if self.peek() == Some(b'=') {
1427 self.advance();
1428 TokenKind::DivEq
1429 } else {
1430 TokenKind::Slash
1431 }
1432 }
1433 b'%' => {
1434 if self.peek() == Some(b'=') {
1435 self.advance();
1436 TokenKind::ModEq
1437 } else {
1438 TokenKind::Percent
1439 }
1440 }
1441 b'.' => {
1442 if self.peek() == Some(b'=') {
1443 self.advance();
1444 TokenKind::ConcatEq
1445 } else if self.peek() == Some(b'.') {
1446 self.advance();
1447 if self.peek() == Some(b'.') {
1448 self.advance();
1449 TokenKind::Ellipsis
1450 } else {
1451 TokenKind::Dot
1452 }
1453 } else if let Some(c) = self.peek()
1454 && c.is_ascii_digit()
1455 {
1456 self.cursor -= 1;
1457 self.read_number()
1458 } else {
1459 TokenKind::Dot
1460 }
1461 }
1462 b'=' => {
1463 if self.peek() == Some(b'=') {
1464 self.advance();
1465 if self.peek() == Some(b'=') {
1466 self.advance();
1467 TokenKind::EqEqEq
1468 } else {
1469 TokenKind::EqEq
1470 }
1471 } else if self.peek() == Some(b'>') {
1472 self.advance();
1473 TokenKind::DoubleArrow
1474 } else {
1475 TokenKind::Eq
1476 }
1477 }
1478 b'!' => {
1479 if self.peek() == Some(b'=') {
1480 self.advance();
1481 if self.peek() == Some(b'=') {
1482 self.advance();
1483 TokenKind::BangEqEq
1484 } else {
1485 TokenKind::BangEq
1486 }
1487 } else {
1488 TokenKind::Bang
1489 }
1490 }
1491 b'<' => {
1492 if self.peek() == Some(b'<') && self.input.get(self.cursor + 1) == Some(&b'<') {
1493 self.advance(); self.advance(); return Some(self.read_heredoc_start(start));
1496 } else if self.peek() == Some(b'=') {
1497 self.advance();
1498 if self.peek() == Some(b'>') {
1499 self.advance();
1500 TokenKind::Spaceship
1501 } else {
1502 TokenKind::LtEq
1503 }
1504 } else if self.peek() == Some(b'<') {
1505 self.advance();
1506 if self.peek() == Some(b'=') {
1507 self.advance();
1508 TokenKind::SlEq
1509 } else {
1510 TokenKind::Sl
1511 }
1512 } else if self.peek() == Some(b'>') {
1513 self.advance();
1514 TokenKind::BangEq
1515 } else {
1516 TokenKind::Lt
1517 }
1518 }
1519 b'>' => {
1520 if self.peek() == Some(b'=') {
1521 self.advance();
1522 TokenKind::GtEq
1523 } else if self.peek() == Some(b'>') {
1524 self.advance();
1525 if self.peek() == Some(b'=') {
1526 self.advance();
1527 TokenKind::SrEq
1528 } else {
1529 TokenKind::Sr
1530 }
1531 } else {
1532 TokenKind::Gt
1533 }
1534 }
1535 b'&' => {
1536 if self.peek() == Some(b'&') {
1537 self.advance();
1538 TokenKind::AmpersandAmpersand
1539 } else if self.peek() == Some(b'=') {
1540 self.advance();
1541 TokenKind::AndEq
1542 } else if self.is_followed_by_var_or_vararg() {
1543 TokenKind::AmpersandFollowedByVarOrVararg
1544 } else {
1545 TokenKind::AmpersandNotFollowedByVarOrVararg
1546 }
1547 }
1548 b'|' => {
1549 if self.peek() == Some(b'|') {
1550 self.advance();
1551 TokenKind::PipePipe
1552 } else if self.peek() == Some(b'=') {
1553 self.advance();
1554 TokenKind::OrEq
1555 } else {
1556 TokenKind::Pipe
1557 }
1558 }
1559 b'^' => {
1560 if self.peek() == Some(b'=') {
1561 self.advance();
1562 TokenKind::XorEq
1563 } else {
1564 TokenKind::Caret
1565 }
1566 }
1567 b'~' => TokenKind::BitNot,
1568 b'@' => TokenKind::At,
1569 b'?' => {
1570 if self.peek() == Some(b'>') {
1571 self.advance();
1572 self.state_stack.pop();
1573 self.state_stack.push(LexerState::Initial);
1574 TokenKind::CloseTag
1575 } else if self.peek() == Some(b'?') {
1576 self.advance();
1577 if self.peek() == Some(b'=') {
1578 self.advance();
1579 TokenKind::CoalesceEq
1580 } else {
1581 TokenKind::Coalesce
1582 }
1583 } else if self.peek() == Some(b'-')
1584 && self.input.get(self.cursor + 1) == Some(&b'>')
1585 {
1586 self.advance();
1587 self.advance();
1588 TokenKind::NullSafeArrow
1589 } else {
1590 TokenKind::Question
1591 }
1592 }
1593 c if c.is_ascii_digit() => {
1594 self.cursor -= 1;
1595 self.read_number()
1596 }
1597 c if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 => {
1598 if (c == b'b' || c == b'B')
1600 && let Some(next) = self.peek()
1601 {
1602 if next == b'\'' {
1603 self.advance(); return Some(Token {
1605 kind: self.read_single_quoted(),
1606 span: Span::new(start, self.cursor),
1607 });
1608 } else if next == b'"' {
1609 let quote_pos = self.cursor;
1610 self.advance(); return Some(Token {
1612 kind: self.read_double_quoted(b'"', quote_pos),
1613 span: Span::new(start, self.cursor),
1614 });
1615 }
1616 }
1617
1618 self.read_identifier();
1619 let text = &self.input[start..self.cursor];
1620
1621 if self.mode == LexerMode::LookingForProperty {
1622 self.mode = LexerMode::Standard;
1623 TokenKind::Identifier
1624 } else {
1625 let is_all_lowercase = text.iter().all(|c| !c.is_ascii_uppercase());
1626
1627 let mut kind = if is_all_lowercase {
1628 keyword_lookup(text)
1629 } else {
1630 keyword_lookup(&text.to_ascii_lowercase())
1631 };
1632
1633 match kind {
1634 TokenKind::Yield => {
1635 let mut look = self.cursor;
1636 while let Some(b) = self.input.get(look) {
1637 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x0b' | b'\x0c') {
1638 look += 1;
1639 } else {
1640 break;
1641 }
1642 }
1643 let from_kw = b"from";
1644 let is_from = self
1645 .input
1646 .get(look..look + from_kw.len())
1647 .map(|s| {
1648 s.iter()
1649 .zip(from_kw.iter())
1650 .all(|(c, k)| c.to_ascii_lowercase() == *k)
1651 })
1652 .unwrap_or(false)
1653 && !self
1654 .input
1655 .get(look + from_kw.len())
1656 .map(|c| c.is_ascii_alphanumeric() || *c == b'_' || *c >= 0x80)
1657 .unwrap_or(false);
1658
1659 if is_from {
1660 self.cursor = look + from_kw.len();
1661 kind = TokenKind::YieldFrom;
1662 }
1663 }
1664 TokenKind::Public => {
1665 if text[0].eq_ignore_ascii_case(&b'p') {
1666 kind = self
1667 .check_set_visibility(TokenKind::Public, TokenKind::PublicSet);
1668 }
1669 }
1670 TokenKind::Protected => {
1671 kind = self.check_set_visibility(
1672 TokenKind::Protected,
1673 TokenKind::ProtectedSet,
1674 );
1675 }
1676 TokenKind::Private => {
1677 kind = self
1678 .check_set_visibility(TokenKind::Private, TokenKind::PrivateSet);
1679 }
1680 TokenKind::HaltCompiler => {
1681 self.state_stack.pop();
1682 self.state_stack.push(LexerState::HaltCompiler);
1683 }
1684 _ => {}
1685 }
1686 kind
1687 }
1688 }
1689 _ => TokenKind::Error,
1690 };
1691
1692 Some(Token {
1693 kind,
1694 span: Span::new(start, self.cursor),
1695 })
1696 }
1697}