1pub mod token;
2
3use crate::span::Span;
4use memchr::{memchr, memchr2, memchr3};
5use token::{Token, TokenKind};
6
7#[derive(Debug, Clone, Copy, PartialEq)]
8pub enum LexerMode {
9 Standard,
10 LookingForProperty,
11 LookingForVarName,
12}
13
14#[derive(Debug, Clone, PartialEq)]
15enum LexerState {
16 Initial,
17 Scripting,
18 DoubleQuotes,
19 Backquote,
20 Heredoc(Vec<u8>),
21 Nowdoc(Vec<u8>),
22 HaltCompiler,
23 RawData,
24 VarOffset,
25 VarOffsetDollarCurly,
26 LookingForProperty,
27 LookingForVarName,
28}
29
30fn keyword_lookup(text: &[u8]) -> TokenKind {
31 match text {
32 b"or" => TokenKind::LogicalOr,
33 b"and" => TokenKind::LogicalAnd,
34 b"xor" => TokenKind::LogicalXor,
35 b"bool" => TokenKind::TypeBool,
36 b"int" => TokenKind::TypeInt,
37 b"float" => TokenKind::TypeFloat,
38 b"string" => TokenKind::TypeString,
39 b"mixed" => TokenKind::TypeMixed,
40 b"never" => TokenKind::TypeNever,
41 b"null" => TokenKind::TypeNull,
42 b"false" => TokenKind::TypeFalse,
43 b"true" => TokenKind::TypeTrue,
44 b"exit" => TokenKind::Exit,
45 b"die" => TokenKind::Die,
46 b"function" => TokenKind::Function,
47 b"fn" => TokenKind::Fn,
48 b"const" => TokenKind::Const,
49 b"return" => TokenKind::Return,
50 b"yield" => TokenKind::Yield,
51 b"try" => TokenKind::Try,
52 b"catch" => TokenKind::Catch,
53 b"finally" => TokenKind::Finally,
54 b"throw" => TokenKind::Throw,
55 b"if" => TokenKind::If,
56 b"elseif" => TokenKind::ElseIf,
57 b"endif" => TokenKind::EndIf,
58 b"else" => TokenKind::Else,
59 b"insteadof" => TokenKind::Insteadof,
60 b"while" => TokenKind::While,
61 b"endwhile" => TokenKind::EndWhile,
62 b"do" => TokenKind::Do,
63 b"for" => TokenKind::For,
64 b"endfor" => TokenKind::EndFor,
65 b"foreach" => TokenKind::Foreach,
66 b"endforeach" => TokenKind::EndForeach,
67 b"declare" => TokenKind::Declare,
68 b"enddeclare" => TokenKind::EndDeclare,
69 b"instanceof" => TokenKind::InstanceOf,
70 b"as" => TokenKind::As,
71 b"switch" => TokenKind::Switch,
72 b"endswitch" => TokenKind::EndSwitch,
73 b"case" => TokenKind::Case,
74 b"default" => TokenKind::Default,
75 b"break" => TokenKind::Break,
76 b"continue" => TokenKind::Continue,
77 b"goto" => TokenKind::Goto,
78 b"echo" => TokenKind::Echo,
79 b"print" => TokenKind::Print,
80 b"enum" => TokenKind::Enum,
81 b"class" => TokenKind::Class,
82 b"interface" => TokenKind::Interface,
83 b"trait" => TokenKind::Trait,
84 b"extends" => TokenKind::Extends,
85 b"implements" => TokenKind::Implements,
86 b"new" => TokenKind::New,
87 b"clone" => TokenKind::Clone,
88 b"var" => TokenKind::Public,
89 b"public" => TokenKind::Public,
90 b"protected" => TokenKind::Protected,
91 b"private" => TokenKind::Private,
92 b"final" => TokenKind::Final,
93 b"abstract" => TokenKind::Abstract,
94 b"static" => TokenKind::Static,
95 b"readonly" => TokenKind::Readonly,
96 b"namespace" => TokenKind::Namespace,
97 b"use" => TokenKind::Use,
98 b"global" => TokenKind::Global,
99 b"isset" => TokenKind::Isset,
100 b"empty" => TokenKind::Empty,
101 b"__halt_compiler" => TokenKind::HaltCompiler,
102 b"__class__" => TokenKind::ClassC,
103 b"__trait__" => TokenKind::TraitC,
104 b"__function__" => TokenKind::FuncC,
105 b"__method__" => TokenKind::MethodC,
106 b"__line__" => TokenKind::Line,
107 b"__file__" => TokenKind::File,
108 b"__dir__" => TokenKind::Dir,
109 b"__namespace__" => TokenKind::NsC,
110 b"__property__" => TokenKind::PropertyC,
111 b"array" => TokenKind::Array,
112 b"callable" => TokenKind::TypeCallable,
113 b"iterable" => TokenKind::TypeIterable,
114 b"void" => TokenKind::TypeVoid,
115 b"object" => TokenKind::TypeObject,
116 b"match" => TokenKind::Match,
117 b"list" => TokenKind::List,
118 b"include" => TokenKind::Include,
119 b"include_once" => TokenKind::IncludeOnce,
120 b"require" => TokenKind::Require,
121 b"require_once" => TokenKind::RequireOnce,
122 b"eval" => TokenKind::Eval,
123 b"unset" => TokenKind::Unset,
124 _ => TokenKind::Identifier,
125 }
126}
127
128#[derive(Debug, Clone)]
129pub struct Lexer<'src> {
130 input: &'src [u8],
131 cursor: usize,
132 state_stack: Vec<LexerState>,
133 mode: LexerMode,
134}
135
136impl<'src> Lexer<'src> {
137 pub fn new(input: &'src [u8]) -> Self {
138 Self {
139 input,
140 cursor: 0,
141 state_stack: vec![LexerState::Initial],
142 mode: LexerMode::Standard,
143 }
144 }
145
146 pub fn set_mode(&mut self, mode: LexerMode) {
147 self.mode = mode;
148 }
149
150 pub fn slice(&self, span: Span) -> &'src [u8] {
151 &self.input[span.start..span.end]
152 }
153
154 fn peek(&self) -> Option<u8> {
155 if self.cursor < self.input.len() {
156 Some(self.input[self.cursor])
157 } else {
158 None
159 }
160 }
161
162 fn advance(&mut self) {
163 self.cursor += 1;
164 }
165
166 fn advance_n(&mut self, n: usize) {
167 self.cursor += n;
168 }
169
170 fn skip_whitespace(&mut self) {
171 while self.cursor < self.input.len() {
172 if self.input[self.cursor].is_ascii_whitespace() {
173 self.cursor += 1;
174 } else {
175 break;
176 }
177 }
178 }
179
180 fn read_identifier(&mut self) {
181 while self.cursor < self.input.len() {
182 let c = self.input[self.cursor];
183 if c.is_ascii_alphanumeric() || c == b'_' || c >= 0x80 {
184 self.cursor += 1;
185 } else {
186 break;
187 }
188 }
189 }
190
191 fn read_number(&mut self) -> TokenKind {
192 let mut is_float = false;
193
194 if self.peek() == Some(b'0') {
196 self.advance();
197 if let Some(c) = self.peek() {
198 if c == b'x' || c == b'X' {
199 self.advance();
200 while let Some(c) = self.peek() {
201 if c.is_ascii_hexdigit() || c == b'_' {
202 self.advance();
203 } else {
204 break;
205 }
206 }
207 return TokenKind::LNumber;
208 } else if c == b'b' || c == b'B' {
209 self.advance();
210 while let Some(c) = self.peek() {
211 if c == b'0' || c == b'1' || c == b'_' {
212 self.advance();
213 } else {
214 break;
215 }
216 }
217 return TokenKind::LNumber;
218 } else if c == b'o' || c == b'O' {
219 self.advance();
220 while let Some(c) = self.peek() {
221 if (b'0'..=b'7').contains(&c) || c == b'_' {
222 self.advance();
223 } else {
224 break;
225 }
226 }
227 return TokenKind::LNumber;
228 }
229 }
230 }
231
232 while let Some(c) = self.peek() {
233 if c.is_ascii_digit() || c == b'_' {
234 self.advance();
235 } else if c == b'.' {
236 if is_float {
237 break; }
239 is_float = true;
240 self.advance();
241 } else if c == b'e' || c == b'E' {
242 is_float = true;
243 self.advance();
244 if let Some(next) = self.peek()
245 && (next == b'+' || next == b'-')
246 {
247 self.advance();
248 }
249 } else {
250 break;
251 }
252 }
253
254 if is_float {
255 TokenKind::DNumber
256 } else {
257 TokenKind::LNumber
258 }
259 }
260
261 fn consume_single_line_comment(&mut self) -> TokenKind {
262 while self.cursor < self.input.len() {
263 let remaining = &self.input[self.cursor..];
264 match memchr3(b'\n', b'\r', b'?', remaining) {
265 Some(pos) => {
266 self.cursor += pos;
267 let c = self.input[self.cursor];
268 if c == b'?' {
269 if self.input.get(self.cursor + 1) == Some(&b'>') {
270 break;
271 } else {
272 self.cursor += 1;
273 }
274 } else {
275 break;
276 }
277 }
278 None => {
279 self.cursor = self.input.len();
280 break;
281 }
282 }
283 }
284 TokenKind::Comment
285 }
286
287 fn consume_multi_line_comment(&mut self) -> TokenKind {
288 let is_doc = if self.peek() == Some(b'*') && self.input.get(self.cursor + 1) != Some(&b'/')
289 {
290 self.advance();
291 true
292 } else {
293 false
294 };
295
296 while self.cursor < self.input.len() {
297 let remaining = &self.input[self.cursor..];
298 match memchr(b'*', remaining) {
299 Some(pos) => {
300 self.cursor += pos;
301 self.advance(); if self.peek() == Some(b'/') {
303 self.advance();
304 return if is_doc {
305 TokenKind::DocComment
306 } else {
307 TokenKind::Comment
308 };
309 }
310 }
311 None => {
312 self.cursor = self.input.len();
313 break;
314 }
315 }
316 }
317
318 TokenKind::Error }
320
321 fn next_in_looking_for_property(&mut self) -> Option<Token> {
322 let start = self.cursor;
323 if self.cursor >= self.input.len() {
324 return Some(Token {
325 kind: TokenKind::Error,
326 span: Span::new(start, start),
327 });
328 }
329
330 let c = self.input[self.cursor];
331
332 if c == b'-' && self.input.get(self.cursor + 1) == Some(&b'>') {
333 self.advance_n(2);
334 return Some(Token {
335 kind: TokenKind::Arrow,
336 span: Span::new(start, self.cursor),
337 });
338 }
339
340 if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 {
341 self.read_identifier();
342 self.state_stack.pop(); return Some(Token {
344 kind: TokenKind::Identifier,
345 span: Span::new(start, self.cursor),
346 });
347 }
348
349 self.state_stack.pop();
356 Some(Token {
359 kind: TokenKind::Error,
360 span: Span::new(start, self.cursor),
361 })
362 }
363
364 fn next_in_looking_for_var_name(&mut self) -> Option<Token> {
365 let start = self.cursor;
366 if self.cursor >= self.input.len() {
367 return Some(Token {
368 kind: TokenKind::Error,
369 span: Span::new(start, start),
370 });
371 }
372
373 let c = self.input[self.cursor];
374
375 if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 {
376 self.read_identifier();
377 return Some(Token {
378 kind: TokenKind::StringVarname,
379 span: Span::new(start, self.cursor),
380 });
381 }
382
383 if c == b'[' {
384 self.advance();
385 self.state_stack.push(LexerState::VarOffsetDollarCurly);
386 return Some(Token {
387 kind: TokenKind::OpenBracket,
388 span: Span::new(start, self.cursor),
389 });
390 }
391
392 if c == b'}' {
393 self.advance();
394 self.state_stack.pop();
395 return Some(Token {
396 kind: TokenKind::CloseBrace,
397 span: Span::new(start, self.cursor),
398 });
399 }
400
401 self.advance();
402 Some(Token {
403 kind: TokenKind::Error,
404 span: Span::new(start, self.cursor),
405 })
406 }
407
408 fn next_in_var_offset(&mut self, is_num_string: bool) -> Option<Token> {
409 let start = self.cursor;
410 if self.cursor >= self.input.len() {
411 return Some(Token {
412 kind: TokenKind::Error,
413 span: Span::new(start, start),
414 });
415 }
416
417 let c = self.input[self.cursor];
418
419 if c == b']' {
420 self.advance();
421 self.state_stack.pop();
422 return Some(Token {
423 kind: TokenKind::CloseBracket,
424 span: Span::new(start, self.cursor),
425 });
426 }
427
428 if c == b'$' {
429 self.advance();
430 if let Some(next) = self.peek()
431 && (next.is_ascii_alphabetic() || next == b'_')
432 {
433 let var_start = self.cursor - 1;
434 self.read_identifier();
435 return Some(Token {
436 kind: TokenKind::Variable,
437 span: Span::new(var_start, self.cursor),
438 });
439 }
440 }
444
445 if c.is_ascii_digit() {
446 if is_num_string {
447 while let Some(c) = self.peek() {
449 if c.is_ascii_digit() {
450 self.advance();
451 } else {
452 break;
453 }
454 }
455 return Some(Token {
456 kind: TokenKind::NumString,
457 span: Span::new(start, self.cursor),
458 });
459 } else {
460 let kind = self.read_number();
461 return Some(Token {
462 kind,
463 span: Span::new(start, self.cursor),
464 });
465 }
466 }
467
468 if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 {
469 self.read_identifier();
470 return Some(Token {
471 kind: TokenKind::Identifier,
472 span: Span::new(start, self.cursor),
473 });
474 }
475
476 if c == b'-' {
477 self.advance();
478 return Some(Token {
479 kind: TokenKind::Minus,
480 span: Span::new(start, self.cursor),
481 });
482 }
483
484 self.advance();
486
487 if c == b'[' {
510 return Some(Token {
511 kind: TokenKind::OpenBracket,
512 span: Span::new(start, self.cursor),
513 });
514 }
515
516 Some(Token {
517 kind: TokenKind::Error,
518 span: Span::new(start, self.cursor),
519 })
520 }
521
522 fn next_in_double_quotes(&mut self) -> Option<Token> {
523 let start = self.cursor;
524 if self.cursor >= self.input.len() {
525 return Some(Token {
526 kind: TokenKind::Error,
527 span: Span::new(start, start),
528 });
529 }
530
531 let char = self.input[self.cursor];
532
533 match char {
534 b'"' => {
535 if let Some(LexerState::DoubleQuotes) = self.state_stack.last() {
536 self.advance();
537 self.state_stack.pop();
538 return Some(Token {
539 kind: TokenKind::DoubleQuote,
540 span: Span::new(start, self.cursor),
541 });
542 }
543 }
544 b'`' => {
545 if let Some(LexerState::Backquote) = self.state_stack.last() {
546 self.advance();
547 self.state_stack.pop();
548 return Some(Token {
549 kind: TokenKind::Backtick,
550 span: Span::new(start, self.cursor),
551 });
552 }
553 }
554 b'$' => {
555 self.advance();
556 if let Some(c) = self.peek() {
557 if c.is_ascii_alphabetic() || c == b'_' {
558 let var_start = self.cursor - 1;
563 self.read_identifier();
564
565 if self.peek() == Some(b'[') {
567 self.state_stack.push(LexerState::VarOffset);
568 } else if self.peek() == Some(b'-')
569 && self.input.get(self.cursor + 1) == Some(&b'>')
570 && let Some(next_next) = self.input.get(self.cursor + 2)
571 && (next_next.is_ascii_alphabetic() || *next_next == b'_')
572 {
573 self.state_stack.push(LexerState::LookingForProperty);
574 }
575
576 return Some(Token {
577 kind: TokenKind::Variable,
578 span: Span::new(var_start, self.cursor),
579 });
580 } else if c == b'{' {
581 self.advance(); self.state_stack.push(LexerState::LookingForVarName);
583 return Some(Token {
584 kind: TokenKind::DollarOpenCurlyBraces,
585 span: Span::new(start, self.cursor),
586 });
587 }
588 }
589 }
591 b'{' => {
592 if self.input.get(self.cursor + 1) == Some(&b'$') {
593 self.advance();
594 self.state_stack.push(LexerState::Scripting);
596 return Some(Token {
597 kind: TokenKind::CurlyOpen,
598 span: Span::new(start, self.cursor),
599 });
600 }
601 }
602 _ => {}
603 }
604
605 while let Some(c) = self.peek() {
607 if c == b'"' && matches!(self.state_stack.last(), Some(LexerState::DoubleQuotes)) {
608 break;
609 }
610 if c == b'`' && matches!(self.state_stack.last(), Some(LexerState::Backquote)) {
611 break;
612 }
613 if c == b'$'
614 && let Some(next) = self.input.get(self.cursor + 1)
615 && (next.is_ascii_alphabetic() || *next == b'_' || *next == b'{')
616 {
617 break;
618 }
619 if c == b'{' && self.input.get(self.cursor + 1) == Some(&b'$') {
620 break;
621 }
622
623 if c == b'\\' {
624 self.advance();
625 if self.peek().is_some() {
626 self.advance();
627 }
628 } else {
629 self.advance();
630 }
631 }
632
633 if self.cursor > start {
634 Some(Token {
635 kind: TokenKind::EncapsedAndWhitespace,
636 span: Span::new(start, self.cursor),
637 })
638 } else {
639 Some(Token {
665 kind: TokenKind::EncapsedAndWhitespace,
666 span: Span::new(start, self.cursor),
667 })
668 }
669 }
670
671 fn read_single_quoted(&mut self) -> TokenKind {
672 while self.cursor < self.input.len() {
673 let remaining = &self.input[self.cursor..];
674 match memchr2(b'\'', b'\\', remaining) {
675 Some(pos) => {
676 self.cursor += pos;
677 let c = self.input[self.cursor];
678 self.advance(); if c == b'\'' {
680 return TokenKind::StringLiteral;
681 } else {
682 if self.cursor < self.input.len() {
684 self.advance(); }
686 }
687 }
688 None => {
689 self.cursor = self.input.len();
690 break;
691 }
692 }
693 }
694 TokenKind::Error
695 }
696
697 fn read_double_quoted(&mut self, quote: u8, start_pos: usize) -> TokenKind {
698 while let Some(c) = self.peek() {
699 if c == quote {
700 self.advance();
701 return TokenKind::StringLiteral;
702 } else if c == b'\\' {
703 self.advance();
704 if self.peek().is_some() {
705 self.advance();
706 }
707 } else if c == b'$' {
708 if let Some(next) = self.input.get(self.cursor + 1)
709 && (next.is_ascii_alphabetic() || *next == b'_' || *next == b'{')
710 {
711 self.cursor = start_pos + 1;
712 self.state_stack.push(if quote == b'"' {
713 LexerState::DoubleQuotes
714 } else {
715 LexerState::Backquote
716 });
717 return if quote == b'"' {
718 TokenKind::DoubleQuote
719 } else {
720 TokenKind::Backtick
721 };
722 }
723 self.advance();
724 } else if c == b'{' {
725 if self.input.get(self.cursor + 1) == Some(&b'$') {
726 self.cursor = start_pos + 1;
727 self.state_stack.push(if quote == b'"' {
728 LexerState::DoubleQuotes
729 } else {
730 LexerState::Backquote
731 });
732 return if quote == b'"' {
733 TokenKind::DoubleQuote
734 } else {
735 TokenKind::Backtick
736 };
737 }
738 self.advance();
739 } else {
740 self.advance();
741 }
742 }
743 TokenKind::Error
744 }
745
746 fn read_heredoc_start(&mut self, start: usize) -> Token {
747 while let Some(c) = self.peek() {
748 if c == b' ' || c == b'\t' {
749 self.advance();
750 } else {
751 break;
752 }
753 }
754
755 let quote = self.peek();
756 let is_quoted = quote == Some(b'\'') || quote == Some(b'"');
757 let is_nowdoc = quote == Some(b'\'');
758
759 if is_quoted {
760 self.advance();
761 }
762
763 let label_start = self.cursor;
764 self.read_identifier();
765 let label = self.input[label_start..self.cursor].to_vec();
766
767 if is_quoted && self.peek() == quote {
768 self.advance();
769 }
770
771 if let Some(c) = self.peek() {
773 if c == b'\n' {
774 self.advance();
775 } else if c == b'\r' {
776 self.advance();
777 if self.peek() == Some(b'\n') {
778 self.advance();
779 }
780 }
781 }
782
783 if is_nowdoc {
784 self.state_stack.push(LexerState::Nowdoc(label));
785 } else {
786 self.state_stack.push(LexerState::Heredoc(label));
787 }
788
789 Token {
790 kind: TokenKind::StartHeredoc,
791 span: Span::new(start, self.cursor),
792 }
793 }
794
795 fn check_heredoc_end(&self, label: &[u8]) -> Option<usize> {
796 let mut current = self.cursor;
797 while current < self.input.len() {
798 let c = self.input[current];
799 if c == b' ' || c == b'\t' {
800 current += 1;
801 } else {
802 break;
803 }
804 }
805
806 if current + label.len() > self.input.len() {
807 return None;
808 }
809
810 if &self.input[current..current + label.len()] == label {
811 let after = current + label.len();
813 if after >= self.input.len() {
814 return Some(after - self.cursor);
815 }
816 let c = self.input[after];
817 if !c.is_ascii_alphanumeric() && c != b'_' && c < 0x80 {
818 return Some(after - self.cursor);
819 }
820 }
821 None
822 }
823
824 fn is_followed_by_var_or_vararg(&self) -> bool {
825 let mut cursor = self.cursor;
826 while cursor < self.input.len() {
827 let c = self.input[cursor];
828 if c.is_ascii_whitespace() {
829 cursor += 1;
830 continue;
831 }
832
833 if c == b'#' {
835 while cursor < self.input.len() && self.input[cursor] != b'\n' {
837 cursor += 1;
838 }
839 continue;
840 }
841 if c == b'/' && cursor + 1 < self.input.len() {
842 if self.input[cursor + 1] == b'/' {
843 while cursor < self.input.len() && self.input[cursor] != b'\n' {
845 cursor += 1;
846 }
847 continue;
848 } else if self.input[cursor + 1] == b'*' {
849 cursor += 2;
851 while cursor < self.input.len() {
852 if self.input[cursor] == b'*'
853 && cursor + 1 < self.input.len()
854 && self.input[cursor + 1] == b'/'
855 {
856 cursor += 2;
857 break;
858 }
859 cursor += 1;
860 }
861 continue;
862 }
863 }
864
865 if c == b'$' && cursor + 1 < self.input.len() {
867 let next = self.input[cursor + 1];
868 if next.is_ascii_alphabetic() || next == b'_' || next >= 0x80 {
869 return true;
870 }
871 }
872
873 if c == b'.'
875 && cursor + 2 < self.input.len()
876 && self.input[cursor + 1] == b'.'
877 && self.input[cursor + 2] == b'.'
878 {
879 return true;
880 }
881
882 return false;
883 }
884 false
885 }
886
887 fn check_set_visibility(&mut self, normal: TokenKind, set: TokenKind) -> TokenKind {
888 let mut look = self.cursor;
889
890 while let Some(b) = self.input.get(look) {
892 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x0b' | b'\x0c') {
893 look += 1;
894 } else {
895 break;
896 }
897 }
898
899 if self.input.get(look) != Some(&b'(') {
900 return normal;
901 }
902 look += 1;
903
904 while let Some(b) = self.input.get(look) {
906 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x0b' | b'\x0c') {
907 look += 1;
908 } else {
909 break;
910 }
911 }
912
913 let set_kw = b"set";
914 let is_set = self
915 .input
916 .get(look..look + set_kw.len())
917 .map(|s| s.eq_ignore_ascii_case(set_kw))
918 .unwrap_or(false);
919
920 if !is_set {
921 return normal;
922 }
923 look += set_kw.len();
924
925 while let Some(b) = self.input.get(look) {
927 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x0b' | b'\x0c') {
928 look += 1;
929 } else {
930 break;
931 }
932 }
933
934 if self.input.get(look) != Some(&b')') {
935 return normal;
936 }
937 look += 1;
938
939 self.cursor = look;
940 set
941 }
942
943 fn next_in_nowdoc(&mut self) -> Option<Token> {
944 let label = if let Some(LexerState::Nowdoc(label)) = self.state_stack.last() {
945 label.clone()
946 } else {
947 return None;
948 };
949
950 if self.cursor >= self.input.len() {
951 return Some(Token {
952 kind: TokenKind::Error,
953 span: Span::new(self.cursor, self.cursor),
954 });
955 }
956
957 let start = self.cursor;
958
959 if let Some(len) = self.check_heredoc_end(&label) {
961 self.advance_n(len);
962 self.state_stack.pop();
963
964 return Some(Token {
965 kind: TokenKind::EndHeredoc,
966 span: Span::new(start, self.cursor),
967 });
968 }
969
970 while let Some(c) = self.peek() {
972 self.advance();
973 if c == b'\n' {
974 if self.check_heredoc_end(&label).is_some() {
976 break;
977 }
978 }
979 }
980
981 Some(Token {
982 kind: TokenKind::EncapsedAndWhitespace,
983 span: Span::new(start, self.cursor),
984 })
985 }
986
987 fn next_in_heredoc(&mut self) -> Option<Token> {
988 let label = if let Some(LexerState::Heredoc(label)) = self.state_stack.last() {
989 label.clone()
990 } else {
991 return None;
992 };
993
994 if self.cursor >= self.input.len() {
995 return Some(Token {
996 kind: TokenKind::Error,
997 span: Span::new(self.cursor, self.cursor),
998 });
999 }
1000
1001 let start = self.cursor;
1002
1003 if let Some(len) = self.check_heredoc_end(&label) {
1005 self.advance_n(len);
1006 self.state_stack.pop();
1007
1008 return Some(Token {
1009 kind: TokenKind::EndHeredoc,
1010 span: Span::new(start, self.cursor),
1011 });
1012 }
1013
1014 if let Some(c) = self.peek() {
1016 if c == b'$' {
1017 self.advance();
1018 if let Some(next) = self.peek() {
1019 if next.is_ascii_alphabetic() || next == b'_' {
1020 let var_start = self.cursor - 1;
1021 self.read_identifier();
1022
1023 if self.peek() == Some(b'[') {
1025 self.state_stack.push(LexerState::VarOffset);
1026 } else if self.peek() == Some(b'-')
1027 && self.input.get(self.cursor + 1) == Some(&b'>')
1028 && let Some(next_next) = self.input.get(self.cursor + 2)
1029 && (next_next.is_ascii_alphabetic() || *next_next == b'_')
1030 {
1031 self.state_stack.push(LexerState::LookingForProperty);
1032 }
1033
1034 return Some(Token {
1035 kind: TokenKind::Variable,
1036 span: Span::new(var_start, self.cursor),
1037 });
1038 } else if next == b'{' {
1039 self.advance();
1040 self.state_stack.push(LexerState::LookingForVarName);
1041 return Some(Token {
1042 kind: TokenKind::DollarOpenCurlyBraces,
1043 span: Span::new(start, self.cursor),
1044 });
1045 }
1046 }
1047 } else if c == b'{' && self.input.get(self.cursor + 1) == Some(&b'$') {
1048 self.advance();
1049 self.state_stack.push(LexerState::Scripting);
1050 return Some(Token {
1051 kind: TokenKind::CurlyOpen,
1052 span: Span::new(start, self.cursor),
1053 });
1054 }
1055 }
1056
1057 while let Some(c) = self.peek() {
1059 if c == b'$'
1060 && let Some(next) = self.input.get(self.cursor + 1)
1061 && (next.is_ascii_alphabetic() || *next == b'_' || *next == b'{')
1062 {
1063 break;
1064 }
1065 if c == b'{' && self.input.get(self.cursor + 1) == Some(&b'$') {
1066 break;
1067 }
1068
1069 self.advance();
1070 if c == b'\n' && self.check_heredoc_end(&label).is_some() {
1071 break;
1072 }
1073
1074 if c == b'\\' && self.peek().is_some() {
1075 self.advance();
1076 }
1077 }
1078
1079 if self.cursor > start {
1080 Some(Token {
1081 kind: TokenKind::EncapsedAndWhitespace,
1082 span: Span::new(start, self.cursor),
1083 })
1084 } else {
1085 Some(Token {
1087 kind: TokenKind::EncapsedAndWhitespace,
1088 span: Span::new(start, self.cursor),
1089 })
1090 }
1091 }
1092
1093 fn next_in_halt_compiler(&mut self) -> Option<Token> {
1094 self.skip_whitespace();
1095
1096 if self.cursor >= self.input.len() {
1097 return Some(Token {
1098 kind: TokenKind::Eof,
1099 span: Span::new(self.cursor, self.cursor),
1100 });
1101 }
1102
1103 let start = self.cursor;
1104 let c = self.input[self.cursor];
1105 self.advance();
1106
1107 let kind = match c {
1108 b'(' => TokenKind::OpenParen,
1109 b')' => TokenKind::CloseParen,
1110 b';' => {
1111 self.state_stack.pop();
1112 self.state_stack.push(LexerState::RawData);
1113 TokenKind::SemiColon
1114 }
1115 b'#' => self.consume_single_line_comment(),
1116 b'/' => {
1117 if self.peek() == Some(b'/') {
1118 self.advance();
1119 self.consume_single_line_comment()
1120 } else if self.peek() == Some(b'*') {
1121 self.advance();
1122 self.consume_multi_line_comment()
1123 } else {
1124 TokenKind::Error
1125 }
1126 }
1127 _ => TokenKind::Error,
1128 };
1129
1130 Some(Token {
1131 kind,
1132 span: Span::new(start, self.cursor),
1133 })
1134 }
1135
1136 pub fn input_slice(&self, span: Span) -> &'src [u8] {
1137 &self.input[span.start..span.end]
1138 }
1139}
1140
1141impl<'src> Iterator for Lexer<'src> {
1142 type Item = Token;
1143
1144 fn next(&mut self) -> Option<Self::Item> {
1145 if let Some(LexerState::Initial) = self.state_stack.last() {
1147 let start = self.cursor;
1148 while self.cursor < self.input.len() {
1149 if self.input[self.cursor] != b'<' {
1150 let remaining = &self.input[self.cursor..];
1151 match memchr(b'<', remaining) {
1152 Some(pos) => self.cursor += pos,
1153 None => {
1154 self.cursor = self.input.len();
1155 break;
1156 }
1157 }
1158 }
1159
1160 if self.input[self.cursor..].starts_with(b"<?php") {
1161 if self.cursor > start {
1162 return Some(Token {
1163 kind: TokenKind::InlineHtml,
1164 span: Span::new(start, self.cursor),
1165 });
1166 }
1167
1168 let tag_start = self.cursor;
1169 self.state_stack.pop();
1170 self.state_stack.push(LexerState::Scripting);
1171 self.advance_n(5);
1172
1173 if self.peek().is_some_and(|c| c.is_ascii_whitespace()) {
1175 self.advance();
1176 }
1177
1178 return Some(Token {
1179 kind: TokenKind::OpenTag,
1180 span: Span::new(tag_start, self.cursor),
1181 });
1182 } else if self.input[self.cursor..].starts_with(b"<?=") {
1183 if self.cursor > start {
1184 return Some(Token {
1185 kind: TokenKind::InlineHtml,
1186 span: Span::new(start, self.cursor),
1187 });
1188 }
1189 let tag_start = self.cursor;
1190 self.state_stack.pop();
1191 self.state_stack.push(LexerState::Scripting);
1192 self.advance_n(3);
1193 return Some(Token {
1194 kind: TokenKind::OpenTagEcho,
1195 span: Span::new(tag_start, self.cursor),
1196 });
1197 }
1198 self.advance();
1199 }
1200
1201 if self.cursor > start {
1202 return Some(Token {
1203 kind: TokenKind::InlineHtml,
1204 span: Span::new(start, self.cursor),
1205 });
1206 }
1207
1208 return Some(Token {
1209 kind: TokenKind::Eof,
1210 span: Span::new(self.cursor, self.cursor),
1211 });
1212 }
1213
1214 if let Some(LexerState::DoubleQuotes) | Some(LexerState::Backquote) =
1216 self.state_stack.last()
1217 {
1218 return self.next_in_double_quotes();
1219 }
1220
1221 if let Some(LexerState::Heredoc(_)) = self.state_stack.last() {
1222 return self.next_in_heredoc();
1223 }
1224
1225 if let Some(LexerState::Nowdoc(_)) = self.state_stack.last() {
1226 return self.next_in_nowdoc();
1227 }
1228
1229 if let Some(LexerState::HaltCompiler) = self.state_stack.last() {
1230 return self.next_in_halt_compiler();
1231 }
1232
1233 if let Some(LexerState::VarOffset) = self.state_stack.last() {
1234 return self.next_in_var_offset(true);
1235 }
1236
1237 if let Some(LexerState::VarOffsetDollarCurly) = self.state_stack.last() {
1238 return self.next_in_var_offset(false);
1239 }
1240
1241 if let Some(LexerState::LookingForProperty) = self.state_stack.last() {
1242 return self.next_in_looking_for_property();
1243 }
1244
1245 if let Some(LexerState::LookingForVarName) = self.state_stack.last() {
1246 return self.next_in_looking_for_var_name();
1247 }
1248
1249 if let Some(LexerState::RawData) = self.state_stack.last() {
1250 if self.cursor >= self.input.len() {
1251 return Some(Token {
1252 kind: TokenKind::Eof,
1253 span: Span::new(self.cursor, self.cursor),
1254 });
1255 }
1256 let start = self.cursor;
1257 self.cursor = self.input.len(); return Some(Token {
1259 kind: TokenKind::InlineHtml,
1260 span: Span::new(start, self.cursor),
1261 });
1262 }
1263
1264 self.skip_whitespace();
1265
1266 if self.cursor >= self.input.len() {
1267 return Some(Token {
1268 kind: TokenKind::Eof,
1269 span: Span::new(self.cursor, self.cursor),
1270 });
1271 }
1272
1273 let start = self.cursor;
1274 let char = self.input[self.cursor];
1275 self.advance();
1276
1277 let kind = match char {
1278 b'$' => {
1279 if let Some(c) = self.peek() {
1280 if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 {
1281 self.read_identifier();
1282 TokenKind::Variable
1283 } else {
1284 TokenKind::Dollar
1285 }
1286 } else {
1287 TokenKind::Dollar
1288 }
1289 }
1290 b'\\' => TokenKind::NsSeparator,
1291 b'\'' => self.read_single_quoted(),
1292 b'"' => self.read_double_quoted(b'"', start),
1293 b'`' => {
1294 self.state_stack.push(LexerState::Backquote);
1295 TokenKind::Backtick
1296 }
1297 b'#' => {
1298 if self.peek() == Some(b'[') {
1299 self.advance();
1300 TokenKind::Attribute
1301 } else {
1302 self.consume_single_line_comment()
1303 }
1304 }
1305 b';' => TokenKind::SemiColon,
1306 b':' => {
1307 if self.peek() == Some(b':') {
1308 self.advance();
1309 TokenKind::DoubleColon
1310 } else {
1311 TokenKind::Colon
1312 }
1313 }
1314 b',' => TokenKind::Comma,
1315 b'{' => {
1316 self.state_stack.push(LexerState::Scripting);
1317 TokenKind::OpenBrace
1318 }
1319 b'}' => {
1320 if self.state_stack.len() > 1 {
1321 self.state_stack.pop();
1322 }
1323 TokenKind::CloseBrace
1324 }
1325 b'(' => {
1326 let saved_cursor = self.cursor;
1328 self.skip_whitespace();
1329
1330 let start_ident = self.cursor;
1331 self.read_identifier();
1332 let ident_len = self.cursor - start_ident;
1333
1334 if ident_len > 0 {
1335 let ident = &self.input[start_ident..self.cursor];
1336 self.skip_whitespace();
1337 if self.peek() == Some(b')') {
1338 let cast_kind = match ident.to_ascii_lowercase().as_slice() {
1339 b"int" | b"integer" => Some(TokenKind::IntCast),
1340 b"bool" | b"boolean" => Some(TokenKind::BoolCast),
1341 b"float" | b"double" | b"real" => Some(TokenKind::FloatCast),
1342 b"string" | b"binary" => Some(TokenKind::StringCast),
1343 b"array" => Some(TokenKind::ArrayCast),
1344 b"object" => Some(TokenKind::ObjectCast),
1345 b"unset" => Some(TokenKind::UnsetCast),
1346 b"void" => Some(TokenKind::VoidCast),
1347 _ => None,
1348 };
1349
1350 if let Some(k) = cast_kind {
1351 self.advance(); k
1353 } else {
1354 self.cursor = saved_cursor;
1355 TokenKind::OpenParen
1356 }
1357 } else {
1358 self.cursor = saved_cursor;
1359 TokenKind::OpenParen
1360 }
1361 } else {
1362 self.cursor = saved_cursor;
1363 TokenKind::OpenParen
1364 }
1365 }
1366 b')' => TokenKind::CloseParen,
1367 b'[' => TokenKind::OpenBracket,
1368 b']' => TokenKind::CloseBracket,
1369 b'+' => {
1370 if self.peek() == Some(b'+') {
1371 self.advance();
1372 TokenKind::Inc
1373 } else if self.peek() == Some(b'=') {
1374 self.advance();
1375 TokenKind::PlusEq
1376 } else {
1377 TokenKind::Plus
1378 }
1379 }
1380 b'-' => {
1381 if self.peek() == Some(b'>') {
1382 self.advance();
1383 TokenKind::Arrow
1384 } else if self.peek() == Some(b'-') {
1385 self.advance();
1386 TokenKind::Dec
1387 } else if self.peek() == Some(b'=') {
1388 self.advance();
1389 TokenKind::MinusEq
1390 } else {
1391 TokenKind::Minus
1392 }
1393 }
1394 b'*' => {
1395 if self.peek() == Some(b'*') {
1396 self.advance();
1397 if self.peek() == Some(b'=') {
1398 self.advance();
1399 TokenKind::PowEq
1400 } else {
1401 TokenKind::Pow
1402 }
1403 } else if self.peek() == Some(b'=') {
1404 self.advance();
1405 TokenKind::MulEq
1406 } else {
1407 TokenKind::Asterisk
1408 }
1409 }
1410 b'/' => {
1411 if self.peek() == Some(b'/') {
1412 self.advance();
1413 self.consume_single_line_comment()
1414 } else if self.peek() == Some(b'*') {
1415 self.advance();
1416 self.consume_multi_line_comment()
1417 } else if self.peek() == Some(b'=') {
1418 self.advance();
1419 TokenKind::DivEq
1420 } else {
1421 TokenKind::Slash
1422 }
1423 }
1424 b'%' => {
1425 if self.peek() == Some(b'=') {
1426 self.advance();
1427 TokenKind::ModEq
1428 } else {
1429 TokenKind::Percent
1430 }
1431 }
1432 b'.' => {
1433 if self.peek() == Some(b'=') {
1434 self.advance();
1435 TokenKind::ConcatEq
1436 } else if self.peek() == Some(b'.') {
1437 self.advance();
1438 if self.peek() == Some(b'.') {
1439 self.advance();
1440 TokenKind::Ellipsis
1441 } else {
1442 TokenKind::Dot
1443 }
1444 } else if let Some(c) = self.peek()
1445 && c.is_ascii_digit()
1446 {
1447 self.cursor -= 1;
1448 self.read_number()
1449 } else {
1450 TokenKind::Dot
1451 }
1452 }
1453 b'=' => {
1454 if self.peek() == Some(b'=') {
1455 self.advance();
1456 if self.peek() == Some(b'=') {
1457 self.advance();
1458 TokenKind::EqEqEq
1459 } else {
1460 TokenKind::EqEq
1461 }
1462 } else if self.peek() == Some(b'>') {
1463 self.advance();
1464 TokenKind::DoubleArrow
1465 } else {
1466 TokenKind::Eq
1467 }
1468 }
1469 b'!' => {
1470 if self.peek() == Some(b'=') {
1471 self.advance();
1472 if self.peek() == Some(b'=') {
1473 self.advance();
1474 TokenKind::BangEqEq
1475 } else {
1476 TokenKind::BangEq
1477 }
1478 } else {
1479 TokenKind::Bang
1480 }
1481 }
1482 b'<' => {
1483 if self.peek() == Some(b'<') && self.input.get(self.cursor + 1) == Some(&b'<') {
1484 self.advance(); self.advance(); return Some(self.read_heredoc_start(start));
1487 } else if self.peek() == Some(b'=') {
1488 self.advance();
1489 if self.peek() == Some(b'>') {
1490 self.advance();
1491 TokenKind::Spaceship
1492 } else {
1493 TokenKind::LtEq
1494 }
1495 } else if self.peek() == Some(b'<') {
1496 self.advance();
1497 if self.peek() == Some(b'=') {
1498 self.advance();
1499 TokenKind::SlEq
1500 } else {
1501 TokenKind::Sl
1502 }
1503 } else if self.peek() == Some(b'>') {
1504 self.advance();
1505 TokenKind::BangEq
1506 } else {
1507 TokenKind::Lt
1508 }
1509 }
1510 b'>' => {
1511 if self.peek() == Some(b'=') {
1512 self.advance();
1513 TokenKind::GtEq
1514 } else if self.peek() == Some(b'>') {
1515 self.advance();
1516 if self.peek() == Some(b'=') {
1517 self.advance();
1518 TokenKind::SrEq
1519 } else {
1520 TokenKind::Sr
1521 }
1522 } else {
1523 TokenKind::Gt
1524 }
1525 }
1526 b'&' => {
1527 if self.peek() == Some(b'&') {
1528 self.advance();
1529 TokenKind::AmpersandAmpersand
1530 } else if self.peek() == Some(b'=') {
1531 self.advance();
1532 TokenKind::AndEq
1533 } else if self.is_followed_by_var_or_vararg() {
1534 TokenKind::AmpersandFollowedByVarOrVararg
1535 } else {
1536 TokenKind::AmpersandNotFollowedByVarOrVararg
1537 }
1538 }
1539 b'|' => {
1540 if self.peek() == Some(b'|') {
1541 self.advance();
1542 TokenKind::PipePipe
1543 } else if self.peek() == Some(b'=') {
1544 self.advance();
1545 TokenKind::OrEq
1546 } else {
1547 TokenKind::Pipe
1548 }
1549 }
1550 b'^' => {
1551 if self.peek() == Some(b'=') {
1552 self.advance();
1553 TokenKind::XorEq
1554 } else {
1555 TokenKind::Caret
1556 }
1557 }
1558 b'~' => TokenKind::BitNot,
1559 b'@' => TokenKind::At,
1560 b'?' => {
1561 if self.peek() == Some(b'>') {
1562 self.advance();
1563 self.state_stack.pop();
1564 self.state_stack.push(LexerState::Initial);
1565 TokenKind::CloseTag
1566 } else if self.peek() == Some(b'?') {
1567 self.advance();
1568 if self.peek() == Some(b'=') {
1569 self.advance();
1570 TokenKind::CoalesceEq
1571 } else {
1572 TokenKind::Coalesce
1573 }
1574 } else if self.peek() == Some(b'-')
1575 && self.input.get(self.cursor + 1) == Some(&b'>')
1576 {
1577 self.advance();
1578 self.advance();
1579 TokenKind::NullSafeArrow
1580 } else {
1581 TokenKind::Question
1582 }
1583 }
1584 c if c.is_ascii_digit() => {
1585 self.cursor -= 1;
1586 self.read_number()
1587 }
1588 c if c.is_ascii_alphabetic() || c == b'_' || c >= 0x80 => {
1589 if (c == b'b' || c == b'B')
1591 && let Some(next) = self.peek()
1592 {
1593 if next == b'\'' {
1594 self.advance(); return Some(Token {
1596 kind: self.read_single_quoted(),
1597 span: Span::new(start, self.cursor),
1598 });
1599 } else if next == b'"' {
1600 let quote_pos = self.cursor;
1601 self.advance(); return Some(Token {
1603 kind: self.read_double_quoted(b'"', quote_pos),
1604 span: Span::new(start, self.cursor),
1605 });
1606 }
1607 }
1608
1609 self.read_identifier();
1610 let text = &self.input[start..self.cursor];
1611
1612 if self.mode == LexerMode::LookingForProperty {
1613 self.mode = LexerMode::Standard;
1614 TokenKind::Identifier
1615 } else {
1616 let is_all_lowercase = text.iter().all(|c| !c.is_ascii_uppercase());
1617
1618 let mut kind = if is_all_lowercase {
1619 keyword_lookup(text)
1620 } else {
1621 keyword_lookup(&text.to_ascii_lowercase())
1622 };
1623
1624 match kind {
1625 TokenKind::Yield => {
1626 let mut look = self.cursor;
1627 while let Some(b) = self.input.get(look) {
1628 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x0b' | b'\x0c') {
1629 look += 1;
1630 } else {
1631 break;
1632 }
1633 }
1634 let from_kw = b"from";
1635 let is_from = self
1636 .input
1637 .get(look..look + from_kw.len())
1638 .map(|s| {
1639 s.iter()
1640 .zip(from_kw.iter())
1641 .all(|(c, k)| c.to_ascii_lowercase() == *k)
1642 })
1643 .unwrap_or(false)
1644 && !self
1645 .input
1646 .get(look + from_kw.len())
1647 .map(|c| c.is_ascii_alphanumeric() || *c == b'_' || *c >= 0x80)
1648 .unwrap_or(false);
1649
1650 if is_from {
1651 self.cursor = look + from_kw.len();
1652 kind = TokenKind::YieldFrom;
1653 }
1654 }
1655 TokenKind::Public => {
1656 if text[0].eq_ignore_ascii_case(&b'p') {
1657 kind = self
1658 .check_set_visibility(TokenKind::Public, TokenKind::PublicSet);
1659 }
1660 }
1661 TokenKind::Protected => {
1662 kind = self.check_set_visibility(
1663 TokenKind::Protected,
1664 TokenKind::ProtectedSet,
1665 );
1666 }
1667 TokenKind::Private => {
1668 kind = self
1669 .check_set_visibility(TokenKind::Private, TokenKind::PrivateSet);
1670 }
1671 TokenKind::HaltCompiler => {
1672 self.state_stack.pop();
1673 self.state_stack.push(LexerState::HaltCompiler);
1674 }
1675 _ => {}
1676 }
1677 kind
1678 }
1679 }
1680 _ => TokenKind::Error,
1681 };
1682
1683 Some(Token {
1684 kind,
1685 span: Span::new(start, self.cursor),
1686 })
1687 }
1688}