1use memchr::{memchr2, memmem};
2use php_ast::Span;
3
4use crate::token::{resolve_keyword, TokenKind};
5
6const fn make_whitespace_table() -> [bool; 256] {
14 let mut t = [false; 256];
15 t[b' ' as usize] = true;
16 t[b'\t' as usize] = true;
17 t[b'\r' as usize] = true;
18 t[b'\n' as usize] = true;
19 t[0x0C] = true; t
21}
22
23const fn make_ident_start_table() -> [bool; 256] {
24 let mut t = [false; 256];
25 let mut i = 0usize;
26 while i < 256 {
27 let b = i as u8;
28 t[i] = (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') || b == b'_' || b >= 0x80;
29 i += 1;
30 }
31 t
32}
33
34const fn make_ident_continue_table() -> [bool; 256] {
35 let mut t = [false; 256];
36 let mut i = 0usize;
37 while i < 256 {
38 let b = i as u8;
39 t[i] = (b >= b'a' && b <= b'z')
40 || (b >= b'A' && b <= b'Z')
41 || (b >= b'0' && b <= b'9')
42 || b == b'_'
43 || b >= 0x80;
44 i += 1;
45 }
46 t
47}
48
49static IS_PHP_WHITESPACE: [bool; 256] = make_whitespace_table();
50static IS_IDENT_START: [bool; 256] = make_ident_start_table();
51static IS_IDENT_CONTINUE: [bool; 256] = make_ident_continue_table();
52
53#[derive(Debug, Clone, PartialEq)]
54pub struct LexerError {
55 pub message: String,
56 pub span: Span,
57}
58
59#[derive(Debug, Clone, PartialEq)]
60pub struct Token {
61 pub kind: TokenKind,
62 pub span: Span,
63}
64
65impl Token {
66 pub fn new(kind: TokenKind, span: Span) -> Self {
67 Self { kind, span }
68 }
69
70 pub fn eof(offset: u32) -> Self {
71 Self {
72 kind: TokenKind::Eof,
73 span: Span::new(offset, offset),
74 }
75 }
76}
77
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79enum LexerMode {
80 InlineHtml,
81 Php,
82}
83
84pub struct Lexer<'src> {
85 source: &'src str,
86 mode: LexerMode,
87 pos: usize,
88 peeked: Option<Token>,
89 peeked2: Option<Token>,
90 pub errors: Vec<LexerError>,
91}
92
93#[inline(always)]
94fn is_ident_start(b: u8) -> bool {
95 IS_IDENT_START[b as usize]
96}
97
98#[inline(always)]
99fn is_ident_continue(b: u8) -> bool {
100 IS_IDENT_CONTINUE[b as usize]
101}
102
103impl<'src> Lexer<'src> {
104 pub fn new(source: &'src str) -> Self {
105 let pos = if source.starts_with("#!") {
107 source.find('\n').map(|p| p + 1).unwrap_or(source.len())
108 } else {
109 0
110 };
111
112 let remaining = &source[pos..];
114 let mode = if remaining.starts_with("<?php") || remaining.starts_with("<?=") {
115 LexerMode::Php
116 } else {
117 LexerMode::InlineHtml
118 };
119
120 Self {
121 source,
122 mode,
123 pos,
124 peeked: None,
125 peeked2: None,
126 errors: Vec::new(),
127 }
128 }
129
130 pub fn new_at(source: &'src str, offset: usize) -> Self {
135 Self {
136 source,
137 mode: LexerMode::Php,
138 pos: offset,
139 peeked: None,
140 peeked2: None,
141 errors: Vec::new(),
142 }
143 }
144
145 pub fn source(&self) -> &'src str {
146 self.source
147 }
148
149 pub fn peek(&mut self) -> &Token {
150 if self.peeked.is_none() {
151 self.peeked = Some(self.read_next_token());
152 }
153 self.peeked.as_ref().expect("peeked is Some: set above")
154 }
155
156 pub fn peek2(&mut self) -> &Token {
158 if self.peeked.is_none() {
160 self.peeked = Some(self.read_next_token());
161 }
162 if self.peeked2.is_none() {
163 self.peeked2 = Some(self.read_next_token());
164 }
165 self.peeked2.as_ref().expect("peeked2 is Some: set above")
166 }
167
168 pub fn next_token(&mut self) -> Token {
169 if let Some(token) = self.peeked.take() {
170 self.peeked = self.peeked2.take();
171 return token;
172 }
173 self.read_next_token()
174 }
175
176 pub fn token_text(&self, token: &Token) -> &'src str {
178 &self.source[token.span.start as usize..token.span.end as usize]
179 }
180
181 fn read_next_token(&mut self) -> Token {
182 if self.pos >= self.source.len() {
183 return Token::eof(self.source.len() as u32);
184 }
185
186 match self.mode {
187 LexerMode::InlineHtml => self.lex_inline_html(),
188 LexerMode::Php => self.lex_php(),
189 }
190 }
191
192 fn lex_inline_html(&mut self) -> Token {
193 let start = self.pos;
194 let bytes = self.source.as_bytes();
195
196 let mut search = self.pos;
200 let tag_pos = loop {
201 match memchr::memchr(b'<', &bytes[search..]) {
202 None => break None,
203 Some(offset) => {
204 let p = search + offset;
205 let rest = &bytes[p..];
206 if rest.starts_with(b"<?php") || rest.starts_with(b"<?=") {
207 break Some(p - self.pos);
208 }
209 search = p + 1;
210 }
211 }
212 };
213
214 if let Some(tag_pos) = tag_pos {
215 if tag_pos == 0 {
216 self.mode = LexerMode::Php;
218 return self.lex_php();
219 }
220 let end = self.pos + tag_pos;
222 self.pos = end;
223 self.mode = LexerMode::Php;
224 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
225 } else {
226 let end = self.source.len();
228 self.pos = end;
229 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
230 }
231 }
232
233 fn lex_php(&mut self) -> Token {
234 let remaining = &self.source[self.pos..];
235
236 if let Some(token) = self.try_lex_heredoc(remaining) {
238 return token;
239 }
240
241 self.skip_whitespace_and_comments();
243
244 if self.pos >= self.source.len() {
245 return Token::eof(self.source.len() as u32);
246 }
247
248 self.scan_token()
249 }
250
251 fn skip_whitespace_and_comments(&mut self) {
253 let bytes = self.source.as_bytes();
254 loop {
255 while self.pos < bytes.len() && IS_PHP_WHITESPACE[bytes[self.pos] as usize] {
257 self.pos += 1;
258 }
259
260 if self.pos >= bytes.len() {
261 break;
262 }
263
264 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'/'
267 {
268 self.pos += 2;
269 Self::skip_line_comment_body(bytes, &mut self.pos);
270 continue;
271 }
272
273 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'*'
275 {
276 self.pos += 2;
277 match memmem::find(&bytes[self.pos..], b"*/") {
278 Some(end) => self.pos += end + 2,
279 None => self.pos = bytes.len(), }
281 continue;
282 }
283
284 if bytes[self.pos] == b'#'
287 && !(self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'[')
288 {
289 self.pos += 1;
290 Self::skip_line_comment_body(bytes, &mut self.pos);
291 continue;
292 }
293
294 break;
295 }
296 }
297
298 fn scan_token(&mut self) -> Token {
300 let start = self.pos;
301 let bytes = self.source.as_bytes();
302 let b = bytes[start];
303
304 match b {
305 b'+' => {
307 if self.check_at(1, b'+') {
308 self.pos = start + 2;
309 self.tok(TokenKind::PlusPlus, start)
310 } else if self.check_at(1, b'=') {
311 self.pos = start + 2;
312 self.tok(TokenKind::PlusEquals, start)
313 } else {
314 self.pos = start + 1;
315 self.tok(TokenKind::Plus, start)
316 }
317 }
318 b'-' => {
319 if self.check_at(1, b'-') {
320 self.pos = start + 2;
321 self.tok(TokenKind::MinusMinus, start)
322 } else if self.check_at(1, b'=') {
323 self.pos = start + 2;
324 self.tok(TokenKind::MinusEquals, start)
325 } else if self.check_at(1, b'>') {
326 self.pos = start + 2;
327 self.tok(TokenKind::Arrow, start)
328 } else {
329 self.pos = start + 1;
330 self.tok(TokenKind::Minus, start)
331 }
332 }
333 b'*' => {
334 if self.check_at(1, b'*') {
335 if self.check_at(2, b'=') {
336 self.pos = start + 3;
337 self.tok(TokenKind::StarStarEquals, start)
338 } else {
339 self.pos = start + 2;
340 self.tok(TokenKind::StarStar, start)
341 }
342 } else if self.check_at(1, b'=') {
343 self.pos = start + 2;
344 self.tok(TokenKind::StarEquals, start)
345 } else {
346 self.pos = start + 1;
347 self.tok(TokenKind::Star, start)
348 }
349 }
350 b'/' => {
351 if self.check_at(1, b'=') {
353 self.pos = start + 2;
354 self.tok(TokenKind::SlashEquals, start)
355 } else {
356 self.pos = start + 1;
357 self.tok(TokenKind::Slash, start)
358 }
359 }
360 b'%' => {
361 if self.check_at(1, b'=') {
362 self.pos = start + 2;
363 self.tok(TokenKind::PercentEquals, start)
364 } else {
365 self.pos = start + 1;
366 self.tok(TokenKind::Percent, start)
367 }
368 }
369 b'.' => {
370 if start + 1 < bytes.len() && bytes[start + 1].is_ascii_digit() {
372 self.pos = start + 1;
373 self.scan_digits(u8::is_ascii_digit);
374 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
376 self.try_scan_exponent();
377 }
378 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
380 self.consume_invalid_numeric_rest();
381 return self.invalid_numeric(start);
382 }
383 return self.tok(TokenKind::FloatLiteralLeadingDot, start);
384 }
385 if self.check_at(1, b'.') && self.check_at(2, b'.') {
386 self.pos = start + 3;
387 self.tok(TokenKind::Ellipsis, start)
388 } else if self.check_at(1, b'=') {
389 self.pos = start + 2;
390 self.tok(TokenKind::DotEquals, start)
391 } else {
392 self.pos = start + 1;
393 self.tok(TokenKind::Dot, start)
394 }
395 }
396 b'=' => {
397 if self.check_at(1, b'=') {
398 if self.check_at(2, b'=') {
399 self.pos = start + 3;
400 self.tok(TokenKind::EqualsEqualsEquals, start)
401 } else {
402 self.pos = start + 2;
403 self.tok(TokenKind::EqualsEquals, start)
404 }
405 } else if self.check_at(1, b'>') {
406 self.pos = start + 2;
407 self.tok(TokenKind::FatArrow, start)
408 } else {
409 self.pos = start + 1;
410 self.tok(TokenKind::Equals, start)
411 }
412 }
413 b'!' => {
414 if self.check_at(1, b'=') {
415 if self.check_at(2, b'=') {
416 self.pos = start + 3;
417 self.tok(TokenKind::BangEqualsEquals, start)
418 } else {
419 self.pos = start + 2;
420 self.tok(TokenKind::BangEquals, start)
421 }
422 } else {
423 self.pos = start + 1;
424 self.tok(TokenKind::Bang, start)
425 }
426 }
427 b'<' => self.scan_less_than(start),
428 b'>' => {
429 if self.check_at(1, b'>') {
430 if self.check_at(2, b'=') {
431 self.pos = start + 3;
432 self.tok(TokenKind::ShiftRightEquals, start)
433 } else {
434 self.pos = start + 2;
435 self.tok(TokenKind::ShiftRight, start)
436 }
437 } else if self.check_at(1, b'=') {
438 self.pos = start + 2;
439 self.tok(TokenKind::GreaterThanEquals, start)
440 } else {
441 self.pos = start + 1;
442 self.tok(TokenKind::GreaterThan, start)
443 }
444 }
445 b'&' => {
446 if self.check_at(1, b'&') {
447 self.pos = start + 2;
448 self.tok(TokenKind::AmpersandAmpersand, start)
449 } else if self.check_at(1, b'=') {
450 self.pos = start + 2;
451 self.tok(TokenKind::AmpersandEquals, start)
452 } else {
453 self.pos = start + 1;
454 self.tok(TokenKind::Ampersand, start)
455 }
456 }
457 b'|' => {
458 if self.check_at(1, b'|') {
459 self.pos = start + 2;
460 self.tok(TokenKind::PipePipe, start)
461 } else if self.check_at(1, b'=') {
462 self.pos = start + 2;
463 self.tok(TokenKind::PipeEquals, start)
464 } else if self.check_at(1, b'>') {
465 self.pos = start + 2;
466 self.tok(TokenKind::PipeArrow, start)
467 } else {
468 self.pos = start + 1;
469 self.tok(TokenKind::Pipe, start)
470 }
471 }
472 b'^' => {
473 if self.check_at(1, b'=') {
474 self.pos = start + 2;
475 self.tok(TokenKind::CaretEquals, start)
476 } else {
477 self.pos = start + 1;
478 self.tok(TokenKind::Caret, start)
479 }
480 }
481 b'~' => {
482 self.pos = start + 1;
483 self.tok(TokenKind::Tilde, start)
484 }
485 b'?' => {
486 if self.check_at(1, b'>') {
487 self.pos = start + 2;
488 self.mode = LexerMode::InlineHtml;
489 self.tok(TokenKind::CloseTag, start)
490 } else if self.check_at(1, b'?') {
491 if self.check_at(2, b'=') {
492 self.pos = start + 3;
493 self.tok(TokenKind::CoalesceEquals, start)
494 } else {
495 self.pos = start + 2;
496 self.tok(TokenKind::QuestionQuestion, start)
497 }
498 } else if self.check_at(1, b'-') && self.check_at(2, b'>') {
499 self.pos = start + 3;
500 self.tok(TokenKind::NullsafeArrow, start)
501 } else {
502 self.pos = start + 1;
503 self.tok(TokenKind::Question, start)
504 }
505 }
506 b':' => {
507 if self.check_at(1, b':') {
508 self.pos = start + 2;
509 self.tok(TokenKind::DoubleColon, start)
510 } else {
511 self.pos = start + 1;
512 self.tok(TokenKind::Colon, start)
513 }
514 }
515 b'@' => {
516 self.pos = start + 1;
517 self.tok(TokenKind::At, start)
518 }
519 b'\\' => {
520 self.pos = start + 1;
521 self.tok(TokenKind::Backslash, start)
522 }
523 b'#' => {
524 if self.check_at(1, b'[') {
527 self.pos = start + 2;
528 self.tok(TokenKind::HashBracket, start)
529 } else {
530 self.pos = start + 1;
532 self.read_next_token()
533 }
534 }
535
536 b'(' => {
538 self.pos = start + 1;
539 self.tok(TokenKind::LeftParen, start)
540 }
541 b')' => {
542 self.pos = start + 1;
543 self.tok(TokenKind::RightParen, start)
544 }
545 b'[' => {
546 self.pos = start + 1;
547 self.tok(TokenKind::LeftBracket, start)
548 }
549 b']' => {
550 self.pos = start + 1;
551 self.tok(TokenKind::RightBracket, start)
552 }
553 b'{' => {
554 self.pos = start + 1;
555 self.tok(TokenKind::LeftBrace, start)
556 }
557 b'}' => {
558 self.pos = start + 1;
559 self.tok(TokenKind::RightBrace, start)
560 }
561 b';' => {
562 self.pos = start + 1;
563 self.tok(TokenKind::Semicolon, start)
564 }
565 b',' => {
566 self.pos = start + 1;
567 self.tok(TokenKind::Comma, start)
568 }
569
570 b'\'' => self.scan_single_quoted_string(),
572 b'"' => self.scan_double_quoted_string(),
573 b'`' => self.scan_backtick_string(),
574
575 b'$' => {
577 if start + 1 < bytes.len() && is_ident_start(bytes[start + 1]) {
578 self.pos = start + 2;
579 while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
580 self.pos += 1;
581 }
582 self.tok(TokenKind::Variable, start)
583 } else {
584 self.pos = start + 1;
585 self.tok(TokenKind::Dollar, start)
586 }
587 }
588
589 b'0'..=b'9' => self.scan_number(),
591
592 _ if is_ident_start(b) => {
594 if b == b'b' || b == b'B' {
596 if self.check_at(1, b'\'') {
597 return self.scan_single_quoted_string();
598 }
599 if self.check_at(1, b'"') {
600 return self.scan_double_quoted_string();
601 }
602 if self.check_at(1, b'<') && self.check_at(2, b'<') && self.check_at(3, b'<') {
603 let remaining = &self.source[self.pos..];
604 if let Some(token) = self.try_lex_heredoc(remaining) {
605 return token;
606 }
607 }
608 }
609 self.scan_identifier()
610 }
611
612 _ => {
614 self.pos = start + 1;
615 self.read_next_token()
616 }
617 }
618 }
619
620 fn scan_less_than(&mut self, start: usize) -> Token {
622 if self.check_at(1, b'<') {
623 if self.check_at(2, b'<') {
624 let remaining = &self.source[self.pos..];
626 if let Some(token) = self.try_lex_heredoc(remaining) {
627 return token;
628 }
629 }
631 if self.check_at(2, b'=') {
632 self.pos = start + 3;
633 return self.tok(TokenKind::ShiftLeftEquals, start);
634 }
635 self.pos = start + 2;
636 return self.tok(TokenKind::ShiftLeft, start);
637 }
638 if self.check_at(1, b'=') {
639 if self.check_at(2, b'>') {
640 self.pos = start + 3;
641 return self.tok(TokenKind::Spaceship, start);
642 }
643 self.pos = start + 2;
644 return self.tok(TokenKind::LessThanEquals, start);
645 }
646 if self.check_at(1, b'?') {
647 if self.source[self.pos..].starts_with("<?php") {
648 self.pos = start + 5;
649 return self.tok(TokenKind::OpenTag, start);
650 }
651 if self.source[self.pos..].starts_with("<?=") {
652 self.pos = start + 3;
653 return self.tok(TokenKind::OpenTag, start);
654 }
655 }
656 self.pos = start + 1;
657 self.tok(TokenKind::LessThan, start)
658 }
659
660 fn scan_single_quoted_string(&mut self) -> Token {
663 let start = self.pos;
664 let bytes = self.source.as_bytes();
665 let mut p = self.pos;
666 if bytes[p] == b'b' || bytes[p] == b'B' {
668 p += 1;
669 }
670 p += 1; loop {
672 match memchr2(b'\\', b'\'', &bytes[p..]) {
673 None => {
674 self.pos = start + 1;
676 return self.read_next_token();
677 }
678 Some(offset) => {
679 p += offset;
680 match bytes[p] {
681 b'\\' => {
682 p += 1;
683 if p < bytes.len() {
684 p += 1;
685 }
686 }
687 _ => {
688 p += 1;
690 break;
691 }
692 }
693 }
694 }
695 }
696 self.pos = p;
697 self.tok(TokenKind::SingleQuotedString, start)
698 }
699
700 fn scan_double_quoted_string(&mut self) -> Token {
701 let start = self.pos;
702 let bytes = self.source.as_bytes();
703 let mut p = self.pos;
704 if bytes[p] == b'b' || bytes[p] == b'B' {
706 p += 1;
707 }
708 p += 1; loop {
710 match memchr2(b'\\', b'"', &bytes[p..]) {
711 None => {
712 self.pos = start + 1;
714 return self.read_next_token();
715 }
716 Some(offset) => {
717 p += offset;
718 match bytes[p] {
719 b'\\' => {
720 p += 1;
721 if p < bytes.len() {
722 p += 1;
723 }
724 }
725 _ => {
726 p += 1;
728 break;
729 }
730 }
731 }
732 }
733 }
734 self.pos = p;
735 self.tok(TokenKind::DoubleQuotedString, start)
736 }
737
738 fn scan_backtick_string(&mut self) -> Token {
739 let start = self.pos;
740 let bytes = self.source.as_bytes();
741 let mut p = self.pos;
742 p += 1; loop {
744 match memchr2(b'\\', b'`', &bytes[p..]) {
745 None => {
746 self.pos = start + 1;
748 return self.read_next_token();
749 }
750 Some(offset) => {
751 p += offset;
752 match bytes[p] {
753 b'\\' => {
754 p += 1;
755 if p < bytes.len() {
756 p += 1;
757 }
758 }
759 _ => {
760 p += 1;
762 break;
763 }
764 }
765 }
766 }
767 }
768 self.pos = p;
769 self.tok(TokenKind::BacktickString, start)
770 }
771
772 fn scan_number(&mut self) -> Token {
775 let start = self.pos;
776 let bytes = self.source.as_bytes();
777
778 if bytes[start] == b'0' && start + 1 < bytes.len() {
780 match bytes[start + 1] {
781 b'x' | b'X' => {
782 self.pos = start + 2;
783 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
784 self.consume_invalid_numeric_rest();
785 return self.invalid_numeric(start);
786 }
787 if self.scan_digits(u8::is_ascii_hexdigit) {
788 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
789 self.consume_invalid_numeric_rest();
790 return self.invalid_numeric(start);
791 }
792 return self.tok(TokenKind::HexIntLiteral, start);
793 }
794 self.pos = start;
796 }
797 b'b' | b'B' => {
798 self.pos = start + 2;
799 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
800 self.consume_invalid_numeric_rest();
801 return self.invalid_numeric(start);
802 }
803 if self.scan_digits(|b| b == &b'0' || b == &b'1') {
804 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
805 self.consume_invalid_numeric_rest();
806 return self.invalid_numeric(start);
807 }
808 return self.tok(TokenKind::BinIntLiteral, start);
809 }
810 self.pos = start;
812 }
813 b'o' | b'O' => {
814 self.pos = start + 2;
815 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
816 self.consume_invalid_numeric_rest();
817 return self.invalid_numeric(start);
818 }
819 if self.scan_digits(|b| (b'0'..=b'7').contains(b)) {
820 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
821 self.consume_invalid_numeric_rest();
822 return self.invalid_numeric(start);
823 }
824 return self.tok(TokenKind::OctIntLiteralNew, start);
825 }
826 self.pos = start;
828 }
829 _ => {}
830 }
831 }
832
833 self.pos = start;
835 self.scan_digits(u8::is_ascii_digit);
836 let integer_end = self.pos;
837 let mut kind = TokenKind::IntLiteral;
838
839 if bytes[start] == b'0' && integer_end > start + 1 {
841 let slice = &bytes[start..integer_end];
842 if slice.iter().all(|&b| (b'0'..=b'7').contains(&b)) {
843 kind = TokenKind::OctIntLiteral;
844 }
845 }
846
847 if self.pos < bytes.len() && bytes[self.pos] == b'.' {
849 if self.pos + 1 < bytes.len() && bytes[self.pos + 1].is_ascii_digit() {
850 self.pos += 1; self.scan_digits(u8::is_ascii_digit);
853 kind = TokenKind::FloatLiteralSimple;
854 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
855 self.consume_invalid_numeric_rest();
857 return self.invalid_numeric(start);
858 } else if self.pos + 1 >= bytes.len() || bytes[self.pos + 1] != b'.' {
859 self.pos += 1; kind = TokenKind::IntLiteral; }
863 }
864
865 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
867 if self.try_scan_exponent() {
868 kind = TokenKind::FloatLiteral;
869 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
870 self.consume_invalid_numeric_rest();
872 return self.invalid_numeric(start);
873 }
874 }
875
876 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
878 self.consume_invalid_numeric_rest();
879 return self.invalid_numeric(start);
880 }
881
882 self.tok(kind, start)
883 }
884
885 fn scan_digits(&mut self, is_valid: fn(&u8) -> bool) -> bool {
888 let bytes = self.source.as_bytes();
889 if self.pos >= bytes.len() || !is_valid(&bytes[self.pos]) {
890 return false;
891 }
892 self.pos += 1;
893 loop {
894 if self.pos >= bytes.len() {
895 break;
896 }
897 if is_valid(&bytes[self.pos]) {
898 self.pos += 1;
899 } else if bytes[self.pos] == b'_'
900 && self.pos + 1 < bytes.len()
901 && is_valid(&bytes[self.pos + 1])
902 {
903 self.pos += 2;
904 } else {
905 break;
906 }
907 }
908 true
909 }
910
911 fn try_scan_exponent(&mut self) -> bool {
914 let bytes = self.source.as_bytes();
915 let saved = self.pos;
916 self.pos += 1; if self.pos < bytes.len() && matches!(bytes[self.pos], b'+' | b'-') {
920 self.pos += 1;
921 }
922
923 if self.scan_digits(u8::is_ascii_digit) {
925 true
926 } else {
927 self.pos = saved;
928 false
929 }
930 }
931
932 fn scan_identifier(&mut self) -> Token {
935 let start = self.pos;
936 let bytes = self.source.as_bytes();
937 self.pos += 1; while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
939 self.pos += 1;
940 }
941 let text = &self.source[start..self.pos];
942 let kind = resolve_keyword(text).unwrap_or(TokenKind::Identifier);
943 self.tok(kind, start)
944 }
945
946 #[inline]
952 fn skip_line_comment_body(bytes: &[u8], pos: &mut usize) {
953 loop {
954 match memchr2(b'\n', b'?', &bytes[*pos..]) {
955 None => {
956 *pos = bytes.len();
957 return;
958 }
959 Some(offset) => {
960 let p = *pos + offset;
961 if bytes[p] == b'\n' {
962 *pos = p; return;
964 }
965 if p + 1 < bytes.len() && bytes[p + 1] == b'>' {
967 *pos = p; return;
969 }
970 *pos = p + 1;
972 }
973 }
974 }
975 }
976
977 #[inline]
978 fn check_at(&self, offset: usize, expected: u8) -> bool {
979 self.source.as_bytes().get(self.pos + offset) == Some(&expected)
980 }
981
982 #[inline]
983 fn tok(&self, kind: TokenKind, start: usize) -> Token {
984 Token::new(kind, Span::new(start as u32, self.pos as u32))
985 }
986
987 fn invalid_numeric(&mut self, start: usize) -> Token {
988 let span = Span::new(start as u32, self.pos as u32);
989 self.errors.push(LexerError {
990 message: "Invalid numeric literal".to_string(),
991 span,
992 });
993 Token::new(TokenKind::InvalidNumericLiteral, span)
994 }
995
996 fn consume_invalid_numeric_rest(&mut self) {
998 let bytes = self.source.as_bytes();
999 while self.pos < bytes.len() {
1000 let b = bytes[self.pos];
1001 if b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'+' || b == b'-' {
1002 if (b == b'+' || b == b'-') && self.pos > 0 {
1004 let prev = bytes[self.pos - 1];
1005 if prev != b'e' && prev != b'E' {
1006 break;
1007 }
1008 }
1009 self.pos += 1;
1010 } else {
1011 break;
1012 }
1013 }
1014 }
1015
1016 fn try_lex_heredoc(&mut self, remaining: &str) -> Option<Token> {
1020 let trimmed = remaining.trim_start_matches(|c: char| {
1022 c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\x0C'
1023 });
1024 let ws_len = remaining.len() - trimmed.len();
1025
1026 let (after_prefix, prefix_len) = if (trimmed.starts_with("b<<<")
1028 || trimmed.starts_with("B<<<"))
1029 && !trimmed[1..].starts_with("<<<>")
1030 {
1031 (&trimmed[1..], 1)
1032 } else {
1033 (trimmed, 0)
1034 };
1035
1036 if !after_prefix.starts_with("<<<") {
1037 return None;
1038 }
1039
1040 let base_pos = self.pos; let start = base_pos + ws_len; let after_arrows = &after_prefix[3..];
1043 let after_arrows_trimmed = after_arrows.trim_start_matches([' ', '\t']);
1044 let arrows_offset =
1045 ws_len + prefix_len + 3 + (after_arrows.len() - after_arrows_trimmed.len());
1046
1047 let (label, is_nowdoc, label_line_end);
1049 if let Some(after_quote) = after_arrows_trimmed.strip_prefix('\'') {
1050 let closing = after_quote.find('\'')?;
1052 label = after_quote[..closing].to_string();
1053 is_nowdoc = true;
1054 let after_label = &after_arrows_trimmed[2 + closing..];
1055 let nl = after_label.find('\n').unwrap_or(after_label.len());
1057 label_line_end = arrows_offset + 2 + closing + nl;
1058 if label_line_end < remaining.len() {
1059 }
1061 } else {
1062 let s = if let Some(after_dquote) = after_arrows_trimmed.strip_prefix('"') {
1064 let closing = after_dquote.find('"')?;
1065 label = after_dquote[..closing].to_string();
1066 &after_dquote[1 + closing..]
1067 } else {
1068 let end = after_arrows_trimmed
1070 .find(|c: char| !c.is_ascii_alphanumeric() && c != '_')
1071 .unwrap_or(after_arrows_trimmed.len());
1072 if end == 0 {
1073 return None;
1074 }
1075 label = after_arrows_trimmed[..end].to_string();
1076 &after_arrows_trimmed[end..]
1077 };
1078 is_nowdoc = false;
1079 let nl = s.find('\n').unwrap_or(s.len());
1080 label_line_end = arrows_offset + (after_arrows_trimmed.len() - s.len()) + nl;
1081 };
1082
1083 if label.is_empty() {
1084 return None;
1085 }
1086
1087 let body_start_in_remaining = if label_line_end < remaining.len() {
1089 label_line_end + 1 } else {
1091 return None; };
1093
1094 let body = &remaining[body_start_in_remaining..];
1095
1096 let mut search_pos = 0;
1098 let end_marker_pos;
1099 loop {
1100 if search_pos >= body.len() {
1101 return None; }
1103 let line_start = search_pos;
1104 let line_end = body[line_start..]
1105 .find('\n')
1106 .map(|p| line_start + p)
1107 .unwrap_or(body.len());
1108 let line = &body[line_start..line_end];
1109 let trimmed_line = line.trim_start_matches([' ', '\t']);
1110
1111 if trimmed_line == label
1113 || trimmed_line.starts_with(&label)
1114 && trimmed_line[label.len()..]
1115 .trim_start_matches(';')
1116 .trim()
1117 .is_empty()
1118 {
1119 end_marker_pos = line_start;
1120 break;
1121 }
1122
1123 search_pos = if line_end < body.len() {
1124 line_end + 1
1125 } else {
1126 body.len()
1127 };
1128 }
1129
1130 let end_marker_line = &body[end_marker_pos..];
1132 let trimmed = end_marker_line.trim_start_matches([' ', '\t']);
1133 let indent_len = end_marker_line.len() - trimmed.len();
1134 let token_end_in_remaining =
1135 body_start_in_remaining + end_marker_pos + indent_len + label.len();
1136 self.pos = base_pos + token_end_in_remaining;
1137
1138 let span = Span::new(start as u32, self.pos as u32);
1139
1140 if is_nowdoc {
1141 Some(Token::new(TokenKind::Nowdoc, span))
1142 } else {
1143 Some(Token::new(TokenKind::Heredoc, span))
1144 }
1145 }
1146}
1147
1148#[cfg(test)]
1149mod tests {
1150 use super::*;
1151
1152 fn collect_tokens(source: &str) -> Vec<Token> {
1153 let mut lexer = Lexer::new(source);
1154 let mut tokens = Vec::new();
1155 loop {
1156 let token = lexer.next_token();
1157 if token.kind == TokenKind::Eof {
1158 tokens.push(token);
1159 break;
1160 }
1161 tokens.push(token);
1162 }
1163 tokens
1164 }
1165
1166 fn collect_kinds(source: &str) -> Vec<TokenKind> {
1167 collect_tokens(source).into_iter().map(|t| t.kind).collect()
1168 }
1169
1170 fn php_kinds(code: &str) -> Vec<TokenKind> {
1172 let full = format!("<?php {}", code);
1173 collect_kinds(&full)
1174 .into_iter()
1175 .filter(|k| *k != TokenKind::OpenTag && *k != TokenKind::Eof)
1176 .collect()
1177 }
1178
1179 fn php_tokens(code: &str) -> Vec<(TokenKind, String)> {
1181 let full = format!("<?php {}", code);
1182 let mut lexer = Lexer::new(&full);
1183 let mut result = Vec::new();
1184 loop {
1185 let token = lexer.next_token();
1186 if token.kind == TokenKind::Eof {
1187 break;
1188 }
1189 if token.kind == TokenKind::OpenTag {
1190 continue;
1191 }
1192 let text = lexer.token_text(&token).to_string();
1193 result.push((token.kind, text));
1194 }
1195 result
1196 }
1197
1198 #[test]
1199 fn test_php_only() {
1200 let tokens = collect_kinds("<?php $x = 42;");
1201 assert_eq!(
1202 tokens,
1203 vec![
1204 TokenKind::OpenTag,
1205 TokenKind::Variable,
1206 TokenKind::Equals,
1207 TokenKind::IntLiteral,
1208 TokenKind::Semicolon,
1209 TokenKind::Eof,
1210 ]
1211 );
1212 }
1213
1214 #[test]
1215 fn test_inline_html_before_php() {
1216 let tokens = collect_kinds("<html><?php echo 1;");
1217 assert_eq!(
1218 tokens,
1219 vec![
1220 TokenKind::InlineHtml,
1221 TokenKind::OpenTag,
1222 TokenKind::Echo,
1223 TokenKind::IntLiteral,
1224 TokenKind::Semicolon,
1225 TokenKind::Eof,
1226 ]
1227 );
1228 }
1229
1230 #[test]
1231 fn test_inline_html_after_close_tag() {
1232 let tokens = collect_kinds("<?php echo 1; ?><html>");
1233 assert_eq!(
1234 tokens,
1235 vec![
1236 TokenKind::OpenTag,
1237 TokenKind::Echo,
1238 TokenKind::IntLiteral,
1239 TokenKind::Semicolon,
1240 TokenKind::CloseTag,
1241 TokenKind::InlineHtml,
1242 TokenKind::Eof,
1243 ]
1244 );
1245 }
1246
1247 #[test]
1248 fn test_keyword_resolution() {
1249 let tokens = collect_kinds("<?php if else while for foreach function return");
1250 assert_eq!(
1251 tokens,
1252 vec![
1253 TokenKind::OpenTag,
1254 TokenKind::If,
1255 TokenKind::Else,
1256 TokenKind::While,
1257 TokenKind::For,
1258 TokenKind::Foreach,
1259 TokenKind::Function,
1260 TokenKind::Return,
1261 TokenKind::Eof,
1262 ]
1263 );
1264 }
1265
1266 #[test]
1267 fn test_keyword_case_insensitive() {
1268 let tokens = collect_kinds("<?php IF ELSE TRUE FALSE NULL");
1269 assert_eq!(
1270 tokens,
1271 vec![
1272 TokenKind::OpenTag,
1273 TokenKind::If,
1274 TokenKind::Else,
1275 TokenKind::True,
1276 TokenKind::False,
1277 TokenKind::Null,
1278 TokenKind::Eof,
1279 ]
1280 );
1281 }
1282
1283 #[test]
1284 fn test_peek_doesnt_consume() {
1285 let mut lexer = Lexer::new("<?php 42");
1286 let peeked = lexer.peek().clone();
1287 assert_eq!(peeked.kind, TokenKind::OpenTag);
1288 let next = lexer.next_token();
1289 assert_eq!(next.kind, TokenKind::OpenTag);
1290 let next = lexer.next_token();
1291 assert_eq!(next.kind, TokenKind::IntLiteral);
1292 }
1293
1294 #[test]
1295 fn test_token_text() {
1296 let source = "<?php $myVar = 'hello';";
1297 let mut lexer = Lexer::new(source);
1298 lexer.next_token(); let var_tok = lexer.next_token();
1300 assert_eq!(lexer.token_text(&var_tok), "$myVar");
1301 lexer.next_token(); let str_tok = lexer.next_token();
1303 assert_eq!(lexer.token_text(&str_tok), "'hello'");
1304 }
1305
1306 #[test]
1307 fn test_spans_are_correct() {
1308 let source = "<?php $x";
1309 let tokens = collect_tokens(source);
1310 assert_eq!(tokens[0].span, Span::new(0, 5)); assert_eq!(tokens[1].span, Span::new(6, 8)); }
1313
1314 #[test]
1315 fn test_operators() {
1316 let tokens = collect_kinds("<?php === !== <=> ?? ++ -- **");
1317 assert_eq!(
1318 tokens,
1319 vec![
1320 TokenKind::OpenTag,
1321 TokenKind::EqualsEqualsEquals,
1322 TokenKind::BangEqualsEquals,
1323 TokenKind::Spaceship,
1324 TokenKind::QuestionQuestion,
1325 TokenKind::PlusPlus,
1326 TokenKind::MinusMinus,
1327 TokenKind::StarStar,
1328 TokenKind::Eof,
1329 ]
1330 );
1331 }
1332
1333 #[test]
1334 fn test_string_literals() {
1335 let tokens = collect_kinds(r#"<?php 'single' "double""#);
1336 assert_eq!(
1337 tokens,
1338 vec![
1339 TokenKind::OpenTag,
1340 TokenKind::SingleQuotedString,
1341 TokenKind::DoubleQuotedString,
1342 TokenKind::Eof,
1343 ]
1344 );
1345 }
1346
1347 #[test]
1348 fn test_assignment_operators() {
1349 let tokens = collect_kinds("<?php += -= *= /= %= **= .= ??=");
1350 assert_eq!(
1351 tokens,
1352 vec![
1353 TokenKind::OpenTag,
1354 TokenKind::PlusEquals,
1355 TokenKind::MinusEquals,
1356 TokenKind::StarEquals,
1357 TokenKind::SlashEquals,
1358 TokenKind::PercentEquals,
1359 TokenKind::StarStarEquals,
1360 TokenKind::DotEquals,
1361 TokenKind::CoalesceEquals,
1362 TokenKind::Eof,
1363 ]
1364 );
1365 }
1366
1367 #[test]
1368 fn test_logical_keywords() {
1369 let tokens = collect_kinds("<?php and or xor");
1370 assert_eq!(
1371 tokens,
1372 vec![
1373 TokenKind::OpenTag,
1374 TokenKind::And,
1375 TokenKind::Or,
1376 TokenKind::Xor,
1377 TokenKind::Eof,
1378 ]
1379 );
1380 }
1381
1382 #[test]
1383 fn test_empty_source() {
1384 let tokens = collect_kinds("");
1385 assert_eq!(tokens, vec![TokenKind::Eof]);
1386 }
1387
1388 #[test]
1389 fn test_only_inline_html() {
1390 let tokens = collect_kinds("<html><body>Hello</body></html>");
1391 assert_eq!(tokens, vec![TokenKind::InlineHtml, TokenKind::Eof]);
1392 }
1393
1394 #[test]
1397 fn test_basic_operators() {
1398 assert_eq!(
1399 php_kinds("+ - * / % ** ."),
1400 vec![
1401 TokenKind::Plus,
1402 TokenKind::Minus,
1403 TokenKind::Star,
1404 TokenKind::Slash,
1405 TokenKind::Percent,
1406 TokenKind::StarStar,
1407 TokenKind::Dot,
1408 ]
1409 );
1410 }
1411
1412 #[test]
1413 fn test_integers() {
1414 let toks = php_tokens("42 0xFF 0b1010 077");
1415 assert_eq!(toks[0], (TokenKind::IntLiteral, "42".to_string()));
1416 assert_eq!(toks[1], (TokenKind::HexIntLiteral, "0xFF".to_string()));
1417 assert_eq!(toks[2], (TokenKind::BinIntLiteral, "0b1010".to_string()));
1418 assert_eq!(toks[3], (TokenKind::OctIntLiteral, "077".to_string()));
1419 }
1420
1421 #[test]
1422 fn test_floats() {
1423 let toks = php_tokens("3.14 1e10 2.5e-3");
1424 assert_eq!(toks[0], (TokenKind::FloatLiteralSimple, "3.14".to_string()));
1425 assert_eq!(toks[1], (TokenKind::FloatLiteral, "1e10".to_string()));
1426 assert_eq!(toks[2], (TokenKind::FloatLiteral, "2.5e-3".to_string()));
1427 }
1428
1429 #[test]
1430 fn test_strings() {
1431 let kinds = php_kinds(r#"'hello' "world" 'it\'s' "say \"hi\"""#);
1432 assert_eq!(
1433 kinds,
1434 vec![
1435 TokenKind::SingleQuotedString,
1436 TokenKind::DoubleQuotedString,
1437 TokenKind::SingleQuotedString,
1438 TokenKind::DoubleQuotedString,
1439 ]
1440 );
1441 }
1442
1443 #[test]
1444 fn test_variables() {
1445 let toks = php_tokens("$x $myVar $_foo");
1446 assert_eq!(toks[0], (TokenKind::Variable, "$x".to_string()));
1447 assert_eq!(toks[1], (TokenKind::Variable, "$myVar".to_string()));
1448 assert_eq!(toks[2], (TokenKind::Variable, "$_foo".to_string()));
1449 }
1450
1451 #[test]
1452 fn test_comments_skipped() {
1453 let toks = php_tokens("42 // line comment\n43 /* block */ 44 # hash comment\n45");
1454 assert_eq!(toks[0], (TokenKind::IntLiteral, "42".to_string()));
1455 assert_eq!(toks[1], (TokenKind::IntLiteral, "43".to_string()));
1456 assert_eq!(toks[2], (TokenKind::IntLiteral, "44".to_string()));
1457 assert_eq!(toks[3], (TokenKind::IntLiteral, "45".to_string()));
1458 }
1459
1460 #[test]
1461 fn test_float_leading_dot() {
1462 let toks = php_tokens(".5 .123e4");
1463 assert_eq!(
1464 toks[0],
1465 (TokenKind::FloatLiteralLeadingDot, ".5".to_string())
1466 );
1467 assert_eq!(
1468 toks[1],
1469 (TokenKind::FloatLiteralLeadingDot, ".123e4".to_string())
1470 );
1471 }
1472
1473 #[test]
1474 fn test_new_octal_syntax() {
1475 let toks = php_tokens("0o77 0O755");
1476 assert_eq!(toks[0], (TokenKind::OctIntLiteralNew, "0o77".to_string()));
1477 assert_eq!(toks[1], (TokenKind::OctIntLiteralNew, "0O755".to_string()));
1478 }
1479
1480 #[test]
1481 fn test_numeric_underscores() {
1482 let toks = php_tokens("1_000 0xFF_FF 0b1010_0101");
1483 assert_eq!(toks[0], (TokenKind::IntLiteral, "1_000".to_string()));
1484 assert_eq!(toks[1], (TokenKind::HexIntLiteral, "0xFF_FF".to_string()));
1485 assert_eq!(
1486 toks[2],
1487 (TokenKind::BinIntLiteral, "0b1010_0101".to_string())
1488 );
1489 }
1490
1491 #[test]
1492 fn test_binary_prefix_strings() {
1493 let kinds = php_kinds(r#"b'hello' B"world""#);
1494 assert_eq!(
1495 kinds,
1496 vec![TokenKind::SingleQuotedString, TokenKind::DoubleQuotedString,]
1497 );
1498 }
1499
1500 #[test]
1501 fn test_hash_bracket_not_comment() {
1502 let kinds = php_kinds("#[Attribute]");
1503 assert_eq!(
1504 kinds,
1505 vec![
1506 TokenKind::HashBracket,
1507 TokenKind::Identifier,
1508 TokenKind::RightBracket,
1509 ]
1510 );
1511 }
1512
1513 #[test]
1514 fn test_nullsafe_arrow() {
1515 let kinds = php_kinds("$x?->y");
1516 assert_eq!(
1517 kinds,
1518 vec![
1519 TokenKind::Variable,
1520 TokenKind::NullsafeArrow,
1521 TokenKind::Identifier,
1522 ]
1523 );
1524 }
1525
1526 #[test]
1527 fn test_pipe_arrow() {
1528 let kinds = php_kinds("$x |> foo(...)");
1529 assert_eq!(
1530 kinds,
1531 vec![
1532 TokenKind::Variable,
1533 TokenKind::PipeArrow,
1534 TokenKind::Identifier,
1535 TokenKind::LeftParen,
1536 TokenKind::Ellipsis,
1537 TokenKind::RightParen,
1538 ]
1539 );
1540 }
1541}