1use memchr::{memchr2, memmem};
2use php_ast::Span;
3
4use crate::token::{resolve_keyword, TokenKind};
5
6const fn make_whitespace_table() -> [bool; 256] {
14 let mut t = [false; 256];
15 t[b' ' as usize] = true;
16 t[b'\t' as usize] = true;
17 t[b'\r' as usize] = true;
18 t[b'\n' as usize] = true;
19 t[0x0C] = true; t
21}
22
23const fn make_ident_start_table() -> [bool; 256] {
24 let mut t = [false; 256];
25 let mut i = 0usize;
26 while i < 256 {
27 let b = i as u8;
28 t[i] = (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') || b == b'_' || b >= 0x80;
29 i += 1;
30 }
31 t
32}
33
34const fn make_ident_continue_table() -> [bool; 256] {
35 let mut t = [false; 256];
36 let mut i = 0usize;
37 while i < 256 {
38 let b = i as u8;
39 t[i] = (b >= b'a' && b <= b'z')
40 || (b >= b'A' && b <= b'Z')
41 || (b >= b'0' && b <= b'9')
42 || b == b'_'
43 || b >= 0x80;
44 i += 1;
45 }
46 t
47}
48
49static IS_PHP_WHITESPACE: [bool; 256] = make_whitespace_table();
50static IS_IDENT_START: [bool; 256] = make_ident_start_table();
51static IS_IDENT_CONTINUE: [bool; 256] = make_ident_continue_table();
52
53#[derive(Debug, Clone, PartialEq)]
54pub struct LexerError {
55 pub message: String,
56 pub span: Span,
57}
58
59#[derive(Debug, Clone, Copy, PartialEq)]
60pub struct Token {
61 pub kind: TokenKind,
62 pub span: Span,
63}
64
65impl Token {
66 pub fn new(kind: TokenKind, span: Span) -> Self {
67 Self { kind, span }
68 }
69
70 pub fn eof(offset: u32) -> Self {
71 Self {
72 kind: TokenKind::Eof,
73 span: Span::new(offset, offset),
74 }
75 }
76}
77
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79enum LexerMode {
80 InlineHtml,
81 Php,
82}
83
84pub struct Lexer<'src> {
85 source: &'src str,
86 mode: LexerMode,
87 pos: usize,
88 peeked: Option<Token>,
89 peeked2: Option<Token>,
90 pub errors: Vec<LexerError>,
91}
92
93#[inline(always)]
94fn is_ident_start(b: u8) -> bool {
95 IS_IDENT_START[b as usize]
96}
97
98#[inline(always)]
99fn is_ident_continue(b: u8) -> bool {
100 IS_IDENT_CONTINUE[b as usize]
101}
102
103impl<'src> Lexer<'src> {
104 pub fn new(source: &'src str) -> Self {
105 let pos = if source.starts_with("#!") {
107 source.find('\n').map(|p| p + 1).unwrap_or(source.len())
108 } else {
109 0
110 };
111
112 let remaining = &source[pos..];
114 let mode = if remaining.starts_with("<?php") || remaining.starts_with("<?=") {
115 LexerMode::Php
116 } else {
117 LexerMode::InlineHtml
118 };
119
120 Self {
121 source,
122 mode,
123 pos,
124 peeked: None,
125 peeked2: None,
126 errors: Vec::new(),
127 }
128 }
129
130 pub fn new_at(source: &'src str, offset: usize) -> Self {
135 Self {
136 source,
137 mode: LexerMode::Php,
138 pos: offset,
139 peeked: None,
140 peeked2: None,
141 errors: Vec::new(),
142 }
143 }
144
145 pub fn source(&self) -> &'src str {
146 self.source
147 }
148
149 pub fn peek(&mut self) -> &Token {
150 if self.peeked.is_none() {
151 self.peeked = Some(self.read_next_token());
152 }
153 self.peeked.as_ref().expect("peeked is Some: set above")
154 }
155
156 pub fn peek2(&mut self) -> &Token {
158 if self.peeked.is_none() {
160 self.peeked = Some(self.read_next_token());
161 }
162 if self.peeked2.is_none() {
163 self.peeked2 = Some(self.read_next_token());
164 }
165 self.peeked2.as_ref().expect("peeked2 is Some: set above")
166 }
167
168 pub fn next_token(&mut self) -> Token {
169 if let Some(token) = self.peeked.take() {
170 self.peeked = self.peeked2.take();
171 return token;
172 }
173 self.read_next_token()
174 }
175
176 pub fn token_text(&self, token: &Token) -> &'src str {
178 &self.source[token.span.start as usize..token.span.end as usize]
179 }
180
181 fn read_next_token(&mut self) -> Token {
182 if self.pos >= self.source.len() {
183 return Token::eof(self.source.len() as u32);
184 }
185
186 match self.mode {
187 LexerMode::InlineHtml => self.lex_inline_html(),
188 LexerMode::Php => self.lex_php(),
189 }
190 }
191
192 fn lex_inline_html(&mut self) -> Token {
193 let start = self.pos;
194 let bytes = self.source.as_bytes();
195
196 let mut search = self.pos;
200 let tag_pos = loop {
201 match memchr::memchr(b'<', &bytes[search..]) {
202 None => break None,
203 Some(offset) => {
204 let p = search + offset;
205 let rest = &bytes[p..];
206 if rest.starts_with(b"<?php") || rest.starts_with(b"<?=") {
207 break Some(p - self.pos);
208 }
209 search = p + 1;
210 }
211 }
212 };
213
214 if let Some(tag_pos) = tag_pos {
215 if tag_pos == 0 {
216 self.mode = LexerMode::Php;
218 return self.lex_php();
219 }
220 let end = self.pos + tag_pos;
222 self.pos = end;
223 self.mode = LexerMode::Php;
224 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
225 } else {
226 let end = self.source.len();
228 self.pos = end;
229 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
230 }
231 }
232
233 fn lex_php(&mut self) -> Token {
234 let remaining = &self.source[self.pos..];
235
236 if let Some(token) = self.try_lex_heredoc(remaining) {
238 return token;
239 }
240
241 self.skip_whitespace_and_comments();
243
244 if self.pos >= self.source.len() {
245 return Token::eof(self.source.len() as u32);
246 }
247
248 self.scan_token()
249 }
250
251 fn skip_whitespace_and_comments(&mut self) {
253 let bytes = self.source.as_bytes();
254 loop {
255 while self.pos < bytes.len() && IS_PHP_WHITESPACE[bytes[self.pos] as usize] {
257 self.pos += 1;
258 }
259
260 if self.pos >= bytes.len() {
261 break;
262 }
263
264 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'/'
267 {
268 self.pos += 2;
269 Self::skip_line_comment_body(bytes, &mut self.pos);
270 continue;
271 }
272
273 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'*'
275 {
276 self.pos += 2;
277 match memmem::find(&bytes[self.pos..], b"*/") {
278 Some(end) => self.pos += end + 2,
279 None => self.pos = bytes.len(), }
281 continue;
282 }
283
284 if bytes[self.pos] == b'#'
287 && !(self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'[')
288 {
289 self.pos += 1;
290 Self::skip_line_comment_body(bytes, &mut self.pos);
291 continue;
292 }
293
294 break;
295 }
296 }
297
298 fn scan_token(&mut self) -> Token {
300 let start = self.pos;
301 let bytes = self.source.as_bytes();
302 let b = bytes[start];
303
304 match b {
305 b'+' => {
307 if self.check_at(1, b'+') {
308 self.pos = start + 2;
309 self.tok(TokenKind::PlusPlus, start)
310 } else if self.check_at(1, b'=') {
311 self.pos = start + 2;
312 self.tok(TokenKind::PlusEquals, start)
313 } else {
314 self.pos = start + 1;
315 self.tok(TokenKind::Plus, start)
316 }
317 }
318 b'-' => {
319 if self.check_at(1, b'-') {
320 self.pos = start + 2;
321 self.tok(TokenKind::MinusMinus, start)
322 } else if self.check_at(1, b'=') {
323 self.pos = start + 2;
324 self.tok(TokenKind::MinusEquals, start)
325 } else if self.check_at(1, b'>') {
326 self.pos = start + 2;
327 self.tok(TokenKind::Arrow, start)
328 } else {
329 self.pos = start + 1;
330 self.tok(TokenKind::Minus, start)
331 }
332 }
333 b'*' => {
334 if self.check_at(1, b'*') {
335 if self.check_at(2, b'=') {
336 self.pos = start + 3;
337 self.tok(TokenKind::StarStarEquals, start)
338 } else {
339 self.pos = start + 2;
340 self.tok(TokenKind::StarStar, start)
341 }
342 } else if self.check_at(1, b'=') {
343 self.pos = start + 2;
344 self.tok(TokenKind::StarEquals, start)
345 } else {
346 self.pos = start + 1;
347 self.tok(TokenKind::Star, start)
348 }
349 }
350 b'/' => {
351 if self.check_at(1, b'=') {
353 self.pos = start + 2;
354 self.tok(TokenKind::SlashEquals, start)
355 } else {
356 self.pos = start + 1;
357 self.tok(TokenKind::Slash, start)
358 }
359 }
360 b'%' => {
361 if self.check_at(1, b'=') {
362 self.pos = start + 2;
363 self.tok(TokenKind::PercentEquals, start)
364 } else {
365 self.pos = start + 1;
366 self.tok(TokenKind::Percent, start)
367 }
368 }
369 b'.' => {
370 if start + 1 < bytes.len() && bytes[start + 1].is_ascii_digit() {
372 self.pos = start + 1;
373 self.scan_digits(u8::is_ascii_digit);
374 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
376 self.try_scan_exponent();
377 }
378 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
380 self.consume_invalid_numeric_rest();
381 return self.invalid_numeric(start);
382 }
383 return self.tok(TokenKind::FloatLiteralLeadingDot, start);
384 }
385 if self.check_at(1, b'.') && self.check_at(2, b'.') {
386 self.pos = start + 3;
387 self.tok(TokenKind::Ellipsis, start)
388 } else if self.check_at(1, b'=') {
389 self.pos = start + 2;
390 self.tok(TokenKind::DotEquals, start)
391 } else {
392 self.pos = start + 1;
393 self.tok(TokenKind::Dot, start)
394 }
395 }
396 b'=' => {
397 if self.check_at(1, b'=') {
398 if self.check_at(2, b'=') {
399 self.pos = start + 3;
400 self.tok(TokenKind::EqualsEqualsEquals, start)
401 } else {
402 self.pos = start + 2;
403 self.tok(TokenKind::EqualsEquals, start)
404 }
405 } else if self.check_at(1, b'>') {
406 self.pos = start + 2;
407 self.tok(TokenKind::FatArrow, start)
408 } else {
409 self.pos = start + 1;
410 self.tok(TokenKind::Equals, start)
411 }
412 }
413 b'!' => {
414 if self.check_at(1, b'=') {
415 if self.check_at(2, b'=') {
416 self.pos = start + 3;
417 self.tok(TokenKind::BangEqualsEquals, start)
418 } else {
419 self.pos = start + 2;
420 self.tok(TokenKind::BangEquals, start)
421 }
422 } else {
423 self.pos = start + 1;
424 self.tok(TokenKind::Bang, start)
425 }
426 }
427 b'<' => self.scan_less_than(start),
428 b'>' => {
429 if self.check_at(1, b'>') {
430 if self.check_at(2, b'=') {
431 self.pos = start + 3;
432 self.tok(TokenKind::ShiftRightEquals, start)
433 } else {
434 self.pos = start + 2;
435 self.tok(TokenKind::ShiftRight, start)
436 }
437 } else if self.check_at(1, b'=') {
438 self.pos = start + 2;
439 self.tok(TokenKind::GreaterThanEquals, start)
440 } else {
441 self.pos = start + 1;
442 self.tok(TokenKind::GreaterThan, start)
443 }
444 }
445 b'&' => {
446 if self.check_at(1, b'&') {
447 self.pos = start + 2;
448 self.tok(TokenKind::AmpersandAmpersand, start)
449 } else if self.check_at(1, b'=') {
450 self.pos = start + 2;
451 self.tok(TokenKind::AmpersandEquals, start)
452 } else {
453 self.pos = start + 1;
454 self.tok(TokenKind::Ampersand, start)
455 }
456 }
457 b'|' => {
458 if self.check_at(1, b'|') {
459 self.pos = start + 2;
460 self.tok(TokenKind::PipePipe, start)
461 } else if self.check_at(1, b'=') {
462 self.pos = start + 2;
463 self.tok(TokenKind::PipeEquals, start)
464 } else if self.check_at(1, b'>') {
465 self.pos = start + 2;
466 self.tok(TokenKind::PipeArrow, start)
467 } else {
468 self.pos = start + 1;
469 self.tok(TokenKind::Pipe, start)
470 }
471 }
472 b'^' => {
473 if self.check_at(1, b'=') {
474 self.pos = start + 2;
475 self.tok(TokenKind::CaretEquals, start)
476 } else {
477 self.pos = start + 1;
478 self.tok(TokenKind::Caret, start)
479 }
480 }
481 b'~' => {
482 self.pos = start + 1;
483 self.tok(TokenKind::Tilde, start)
484 }
485 b'?' => {
486 if self.check_at(1, b'>') {
487 self.pos = start + 2;
488 self.mode = LexerMode::InlineHtml;
489 self.tok(TokenKind::CloseTag, start)
490 } else if self.check_at(1, b'?') {
491 if self.check_at(2, b'=') {
492 self.pos = start + 3;
493 self.tok(TokenKind::CoalesceEquals, start)
494 } else {
495 self.pos = start + 2;
496 self.tok(TokenKind::QuestionQuestion, start)
497 }
498 } else if self.check_at(1, b'-') && self.check_at(2, b'>') {
499 self.pos = start + 3;
500 self.tok(TokenKind::NullsafeArrow, start)
501 } else {
502 self.pos = start + 1;
503 self.tok(TokenKind::Question, start)
504 }
505 }
506 b':' => {
507 if self.check_at(1, b':') {
508 self.pos = start + 2;
509 self.tok(TokenKind::DoubleColon, start)
510 } else {
511 self.pos = start + 1;
512 self.tok(TokenKind::Colon, start)
513 }
514 }
515 b'@' => {
516 self.pos = start + 1;
517 self.tok(TokenKind::At, start)
518 }
519 b'\\' => {
520 self.pos = start + 1;
521 self.tok(TokenKind::Backslash, start)
522 }
523 b'#' => {
524 if self.check_at(1, b'[') {
527 self.pos = start + 2;
528 self.tok(TokenKind::HashBracket, start)
529 } else {
530 self.pos = start + 1;
532 self.read_next_token()
533 }
534 }
535
536 b'(' => {
538 self.pos = start + 1;
539 self.tok(TokenKind::LeftParen, start)
540 }
541 b')' => {
542 self.pos = start + 1;
543 self.tok(TokenKind::RightParen, start)
544 }
545 b'[' => {
546 self.pos = start + 1;
547 self.tok(TokenKind::LeftBracket, start)
548 }
549 b']' => {
550 self.pos = start + 1;
551 self.tok(TokenKind::RightBracket, start)
552 }
553 b'{' => {
554 self.pos = start + 1;
555 self.tok(TokenKind::LeftBrace, start)
556 }
557 b'}' => {
558 self.pos = start + 1;
559 self.tok(TokenKind::RightBrace, start)
560 }
561 b';' => {
562 self.pos = start + 1;
563 self.tok(TokenKind::Semicolon, start)
564 }
565 b',' => {
566 self.pos = start + 1;
567 self.tok(TokenKind::Comma, start)
568 }
569
570 b'\'' => self.scan_single_quoted_string(),
572 b'"' => self.scan_double_quoted_string(),
573 b'`' => self.scan_backtick_string(),
574
575 b'$' => {
577 if start + 1 < bytes.len() && is_ident_start(bytes[start + 1]) {
578 self.pos = start + 2;
579 while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
580 self.pos += 1;
581 }
582 self.tok(TokenKind::Variable, start)
583 } else {
584 self.pos = start + 1;
585 self.tok(TokenKind::Dollar, start)
586 }
587 }
588
589 b'0'..=b'9' => self.scan_number(),
591
592 _ if is_ident_start(b) => {
594 if b == b'b' || b == b'B' {
596 if self.check_at(1, b'\'') {
597 return self.scan_single_quoted_string();
598 }
599 if self.check_at(1, b'"') {
600 return self.scan_double_quoted_string();
601 }
602 if self.check_at(1, b'<') && self.check_at(2, b'<') && self.check_at(3, b'<') {
603 let remaining = &self.source[self.pos..];
604 if let Some(token) = self.try_lex_heredoc(remaining) {
605 return token;
606 }
607 }
608 }
609 self.scan_identifier()
610 }
611
612 _ => {
614 self.pos = start + 1;
615 self.read_next_token()
616 }
617 }
618 }
619
620 fn scan_less_than(&mut self, start: usize) -> Token {
622 if self.check_at(1, b'<') {
623 if self.check_at(2, b'<') {
624 let remaining = &self.source[self.pos..];
626 if let Some(token) = self.try_lex_heredoc(remaining) {
627 return token;
628 }
629 }
631 if self.check_at(2, b'=') {
632 self.pos = start + 3;
633 return self.tok(TokenKind::ShiftLeftEquals, start);
634 }
635 self.pos = start + 2;
636 return self.tok(TokenKind::ShiftLeft, start);
637 }
638 if self.check_at(1, b'=') {
639 if self.check_at(2, b'>') {
640 self.pos = start + 3;
641 return self.tok(TokenKind::Spaceship, start);
642 }
643 self.pos = start + 2;
644 return self.tok(TokenKind::LessThanEquals, start);
645 }
646 if self.check_at(1, b'?') {
647 if self.source[self.pos..].starts_with("<?php") {
648 self.pos = start + 5;
649 return self.tok(TokenKind::OpenTag, start);
650 }
651 if self.source[self.pos..].starts_with("<?=") {
652 self.pos = start + 3;
653 return self.tok(TokenKind::OpenTag, start);
654 }
655 }
656 self.pos = start + 1;
657 self.tok(TokenKind::LessThan, start)
658 }
659
660 fn scan_single_quoted_string(&mut self) -> Token {
663 let start = self.pos;
664 let bytes = self.source.as_bytes();
665 let mut p = self.pos;
666 if bytes[p] == b'b' || bytes[p] == b'B' {
668 p += 1;
669 }
670 p += 1; loop {
672 match memchr2(b'\\', b'\'', &bytes[p..]) {
673 None => {
674 self.pos = start + 1;
676 return self.read_next_token();
677 }
678 Some(offset) => {
679 p += offset;
680 match bytes[p] {
681 b'\\' => {
682 p += 1;
683 if p < bytes.len() {
684 p += 1;
685 }
686 }
687 _ => {
688 p += 1;
690 break;
691 }
692 }
693 }
694 }
695 }
696 self.pos = p;
697 self.tok(TokenKind::SingleQuotedString, start)
698 }
699
700 fn scan_double_quoted_string(&mut self) -> Token {
701 let start = self.pos;
702 let bytes = self.source.as_bytes();
703 let mut p = self.pos;
704 if bytes[p] == b'b' || bytes[p] == b'B' {
706 p += 1;
707 }
708 p += 1; loop {
710 match memchr2(b'\\', b'"', &bytes[p..]) {
711 None => {
712 self.pos = start + 1;
714 return self.read_next_token();
715 }
716 Some(offset) => {
717 p += offset;
718 match bytes[p] {
719 b'\\' => {
720 p += 1;
721 if p < bytes.len() {
722 p += 1;
723 }
724 }
725 _ => {
726 p += 1;
728 break;
729 }
730 }
731 }
732 }
733 }
734 self.pos = p;
735 self.tok(TokenKind::DoubleQuotedString, start)
736 }
737
738 fn scan_backtick_string(&mut self) -> Token {
739 let start = self.pos;
740 let bytes = self.source.as_bytes();
741 let mut p = self.pos;
742 p += 1; loop {
744 match memchr2(b'\\', b'`', &bytes[p..]) {
745 None => {
746 self.pos = start + 1;
748 return self.read_next_token();
749 }
750 Some(offset) => {
751 p += offset;
752 match bytes[p] {
753 b'\\' => {
754 p += 1;
755 if p < bytes.len() {
756 p += 1;
757 }
758 }
759 _ => {
760 p += 1;
762 break;
763 }
764 }
765 }
766 }
767 }
768 self.pos = p;
769 self.tok(TokenKind::BacktickString, start)
770 }
771
772 fn scan_number(&mut self) -> Token {
775 let start = self.pos;
776 let bytes = self.source.as_bytes();
777
778 if bytes[start] == b'0' && start + 1 < bytes.len() {
780 match bytes[start + 1] {
781 b'x' | b'X' => {
782 self.pos = start + 2;
783 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
784 self.consume_invalid_numeric_rest();
785 return self.invalid_numeric(start);
786 }
787 if self.scan_digits(u8::is_ascii_hexdigit) {
788 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
789 self.consume_invalid_numeric_rest();
790 return self.invalid_numeric(start);
791 }
792 return self.tok(TokenKind::HexIntLiteral, start);
793 }
794 self.pos = start;
796 }
797 b'b' | b'B' => {
798 self.pos = start + 2;
799 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
800 self.consume_invalid_numeric_rest();
801 return self.invalid_numeric(start);
802 }
803 if self.scan_digits(|b| b == &b'0' || b == &b'1') {
804 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
805 self.consume_invalid_numeric_rest();
806 return self.invalid_numeric(start);
807 }
808 return self.tok(TokenKind::BinIntLiteral, start);
809 }
810 self.pos = start;
812 }
813 b'o' | b'O' => {
814 self.pos = start + 2;
815 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
816 self.consume_invalid_numeric_rest();
817 return self.invalid_numeric(start);
818 }
819 if self.scan_digits(|b| (b'0'..=b'7').contains(b)) {
820 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
821 self.consume_invalid_numeric_rest();
822 return self.invalid_numeric(start);
823 }
824 return self.tok(TokenKind::OctIntLiteralNew, start);
825 }
826 self.pos = start;
828 }
829 _ => {}
830 }
831 }
832
833 self.pos = start;
835 self.scan_digits(u8::is_ascii_digit);
836 let integer_end = self.pos;
837 let mut kind = TokenKind::IntLiteral;
838
839 if bytes[start] == b'0' && integer_end > start + 1 {
841 let slice = &bytes[start..integer_end];
842 if slice.iter().all(|&b| (b'0'..=b'7').contains(&b)) {
843 kind = TokenKind::OctIntLiteral;
844 }
845 }
846
847 if self.pos < bytes.len() && bytes[self.pos] == b'.' {
849 if self.pos + 1 < bytes.len() && bytes[self.pos + 1].is_ascii_digit() {
850 self.pos += 1; self.scan_digits(u8::is_ascii_digit);
853 kind = TokenKind::FloatLiteralSimple;
854 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
855 self.consume_invalid_numeric_rest();
857 return self.invalid_numeric(start);
858 } else if self.pos + 1 >= bytes.len() || bytes[self.pos + 1] != b'.' {
859 self.pos += 1; kind = TokenKind::IntLiteral; }
863 }
864
865 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
867 if self.try_scan_exponent() {
868 kind = TokenKind::FloatLiteral;
869 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
870 self.consume_invalid_numeric_rest();
872 return self.invalid_numeric(start);
873 }
874 }
875
876 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
878 self.consume_invalid_numeric_rest();
879 return self.invalid_numeric(start);
880 }
881
882 self.tok(kind, start)
883 }
884
885 fn scan_digits(&mut self, is_valid: fn(&u8) -> bool) -> bool {
888 let bytes = self.source.as_bytes();
889 if self.pos >= bytes.len() || !is_valid(&bytes[self.pos]) {
890 return false;
891 }
892 self.pos += 1;
893 loop {
894 if self.pos >= bytes.len() {
895 break;
896 }
897 if is_valid(&bytes[self.pos]) {
898 self.pos += 1;
899 } else if bytes[self.pos] == b'_'
900 && self.pos + 1 < bytes.len()
901 && is_valid(&bytes[self.pos + 1])
902 {
903 self.pos += 2;
904 } else {
905 break;
906 }
907 }
908 true
909 }
910
911 fn try_scan_exponent(&mut self) -> bool {
914 let bytes = self.source.as_bytes();
915 let saved = self.pos;
916 self.pos += 1; if self.pos < bytes.len() && matches!(bytes[self.pos], b'+' | b'-') {
920 self.pos += 1;
921 }
922
923 if self.scan_digits(u8::is_ascii_digit) {
925 true
926 } else {
927 self.pos = saved;
928 false
929 }
930 }
931
932 fn scan_identifier(&mut self) -> Token {
935 let start = self.pos;
936 let bytes = self.source.as_bytes();
937 self.pos += 1; while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
939 self.pos += 1;
940 }
941 let text = &self.source[start..self.pos];
942 let kind = resolve_keyword(text).unwrap_or(TokenKind::Identifier);
943 self.tok(kind, start)
944 }
945
946 #[inline]
952 fn skip_line_comment_body(bytes: &[u8], pos: &mut usize) {
953 loop {
954 match memchr2(b'\n', b'?', &bytes[*pos..]) {
955 None => {
956 *pos = bytes.len();
957 return;
958 }
959 Some(offset) => {
960 let p = *pos + offset;
961 if bytes[p] == b'\n' {
962 *pos = p; return;
964 }
965 if p + 1 < bytes.len() && bytes[p + 1] == b'>' {
967 *pos = p; return;
969 }
970 *pos = p + 1;
972 }
973 }
974 }
975 }
976
977 #[inline]
978 fn check_at(&self, offset: usize, expected: u8) -> bool {
979 self.source.as_bytes().get(self.pos + offset) == Some(&expected)
980 }
981
982 #[inline]
983 fn tok(&self, kind: TokenKind, start: usize) -> Token {
984 Token::new(kind, Span::new(start as u32, self.pos as u32))
985 }
986
987 fn invalid_numeric(&mut self, start: usize) -> Token {
988 let span = Span::new(start as u32, self.pos as u32);
989 self.errors.push(LexerError {
990 message: "Invalid numeric literal".to_string(),
991 span,
992 });
993 Token::new(TokenKind::InvalidNumericLiteral, span)
994 }
995
996 fn consume_invalid_numeric_rest(&mut self) {
998 let bytes = self.source.as_bytes();
999 while self.pos < bytes.len() {
1000 let b = bytes[self.pos];
1001 if b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'+' || b == b'-' {
1002 if (b == b'+' || b == b'-') && self.pos > 0 {
1004 let prev = bytes[self.pos - 1];
1005 if prev != b'e' && prev != b'E' {
1006 break;
1007 }
1008 }
1009 self.pos += 1;
1010 } else {
1011 break;
1012 }
1013 }
1014 }
1015
1016 fn try_lex_heredoc(&mut self, remaining: &str) -> Option<Token> {
1020 let trimmed = remaining.trim_start_matches(|c: char| {
1022 c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\x0C'
1023 });
1024 let ws_len = remaining.len() - trimmed.len();
1025
1026 let (after_prefix, prefix_len) = if (trimmed.starts_with("b<<<")
1028 || trimmed.starts_with("B<<<"))
1029 && !trimmed[1..].starts_with("<<<>")
1030 {
1031 (&trimmed[1..], 1)
1032 } else {
1033 (trimmed, 0)
1034 };
1035
1036 if !after_prefix.starts_with("<<<") {
1037 return None;
1038 }
1039
1040 let base_pos = self.pos; let start = base_pos + ws_len; let after_arrows = &after_prefix[3..];
1043 let after_arrows_trimmed = after_arrows.trim_start_matches([' ', '\t']);
1044 let arrows_offset =
1045 ws_len + prefix_len + 3 + (after_arrows.len() - after_arrows_trimmed.len());
1046
1047 let (label, is_nowdoc, label_line_end);
1049 if let Some(after_quote) = after_arrows_trimmed.strip_prefix('\'') {
1050 let closing = after_quote.find('\'')?;
1052 label = after_quote[..closing].to_string();
1053 is_nowdoc = true;
1054 let after_label = &after_arrows_trimmed[2 + closing..];
1055 let nl = after_label.find('\n').unwrap_or(after_label.len());
1057 label_line_end = arrows_offset + 2 + closing + nl;
1058 if label_line_end < remaining.len() {
1059 }
1061 } else {
1062 let s = if let Some(after_dquote) = after_arrows_trimmed.strip_prefix('"') {
1064 let closing = after_dquote.find('"')?;
1065 label = after_dquote[..closing].to_string();
1066 &after_dquote[1 + closing..]
1067 } else {
1068 let end = after_arrows_trimmed
1070 .find(|c: char| !c.is_ascii_alphanumeric() && c != '_')
1071 .unwrap_or(after_arrows_trimmed.len());
1072 if end == 0 {
1073 return None;
1074 }
1075 label = after_arrows_trimmed[..end].to_string();
1076 &after_arrows_trimmed[end..]
1077 };
1078 is_nowdoc = false;
1079 let nl = s.find('\n').unwrap_or(s.len());
1080 label_line_end = arrows_offset + (after_arrows_trimmed.len() - s.len()) + nl;
1081 };
1082
1083 if label.is_empty() {
1084 return None;
1085 }
1086
1087 let body_start_in_remaining = if label_line_end < remaining.len() {
1089 label_line_end + 1 } else {
1091 return None; };
1093
1094 let body = &remaining[body_start_in_remaining..];
1095
1096 let mut search_pos = 0;
1098 let end_marker_pos;
1099 loop {
1100 if search_pos >= body.len() {
1101 return None; }
1103 let line_start = search_pos;
1104 let line_end = body[line_start..]
1105 .find('\n')
1106 .map(|p| line_start + p)
1107 .unwrap_or(body.len());
1108 let line = &body[line_start..line_end];
1109 let trimmed_line = line.trim_start_matches([' ', '\t']);
1110
1111 if trimmed_line == label
1113 || trimmed_line.starts_with(&label)
1114 && trimmed_line[label.len()..]
1115 .trim_start_matches(';')
1116 .trim()
1117 .is_empty()
1118 {
1119 end_marker_pos = line_start;
1120 break;
1121 }
1122
1123 search_pos = if line_end < body.len() {
1124 line_end + 1
1125 } else {
1126 body.len()
1127 };
1128 }
1129
1130 let end_marker_line = &body[end_marker_pos..];
1132 let trimmed = end_marker_line.trim_start_matches([' ', '\t']);
1133 let indent_len = end_marker_line.len() - trimmed.len();
1134 let token_end_in_remaining =
1135 body_start_in_remaining + end_marker_pos + indent_len + label.len();
1136 self.pos = base_pos + token_end_in_remaining;
1137
1138 let span = Span::new(start as u32, self.pos as u32);
1139
1140 if is_nowdoc {
1141 Some(Token::new(TokenKind::Nowdoc, span))
1142 } else {
1143 Some(Token::new(TokenKind::Heredoc, span))
1144 }
1145 }
1146}
1147
1148pub fn lex_all(source: &str) -> (Vec<Token>, Vec<LexerError>) {
1156 let mut lexer = Lexer::new(source);
1157 let mut tokens = Vec::new();
1158
1159 loop {
1160 let tok = lexer.next_token();
1161 let is_eof = tok.kind == TokenKind::Eof;
1162 tokens.push(tok);
1163 if is_eof {
1164 break;
1165 }
1166 }
1167
1168 let eof_span = tokens.last().unwrap().span;
1171 tokens.push(Token::new(TokenKind::Eof, eof_span));
1172
1173 let errors = lexer.errors;
1174 (tokens, errors)
1175}
1176
1177#[cfg(test)]
1178mod tests {
1179 use super::*;
1180
1181 fn collect_tokens(source: &str) -> Vec<Token> {
1182 let mut lexer = Lexer::new(source);
1183 let mut tokens = Vec::new();
1184 loop {
1185 let token = lexer.next_token();
1186 if token.kind == TokenKind::Eof {
1187 tokens.push(token);
1188 break;
1189 }
1190 tokens.push(token);
1191 }
1192 tokens
1193 }
1194
1195 fn collect_kinds(source: &str) -> Vec<TokenKind> {
1196 collect_tokens(source).into_iter().map(|t| t.kind).collect()
1197 }
1198
1199 fn php_kinds(code: &str) -> Vec<TokenKind> {
1201 let full = format!("<?php {}", code);
1202 collect_kinds(&full)
1203 .into_iter()
1204 .filter(|k| *k != TokenKind::OpenTag && *k != TokenKind::Eof)
1205 .collect()
1206 }
1207
1208 fn php_tokens(code: &str) -> Vec<(TokenKind, String)> {
1210 let full = format!("<?php {}", code);
1211 let mut lexer = Lexer::new(&full);
1212 let mut result = Vec::new();
1213 loop {
1214 let token = lexer.next_token();
1215 if token.kind == TokenKind::Eof {
1216 break;
1217 }
1218 if token.kind == TokenKind::OpenTag {
1219 continue;
1220 }
1221 let text = lexer.token_text(&token).to_string();
1222 result.push((token.kind, text));
1223 }
1224 result
1225 }
1226
1227 #[test]
1228 fn test_php_only() {
1229 let tokens = collect_kinds("<?php $x = 42;");
1230 assert_eq!(
1231 tokens,
1232 vec![
1233 TokenKind::OpenTag,
1234 TokenKind::Variable,
1235 TokenKind::Equals,
1236 TokenKind::IntLiteral,
1237 TokenKind::Semicolon,
1238 TokenKind::Eof,
1239 ]
1240 );
1241 }
1242
1243 #[test]
1244 fn test_inline_html_before_php() {
1245 let tokens = collect_kinds("<html><?php echo 1;");
1246 assert_eq!(
1247 tokens,
1248 vec![
1249 TokenKind::InlineHtml,
1250 TokenKind::OpenTag,
1251 TokenKind::Echo,
1252 TokenKind::IntLiteral,
1253 TokenKind::Semicolon,
1254 TokenKind::Eof,
1255 ]
1256 );
1257 }
1258
1259 #[test]
1260 fn test_inline_html_after_close_tag() {
1261 let tokens = collect_kinds("<?php echo 1; ?><html>");
1262 assert_eq!(
1263 tokens,
1264 vec![
1265 TokenKind::OpenTag,
1266 TokenKind::Echo,
1267 TokenKind::IntLiteral,
1268 TokenKind::Semicolon,
1269 TokenKind::CloseTag,
1270 TokenKind::InlineHtml,
1271 TokenKind::Eof,
1272 ]
1273 );
1274 }
1275
1276 #[test]
1277 fn test_keyword_resolution() {
1278 let tokens = collect_kinds("<?php if else while for foreach function return");
1279 assert_eq!(
1280 tokens,
1281 vec![
1282 TokenKind::OpenTag,
1283 TokenKind::If,
1284 TokenKind::Else,
1285 TokenKind::While,
1286 TokenKind::For,
1287 TokenKind::Foreach,
1288 TokenKind::Function,
1289 TokenKind::Return,
1290 TokenKind::Eof,
1291 ]
1292 );
1293 }
1294
1295 #[test]
1296 fn test_keyword_case_insensitive() {
1297 let tokens = collect_kinds("<?php IF ELSE TRUE FALSE NULL");
1298 assert_eq!(
1299 tokens,
1300 vec![
1301 TokenKind::OpenTag,
1302 TokenKind::If,
1303 TokenKind::Else,
1304 TokenKind::True,
1305 TokenKind::False,
1306 TokenKind::Null,
1307 TokenKind::Eof,
1308 ]
1309 );
1310 }
1311
1312 #[test]
1313 fn test_peek_doesnt_consume() {
1314 let mut lexer = Lexer::new("<?php 42");
1315 let peeked = lexer.peek().clone();
1316 assert_eq!(peeked.kind, TokenKind::OpenTag);
1317 let next = lexer.next_token();
1318 assert_eq!(next.kind, TokenKind::OpenTag);
1319 let next = lexer.next_token();
1320 assert_eq!(next.kind, TokenKind::IntLiteral);
1321 }
1322
1323 #[test]
1324 fn test_token_text() {
1325 let source = "<?php $myVar = 'hello';";
1326 let mut lexer = Lexer::new(source);
1327 lexer.next_token(); let var_tok = lexer.next_token();
1329 assert_eq!(lexer.token_text(&var_tok), "$myVar");
1330 lexer.next_token(); let str_tok = lexer.next_token();
1332 assert_eq!(lexer.token_text(&str_tok), "'hello'");
1333 }
1334
1335 #[test]
1336 fn test_spans_are_correct() {
1337 let source = "<?php $x";
1338 let tokens = collect_tokens(source);
1339 assert_eq!(tokens[0].span, Span::new(0, 5)); assert_eq!(tokens[1].span, Span::new(6, 8)); }
1342
1343 #[test]
1344 fn test_operators() {
1345 let tokens = collect_kinds("<?php === !== <=> ?? ++ -- **");
1346 assert_eq!(
1347 tokens,
1348 vec![
1349 TokenKind::OpenTag,
1350 TokenKind::EqualsEqualsEquals,
1351 TokenKind::BangEqualsEquals,
1352 TokenKind::Spaceship,
1353 TokenKind::QuestionQuestion,
1354 TokenKind::PlusPlus,
1355 TokenKind::MinusMinus,
1356 TokenKind::StarStar,
1357 TokenKind::Eof,
1358 ]
1359 );
1360 }
1361
1362 #[test]
1363 fn test_string_literals() {
1364 let tokens = collect_kinds(r#"<?php 'single' "double""#);
1365 assert_eq!(
1366 tokens,
1367 vec![
1368 TokenKind::OpenTag,
1369 TokenKind::SingleQuotedString,
1370 TokenKind::DoubleQuotedString,
1371 TokenKind::Eof,
1372 ]
1373 );
1374 }
1375
1376 #[test]
1377 fn test_assignment_operators() {
1378 let tokens = collect_kinds("<?php += -= *= /= %= **= .= ??=");
1379 assert_eq!(
1380 tokens,
1381 vec![
1382 TokenKind::OpenTag,
1383 TokenKind::PlusEquals,
1384 TokenKind::MinusEquals,
1385 TokenKind::StarEquals,
1386 TokenKind::SlashEquals,
1387 TokenKind::PercentEquals,
1388 TokenKind::StarStarEquals,
1389 TokenKind::DotEquals,
1390 TokenKind::CoalesceEquals,
1391 TokenKind::Eof,
1392 ]
1393 );
1394 }
1395
1396 #[test]
1397 fn test_logical_keywords() {
1398 let tokens = collect_kinds("<?php and or xor");
1399 assert_eq!(
1400 tokens,
1401 vec![
1402 TokenKind::OpenTag,
1403 TokenKind::And,
1404 TokenKind::Or,
1405 TokenKind::Xor,
1406 TokenKind::Eof,
1407 ]
1408 );
1409 }
1410
1411 #[test]
1412 fn test_empty_source() {
1413 let tokens = collect_kinds("");
1414 assert_eq!(tokens, vec![TokenKind::Eof]);
1415 }
1416
1417 #[test]
1418 fn test_only_inline_html() {
1419 let tokens = collect_kinds("<html><body>Hello</body></html>");
1420 assert_eq!(tokens, vec![TokenKind::InlineHtml, TokenKind::Eof]);
1421 }
1422
1423 #[test]
1426 fn test_basic_operators() {
1427 assert_eq!(
1428 php_kinds("+ - * / % ** ."),
1429 vec![
1430 TokenKind::Plus,
1431 TokenKind::Minus,
1432 TokenKind::Star,
1433 TokenKind::Slash,
1434 TokenKind::Percent,
1435 TokenKind::StarStar,
1436 TokenKind::Dot,
1437 ]
1438 );
1439 }
1440
1441 #[test]
1442 fn test_integers() {
1443 let toks = php_tokens("42 0xFF 0b1010 077");
1444 assert_eq!(toks[0], (TokenKind::IntLiteral, "42".to_string()));
1445 assert_eq!(toks[1], (TokenKind::HexIntLiteral, "0xFF".to_string()));
1446 assert_eq!(toks[2], (TokenKind::BinIntLiteral, "0b1010".to_string()));
1447 assert_eq!(toks[3], (TokenKind::OctIntLiteral, "077".to_string()));
1448 }
1449
1450 #[test]
1451 fn test_floats() {
1452 let toks = php_tokens("3.14 1e10 2.5e-3");
1453 assert_eq!(toks[0], (TokenKind::FloatLiteralSimple, "3.14".to_string()));
1454 assert_eq!(toks[1], (TokenKind::FloatLiteral, "1e10".to_string()));
1455 assert_eq!(toks[2], (TokenKind::FloatLiteral, "2.5e-3".to_string()));
1456 }
1457
1458 #[test]
1459 fn test_strings() {
1460 let kinds = php_kinds(r#"'hello' "world" 'it\'s' "say \"hi\"""#);
1461 assert_eq!(
1462 kinds,
1463 vec![
1464 TokenKind::SingleQuotedString,
1465 TokenKind::DoubleQuotedString,
1466 TokenKind::SingleQuotedString,
1467 TokenKind::DoubleQuotedString,
1468 ]
1469 );
1470 }
1471
1472 #[test]
1473 fn test_variables() {
1474 let toks = php_tokens("$x $myVar $_foo");
1475 assert_eq!(toks[0], (TokenKind::Variable, "$x".to_string()));
1476 assert_eq!(toks[1], (TokenKind::Variable, "$myVar".to_string()));
1477 assert_eq!(toks[2], (TokenKind::Variable, "$_foo".to_string()));
1478 }
1479
1480 #[test]
1481 fn test_comments_skipped() {
1482 let toks = php_tokens("42 // line comment\n43 /* block */ 44 # hash comment\n45");
1483 assert_eq!(toks[0], (TokenKind::IntLiteral, "42".to_string()));
1484 assert_eq!(toks[1], (TokenKind::IntLiteral, "43".to_string()));
1485 assert_eq!(toks[2], (TokenKind::IntLiteral, "44".to_string()));
1486 assert_eq!(toks[3], (TokenKind::IntLiteral, "45".to_string()));
1487 }
1488
1489 #[test]
1490 fn test_float_leading_dot() {
1491 let toks = php_tokens(".5 .123e4");
1492 assert_eq!(
1493 toks[0],
1494 (TokenKind::FloatLiteralLeadingDot, ".5".to_string())
1495 );
1496 assert_eq!(
1497 toks[1],
1498 (TokenKind::FloatLiteralLeadingDot, ".123e4".to_string())
1499 );
1500 }
1501
1502 #[test]
1503 fn test_new_octal_syntax() {
1504 let toks = php_tokens("0o77 0O755");
1505 assert_eq!(toks[0], (TokenKind::OctIntLiteralNew, "0o77".to_string()));
1506 assert_eq!(toks[1], (TokenKind::OctIntLiteralNew, "0O755".to_string()));
1507 }
1508
1509 #[test]
1510 fn test_numeric_underscores() {
1511 let toks = php_tokens("1_000 0xFF_FF 0b1010_0101");
1512 assert_eq!(toks[0], (TokenKind::IntLiteral, "1_000".to_string()));
1513 assert_eq!(toks[1], (TokenKind::HexIntLiteral, "0xFF_FF".to_string()));
1514 assert_eq!(
1515 toks[2],
1516 (TokenKind::BinIntLiteral, "0b1010_0101".to_string())
1517 );
1518 }
1519
1520 #[test]
1521 fn test_binary_prefix_strings() {
1522 let kinds = php_kinds(r#"b'hello' B"world""#);
1523 assert_eq!(
1524 kinds,
1525 vec![TokenKind::SingleQuotedString, TokenKind::DoubleQuotedString,]
1526 );
1527 }
1528
1529 #[test]
1530 fn test_hash_bracket_not_comment() {
1531 let kinds = php_kinds("#[Attribute]");
1532 assert_eq!(
1533 kinds,
1534 vec![
1535 TokenKind::HashBracket,
1536 TokenKind::Identifier,
1537 TokenKind::RightBracket,
1538 ]
1539 );
1540 }
1541
1542 #[test]
1543 fn test_nullsafe_arrow() {
1544 let kinds = php_kinds("$x?->y");
1545 assert_eq!(
1546 kinds,
1547 vec![
1548 TokenKind::Variable,
1549 TokenKind::NullsafeArrow,
1550 TokenKind::Identifier,
1551 ]
1552 );
1553 }
1554
1555 #[test]
1556 fn test_pipe_arrow() {
1557 let kinds = php_kinds("$x |> foo(...)");
1558 assert_eq!(
1559 kinds,
1560 vec![
1561 TokenKind::Variable,
1562 TokenKind::PipeArrow,
1563 TokenKind::Identifier,
1564 TokenKind::LeftParen,
1565 TokenKind::Ellipsis,
1566 TokenKind::RightParen,
1567 ]
1568 );
1569 }
1570}