1use memchr::{memchr2, memmem};
2use php_ast::Span;
3
4use crate::token::{resolve_keyword, TokenKind};
5
6const fn make_whitespace_table() -> [bool; 256] {
14 let mut t = [false; 256];
15 t[b' ' as usize] = true;
16 t[b'\t' as usize] = true;
17 t[b'\r' as usize] = true;
18 t[b'\n' as usize] = true;
19 t[0x0C] = true; t
21}
22
23const fn make_ident_start_table() -> [bool; 256] {
24 let mut t = [false; 256];
25 let mut i = 0usize;
26 while i < 256 {
27 let b = i as u8;
28 t[i] = (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') || b == b'_' || b >= 0x80;
29 i += 1;
30 }
31 t
32}
33
34const fn make_ident_continue_table() -> [bool; 256] {
35 let mut t = [false; 256];
36 let mut i = 0usize;
37 while i < 256 {
38 let b = i as u8;
39 t[i] = (b >= b'a' && b <= b'z')
40 || (b >= b'A' && b <= b'Z')
41 || (b >= b'0' && b <= b'9')
42 || b == b'_'
43 || b >= 0x80;
44 i += 1;
45 }
46 t
47}
48
49static IS_PHP_WHITESPACE: [bool; 256] = make_whitespace_table();
50static IS_IDENT_START: [bool; 256] = make_ident_start_table();
51static IS_IDENT_CONTINUE: [bool; 256] = make_ident_continue_table();
52
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum LexerErrorKind {
56 UnterminatedString,
58 FileTooLarge,
60 Other,
62}
63
64#[derive(Debug, Clone, PartialEq)]
65pub struct LexerError {
66 pub kind: LexerErrorKind,
67 pub message: String,
68 pub span: Span,
69}
70
71#[derive(Debug, Clone, Copy, PartialEq)]
72pub struct Token {
73 pub kind: TokenKind,
74 pub span: Span,
75}
76
77impl Token {
78 pub fn new(kind: TokenKind, span: Span) -> Self {
79 Self { kind, span }
80 }
81
82 pub fn eof(offset: u32) -> Self {
83 Self {
84 kind: TokenKind::Eof,
85 span: Span::new(offset, offset),
86 }
87 }
88}
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91enum LexerMode {
92 InlineHtml,
93 Php,
94}
95
96pub struct Lexer<'src> {
97 source: &'src str,
98 mode: LexerMode,
99 pos: usize,
100 peeked: Option<Token>,
101 peeked2: Option<Token>,
102 pub errors: Vec<LexerError>,
103}
104
105#[inline(always)]
106fn is_ident_start(b: u8) -> bool {
107 IS_IDENT_START[b as usize]
108}
109
110#[inline(always)]
111fn is_ident_continue(b: u8) -> bool {
112 IS_IDENT_CONTINUE[b as usize]
113}
114
115impl<'src> Lexer<'src> {
116 pub fn new(source: &'src str) -> Self {
117 debug_assert!(
118 source.len() <= u32::MAX as usize,
119 "source is {} bytes, which exceeds the u32::MAX span limit",
120 source.len()
121 );
122
123 let pos = if source.starts_with("#!") {
125 source.find('\n').map(|p| p + 1).unwrap_or(source.len())
126 } else {
127 0
128 };
129
130 let remaining = &source[pos..];
132 let rem_bytes = remaining.as_bytes();
133 let mode = if (rem_bytes.len() >= 5
134 && rem_bytes[0] == b'<'
135 && rem_bytes[1] == b'?'
136 && rem_bytes[2..5].eq_ignore_ascii_case(b"php"))
137 || remaining.starts_with("<?=")
138 {
139 LexerMode::Php
140 } else {
141 LexerMode::InlineHtml
142 };
143
144 Self {
145 source,
146 mode,
147 pos,
148 peeked: None,
149 peeked2: None,
150 errors: Vec::new(),
151 }
152 }
153
154 pub fn new_at(source: &'src str, offset: usize) -> Self {
159 debug_assert!(
160 source.len() <= u32::MAX as usize,
161 "source is {} bytes, which exceeds the u32::MAX span limit",
162 source.len()
163 );
164
165 Self {
166 source,
167 mode: LexerMode::Php,
168 pos: offset,
169 peeked: None,
170 peeked2: None,
171 errors: Vec::new(),
172 }
173 }
174
175 pub fn source(&self) -> &'src str {
176 self.source
177 }
178
179 pub fn peek(&mut self) -> &Token {
180 if self.peeked.is_none() {
181 self.peeked = Some(self.read_next_token());
182 }
183 self.peeked.as_ref().expect("peeked is Some: set above")
184 }
185
186 pub fn peek2(&mut self) -> &Token {
188 if self.peeked.is_none() {
190 self.peeked = Some(self.read_next_token());
191 }
192 if self.peeked2.is_none() {
193 self.peeked2 = Some(self.read_next_token());
194 }
195 self.peeked2.as_ref().expect("peeked2 is Some: set above")
196 }
197
198 pub fn next_token(&mut self) -> Token {
199 if let Some(token) = self.peeked.take() {
200 self.peeked = self.peeked2.take();
201 return token;
202 }
203 self.read_next_token()
204 }
205
206 pub fn token_text(&self, token: &Token) -> &'src str {
208 &self.source[token.span.start as usize..token.span.end as usize]
209 }
210
211 fn read_next_token(&mut self) -> Token {
212 if self.pos >= self.source.len() {
213 return Token::eof(self.source.len() as u32);
214 }
215
216 match self.mode {
217 LexerMode::InlineHtml => self.lex_inline_html(),
218 LexerMode::Php => self.lex_php(),
219 }
220 }
221
222 fn lex_inline_html(&mut self) -> Token {
223 let start = self.pos;
224 let bytes = self.source.as_bytes();
225
226 let mut search = self.pos;
230 let tag_pos = loop {
231 match memchr::memchr(b'<', &bytes[search..]) {
232 None => break None,
233 Some(offset) => {
234 let p = search + offset;
235 let rest = &bytes[p..];
236 if (rest.len() >= 5
237 && rest[0] == b'<'
238 && rest[1] == b'?'
239 && rest[2..5].eq_ignore_ascii_case(b"php"))
240 || rest.starts_with(b"<?=")
241 {
242 break Some(p - self.pos);
243 }
244 search = p + 1;
245 }
246 }
247 };
248
249 if let Some(tag_pos) = tag_pos {
250 if tag_pos == 0 {
251 self.mode = LexerMode::Php;
253 return self.lex_php();
254 }
255 let end = self.pos + tag_pos;
257 self.pos = end;
258 self.mode = LexerMode::Php;
259 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
260 } else {
261 let end = self.source.len();
263 self.pos = end;
264 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
265 }
266 }
267
268 fn lex_php(&mut self) -> Token {
269 let remaining = &self.source[self.pos..];
270
271 if let Some(token) = self.try_lex_heredoc(remaining) {
273 return token;
274 }
275
276 self.skip_whitespace();
278
279 if self.pos >= self.source.len() {
280 return Token::eof(self.source.len() as u32);
281 }
282
283 let bytes = self.source.as_bytes();
284 let start = self.pos;
285
286 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'/' {
289 self.pos += 2;
290 Self::skip_line_comment_body(bytes, &mut self.pos);
291 return self.tok(TokenKind::LineComment, start);
292 }
293
294 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'*' {
296 self.pos += 2;
297 let kind = if self.pos < bytes.len()
300 && bytes[self.pos] == b'*'
301 && !(self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'/')
302 {
303 TokenKind::DocComment
304 } else {
305 TokenKind::BlockComment
306 };
307 match memmem::find(&bytes[self.pos..], b"*/") {
308 Some(end) => self.pos += end + 2,
309 None => {
310 let span = Span::new(start as u32, self.source.len() as u32);
311 self.errors.push(LexerError {
312 kind: LexerErrorKind::Other,
313 message: "unterminated block comment".to_string(),
314 span,
315 });
316 self.pos = bytes.len();
317 }
318 }
319 return self.tok(kind, start);
320 }
321
322 if bytes[self.pos] == b'#' && !(self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'[') {
325 self.pos += 1;
326 Self::skip_line_comment_body(bytes, &mut self.pos);
327 return self.tok(TokenKind::HashComment, start);
328 }
329
330 self.scan_token()
331 }
332
333 fn skip_whitespace(&mut self) {
335 let bytes = self.source.as_bytes();
336 while self.pos < bytes.len() && IS_PHP_WHITESPACE[bytes[self.pos] as usize] {
337 self.pos += 1;
338 }
339 }
340
341 fn scan_token(&mut self) -> Token {
343 let start = self.pos;
344 let bytes = self.source.as_bytes();
345 let b = bytes[start];
346
347 match b {
348 b'+' => {
350 if self.check_at(1, b'+') {
351 self.pos = start + 2;
352 self.tok(TokenKind::PlusPlus, start)
353 } else if self.check_at(1, b'=') {
354 self.pos = start + 2;
355 self.tok(TokenKind::PlusEquals, start)
356 } else {
357 self.pos = start + 1;
358 self.tok(TokenKind::Plus, start)
359 }
360 }
361 b'-' => {
362 if self.check_at(1, b'-') {
363 self.pos = start + 2;
364 self.tok(TokenKind::MinusMinus, start)
365 } else if self.check_at(1, b'=') {
366 self.pos = start + 2;
367 self.tok(TokenKind::MinusEquals, start)
368 } else if self.check_at(1, b'>') {
369 self.pos = start + 2;
370 self.tok(TokenKind::Arrow, start)
371 } else {
372 self.pos = start + 1;
373 self.tok(TokenKind::Minus, start)
374 }
375 }
376 b'*' => {
377 if self.check_at(1, b'*') {
378 if self.check_at(2, b'=') {
379 self.pos = start + 3;
380 self.tok(TokenKind::StarStarEquals, start)
381 } else {
382 self.pos = start + 2;
383 self.tok(TokenKind::StarStar, start)
384 }
385 } else if self.check_at(1, b'=') {
386 self.pos = start + 2;
387 self.tok(TokenKind::StarEquals, start)
388 } else {
389 self.pos = start + 1;
390 self.tok(TokenKind::Star, start)
391 }
392 }
393 b'/' => {
394 if self.check_at(1, b'=') {
396 self.pos = start + 2;
397 self.tok(TokenKind::SlashEquals, start)
398 } else {
399 self.pos = start + 1;
400 self.tok(TokenKind::Slash, start)
401 }
402 }
403 b'%' => {
404 if self.check_at(1, b'=') {
405 self.pos = start + 2;
406 self.tok(TokenKind::PercentEquals, start)
407 } else {
408 self.pos = start + 1;
409 self.tok(TokenKind::Percent, start)
410 }
411 }
412 b'.' => {
413 if start + 1 < bytes.len() && bytes[start + 1].is_ascii_digit() {
415 self.pos = start + 1;
416 self.scan_digits(u8::is_ascii_digit);
417 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
419 self.try_scan_exponent();
420 }
421 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
423 self.consume_invalid_numeric_rest();
424 return self.invalid_numeric(start);
425 }
426 return self.tok(TokenKind::FloatLiteralLeadingDot, start);
427 }
428 if self.check_at(1, b'.') && self.check_at(2, b'.') {
429 self.pos = start + 3;
430 self.tok(TokenKind::Ellipsis, start)
431 } else if self.check_at(1, b'=') {
432 self.pos = start + 2;
433 self.tok(TokenKind::DotEquals, start)
434 } else {
435 self.pos = start + 1;
436 self.tok(TokenKind::Dot, start)
437 }
438 }
439 b'=' => {
440 if self.check_at(1, b'=') {
441 if self.check_at(2, b'=') {
442 self.pos = start + 3;
443 self.tok(TokenKind::EqualsEqualsEquals, start)
444 } else {
445 self.pos = start + 2;
446 self.tok(TokenKind::EqualsEquals, start)
447 }
448 } else if self.check_at(1, b'>') {
449 self.pos = start + 2;
450 self.tok(TokenKind::FatArrow, start)
451 } else {
452 self.pos = start + 1;
453 self.tok(TokenKind::Equals, start)
454 }
455 }
456 b'!' => {
457 if self.check_at(1, b'=') {
458 if self.check_at(2, b'=') {
459 self.pos = start + 3;
460 self.tok(TokenKind::BangEqualsEquals, start)
461 } else {
462 self.pos = start + 2;
463 self.tok(TokenKind::BangEquals, start)
464 }
465 } else {
466 self.pos = start + 1;
467 self.tok(TokenKind::Bang, start)
468 }
469 }
470 b'<' => self.scan_less_than(start),
471 b'>' => {
472 if self.check_at(1, b'>') {
473 if self.check_at(2, b'=') {
474 self.pos = start + 3;
475 self.tok(TokenKind::ShiftRightEquals, start)
476 } else {
477 self.pos = start + 2;
478 self.tok(TokenKind::ShiftRight, start)
479 }
480 } else if self.check_at(1, b'=') {
481 self.pos = start + 2;
482 self.tok(TokenKind::GreaterThanEquals, start)
483 } else {
484 self.pos = start + 1;
485 self.tok(TokenKind::GreaterThan, start)
486 }
487 }
488 b'&' => {
489 if self.check_at(1, b'&') {
490 self.pos = start + 2;
491 self.tok(TokenKind::AmpersandAmpersand, start)
492 } else if self.check_at(1, b'=') {
493 self.pos = start + 2;
494 self.tok(TokenKind::AmpersandEquals, start)
495 } else {
496 self.pos = start + 1;
497 self.tok(TokenKind::Ampersand, start)
498 }
499 }
500 b'|' => {
501 if self.check_at(1, b'|') {
502 self.pos = start + 2;
503 self.tok(TokenKind::PipePipe, start)
504 } else if self.check_at(1, b'=') {
505 self.pos = start + 2;
506 self.tok(TokenKind::PipeEquals, start)
507 } else if self.check_at(1, b'>') {
508 self.pos = start + 2;
509 self.tok(TokenKind::PipeArrow, start)
510 } else {
511 self.pos = start + 1;
512 self.tok(TokenKind::Pipe, start)
513 }
514 }
515 b'^' => {
516 if self.check_at(1, b'=') {
517 self.pos = start + 2;
518 self.tok(TokenKind::CaretEquals, start)
519 } else {
520 self.pos = start + 1;
521 self.tok(TokenKind::Caret, start)
522 }
523 }
524 b'~' => {
525 self.pos = start + 1;
526 self.tok(TokenKind::Tilde, start)
527 }
528 b'?' => {
529 if self.check_at(1, b'>') {
530 self.pos = start + 2;
531 self.mode = LexerMode::InlineHtml;
532 self.tok(TokenKind::CloseTag, start)
533 } else if self.check_at(1, b'?') {
534 if self.check_at(2, b'=') {
535 self.pos = start + 3;
536 self.tok(TokenKind::CoalesceEquals, start)
537 } else {
538 self.pos = start + 2;
539 self.tok(TokenKind::QuestionQuestion, start)
540 }
541 } else if self.check_at(1, b'-') && self.check_at(2, b'>') {
542 self.pos = start + 3;
543 self.tok(TokenKind::NullsafeArrow, start)
544 } else {
545 self.pos = start + 1;
546 self.tok(TokenKind::Question, start)
547 }
548 }
549 b':' => {
550 if self.check_at(1, b':') {
551 self.pos = start + 2;
552 self.tok(TokenKind::DoubleColon, start)
553 } else {
554 self.pos = start + 1;
555 self.tok(TokenKind::Colon, start)
556 }
557 }
558 b'@' => {
559 self.pos = start + 1;
560 self.tok(TokenKind::At, start)
561 }
562 b'\\' => {
563 self.pos = start + 1;
564 self.tok(TokenKind::Backslash, start)
565 }
566 b'#' => {
567 if self.check_at(1, b'[') {
570 self.pos = start + 2;
571 self.tok(TokenKind::HashBracket, start)
572 } else {
573 self.pos = start + 1;
575 self.read_next_token()
576 }
577 }
578
579 b'(' => {
581 self.pos = start + 1;
582 self.tok(TokenKind::LeftParen, start)
583 }
584 b')' => {
585 self.pos = start + 1;
586 self.tok(TokenKind::RightParen, start)
587 }
588 b'[' => {
589 self.pos = start + 1;
590 self.tok(TokenKind::LeftBracket, start)
591 }
592 b']' => {
593 self.pos = start + 1;
594 self.tok(TokenKind::RightBracket, start)
595 }
596 b'{' => {
597 self.pos = start + 1;
598 self.tok(TokenKind::LeftBrace, start)
599 }
600 b'}' => {
601 self.pos = start + 1;
602 self.tok(TokenKind::RightBrace, start)
603 }
604 b';' => {
605 self.pos = start + 1;
606 self.tok(TokenKind::Semicolon, start)
607 }
608 b',' => {
609 self.pos = start + 1;
610 self.tok(TokenKind::Comma, start)
611 }
612
613 b'\'' => self.scan_single_quoted_string(),
615 b'"' => self.scan_double_quoted_string(),
616 b'`' => self.scan_backtick_string(),
617
618 b'$' => {
620 if start + 1 < bytes.len() && is_ident_start(bytes[start + 1]) {
621 self.pos = start + 2;
622 while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
623 self.pos += 1;
624 }
625 self.tok(TokenKind::Variable, start)
626 } else {
627 self.pos = start + 1;
628 self.tok(TokenKind::Dollar, start)
629 }
630 }
631
632 b'0'..=b'9' => self.scan_number(),
634
635 _ if is_ident_start(b) => {
637 if b == b'b' || b == b'B' {
639 if self.check_at(1, b'\'') {
640 return self.scan_single_quoted_string();
641 }
642 if self.check_at(1, b'"') {
643 return self.scan_double_quoted_string();
644 }
645 if self.check_at(1, b'<') && self.check_at(2, b'<') && self.check_at(3, b'<') {
646 let remaining = &self.source[self.pos..];
647 if let Some(token) = self.try_lex_heredoc(remaining) {
648 return token;
649 }
650 }
651 }
652 self.scan_identifier()
653 }
654
655 _ => {
657 self.pos = start + 1;
658 self.read_next_token()
659 }
660 }
661 }
662
663 fn scan_less_than(&mut self, start: usize) -> Token {
665 if self.check_at(1, b'<') {
666 if self.check_at(2, b'<') {
667 let remaining = &self.source[self.pos..];
669 if let Some(token) = self.try_lex_heredoc(remaining) {
670 return token;
671 }
672 }
674 if self.check_at(2, b'=') {
675 self.pos = start + 3;
676 return self.tok(TokenKind::ShiftLeftEquals, start);
677 }
678 self.pos = start + 2;
679 return self.tok(TokenKind::ShiftLeft, start);
680 }
681 if self.check_at(1, b'=') {
682 if self.check_at(2, b'>') {
683 self.pos = start + 3;
684 return self.tok(TokenKind::Spaceship, start);
685 }
686 self.pos = start + 2;
687 return self.tok(TokenKind::LessThanEquals, start);
688 }
689 if self.check_at(1, b'?') {
690 let bytes = self.source.as_bytes();
691 if bytes.len() >= self.pos + 5
692 && bytes[self.pos + 2..self.pos + 5].eq_ignore_ascii_case(b"php")
693 {
694 self.pos = start + 5;
695 return self.tok(TokenKind::OpenTag, start);
696 }
697 if self.source[self.pos..].starts_with("<?=") {
698 self.pos = start + 3;
699 return self.tok(TokenKind::OpenTag, start);
700 }
701 }
702 self.pos = start + 1;
703 self.tok(TokenKind::LessThan, start)
704 }
705
706 fn scan_single_quoted_string(&mut self) -> Token {
709 let start = self.pos;
710 let bytes = self.source.as_bytes();
711 let mut p = self.pos;
712 if bytes[p] == b'b' || bytes[p] == b'B' {
714 p += 1;
715 }
716 p += 1; loop {
718 match memchr2(b'\\', b'\'', &bytes[p..]) {
719 None => {
720 self.errors.push(LexerError {
721 kind: LexerErrorKind::UnterminatedString,
722 message: "unterminated string literal".to_string(),
723 span: Span::new(start as u32, self.source.len() as u32),
724 });
725 self.pos = self.source.len();
726 return self.tok(TokenKind::SingleQuotedString, start);
727 }
728 Some(offset) => {
729 p += offset;
730 match bytes[p] {
731 b'\\' => {
732 p += 1;
733 if p < bytes.len() {
734 p += 1;
735 }
736 }
737 _ => {
738 p += 1;
740 break;
741 }
742 }
743 }
744 }
745 }
746 self.pos = p;
747 self.tok(TokenKind::SingleQuotedString, start)
748 }
749
750 fn scan_double_quoted_string(&mut self) -> Token {
751 let start = self.pos;
752 let bytes = self.source.as_bytes();
753 let mut p = self.pos;
754 if bytes[p] == b'b' || bytes[p] == b'B' {
756 p += 1;
757 }
758 p += 1; loop {
760 match memchr2(b'\\', b'"', &bytes[p..]) {
761 None => {
762 self.errors.push(LexerError {
763 kind: LexerErrorKind::UnterminatedString,
764 message: "unterminated string literal".to_string(),
765 span: Span::new(start as u32, self.source.len() as u32),
766 });
767 self.pos = self.source.len();
768 return self.tok(TokenKind::DoubleQuotedString, start);
769 }
770 Some(offset) => {
771 p += offset;
772 match bytes[p] {
773 b'\\' => {
774 p += 1;
775 if p < bytes.len() {
776 p += 1;
777 }
778 }
779 _ => {
780 p += 1;
782 break;
783 }
784 }
785 }
786 }
787 }
788 self.pos = p;
789 self.tok(TokenKind::DoubleQuotedString, start)
790 }
791
792 fn scan_backtick_string(&mut self) -> Token {
793 let start = self.pos;
794 let bytes = self.source.as_bytes();
795 let mut p = self.pos;
796 p += 1; loop {
798 match memchr2(b'\\', b'`', &bytes[p..]) {
799 None => {
800 self.errors.push(LexerError {
801 kind: LexerErrorKind::UnterminatedString,
802 message: "unterminated string literal".to_string(),
803 span: Span::new(start as u32, self.source.len() as u32),
804 });
805 self.pos = self.source.len();
806 return self.tok(TokenKind::BacktickString, start);
807 }
808 Some(offset) => {
809 p += offset;
810 match bytes[p] {
811 b'\\' => {
812 p += 1;
813 if p < bytes.len() {
814 p += 1;
815 }
816 }
817 _ => {
818 p += 1;
820 break;
821 }
822 }
823 }
824 }
825 }
826 self.pos = p;
827 self.tok(TokenKind::BacktickString, start)
828 }
829
830 fn scan_number(&mut self) -> Token {
833 let start = self.pos;
834 let bytes = self.source.as_bytes();
835
836 if bytes[start] == b'0' && start + 1 < bytes.len() {
838 match bytes[start + 1] {
839 b'x' | b'X' => {
840 self.pos = start + 2;
841 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
842 self.consume_invalid_numeric_rest();
843 return self.invalid_numeric(start);
844 }
845 if self.scan_digits(u8::is_ascii_hexdigit) {
846 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
847 self.consume_invalid_numeric_rest();
848 return self.invalid_numeric(start);
849 }
850 return self.tok(TokenKind::HexIntLiteral, start);
851 }
852 self.pos = start;
854 }
855 b'b' | b'B' => {
856 self.pos = start + 2;
857 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
858 self.consume_invalid_numeric_rest();
859 return self.invalid_numeric(start);
860 }
861 if self.scan_digits(|b| b == &b'0' || b == &b'1') {
862 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
863 self.consume_invalid_numeric_rest();
864 return self.invalid_numeric(start);
865 }
866 return self.tok(TokenKind::BinIntLiteral, start);
867 }
868 self.pos = start;
870 }
871 b'o' | b'O' => {
872 self.pos = start + 2;
873 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
874 self.consume_invalid_numeric_rest();
875 return self.invalid_numeric(start);
876 }
877 if self.scan_digits(|b| (b'0'..=b'7').contains(b)) {
878 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
879 self.consume_invalid_numeric_rest();
880 return self.invalid_numeric(start);
881 }
882 return self.tok(TokenKind::OctIntLiteralNew, start);
883 }
884 self.pos = start;
886 }
887 _ => {}
888 }
889 }
890
891 self.pos = start;
893 self.scan_digits(u8::is_ascii_digit);
894 let integer_end = self.pos;
895 let mut kind = TokenKind::IntLiteral;
896
897 if bytes[start] == b'0' && integer_end > start + 1 {
900 kind = TokenKind::OctIntLiteral;
901 }
902
903 if self.pos < bytes.len() && bytes[self.pos] == b'.' {
905 if self.pos + 1 < bytes.len() && bytes[self.pos + 1].is_ascii_digit() {
906 self.pos += 1; self.scan_digits(u8::is_ascii_digit);
909 kind = TokenKind::FloatLiteralSimple;
910 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
911 self.consume_invalid_numeric_rest();
913 return self.invalid_numeric(start);
914 } else if self.pos + 1 >= bytes.len() || bytes[self.pos + 1] != b'.' {
915 self.pos += 1; kind = TokenKind::FloatLiteralSimple;
918 }
919 }
920
921 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
923 if self.try_scan_exponent() {
924 kind = TokenKind::FloatLiteral;
925 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
926 self.consume_invalid_numeric_rest();
928 return self.invalid_numeric(start);
929 }
930 }
931
932 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
934 self.consume_invalid_numeric_rest();
935 return self.invalid_numeric(start);
936 }
937
938 self.tok(kind, start)
939 }
940
941 fn scan_digits(&mut self, is_valid: fn(&u8) -> bool) -> bool {
944 let bytes = self.source.as_bytes();
945 if self.pos >= bytes.len() || !is_valid(&bytes[self.pos]) {
946 return false;
947 }
948 self.pos += 1;
949 loop {
950 if self.pos >= bytes.len() {
951 break;
952 }
953 if is_valid(&bytes[self.pos]) {
954 self.pos += 1;
955 } else if bytes[self.pos] == b'_'
956 && self.pos + 1 < bytes.len()
957 && is_valid(&bytes[self.pos + 1])
958 {
959 self.pos += 2;
960 } else {
961 break;
962 }
963 }
964 true
965 }
966
967 fn try_scan_exponent(&mut self) -> bool {
970 let bytes = self.source.as_bytes();
971 let saved = self.pos;
972 self.pos += 1; if self.pos < bytes.len() && matches!(bytes[self.pos], b'+' | b'-') {
976 self.pos += 1;
977 }
978
979 if self.scan_digits(u8::is_ascii_digit) {
981 true
982 } else {
983 self.pos = saved;
984 false
985 }
986 }
987
988 fn scan_identifier(&mut self) -> Token {
991 let start = self.pos;
992 let bytes = self.source.as_bytes();
993 self.pos += 1; while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
995 self.pos += 1;
996 }
997 let text = &self.source[start..self.pos];
998 let kind = resolve_keyword(text).unwrap_or(TokenKind::Identifier);
999 self.tok(kind, start)
1000 }
1001
1002 #[inline]
1008 fn skip_line_comment_body(bytes: &[u8], pos: &mut usize) {
1009 loop {
1010 match memchr2(b'\n', b'?', &bytes[*pos..]) {
1011 None => {
1012 *pos = bytes.len();
1013 return;
1014 }
1015 Some(offset) => {
1016 let p = *pos + offset;
1017 if bytes[p] == b'\n' {
1018 *pos = p; return;
1020 }
1021 if p + 1 < bytes.len() && bytes[p + 1] == b'>' {
1023 *pos = p; return;
1025 }
1026 *pos = p + 1;
1028 }
1029 }
1030 }
1031 }
1032
1033 #[inline]
1034 fn check_at(&self, offset: usize, expected: u8) -> bool {
1035 self.source.as_bytes().get(self.pos + offset) == Some(&expected)
1036 }
1037
1038 #[inline]
1039 fn tok(&self, kind: TokenKind, start: usize) -> Token {
1040 Token::new(kind, Span::new(start as u32, self.pos as u32))
1041 }
1042
1043 fn invalid_numeric(&mut self, start: usize) -> Token {
1044 let span = Span::new(start as u32, self.pos as u32);
1045 self.errors.push(LexerError {
1046 kind: LexerErrorKind::Other,
1047 message: "Invalid numeric literal".to_string(),
1048 span,
1049 });
1050 Token::new(TokenKind::InvalidNumericLiteral, span)
1051 }
1052
1053 fn consume_invalid_numeric_rest(&mut self) {
1055 let bytes = self.source.as_bytes();
1056 while self.pos < bytes.len() {
1057 let b = bytes[self.pos];
1058 if b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'+' || b == b'-' {
1059 if (b == b'+' || b == b'-') && self.pos > 0 {
1061 let prev = bytes[self.pos - 1];
1062 if prev != b'e' && prev != b'E' {
1063 break;
1064 }
1065 }
1066 self.pos += 1;
1067 } else {
1068 break;
1069 }
1070 }
1071 }
1072
1073 fn try_lex_heredoc(&mut self, remaining: &str) -> Option<Token> {
1077 let trimmed = remaining.trim_start_matches(|c: char| {
1079 c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\x0C'
1080 });
1081 let ws_len = remaining.len() - trimmed.len();
1082
1083 let (after_prefix, prefix_len) = if (trimmed.starts_with("b<<<")
1085 || trimmed.starts_with("B<<<"))
1086 && !trimmed[1..].starts_with("<<<>")
1087 {
1088 (&trimmed[1..], 1)
1089 } else {
1090 (trimmed, 0)
1091 };
1092
1093 if !after_prefix.starts_with("<<<") {
1094 return None;
1095 }
1096
1097 let base_pos = self.pos; let start = base_pos + ws_len; let after_arrows = &after_prefix[3..];
1100 let after_arrows_trimmed = after_arrows.trim_start_matches([' ', '\t']);
1101 let arrows_offset =
1102 ws_len + prefix_len + 3 + (after_arrows.len() - after_arrows_trimmed.len());
1103
1104 let (label, is_nowdoc, label_line_end);
1106 if let Some(after_quote) = after_arrows_trimmed.strip_prefix('\'') {
1107 let closing = after_quote.find('\'')?;
1109 label = &after_quote[..closing];
1110 is_nowdoc = true;
1111 let after_label = &after_arrows_trimmed[2 + closing..];
1112 let nl = after_label.find('\n').unwrap_or(after_label.len());
1114 label_line_end = arrows_offset + 2 + closing + nl;
1115 if label_line_end < remaining.len() {
1116 }
1118 } else {
1119 let s = if let Some(after_dquote) = after_arrows_trimmed.strip_prefix('"') {
1121 let closing = after_dquote.find('"')?;
1122 label = &after_dquote[..closing];
1123 &after_dquote[1 + closing..]
1124 } else {
1125 let end = after_arrows_trimmed
1127 .find(|c: char| !c.is_ascii_alphanumeric() && c != '_')
1128 .unwrap_or(after_arrows_trimmed.len());
1129 if end == 0 {
1130 return None;
1131 }
1132 label = &after_arrows_trimmed[..end];
1133 &after_arrows_trimmed[end..]
1134 };
1135 is_nowdoc = false;
1136 let nl = s.find('\n').unwrap_or(s.len());
1137 label_line_end = arrows_offset + (after_arrows_trimmed.len() - s.len()) + nl;
1138 };
1139
1140 if label.is_empty() {
1141 return None;
1142 }
1143
1144 let body_start_in_remaining = if label_line_end < remaining.len() {
1146 label_line_end + 1 } else {
1148 return None; };
1150
1151 let body = &remaining[body_start_in_remaining..];
1152
1153 let mut search_pos = 0;
1155 let end_marker_pos;
1156 loop {
1157 if search_pos >= body.len() {
1158 return None; }
1160 let line_start = search_pos;
1161 let line_end = body[line_start..]
1162 .find('\n')
1163 .map(|p| line_start + p)
1164 .unwrap_or(body.len());
1165 let line = &body[line_start..line_end];
1166 let trimmed_line = line.trim_start_matches([' ', '\t']);
1167
1168 if trimmed_line == label
1172 || trimmed_line.starts_with(label)
1173 && trimmed_line[label.len()..]
1174 .trim_start_matches([';', ',', ')'])
1175 .trim()
1176 .is_empty()
1177 {
1178 end_marker_pos = line_start;
1179 break;
1180 }
1181
1182 search_pos = if line_end < body.len() {
1183 line_end + 1
1184 } else {
1185 body.len()
1186 };
1187 }
1188
1189 let end_marker_line = &body[end_marker_pos..];
1191 let trimmed = end_marker_line.trim_start_matches([' ', '\t']);
1192 let indent_len = end_marker_line.len() - trimmed.len();
1193 let token_end_in_remaining =
1194 body_start_in_remaining + end_marker_pos + indent_len + label.len();
1195 self.pos = base_pos + token_end_in_remaining;
1196
1197 let span = Span::new(start as u32, self.pos as u32);
1198
1199 if is_nowdoc {
1200 Some(Token::new(TokenKind::Nowdoc, span))
1201 } else {
1202 Some(Token::new(TokenKind::Heredoc, span))
1203 }
1204 }
1205}
1206
1207pub fn lex_all(source: &str) -> (Vec<Token>, Vec<LexerError>) {
1215 if source.len() > u32::MAX as usize {
1216 let error = LexerError {
1217 kind: LexerErrorKind::FileTooLarge,
1218 message: format!(
1219 "source is {} bytes, which exceeds the maximum supported size of {} bytes",
1220 source.len(),
1221 u32::MAX
1222 ),
1223 span: Span::new(0, 0),
1224 };
1225 let eof = Token::eof(0);
1226 return (vec![eof, eof], vec![error]);
1227 }
1228
1229 let mut lexer = Lexer::new(source);
1230 let mut tokens = Vec::new();
1231
1232 loop {
1233 let tok = lexer.next_token();
1234 let is_eof = tok.kind == TokenKind::Eof;
1235 tokens.push(tok);
1236 if is_eof {
1237 break;
1238 }
1239 }
1240
1241 let eof_span = tokens.last().unwrap().span;
1244 tokens.push(Token::new(TokenKind::Eof, eof_span));
1245
1246 let errors = lexer.errors;
1247 (tokens, errors)
1248}
1249
1250#[cfg(test)]
1251mod tests {
1252 use super::*;
1253
1254 fn collect_tokens(source: &str) -> Vec<Token> {
1255 let mut lexer = Lexer::new(source);
1256 let mut tokens = Vec::new();
1257 loop {
1258 let token = lexer.next_token();
1259 if token.kind == TokenKind::Eof {
1260 tokens.push(token);
1261 break;
1262 }
1263 tokens.push(token);
1264 }
1265 tokens
1266 }
1267
1268 fn collect_kinds(source: &str) -> Vec<TokenKind> {
1269 collect_tokens(source).into_iter().map(|t| t.kind).collect()
1270 }
1271
1272 fn php_kinds(code: &str) -> Vec<TokenKind> {
1274 let full = format!("<?php {}", code);
1275 collect_kinds(&full)
1276 .into_iter()
1277 .filter(|k| *k != TokenKind::OpenTag && *k != TokenKind::Eof)
1278 .collect()
1279 }
1280
1281 fn php_tokens(code: &str) -> Vec<(TokenKind, String)> {
1283 let full = format!("<?php {}", code);
1284 let mut lexer = Lexer::new(&full);
1285 let mut result = Vec::new();
1286 loop {
1287 let token = lexer.next_token();
1288 if token.kind == TokenKind::Eof {
1289 break;
1290 }
1291 if token.kind == TokenKind::OpenTag {
1292 continue;
1293 }
1294 let text = lexer.token_text(&token).to_string();
1295 result.push((token.kind, text));
1296 }
1297 result
1298 }
1299
1300 mod open_tag_and_html {
1301 use super::*;
1302
1303 #[test]
1304 fn test_php_only() {
1305 let tokens = collect_kinds("<?php $x = 42;");
1306 assert_eq!(
1307 tokens,
1308 vec![
1309 TokenKind::OpenTag,
1310 TokenKind::Variable,
1311 TokenKind::Equals,
1312 TokenKind::IntLiteral,
1313 TokenKind::Semicolon,
1314 TokenKind::Eof,
1315 ]
1316 );
1317 }
1318
1319 #[test]
1320 fn test_inline_html_before_php() {
1321 let tokens = collect_kinds("<html><?php echo 1;");
1322 assert_eq!(
1323 tokens,
1324 vec![
1325 TokenKind::InlineHtml,
1326 TokenKind::OpenTag,
1327 TokenKind::Echo,
1328 TokenKind::IntLiteral,
1329 TokenKind::Semicolon,
1330 TokenKind::Eof,
1331 ]
1332 );
1333 }
1334
1335 #[test]
1336 fn test_inline_html_after_close_tag() {
1337 let tokens = collect_kinds("<?php echo 1; ?><html>");
1338 assert_eq!(
1339 tokens,
1340 vec![
1341 TokenKind::OpenTag,
1342 TokenKind::Echo,
1343 TokenKind::IntLiteral,
1344 TokenKind::Semicolon,
1345 TokenKind::CloseTag,
1346 TokenKind::InlineHtml,
1347 TokenKind::Eof,
1348 ]
1349 );
1350 }
1351
1352 #[test]
1353 fn test_empty_source() {
1354 let tokens = collect_kinds("");
1355 assert_eq!(tokens, vec![TokenKind::Eof]);
1356 }
1357
1358 #[test]
1359 fn test_only_inline_html() {
1360 let tokens = collect_kinds("<html><body>Hello</body></html>");
1361 assert_eq!(tokens, vec![TokenKind::InlineHtml, TokenKind::Eof]);
1362 }
1363
1364 #[test]
1365 fn test_open_tag_uppercase() {
1366 for tag in &["<?PHP", "<?Php", "<?PhP", "<?pHP", "<?phP"] {
1368 let src = format!("{} $x = 1;", tag);
1369 let tokens = collect_kinds(&src);
1370 assert_eq!(
1371 tokens[0],
1372 TokenKind::OpenTag,
1373 "expected OpenTag for opening tag '{tag}'"
1374 );
1375 }
1376 }
1377
1378 #[test]
1379 fn test_open_tag_uppercase_mid_file() {
1380 let tokens = collect_kinds("<html><?PHP echo 1;");
1382 assert_eq!(
1383 tokens,
1384 vec![
1385 TokenKind::InlineHtml,
1386 TokenKind::OpenTag,
1387 TokenKind::Echo,
1388 TokenKind::IntLiteral,
1389 TokenKind::Semicolon,
1390 TokenKind::Eof,
1391 ]
1392 );
1393 }
1394 }
1395
1396 mod keywords {
1397 use super::*;
1398
1399 #[test]
1400 fn test_keyword_resolution() {
1401 let tokens = collect_kinds("<?php if else while for foreach function return");
1402 assert_eq!(
1403 tokens,
1404 vec![
1405 TokenKind::OpenTag,
1406 TokenKind::If,
1407 TokenKind::Else,
1408 TokenKind::While,
1409 TokenKind::For,
1410 TokenKind::Foreach,
1411 TokenKind::Function,
1412 TokenKind::Return,
1413 TokenKind::Eof,
1414 ]
1415 );
1416 }
1417
1418 #[test]
1419 fn test_keyword_case_insensitive() {
1420 let tokens = collect_kinds("<?php IF ELSE TRUE FALSE NULL");
1421 assert_eq!(
1422 tokens,
1423 vec![
1424 TokenKind::OpenTag,
1425 TokenKind::If,
1426 TokenKind::Else,
1427 TokenKind::True,
1428 TokenKind::False,
1429 TokenKind::Null,
1430 TokenKind::Eof,
1431 ]
1432 );
1433 }
1434
1435 #[test]
1436 fn test_logical_keywords() {
1437 let tokens = collect_kinds("<?php and or xor");
1438 assert_eq!(
1439 tokens,
1440 vec![
1441 TokenKind::OpenTag,
1442 TokenKind::And,
1443 TokenKind::Or,
1444 TokenKind::Xor,
1445 TokenKind::Eof,
1446 ]
1447 );
1448 }
1449 }
1450
1451 mod lexer_api {
1452 use super::*;
1453
1454 #[test]
1455 fn test_peek_doesnt_consume() {
1456 let mut lexer = Lexer::new("<?php 42");
1457 let peeked = *lexer.peek();
1458 assert_eq!(peeked.kind, TokenKind::OpenTag);
1459 let next = lexer.next_token();
1460 assert_eq!(next.kind, TokenKind::OpenTag);
1461 let next = lexer.next_token();
1462 assert_eq!(next.kind, TokenKind::IntLiteral);
1463 }
1464
1465 #[test]
1466 fn test_token_text() {
1467 let source = "<?php $myVar = 'hello';";
1468 let mut lexer = Lexer::new(source);
1469 lexer.next_token(); let var_tok = lexer.next_token();
1471 assert_eq!(lexer.token_text(&var_tok), "$myVar");
1472 lexer.next_token(); let str_tok = lexer.next_token();
1474 assert_eq!(lexer.token_text(&str_tok), "'hello'");
1475 }
1476
1477 #[test]
1478 fn test_spans_are_correct() {
1479 let source = "<?php $x";
1480 let tokens = collect_tokens(source);
1481 assert_eq!(tokens[0].span, Span::new(0, 5)); assert_eq!(tokens[1].span, Span::new(6, 8)); }
1484 }
1485
1486 mod operators {
1487 use super::*;
1488
1489 #[test]
1490 fn test_basic_operators() {
1491 assert_eq!(
1492 php_kinds("+ - * / % ** ."),
1493 vec![
1494 TokenKind::Plus,
1495 TokenKind::Minus,
1496 TokenKind::Star,
1497 TokenKind::Slash,
1498 TokenKind::Percent,
1499 TokenKind::StarStar,
1500 TokenKind::Dot,
1501 ]
1502 );
1503 }
1504
1505 #[test]
1506 fn test_operators() {
1507 let tokens = collect_kinds("<?php === !== <=> ?? ++ -- **");
1508 assert_eq!(
1509 tokens,
1510 vec![
1511 TokenKind::OpenTag,
1512 TokenKind::EqualsEqualsEquals,
1513 TokenKind::BangEqualsEquals,
1514 TokenKind::Spaceship,
1515 TokenKind::QuestionQuestion,
1516 TokenKind::PlusPlus,
1517 TokenKind::MinusMinus,
1518 TokenKind::StarStar,
1519 TokenKind::Eof,
1520 ]
1521 );
1522 }
1523
1524 #[test]
1525 fn test_assignment_operators() {
1526 let tokens = collect_kinds("<?php += -= *= /= %= **= .= ??=");
1527 assert_eq!(
1528 tokens,
1529 vec![
1530 TokenKind::OpenTag,
1531 TokenKind::PlusEquals,
1532 TokenKind::MinusEquals,
1533 TokenKind::StarEquals,
1534 TokenKind::SlashEquals,
1535 TokenKind::PercentEquals,
1536 TokenKind::StarStarEquals,
1537 TokenKind::DotEquals,
1538 TokenKind::CoalesceEquals,
1539 TokenKind::Eof,
1540 ]
1541 );
1542 }
1543
1544 #[test]
1545 fn test_hash_bracket_not_comment() {
1546 let kinds = php_kinds("#[Attribute]");
1547 assert_eq!(
1548 kinds,
1549 vec![
1550 TokenKind::HashBracket,
1551 TokenKind::Identifier,
1552 TokenKind::RightBracket,
1553 ]
1554 );
1555 }
1556
1557 #[test]
1558 fn test_nullsafe_arrow() {
1559 let kinds = php_kinds("$x?->y");
1560 assert_eq!(
1561 kinds,
1562 vec![
1563 TokenKind::Variable,
1564 TokenKind::NullsafeArrow,
1565 TokenKind::Identifier,
1566 ]
1567 );
1568 }
1569
1570 #[test]
1571 fn test_pipe_arrow() {
1572 let kinds = php_kinds("$x |> foo(...)");
1573 assert_eq!(
1574 kinds,
1575 vec![
1576 TokenKind::Variable,
1577 TokenKind::PipeArrow,
1578 TokenKind::Identifier,
1579 TokenKind::LeftParen,
1580 TokenKind::Ellipsis,
1581 TokenKind::RightParen,
1582 ]
1583 );
1584 }
1585 }
1586
1587 mod numeric_literals {
1588 use super::*;
1589
1590 #[test]
1591 fn test_integers() {
1592 let toks = php_tokens("42 0xFF 0b1010 077");
1593 assert_eq!(toks[0], (TokenKind::IntLiteral, "42".to_string()));
1594 assert_eq!(toks[1], (TokenKind::HexIntLiteral, "0xFF".to_string()));
1595 assert_eq!(toks[2], (TokenKind::BinIntLiteral, "0b1010".to_string()));
1596 assert_eq!(toks[3], (TokenKind::OctIntLiteral, "077".to_string()));
1597 }
1598
1599 #[test]
1600 fn test_floats() {
1601 let toks = php_tokens("3.14 1e10 2.5e-3");
1602 assert_eq!(toks[0], (TokenKind::FloatLiteralSimple, "3.14".to_string()));
1603 assert_eq!(toks[1], (TokenKind::FloatLiteral, "1e10".to_string()));
1604 assert_eq!(toks[2], (TokenKind::FloatLiteral, "2.5e-3".to_string()));
1605 }
1606
1607 #[test]
1608 fn test_float_leading_dot() {
1609 let toks = php_tokens(".5 .123e4");
1610 assert_eq!(
1611 toks[0],
1612 (TokenKind::FloatLiteralLeadingDot, ".5".to_string())
1613 );
1614 assert_eq!(
1615 toks[1],
1616 (TokenKind::FloatLiteralLeadingDot, ".123e4".to_string())
1617 );
1618 }
1619
1620 #[test]
1621 fn test_trailing_dot_float() {
1622 let toks = php_tokens("0. 1. 42.");
1624 assert_eq!(toks[0], (TokenKind::FloatLiteralSimple, "0.".to_string()));
1625 assert_eq!(toks[1], (TokenKind::FloatLiteralSimple, "1.".to_string()));
1626 assert_eq!(toks[2], (TokenKind::FloatLiteralSimple, "42.".to_string()));
1627 }
1628
1629 #[test]
1630 fn test_trailing_dot_not_confused_with_dotdot() {
1631 let toks = php_tokens("1..");
1634 assert_eq!(toks[0], (TokenKind::IntLiteral, "1".to_string()));
1635 assert_eq!(toks[1], (TokenKind::Dot, ".".to_string()));
1636 assert_eq!(toks[2], (TokenKind::Dot, ".".to_string()));
1637 }
1638
1639 #[test]
1640 fn test_new_octal_syntax() {
1641 let toks = php_tokens("0o77 0O755");
1642 assert_eq!(toks[0], (TokenKind::OctIntLiteralNew, "0o77".to_string()));
1643 assert_eq!(toks[1], (TokenKind::OctIntLiteralNew, "0O755".to_string()));
1644 }
1645
1646 #[test]
1647 fn test_legacy_octal_with_invalid_digits() {
1648 let toks = php_tokens("0778 019 09");
1651 assert_eq!(toks[0], (TokenKind::OctIntLiteral, "0778".to_string()));
1652 assert_eq!(toks[1], (TokenKind::OctIntLiteral, "019".to_string()));
1653 assert_eq!(toks[2], (TokenKind::OctIntLiteral, "09".to_string()));
1654 }
1655
1656 #[test]
1657 fn test_numeric_underscores() {
1658 let toks = php_tokens("1_000 0xFF_FF 0b1010_0101");
1659 assert_eq!(toks[0], (TokenKind::IntLiteral, "1_000".to_string()));
1660 assert_eq!(toks[1], (TokenKind::HexIntLiteral, "0xFF_FF".to_string()));
1661 assert_eq!(
1662 toks[2],
1663 (TokenKind::BinIntLiteral, "0b1010_0101".to_string())
1664 );
1665 }
1666 }
1667
1668 mod strings_and_variables {
1669 use super::*;
1670
1671 #[test]
1672 fn test_string_literals() {
1673 let tokens = collect_kinds(r#"<?php 'single' "double""#);
1674 assert_eq!(
1675 tokens,
1676 vec![
1677 TokenKind::OpenTag,
1678 TokenKind::SingleQuotedString,
1679 TokenKind::DoubleQuotedString,
1680 TokenKind::Eof,
1681 ]
1682 );
1683 }
1684
1685 #[test]
1686 fn test_strings() {
1687 let kinds = php_kinds(r#"'hello' "world" 'it\'s' "say \"hi\"""#);
1688 assert_eq!(
1689 kinds,
1690 vec![
1691 TokenKind::SingleQuotedString,
1692 TokenKind::DoubleQuotedString,
1693 TokenKind::SingleQuotedString,
1694 TokenKind::DoubleQuotedString,
1695 ]
1696 );
1697 }
1698
1699 #[test]
1700 fn test_binary_prefix_strings() {
1701 let kinds = php_kinds(r#"b'hello' B"world""#);
1702 assert_eq!(
1703 kinds,
1704 vec![TokenKind::SingleQuotedString, TokenKind::DoubleQuotedString,]
1705 );
1706 }
1707
1708 #[test]
1709 fn test_variables() {
1710 let toks = php_tokens("$x $myVar $_foo");
1711 assert_eq!(toks[0], (TokenKind::Variable, "$x".to_string()));
1712 assert_eq!(toks[1], (TokenKind::Variable, "$myVar".to_string()));
1713 assert_eq!(toks[2], (TokenKind::Variable, "$_foo".to_string()));
1714 }
1715
1716 #[test]
1717 fn test_comments_yielded() {
1718 let toks = php_tokens("42 // line comment\n43 /* block */ 44 # hash comment\n45");
1720 assert_eq!(toks[0], (TokenKind::IntLiteral, "42".to_string()));
1721 assert_eq!(
1722 toks[1],
1723 (TokenKind::LineComment, "// line comment".to_string())
1724 );
1725 assert_eq!(toks[2], (TokenKind::IntLiteral, "43".to_string()));
1726 assert_eq!(
1727 toks[3],
1728 (TokenKind::BlockComment, "/* block */".to_string())
1729 );
1730 assert_eq!(toks[4], (TokenKind::IntLiteral, "44".to_string()));
1731 assert_eq!(
1732 toks[5],
1733 (TokenKind::HashComment, "# hash comment".to_string())
1734 );
1735 assert_eq!(toks[6], (TokenKind::IntLiteral, "45".to_string()));
1736 }
1737 }
1738}