1use memchr::{memchr2, memmem};
2use php_ast::Span;
3
4use crate::token::{resolve_keyword, TokenKind};
5
6const fn make_whitespace_table() -> [bool; 256] {
14 let mut t = [false; 256];
15 t[b' ' as usize] = true;
16 t[b'\t' as usize] = true;
17 t[b'\r' as usize] = true;
18 t[b'\n' as usize] = true;
19 t[0x0C] = true; t
21}
22
23const fn make_ident_start_table() -> [bool; 256] {
24 let mut t = [false; 256];
25 let mut i = 0usize;
26 while i < 256 {
27 let b = i as u8;
28 t[i] = (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') || b == b'_' || b >= 0x80;
29 i += 1;
30 }
31 t
32}
33
34const fn make_ident_continue_table() -> [bool; 256] {
35 let mut t = [false; 256];
36 let mut i = 0usize;
37 while i < 256 {
38 let b = i as u8;
39 t[i] = (b >= b'a' && b <= b'z')
40 || (b >= b'A' && b <= b'Z')
41 || (b >= b'0' && b <= b'9')
42 || b == b'_'
43 || b >= 0x80;
44 i += 1;
45 }
46 t
47}
48
49static IS_PHP_WHITESPACE: [bool; 256] = make_whitespace_table();
50static IS_IDENT_START: [bool; 256] = make_ident_start_table();
51static IS_IDENT_CONTINUE: [bool; 256] = make_ident_continue_table();
52
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum LexerErrorKind {
55 UnterminatedString,
56 FileTooLarge,
57 Other,
58}
59
60#[derive(Debug, Clone, PartialEq)]
61pub struct LexerError {
62 pub kind: LexerErrorKind,
63 pub message: String,
64 pub span: Span,
65}
66
67#[derive(Debug, Clone, Copy, PartialEq)]
68pub struct Token {
69 pub kind: TokenKind,
70 pub span: Span,
71}
72
73impl Token {
74 pub fn new(kind: TokenKind, span: Span) -> Self {
75 Self { kind, span }
76 }
77
78 pub fn eof(offset: u32) -> Self {
79 Self {
80 kind: TokenKind::Eof,
81 span: Span::new(offset, offset),
82 }
83 }
84}
85
86#[derive(Debug, Clone, Copy, PartialEq, Eq)]
87enum LexerMode {
88 InlineHtml,
89 Php,
90}
91
92pub struct Lexer<'src> {
93 source: &'src str,
94 mode: LexerMode,
95 pos: usize,
96 peeked: Option<Token>,
97 peeked2: Option<Token>,
98 pub errors: Vec<LexerError>,
99}
100
101#[inline(always)]
102fn is_ident_start(b: u8) -> bool {
103 IS_IDENT_START[b as usize]
104}
105
106#[inline(always)]
107fn is_ident_continue(b: u8) -> bool {
108 IS_IDENT_CONTINUE[b as usize]
109}
110
111impl<'src> Lexer<'src> {
112 pub fn new(source: &'src str) -> Self {
113 debug_assert!(
114 source.len() <= u32::MAX as usize,
115 "source is {} bytes, which exceeds the u32::MAX span limit",
116 source.len()
117 );
118
119 let pos = if source.starts_with("#!") {
121 source.find('\n').map(|p| p + 1).unwrap_or(source.len())
122 } else {
123 0
124 };
125
126 let remaining = &source[pos..];
128 let rem_bytes = remaining.as_bytes();
129 let mode = if (rem_bytes.len() >= 5
130 && rem_bytes[0] == b'<'
131 && rem_bytes[1] == b'?'
132 && rem_bytes[2..5].eq_ignore_ascii_case(b"php"))
133 || remaining.starts_with("<?=")
134 {
135 LexerMode::Php
136 } else {
137 LexerMode::InlineHtml
138 };
139
140 Self {
141 source,
142 mode,
143 pos,
144 peeked: None,
145 peeked2: None,
146 errors: Vec::new(),
147 }
148 }
149
150 pub fn new_at(source: &'src str, offset: usize) -> Self {
155 debug_assert!(
156 source.len() <= u32::MAX as usize,
157 "source is {} bytes, which exceeds the u32::MAX span limit",
158 source.len()
159 );
160
161 Self {
162 source,
163 mode: LexerMode::Php,
164 pos: offset,
165 peeked: None,
166 peeked2: None,
167 errors: Vec::new(),
168 }
169 }
170
171 pub fn source(&self) -> &'src str {
172 self.source
173 }
174
175 pub fn peek(&mut self) -> &Token {
176 if self.peeked.is_none() {
177 self.peeked = Some(self.read_next_token());
178 }
179 self.peeked.as_ref().expect("peeked is Some: set above")
180 }
181
182 pub fn peek2(&mut self) -> &Token {
184 if self.peeked.is_none() {
186 self.peeked = Some(self.read_next_token());
187 }
188 if self.peeked2.is_none() {
189 self.peeked2 = Some(self.read_next_token());
190 }
191 self.peeked2.as_ref().expect("peeked2 is Some: set above")
192 }
193
194 pub fn next_token(&mut self) -> Token {
195 if let Some(token) = self.peeked.take() {
196 self.peeked = self.peeked2.take();
197 return token;
198 }
199 self.read_next_token()
200 }
201
202 pub fn token_text(&self, token: &Token) -> &'src str {
204 &self.source[token.span.start as usize..token.span.end as usize]
205 }
206
207 fn read_next_token(&mut self) -> Token {
208 if self.pos >= self.source.len() {
209 return Token::eof(self.source.len() as u32);
210 }
211
212 match self.mode {
213 LexerMode::InlineHtml => self.lex_inline_html(),
214 LexerMode::Php => self.lex_php(),
215 }
216 }
217
218 fn lex_inline_html(&mut self) -> Token {
219 let start = self.pos;
220 let bytes = self.source.as_bytes();
221
222 let mut search = self.pos;
226 let tag_pos = loop {
227 match memchr::memchr(b'<', &bytes[search..]) {
228 None => break None,
229 Some(offset) => {
230 let p = search + offset;
231 let rest = &bytes[p..];
232 if (rest.len() >= 5
233 && rest[0] == b'<'
234 && rest[1] == b'?'
235 && rest[2..5].eq_ignore_ascii_case(b"php"))
236 || rest.starts_with(b"<?=")
237 {
238 break Some(p - self.pos);
239 }
240 search = p + 1;
241 }
242 }
243 };
244
245 if let Some(tag_pos) = tag_pos {
246 if tag_pos == 0 {
247 self.mode = LexerMode::Php;
249 return self.lex_php();
250 }
251 let end = self.pos + tag_pos;
253 self.pos = end;
254 self.mode = LexerMode::Php;
255 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
256 } else {
257 let end = self.source.len();
259 self.pos = end;
260 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
261 }
262 }
263
264 fn lex_php(&mut self) -> Token {
265 let remaining = &self.source[self.pos..];
266
267 if let Some(token) = self.try_lex_heredoc(remaining) {
269 return token;
270 }
271
272 self.skip_whitespace();
274
275 if self.pos >= self.source.len() {
276 return Token::eof(self.source.len() as u32);
277 }
278
279 let bytes = self.source.as_bytes();
280 let start = self.pos;
281
282 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'/' {
285 self.pos += 2;
286 Self::skip_line_comment_body(bytes, &mut self.pos);
287 return self.tok(TokenKind::LineComment, start);
288 }
289
290 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'*' {
292 self.pos += 2;
293 let kind = if self.pos < bytes.len()
296 && bytes[self.pos] == b'*'
297 && !(self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'/')
298 {
299 TokenKind::DocComment
300 } else {
301 TokenKind::BlockComment
302 };
303 match memmem::find(&bytes[self.pos..], b"*/") {
304 Some(end) => self.pos += end + 2,
305 None => {
306 let span = Span::new(start as u32, self.source.len() as u32);
307 self.errors.push(LexerError {
308 kind: LexerErrorKind::Other,
309 message: "unterminated block comment".to_string(),
310 span,
311 });
312 self.pos = bytes.len();
313 }
314 }
315 return self.tok(kind, start);
316 }
317
318 if bytes[self.pos] == b'#' && !(self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'[') {
321 self.pos += 1;
322 Self::skip_line_comment_body(bytes, &mut self.pos);
323 return self.tok(TokenKind::HashComment, start);
324 }
325
326 self.scan_token()
327 }
328
329 fn skip_whitespace(&mut self) {
331 let bytes = self.source.as_bytes();
332 while self.pos < bytes.len() && IS_PHP_WHITESPACE[bytes[self.pos] as usize] {
333 self.pos += 1;
334 }
335 }
336
337 fn scan_token(&mut self) -> Token {
339 let start = self.pos;
340 let bytes = self.source.as_bytes();
341 let b = bytes[start];
342
343 match b {
344 b'+' => {
346 if self.check_at(1, b'+') {
347 self.pos = start + 2;
348 self.tok(TokenKind::PlusPlus, start)
349 } else if self.check_at(1, b'=') {
350 self.pos = start + 2;
351 self.tok(TokenKind::PlusEquals, start)
352 } else {
353 self.pos = start + 1;
354 self.tok(TokenKind::Plus, start)
355 }
356 }
357 b'-' => {
358 if self.check_at(1, b'-') {
359 self.pos = start + 2;
360 self.tok(TokenKind::MinusMinus, start)
361 } else if self.check_at(1, b'=') {
362 self.pos = start + 2;
363 self.tok(TokenKind::MinusEquals, start)
364 } else if self.check_at(1, b'>') {
365 self.pos = start + 2;
366 self.tok(TokenKind::Arrow, start)
367 } else {
368 self.pos = start + 1;
369 self.tok(TokenKind::Minus, start)
370 }
371 }
372 b'*' => {
373 if self.check_at(1, b'*') {
374 if self.check_at(2, b'=') {
375 self.pos = start + 3;
376 self.tok(TokenKind::StarStarEquals, start)
377 } else {
378 self.pos = start + 2;
379 self.tok(TokenKind::StarStar, start)
380 }
381 } else if self.check_at(1, b'=') {
382 self.pos = start + 2;
383 self.tok(TokenKind::StarEquals, start)
384 } else {
385 self.pos = start + 1;
386 self.tok(TokenKind::Star, start)
387 }
388 }
389 b'/' => {
390 if self.check_at(1, b'=') {
392 self.pos = start + 2;
393 self.tok(TokenKind::SlashEquals, start)
394 } else {
395 self.pos = start + 1;
396 self.tok(TokenKind::Slash, start)
397 }
398 }
399 b'%' => {
400 if self.check_at(1, b'=') {
401 self.pos = start + 2;
402 self.tok(TokenKind::PercentEquals, start)
403 } else {
404 self.pos = start + 1;
405 self.tok(TokenKind::Percent, start)
406 }
407 }
408 b'.' => {
409 if start + 1 < bytes.len() && bytes[start + 1].is_ascii_digit() {
411 self.pos = start + 1;
412 self.scan_digits(u8::is_ascii_digit);
413 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
415 self.try_scan_exponent();
416 }
417 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
419 self.consume_invalid_numeric_rest();
420 return self.invalid_numeric(start);
421 }
422 return self.tok(TokenKind::FloatLiteralLeadingDot, start);
423 }
424 if self.check_at(1, b'.') && self.check_at(2, b'.') {
425 self.pos = start + 3;
426 self.tok(TokenKind::Ellipsis, start)
427 } else if self.check_at(1, b'=') {
428 self.pos = start + 2;
429 self.tok(TokenKind::DotEquals, start)
430 } else {
431 self.pos = start + 1;
432 self.tok(TokenKind::Dot, start)
433 }
434 }
435 b'=' => {
436 if self.check_at(1, b'=') {
437 if self.check_at(2, b'=') {
438 self.pos = start + 3;
439 self.tok(TokenKind::EqualsEqualsEquals, start)
440 } else {
441 self.pos = start + 2;
442 self.tok(TokenKind::EqualsEquals, start)
443 }
444 } else if self.check_at(1, b'>') {
445 self.pos = start + 2;
446 self.tok(TokenKind::FatArrow, start)
447 } else {
448 self.pos = start + 1;
449 self.tok(TokenKind::Equals, start)
450 }
451 }
452 b'!' => {
453 if self.check_at(1, b'=') {
454 if self.check_at(2, b'=') {
455 self.pos = start + 3;
456 self.tok(TokenKind::BangEqualsEquals, start)
457 } else {
458 self.pos = start + 2;
459 self.tok(TokenKind::BangEquals, start)
460 }
461 } else {
462 self.pos = start + 1;
463 self.tok(TokenKind::Bang, start)
464 }
465 }
466 b'<' => self.scan_less_than(start),
467 b'>' => {
468 if self.check_at(1, b'>') {
469 if self.check_at(2, b'=') {
470 self.pos = start + 3;
471 self.tok(TokenKind::ShiftRightEquals, start)
472 } else {
473 self.pos = start + 2;
474 self.tok(TokenKind::ShiftRight, start)
475 }
476 } else if self.check_at(1, b'=') {
477 self.pos = start + 2;
478 self.tok(TokenKind::GreaterThanEquals, start)
479 } else {
480 self.pos = start + 1;
481 self.tok(TokenKind::GreaterThan, start)
482 }
483 }
484 b'&' => {
485 if self.check_at(1, b'&') {
486 self.pos = start + 2;
487 self.tok(TokenKind::AmpersandAmpersand, start)
488 } else if self.check_at(1, b'=') {
489 self.pos = start + 2;
490 self.tok(TokenKind::AmpersandEquals, start)
491 } else {
492 self.pos = start + 1;
493 self.tok(TokenKind::Ampersand, start)
494 }
495 }
496 b'|' => {
497 if self.check_at(1, b'|') {
498 self.pos = start + 2;
499 self.tok(TokenKind::PipePipe, start)
500 } else if self.check_at(1, b'=') {
501 self.pos = start + 2;
502 self.tok(TokenKind::PipeEquals, start)
503 } else if self.check_at(1, b'>') {
504 self.pos = start + 2;
505 self.tok(TokenKind::PipeArrow, start)
506 } else {
507 self.pos = start + 1;
508 self.tok(TokenKind::Pipe, start)
509 }
510 }
511 b'^' => {
512 if self.check_at(1, b'=') {
513 self.pos = start + 2;
514 self.tok(TokenKind::CaretEquals, start)
515 } else {
516 self.pos = start + 1;
517 self.tok(TokenKind::Caret, start)
518 }
519 }
520 b'~' => {
521 self.pos = start + 1;
522 self.tok(TokenKind::Tilde, start)
523 }
524 b'?' => {
525 if self.check_at(1, b'>') {
526 self.pos = start + 2;
527 self.mode = LexerMode::InlineHtml;
528 self.tok(TokenKind::CloseTag, start)
529 } else if self.check_at(1, b'?') {
530 if self.check_at(2, b'=') {
531 self.pos = start + 3;
532 self.tok(TokenKind::CoalesceEquals, start)
533 } else {
534 self.pos = start + 2;
535 self.tok(TokenKind::QuestionQuestion, start)
536 }
537 } else if self.check_at(1, b'-') && self.check_at(2, b'>') {
538 self.pos = start + 3;
539 self.tok(TokenKind::NullsafeArrow, start)
540 } else {
541 self.pos = start + 1;
542 self.tok(TokenKind::Question, start)
543 }
544 }
545 b':' => {
546 if self.check_at(1, b':') {
547 self.pos = start + 2;
548 self.tok(TokenKind::DoubleColon, start)
549 } else {
550 self.pos = start + 1;
551 self.tok(TokenKind::Colon, start)
552 }
553 }
554 b'@' => {
555 self.pos = start + 1;
556 self.tok(TokenKind::At, start)
557 }
558 b'\\' => {
559 self.pos = start + 1;
560 self.tok(TokenKind::Backslash, start)
561 }
562 b'#' => {
563 if self.check_at(1, b'[') {
566 self.pos = start + 2;
567 self.tok(TokenKind::HashBracket, start)
568 } else {
569 self.pos = start + 1;
571 self.read_next_token()
572 }
573 }
574
575 b'(' => {
577 self.pos = start + 1;
578 self.tok(TokenKind::LeftParen, start)
579 }
580 b')' => {
581 self.pos = start + 1;
582 self.tok(TokenKind::RightParen, start)
583 }
584 b'[' => {
585 self.pos = start + 1;
586 self.tok(TokenKind::LeftBracket, start)
587 }
588 b']' => {
589 self.pos = start + 1;
590 self.tok(TokenKind::RightBracket, start)
591 }
592 b'{' => {
593 self.pos = start + 1;
594 self.tok(TokenKind::LeftBrace, start)
595 }
596 b'}' => {
597 self.pos = start + 1;
598 self.tok(TokenKind::RightBrace, start)
599 }
600 b';' => {
601 self.pos = start + 1;
602 self.tok(TokenKind::Semicolon, start)
603 }
604 b',' => {
605 self.pos = start + 1;
606 self.tok(TokenKind::Comma, start)
607 }
608
609 b'\'' => self.scan_single_quoted_string(),
611 b'"' => self.scan_double_quoted_string(),
612 b'`' => self.scan_backtick_string(),
613
614 b'$' => {
616 if start + 1 < bytes.len() && is_ident_start(bytes[start + 1]) {
617 self.pos = start + 2;
618 while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
619 self.pos += 1;
620 }
621 self.tok(TokenKind::Variable, start)
622 } else {
623 self.pos = start + 1;
624 self.tok(TokenKind::Dollar, start)
625 }
626 }
627
628 b'0'..=b'9' => self.scan_number(),
630
631 _ if is_ident_start(b) => {
633 if b == b'b' || b == b'B' {
635 if self.check_at(1, b'\'') {
636 return self.scan_single_quoted_string();
637 }
638 if self.check_at(1, b'"') {
639 return self.scan_double_quoted_string();
640 }
641 if self.check_at(1, b'<') && self.check_at(2, b'<') && self.check_at(3, b'<') {
642 let remaining = &self.source[self.pos..];
643 if let Some(token) = self.try_lex_heredoc(remaining) {
644 return token;
645 }
646 }
647 }
648 self.scan_identifier()
649 }
650
651 _ => {
653 self.pos = start + 1;
654 self.read_next_token()
655 }
656 }
657 }
658
659 fn scan_less_than(&mut self, start: usize) -> Token {
661 if self.check_at(1, b'<') {
662 if self.check_at(2, b'<') {
663 let remaining = &self.source[self.pos..];
665 if let Some(token) = self.try_lex_heredoc(remaining) {
666 return token;
667 }
668 }
670 if self.check_at(2, b'=') {
671 self.pos = start + 3;
672 return self.tok(TokenKind::ShiftLeftEquals, start);
673 }
674 self.pos = start + 2;
675 return self.tok(TokenKind::ShiftLeft, start);
676 }
677 if self.check_at(1, b'=') {
678 if self.check_at(2, b'>') {
679 self.pos = start + 3;
680 return self.tok(TokenKind::Spaceship, start);
681 }
682 self.pos = start + 2;
683 return self.tok(TokenKind::LessThanEquals, start);
684 }
685 if self.check_at(1, b'?') {
686 let bytes = self.source.as_bytes();
687 if bytes.len() >= self.pos + 5
688 && bytes[self.pos + 2..self.pos + 5].eq_ignore_ascii_case(b"php")
689 {
690 self.pos = start + 5;
691 return self.tok(TokenKind::OpenTag, start);
692 }
693 if self.source[self.pos..].starts_with("<?=") {
694 self.pos = start + 3;
695 return self.tok(TokenKind::OpenTag, start);
696 }
697 }
698 self.pos = start + 1;
699 self.tok(TokenKind::LessThan, start)
700 }
701
702 fn scan_single_quoted_string(&mut self) -> Token {
705 let start = self.pos;
706 let bytes = self.source.as_bytes();
707 let mut p = self.pos;
708 if bytes[p] == b'b' || bytes[p] == b'B' {
710 p += 1;
711 }
712 p += 1; loop {
714 match memchr2(b'\\', b'\'', &bytes[p..]) {
715 None => {
716 self.errors.push(LexerError {
717 kind: LexerErrorKind::UnterminatedString,
718 message: "unterminated string literal".to_string(),
719 span: Span::new(start as u32, self.source.len() as u32),
720 });
721 self.pos = self.source.len();
722 return self.tok(TokenKind::SingleQuotedString, start);
723 }
724 Some(offset) => {
725 p += offset;
726 match bytes[p] {
727 b'\\' => {
728 p += 1;
729 if p < bytes.len() {
730 p += 1;
731 }
732 }
733 _ => {
734 p += 1;
736 break;
737 }
738 }
739 }
740 }
741 }
742 self.pos = p;
743 self.tok(TokenKind::SingleQuotedString, start)
744 }
745
746 fn scan_double_quoted_string(&mut self) -> Token {
747 let start = self.pos;
748 let bytes = self.source.as_bytes();
749 let mut p = self.pos;
750 if bytes[p] == b'b' || bytes[p] == b'B' {
752 p += 1;
753 }
754 p += 1; loop {
756 match memchr2(b'\\', b'"', &bytes[p..]) {
757 None => {
758 self.errors.push(LexerError {
759 kind: LexerErrorKind::UnterminatedString,
760 message: "unterminated string literal".to_string(),
761 span: Span::new(start as u32, self.source.len() as u32),
762 });
763 self.pos = self.source.len();
764 return self.tok(TokenKind::DoubleQuotedString, start);
765 }
766 Some(offset) => {
767 p += offset;
768 match bytes[p] {
769 b'\\' => {
770 p += 1;
771 if p < bytes.len() {
772 p += 1;
773 }
774 }
775 _ => {
776 p += 1;
778 break;
779 }
780 }
781 }
782 }
783 }
784 self.pos = p;
785 self.tok(TokenKind::DoubleQuotedString, start)
786 }
787
788 fn scan_backtick_string(&mut self) -> Token {
789 let start = self.pos;
790 let bytes = self.source.as_bytes();
791 let mut p = self.pos;
792 p += 1; loop {
794 match memchr2(b'\\', b'`', &bytes[p..]) {
795 None => {
796 self.errors.push(LexerError {
797 kind: LexerErrorKind::UnterminatedString,
798 message: "unterminated string literal".to_string(),
799 span: Span::new(start as u32, self.source.len() as u32),
800 });
801 self.pos = self.source.len();
802 return self.tok(TokenKind::BacktickString, start);
803 }
804 Some(offset) => {
805 p += offset;
806 match bytes[p] {
807 b'\\' => {
808 p += 1;
809 if p < bytes.len() {
810 p += 1;
811 }
812 }
813 _ => {
814 p += 1;
816 break;
817 }
818 }
819 }
820 }
821 }
822 self.pos = p;
823 self.tok(TokenKind::BacktickString, start)
824 }
825
826 fn scan_number(&mut self) -> Token {
829 let start = self.pos;
830 let bytes = self.source.as_bytes();
831
832 if bytes[start] == b'0' && start + 1 < bytes.len() {
834 match bytes[start + 1] {
835 b'x' | b'X' => {
836 self.pos = start + 2;
837 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
838 self.consume_invalid_numeric_rest();
839 return self.invalid_numeric(start);
840 }
841 if self.scan_digits(u8::is_ascii_hexdigit) {
842 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
843 self.consume_invalid_numeric_rest();
844 return self.invalid_numeric(start);
845 }
846 return self.tok(TokenKind::HexIntLiteral, start);
847 }
848 self.pos = start;
850 }
851 b'b' | b'B' => {
852 self.pos = start + 2;
853 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
854 self.consume_invalid_numeric_rest();
855 return self.invalid_numeric(start);
856 }
857 if self.scan_digits(|b| b == &b'0' || b == &b'1') {
858 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
859 self.consume_invalid_numeric_rest();
860 return self.invalid_numeric(start);
861 }
862 return self.tok(TokenKind::BinIntLiteral, start);
863 }
864 self.pos = start;
866 }
867 b'o' | b'O' => {
868 self.pos = start + 2;
869 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
870 self.consume_invalid_numeric_rest();
871 return self.invalid_numeric(start);
872 }
873 if self.scan_digits(|b| (b'0'..=b'7').contains(b)) {
874 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
875 self.consume_invalid_numeric_rest();
876 return self.invalid_numeric(start);
877 }
878 return self.tok(TokenKind::OctIntLiteralNew, start);
879 }
880 self.pos = start;
882 }
883 _ => {}
884 }
885 }
886
887 self.pos = start;
889 self.scan_digits(u8::is_ascii_digit);
890 let integer_end = self.pos;
891 let mut kind = TokenKind::IntLiteral;
892
893 if bytes[start] == b'0' && integer_end > start + 1 {
896 kind = TokenKind::OctIntLiteral;
897 }
898
899 if self.pos < bytes.len() && bytes[self.pos] == b'.' {
901 if self.pos + 1 < bytes.len() && bytes[self.pos + 1].is_ascii_digit() {
902 self.pos += 1; self.scan_digits(u8::is_ascii_digit);
905 kind = TokenKind::FloatLiteralSimple;
906 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
907 self.consume_invalid_numeric_rest();
909 return self.invalid_numeric(start);
910 } else if self.pos + 1 >= bytes.len() || bytes[self.pos + 1] != b'.' {
911 self.pos += 1; kind = TokenKind::FloatLiteralSimple;
914 }
915 }
916
917 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
919 if self.try_scan_exponent() {
920 kind = TokenKind::FloatLiteral;
921 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
922 self.consume_invalid_numeric_rest();
924 return self.invalid_numeric(start);
925 }
926 }
927
928 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
930 self.consume_invalid_numeric_rest();
931 return self.invalid_numeric(start);
932 }
933
934 self.tok(kind, start)
935 }
936
937 fn scan_digits(&mut self, is_valid: fn(&u8) -> bool) -> bool {
940 let bytes = self.source.as_bytes();
941 if self.pos >= bytes.len() || !is_valid(&bytes[self.pos]) {
942 return false;
943 }
944 self.pos += 1;
945 loop {
946 if self.pos >= bytes.len() {
947 break;
948 }
949 if is_valid(&bytes[self.pos]) {
950 self.pos += 1;
951 } else if bytes[self.pos] == b'_'
952 && self.pos + 1 < bytes.len()
953 && is_valid(&bytes[self.pos + 1])
954 {
955 self.pos += 2;
956 } else {
957 break;
958 }
959 }
960 true
961 }
962
963 fn try_scan_exponent(&mut self) -> bool {
966 let bytes = self.source.as_bytes();
967 let saved = self.pos;
968 self.pos += 1; if self.pos < bytes.len() && matches!(bytes[self.pos], b'+' | b'-') {
972 self.pos += 1;
973 }
974
975 if self.scan_digits(u8::is_ascii_digit) {
977 true
978 } else {
979 self.pos = saved;
980 false
981 }
982 }
983
984 fn scan_identifier(&mut self) -> Token {
987 let start = self.pos;
988 let bytes = self.source.as_bytes();
989 self.pos += 1; while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
991 self.pos += 1;
992 }
993 let text = &self.source[start..self.pos];
994 let kind = resolve_keyword(text).unwrap_or(TokenKind::Identifier);
995 self.tok(kind, start)
996 }
997
998 #[inline]
1004 fn skip_line_comment_body(bytes: &[u8], pos: &mut usize) {
1005 loop {
1006 match memchr2(b'\n', b'?', &bytes[*pos..]) {
1007 None => {
1008 *pos = bytes.len();
1009 return;
1010 }
1011 Some(offset) => {
1012 let p = *pos + offset;
1013 if bytes[p] == b'\n' {
1014 *pos = p; return;
1016 }
1017 if p + 1 < bytes.len() && bytes[p + 1] == b'>' {
1019 *pos = p; return;
1021 }
1022 *pos = p + 1;
1024 }
1025 }
1026 }
1027 }
1028
1029 #[inline]
1030 fn check_at(&self, offset: usize, expected: u8) -> bool {
1031 self.source.as_bytes().get(self.pos + offset) == Some(&expected)
1032 }
1033
1034 #[inline]
1035 fn tok(&self, kind: TokenKind, start: usize) -> Token {
1036 Token::new(kind, Span::new(start as u32, self.pos as u32))
1037 }
1038
1039 fn invalid_numeric(&mut self, start: usize) -> Token {
1040 let span = Span::new(start as u32, self.pos as u32);
1041 self.errors.push(LexerError {
1042 kind: LexerErrorKind::Other,
1043 message: "Invalid numeric literal".to_string(),
1044 span,
1045 });
1046 Token::new(TokenKind::InvalidNumericLiteral, span)
1047 }
1048
1049 fn consume_invalid_numeric_rest(&mut self) {
1051 let bytes = self.source.as_bytes();
1052 while self.pos < bytes.len() {
1053 let b = bytes[self.pos];
1054 if b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'+' || b == b'-' {
1055 if (b == b'+' || b == b'-') && self.pos > 0 {
1057 let prev = bytes[self.pos - 1];
1058 if prev != b'e' && prev != b'E' {
1059 break;
1060 }
1061 }
1062 self.pos += 1;
1063 } else {
1064 break;
1065 }
1066 }
1067 }
1068
1069 fn try_lex_heredoc(&mut self, remaining: &str) -> Option<Token> {
1073 let trimmed = remaining.trim_start_matches(|c: char| {
1075 c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\x0C'
1076 });
1077 let ws_len = remaining.len() - trimmed.len();
1078
1079 let (after_prefix, prefix_len) = if (trimmed.starts_with("b<<<")
1081 || trimmed.starts_with("B<<<"))
1082 && !trimmed[1..].starts_with("<<<>")
1083 {
1084 (&trimmed[1..], 1)
1085 } else {
1086 (trimmed, 0)
1087 };
1088
1089 if !after_prefix.starts_with("<<<") {
1090 return None;
1091 }
1092
1093 let base_pos = self.pos; let start = base_pos + ws_len; let after_arrows = &after_prefix[3..];
1096 let after_arrows_trimmed = after_arrows.trim_start_matches([' ', '\t']);
1097 let arrows_offset =
1098 ws_len + prefix_len + 3 + (after_arrows.len() - after_arrows_trimmed.len());
1099
1100 let (label, is_nowdoc, label_line_end);
1102 if let Some(after_quote) = after_arrows_trimmed.strip_prefix('\'') {
1103 let closing = after_quote.find('\'')?;
1105 label = &after_quote[..closing];
1106 is_nowdoc = true;
1107 let after_label = &after_arrows_trimmed[2 + closing..];
1108 let nl = after_label.find('\n').unwrap_or(after_label.len());
1110 label_line_end = arrows_offset + 2 + closing + nl;
1111 if label_line_end < remaining.len() {
1112 }
1114 } else {
1115 let s = if let Some(after_dquote) = after_arrows_trimmed.strip_prefix('"') {
1117 let closing = after_dquote.find('"')?;
1118 label = &after_dquote[..closing];
1119 &after_dquote[1 + closing..]
1120 } else {
1121 let end = after_arrows_trimmed
1123 .find(|c: char| !c.is_ascii_alphanumeric() && c != '_')
1124 .unwrap_or(after_arrows_trimmed.len());
1125 if end == 0 {
1126 return None;
1127 }
1128 label = &after_arrows_trimmed[..end];
1129 &after_arrows_trimmed[end..]
1130 };
1131 is_nowdoc = false;
1132 let nl = s.find('\n').unwrap_or(s.len());
1133 label_line_end = arrows_offset + (after_arrows_trimmed.len() - s.len()) + nl;
1134 };
1135
1136 if label.is_empty() {
1137 return None;
1138 }
1139
1140 let body_start_in_remaining = if label_line_end < remaining.len() {
1142 label_line_end + 1 } else {
1144 return None; };
1146
1147 let body = &remaining[body_start_in_remaining..];
1148
1149 let mut search_pos = 0;
1151 let end_marker_pos;
1152 loop {
1153 if search_pos >= body.len() {
1154 return None; }
1156 let line_start = search_pos;
1157 let line_end = body[line_start..]
1158 .find('\n')
1159 .map(|p| line_start + p)
1160 .unwrap_or(body.len());
1161 let line = &body[line_start..line_end];
1162 let trimmed_line = line.trim_start_matches([' ', '\t']);
1163
1164 if trimmed_line == label
1168 || trimmed_line.starts_with(label)
1169 && trimmed_line[label.len()..]
1170 .trim_start_matches([';', ',', ')'])
1171 .trim()
1172 .is_empty()
1173 {
1174 end_marker_pos = line_start;
1175 break;
1176 }
1177
1178 search_pos = if line_end < body.len() {
1179 line_end + 1
1180 } else {
1181 body.len()
1182 };
1183 }
1184
1185 let end_marker_line = &body[end_marker_pos..];
1187 let trimmed = end_marker_line.trim_start_matches([' ', '\t']);
1188 let indent_len = end_marker_line.len() - trimmed.len();
1189 let token_end_in_remaining =
1190 body_start_in_remaining + end_marker_pos + indent_len + label.len();
1191 self.pos = base_pos + token_end_in_remaining;
1192
1193 let span = Span::new(start as u32, self.pos as u32);
1194
1195 if is_nowdoc {
1196 Some(Token::new(TokenKind::Nowdoc, span))
1197 } else {
1198 Some(Token::new(TokenKind::Heredoc, span))
1199 }
1200 }
1201}
1202
1203pub fn lex_all(source: &str) -> (Vec<Token>, Vec<LexerError>) {
1211 if source.len() > u32::MAX as usize {
1212 let error = LexerError {
1213 kind: LexerErrorKind::FileTooLarge,
1214 message: format!(
1215 "source is {} bytes, which exceeds the maximum supported size of {} bytes",
1216 source.len(),
1217 u32::MAX
1218 ),
1219 span: Span::new(0, 0),
1220 };
1221 let eof = Token::eof(0);
1222 return (vec![eof, eof], vec![error]);
1223 }
1224
1225 let mut lexer = Lexer::new(source);
1226 let mut tokens = Vec::new();
1227
1228 loop {
1229 let tok = lexer.next_token();
1230 let is_eof = tok.kind == TokenKind::Eof;
1231 tokens.push(tok);
1232 if is_eof {
1233 break;
1234 }
1235 }
1236
1237 let eof_span = tokens.last().unwrap().span;
1240 tokens.push(Token::new(TokenKind::Eof, eof_span));
1241
1242 let errors = lexer.errors;
1243 (tokens, errors)
1244}
1245
1246#[cfg(test)]
1247mod tests {
1248 use super::*;
1249
1250 fn collect_tokens(source: &str) -> Vec<Token> {
1251 let mut lexer = Lexer::new(source);
1252 let mut tokens = Vec::new();
1253 loop {
1254 let token = lexer.next_token();
1255 if token.kind == TokenKind::Eof {
1256 tokens.push(token);
1257 break;
1258 }
1259 tokens.push(token);
1260 }
1261 tokens
1262 }
1263
1264 fn collect_kinds(source: &str) -> Vec<TokenKind> {
1265 collect_tokens(source).into_iter().map(|t| t.kind).collect()
1266 }
1267
1268 fn php_kinds(code: &str) -> Vec<TokenKind> {
1270 let full = format!("<?php {}", code);
1271 collect_kinds(&full)
1272 .into_iter()
1273 .filter(|k| *k != TokenKind::OpenTag && *k != TokenKind::Eof)
1274 .collect()
1275 }
1276
1277 fn php_tokens(code: &str) -> Vec<(TokenKind, String)> {
1279 let full = format!("<?php {}", code);
1280 let mut lexer = Lexer::new(&full);
1281 let mut result = Vec::new();
1282 loop {
1283 let token = lexer.next_token();
1284 if token.kind == TokenKind::Eof {
1285 break;
1286 }
1287 if token.kind == TokenKind::OpenTag {
1288 continue;
1289 }
1290 let text = lexer.token_text(&token).to_string();
1291 result.push((token.kind, text));
1292 }
1293 result
1294 }
1295
1296 mod open_tag_and_html {
1297 use super::*;
1298
1299 #[test]
1300 fn test_php_only() {
1301 let tokens = collect_kinds("<?php $x = 42;");
1302 assert_eq!(
1303 tokens,
1304 vec![
1305 TokenKind::OpenTag,
1306 TokenKind::Variable,
1307 TokenKind::Equals,
1308 TokenKind::IntLiteral,
1309 TokenKind::Semicolon,
1310 TokenKind::Eof,
1311 ]
1312 );
1313 }
1314
1315 #[test]
1316 fn test_inline_html_before_php() {
1317 let tokens = collect_kinds("<html><?php echo 1;");
1318 assert_eq!(
1319 tokens,
1320 vec![
1321 TokenKind::InlineHtml,
1322 TokenKind::OpenTag,
1323 TokenKind::Echo,
1324 TokenKind::IntLiteral,
1325 TokenKind::Semicolon,
1326 TokenKind::Eof,
1327 ]
1328 );
1329 }
1330
1331 #[test]
1332 fn test_inline_html_after_close_tag() {
1333 let tokens = collect_kinds("<?php echo 1; ?><html>");
1334 assert_eq!(
1335 tokens,
1336 vec![
1337 TokenKind::OpenTag,
1338 TokenKind::Echo,
1339 TokenKind::IntLiteral,
1340 TokenKind::Semicolon,
1341 TokenKind::CloseTag,
1342 TokenKind::InlineHtml,
1343 TokenKind::Eof,
1344 ]
1345 );
1346 }
1347
1348 #[test]
1349 fn test_empty_source() {
1350 let tokens = collect_kinds("");
1351 assert_eq!(tokens, vec![TokenKind::Eof]);
1352 }
1353
1354 #[test]
1355 fn test_only_inline_html() {
1356 let tokens = collect_kinds("<html><body>Hello</body></html>");
1357 assert_eq!(tokens, vec![TokenKind::InlineHtml, TokenKind::Eof]);
1358 }
1359
1360 #[test]
1361 fn test_open_tag_uppercase() {
1362 for tag in &["<?PHP", "<?Php", "<?PhP", "<?pHP", "<?phP"] {
1364 let src = format!("{} $x = 1;", tag);
1365 let tokens = collect_kinds(&src);
1366 assert_eq!(
1367 tokens[0],
1368 TokenKind::OpenTag,
1369 "expected OpenTag for opening tag '{tag}'"
1370 );
1371 }
1372 }
1373
1374 #[test]
1375 fn test_open_tag_uppercase_mid_file() {
1376 let tokens = collect_kinds("<html><?PHP echo 1;");
1378 assert_eq!(
1379 tokens,
1380 vec![
1381 TokenKind::InlineHtml,
1382 TokenKind::OpenTag,
1383 TokenKind::Echo,
1384 TokenKind::IntLiteral,
1385 TokenKind::Semicolon,
1386 TokenKind::Eof,
1387 ]
1388 );
1389 }
1390 }
1391
1392 mod keywords {
1393 use super::*;
1394
1395 #[test]
1396 fn test_keyword_resolution() {
1397 let tokens = collect_kinds("<?php if else while for foreach function return");
1398 assert_eq!(
1399 tokens,
1400 vec![
1401 TokenKind::OpenTag,
1402 TokenKind::If,
1403 TokenKind::Else,
1404 TokenKind::While,
1405 TokenKind::For,
1406 TokenKind::Foreach,
1407 TokenKind::Function,
1408 TokenKind::Return,
1409 TokenKind::Eof,
1410 ]
1411 );
1412 }
1413
1414 #[test]
1415 fn test_keyword_case_insensitive() {
1416 let tokens = collect_kinds("<?php IF ELSE TRUE FALSE NULL");
1417 assert_eq!(
1418 tokens,
1419 vec![
1420 TokenKind::OpenTag,
1421 TokenKind::If,
1422 TokenKind::Else,
1423 TokenKind::True,
1424 TokenKind::False,
1425 TokenKind::Null,
1426 TokenKind::Eof,
1427 ]
1428 );
1429 }
1430
1431 #[test]
1432 fn test_logical_keywords() {
1433 let tokens = collect_kinds("<?php and or xor");
1434 assert_eq!(
1435 tokens,
1436 vec![
1437 TokenKind::OpenTag,
1438 TokenKind::And,
1439 TokenKind::Or,
1440 TokenKind::Xor,
1441 TokenKind::Eof,
1442 ]
1443 );
1444 }
1445 }
1446
1447 mod lexer_api {
1448 use super::*;
1449
1450 #[test]
1451 fn test_peek_doesnt_consume() {
1452 let mut lexer = Lexer::new("<?php 42");
1453 let peeked = *lexer.peek();
1454 assert_eq!(peeked.kind, TokenKind::OpenTag);
1455 let next = lexer.next_token();
1456 assert_eq!(next.kind, TokenKind::OpenTag);
1457 let next = lexer.next_token();
1458 assert_eq!(next.kind, TokenKind::IntLiteral);
1459 }
1460
1461 #[test]
1462 fn test_token_text() {
1463 let source = "<?php $myVar = 'hello';";
1464 let mut lexer = Lexer::new(source);
1465 lexer.next_token(); let var_tok = lexer.next_token();
1467 assert_eq!(lexer.token_text(&var_tok), "$myVar");
1468 lexer.next_token(); let str_tok = lexer.next_token();
1470 assert_eq!(lexer.token_text(&str_tok), "'hello'");
1471 }
1472
1473 #[test]
1474 fn test_spans_are_correct() {
1475 let source = "<?php $x";
1476 let tokens = collect_tokens(source);
1477 assert_eq!(tokens[0].span, Span::new(0, 5)); assert_eq!(tokens[1].span, Span::new(6, 8)); }
1480 }
1481
1482 mod operators {
1483 use super::*;
1484
1485 #[test]
1486 fn test_basic_operators() {
1487 assert_eq!(
1488 php_kinds("+ - * / % ** ."),
1489 vec![
1490 TokenKind::Plus,
1491 TokenKind::Minus,
1492 TokenKind::Star,
1493 TokenKind::Slash,
1494 TokenKind::Percent,
1495 TokenKind::StarStar,
1496 TokenKind::Dot,
1497 ]
1498 );
1499 }
1500
1501 #[test]
1502 fn test_operators() {
1503 let tokens = collect_kinds("<?php === !== <=> ?? ++ -- **");
1504 assert_eq!(
1505 tokens,
1506 vec![
1507 TokenKind::OpenTag,
1508 TokenKind::EqualsEqualsEquals,
1509 TokenKind::BangEqualsEquals,
1510 TokenKind::Spaceship,
1511 TokenKind::QuestionQuestion,
1512 TokenKind::PlusPlus,
1513 TokenKind::MinusMinus,
1514 TokenKind::StarStar,
1515 TokenKind::Eof,
1516 ]
1517 );
1518 }
1519
1520 #[test]
1521 fn test_assignment_operators() {
1522 let tokens = collect_kinds("<?php += -= *= /= %= **= .= ??=");
1523 assert_eq!(
1524 tokens,
1525 vec![
1526 TokenKind::OpenTag,
1527 TokenKind::PlusEquals,
1528 TokenKind::MinusEquals,
1529 TokenKind::StarEquals,
1530 TokenKind::SlashEquals,
1531 TokenKind::PercentEquals,
1532 TokenKind::StarStarEquals,
1533 TokenKind::DotEquals,
1534 TokenKind::CoalesceEquals,
1535 TokenKind::Eof,
1536 ]
1537 );
1538 }
1539
1540 #[test]
1541 fn test_hash_bracket_not_comment() {
1542 let kinds = php_kinds("#[Attribute]");
1543 assert_eq!(
1544 kinds,
1545 vec![
1546 TokenKind::HashBracket,
1547 TokenKind::Identifier,
1548 TokenKind::RightBracket,
1549 ]
1550 );
1551 }
1552
1553 #[test]
1554 fn test_nullsafe_arrow() {
1555 let kinds = php_kinds("$x?->y");
1556 assert_eq!(
1557 kinds,
1558 vec![
1559 TokenKind::Variable,
1560 TokenKind::NullsafeArrow,
1561 TokenKind::Identifier,
1562 ]
1563 );
1564 }
1565
1566 #[test]
1567 fn test_pipe_arrow() {
1568 let kinds = php_kinds("$x |> foo(...)");
1569 assert_eq!(
1570 kinds,
1571 vec![
1572 TokenKind::Variable,
1573 TokenKind::PipeArrow,
1574 TokenKind::Identifier,
1575 TokenKind::LeftParen,
1576 TokenKind::Ellipsis,
1577 TokenKind::RightParen,
1578 ]
1579 );
1580 }
1581 }
1582
1583 mod numeric_literals {
1584 use super::*;
1585
1586 #[test]
1587 fn test_integers() {
1588 let toks = php_tokens("42 0xFF 0b1010 077");
1589 assert_eq!(toks[0], (TokenKind::IntLiteral, "42".to_string()));
1590 assert_eq!(toks[1], (TokenKind::HexIntLiteral, "0xFF".to_string()));
1591 assert_eq!(toks[2], (TokenKind::BinIntLiteral, "0b1010".to_string()));
1592 assert_eq!(toks[3], (TokenKind::OctIntLiteral, "077".to_string()));
1593 }
1594
1595 #[test]
1596 fn test_floats() {
1597 let toks = php_tokens("3.14 1e10 2.5e-3");
1598 assert_eq!(toks[0], (TokenKind::FloatLiteralSimple, "3.14".to_string()));
1599 assert_eq!(toks[1], (TokenKind::FloatLiteral, "1e10".to_string()));
1600 assert_eq!(toks[2], (TokenKind::FloatLiteral, "2.5e-3".to_string()));
1601 }
1602
1603 #[test]
1604 fn test_float_leading_dot() {
1605 let toks = php_tokens(".5 .123e4");
1606 assert_eq!(
1607 toks[0],
1608 (TokenKind::FloatLiteralLeadingDot, ".5".to_string())
1609 );
1610 assert_eq!(
1611 toks[1],
1612 (TokenKind::FloatLiteralLeadingDot, ".123e4".to_string())
1613 );
1614 }
1615
1616 #[test]
1617 fn test_trailing_dot_float() {
1618 let toks = php_tokens("0. 1. 42.");
1620 assert_eq!(toks[0], (TokenKind::FloatLiteralSimple, "0.".to_string()));
1621 assert_eq!(toks[1], (TokenKind::FloatLiteralSimple, "1.".to_string()));
1622 assert_eq!(toks[2], (TokenKind::FloatLiteralSimple, "42.".to_string()));
1623 }
1624
1625 #[test]
1626 fn test_trailing_dot_not_confused_with_dotdot() {
1627 let toks = php_tokens("1..");
1630 assert_eq!(toks[0], (TokenKind::IntLiteral, "1".to_string()));
1631 assert_eq!(toks[1], (TokenKind::Dot, ".".to_string()));
1632 assert_eq!(toks[2], (TokenKind::Dot, ".".to_string()));
1633 }
1634
1635 #[test]
1636 fn test_new_octal_syntax() {
1637 let toks = php_tokens("0o77 0O755");
1638 assert_eq!(toks[0], (TokenKind::OctIntLiteralNew, "0o77".to_string()));
1639 assert_eq!(toks[1], (TokenKind::OctIntLiteralNew, "0O755".to_string()));
1640 }
1641
1642 #[test]
1643 fn test_legacy_octal_with_invalid_digits() {
1644 let toks = php_tokens("0778 019 09");
1647 assert_eq!(toks[0], (TokenKind::OctIntLiteral, "0778".to_string()));
1648 assert_eq!(toks[1], (TokenKind::OctIntLiteral, "019".to_string()));
1649 assert_eq!(toks[2], (TokenKind::OctIntLiteral, "09".to_string()));
1650 }
1651
1652 #[test]
1653 fn test_numeric_underscores() {
1654 let toks = php_tokens("1_000 0xFF_FF 0b1010_0101");
1655 assert_eq!(toks[0], (TokenKind::IntLiteral, "1_000".to_string()));
1656 assert_eq!(toks[1], (TokenKind::HexIntLiteral, "0xFF_FF".to_string()));
1657 assert_eq!(
1658 toks[2],
1659 (TokenKind::BinIntLiteral, "0b1010_0101".to_string())
1660 );
1661 }
1662 }
1663
1664 mod strings_and_variables {
1665 use super::*;
1666
1667 #[test]
1668 fn test_string_literals() {
1669 let tokens = collect_kinds(r#"<?php 'single' "double""#);
1670 assert_eq!(
1671 tokens,
1672 vec![
1673 TokenKind::OpenTag,
1674 TokenKind::SingleQuotedString,
1675 TokenKind::DoubleQuotedString,
1676 TokenKind::Eof,
1677 ]
1678 );
1679 }
1680
1681 #[test]
1682 fn test_strings() {
1683 let kinds = php_kinds(r#"'hello' "world" 'it\'s' "say \"hi\"""#);
1684 assert_eq!(
1685 kinds,
1686 vec![
1687 TokenKind::SingleQuotedString,
1688 TokenKind::DoubleQuotedString,
1689 TokenKind::SingleQuotedString,
1690 TokenKind::DoubleQuotedString,
1691 ]
1692 );
1693 }
1694
1695 #[test]
1696 fn test_binary_prefix_strings() {
1697 let kinds = php_kinds(r#"b'hello' B"world""#);
1698 assert_eq!(
1699 kinds,
1700 vec![TokenKind::SingleQuotedString, TokenKind::DoubleQuotedString,]
1701 );
1702 }
1703
1704 #[test]
1705 fn test_variables() {
1706 let toks = php_tokens("$x $myVar $_foo");
1707 assert_eq!(toks[0], (TokenKind::Variable, "$x".to_string()));
1708 assert_eq!(toks[1], (TokenKind::Variable, "$myVar".to_string()));
1709 assert_eq!(toks[2], (TokenKind::Variable, "$_foo".to_string()));
1710 }
1711
1712 #[test]
1713 fn test_comments_yielded() {
1714 let toks = php_tokens("42 // line comment\n43 /* block */ 44 # hash comment\n45");
1716 assert_eq!(toks[0], (TokenKind::IntLiteral, "42".to_string()));
1717 assert_eq!(
1718 toks[1],
1719 (TokenKind::LineComment, "// line comment".to_string())
1720 );
1721 assert_eq!(toks[2], (TokenKind::IntLiteral, "43".to_string()));
1722 assert_eq!(
1723 toks[3],
1724 (TokenKind::BlockComment, "/* block */".to_string())
1725 );
1726 assert_eq!(toks[4], (TokenKind::IntLiteral, "44".to_string()));
1727 assert_eq!(
1728 toks[5],
1729 (TokenKind::HashComment, "# hash comment".to_string())
1730 );
1731 assert_eq!(toks[6], (TokenKind::IntLiteral, "45".to_string()));
1732 }
1733 }
1734}