1use memchr::{memchr2, memmem};
2use php_ast::Span;
3
4use crate::token::{resolve_keyword, TokenKind};
5
6const fn make_whitespace_table() -> [bool; 256] {
14 let mut t = [false; 256];
15 t[b' ' as usize] = true;
16 t[b'\t' as usize] = true;
17 t[b'\r' as usize] = true;
18 t[b'\n' as usize] = true;
19 t[0x0C] = true; t
21}
22
23const fn make_ident_start_table() -> [bool; 256] {
24 let mut t = [false; 256];
25 let mut i = 0usize;
26 while i < 256 {
27 let b = i as u8;
28 t[i] = (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') || b == b'_' || b >= 0x80;
29 i += 1;
30 }
31 t
32}
33
34const fn make_ident_continue_table() -> [bool; 256] {
35 let mut t = [false; 256];
36 let mut i = 0usize;
37 while i < 256 {
38 let b = i as u8;
39 t[i] = (b >= b'a' && b <= b'z')
40 || (b >= b'A' && b <= b'Z')
41 || (b >= b'0' && b <= b'9')
42 || b == b'_'
43 || b >= 0x80;
44 i += 1;
45 }
46 t
47}
48
49static IS_PHP_WHITESPACE: [bool; 256] = make_whitespace_table();
50static IS_IDENT_START: [bool; 256] = make_ident_start_table();
51static IS_IDENT_CONTINUE: [bool; 256] = make_ident_continue_table();
52
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum LexerErrorKind {
56 UnterminatedString,
58 FileTooLarge,
60 Other,
62}
63
64#[derive(Debug, Clone, PartialEq)]
65pub struct LexerError {
66 pub kind: LexerErrorKind,
67 pub message: String,
68 pub span: Span,
69}
70
71#[derive(Debug, Clone, Copy, PartialEq)]
72pub struct Token {
73 pub kind: TokenKind,
74 pub span: Span,
75}
76
77impl Token {
78 pub fn new(kind: TokenKind, span: Span) -> Self {
79 Self { kind, span }
80 }
81
82 pub fn eof(offset: u32) -> Self {
83 Self {
84 kind: TokenKind::Eof,
85 span: Span::new(offset, offset),
86 }
87 }
88}
89
90#[derive(Debug, Clone, Copy, PartialEq, Eq)]
91enum LexerMode {
92 InlineHtml,
93 Php,
94}
95
96pub struct Lexer<'src> {
97 source: &'src str,
98 mode: LexerMode,
99 pos: usize,
100 peeked: Option<Token>,
101 peeked2: Option<Token>,
102 pub errors: Vec<LexerError>,
103}
104
105#[inline(always)]
106fn is_ident_start(b: u8) -> bool {
107 IS_IDENT_START[b as usize]
108}
109
110#[inline(always)]
111fn is_ident_continue(b: u8) -> bool {
112 IS_IDENT_CONTINUE[b as usize]
113}
114
115fn skip_complex_interp(bytes: &[u8], mut p: usize) -> usize {
121 debug_assert!(bytes.get(p) == Some(&b'{'));
122 let mut depth = 0i32;
123 while p < bytes.len() {
124 match bytes[p] {
125 b'{' => {
126 depth += 1;
127 p += 1;
128 }
129 b'}' => {
130 depth -= 1;
131 p += 1;
132 if depth == 0 {
133 return p;
134 }
135 }
136 b'\\' => {
137 p += 1;
138 if p < bytes.len() {
139 p += 1;
140 }
141 }
142 b'"' => p = skip_nested_dquoted(bytes, p),
143 b'\'' => p = skip_nested_squoted(bytes, p),
144 _ => p += 1,
145 }
146 }
147 p
148}
149
150fn skip_nested_dquoted(bytes: &[u8], mut p: usize) -> usize {
151 debug_assert!(bytes.get(p) == Some(&b'"'));
152 p += 1;
153 while p < bytes.len() {
154 match bytes[p] {
155 b'\\' => {
156 p += 1;
157 if p < bytes.len() {
158 p += 1;
159 }
160 }
161 b'"' => return p + 1,
162 b'{' if bytes.get(p + 1) == Some(&b'$') => p = skip_complex_interp(bytes, p),
163 _ => p += 1,
164 }
165 }
166 p
167}
168
169fn skip_nested_squoted(bytes: &[u8], mut p: usize) -> usize {
170 debug_assert!(bytes.get(p) == Some(&b'\''));
171 p += 1;
172 while p < bytes.len() {
173 match bytes[p] {
174 b'\\' => {
175 p += 1;
176 if p < bytes.len() {
177 p += 1;
178 }
179 }
180 b'\'' => return p + 1,
181 _ => p += 1,
182 }
183 }
184 p
185}
186
187impl<'src> Lexer<'src> {
188 pub fn new(source: &'src str) -> Self {
189 debug_assert!(
190 source.len() <= u32::MAX as usize,
191 "source is {} bytes, which exceeds the u32::MAX span limit",
192 source.len()
193 );
194
195 let pos = if source.starts_with("#!") {
197 source.find('\n').map(|p| p + 1).unwrap_or(source.len())
198 } else {
199 0
200 };
201
202 let remaining = &source[pos..];
204 let rem_bytes = remaining.as_bytes();
205 let mode = if (rem_bytes.len() >= 5
206 && rem_bytes[0] == b'<'
207 && rem_bytes[1] == b'?'
208 && rem_bytes[2..5].eq_ignore_ascii_case(b"php"))
209 || remaining.starts_with("<?=")
210 {
211 LexerMode::Php
212 } else {
213 LexerMode::InlineHtml
214 };
215
216 Self {
217 source,
218 mode,
219 pos,
220 peeked: None,
221 peeked2: None,
222 errors: Vec::new(),
223 }
224 }
225
226 pub fn new_at(source: &'src str, offset: usize) -> Self {
231 debug_assert!(
232 source.len() <= u32::MAX as usize,
233 "source is {} bytes, which exceeds the u32::MAX span limit",
234 source.len()
235 );
236
237 Self {
238 source,
239 mode: LexerMode::Php,
240 pos: offset,
241 peeked: None,
242 peeked2: None,
243 errors: Vec::new(),
244 }
245 }
246
247 pub fn source(&self) -> &'src str {
248 self.source
249 }
250
251 pub fn peek(&mut self) -> &Token {
252 if self.peeked.is_none() {
253 self.peeked = Some(self.read_next_token());
254 }
255 self.peeked.as_ref().expect("peeked is Some: set above")
256 }
257
258 pub fn peek2(&mut self) -> &Token {
260 if self.peeked.is_none() {
262 self.peeked = Some(self.read_next_token());
263 }
264 if self.peeked2.is_none() {
265 self.peeked2 = Some(self.read_next_token());
266 }
267 self.peeked2.as_ref().expect("peeked2 is Some: set above")
268 }
269
270 pub fn next_token(&mut self) -> Token {
271 if let Some(token) = self.peeked.take() {
272 self.peeked = self.peeked2.take();
273 return token;
274 }
275 self.read_next_token()
276 }
277
278 pub fn token_text(&self, token: &Token) -> &'src str {
280 &self.source[token.span.start as usize..token.span.end as usize]
281 }
282
283 fn read_next_token(&mut self) -> Token {
284 if self.pos >= self.source.len() {
285 return Token::eof(self.source.len() as u32);
286 }
287
288 match self.mode {
289 LexerMode::InlineHtml => self.lex_inline_html(),
290 LexerMode::Php => self.lex_php(),
291 }
292 }
293
294 fn lex_inline_html(&mut self) -> Token {
295 let start = self.pos;
296 let bytes = self.source.as_bytes();
297
298 let mut search = self.pos;
302 let tag_pos = loop {
303 match memchr::memchr(b'<', &bytes[search..]) {
304 None => break None,
305 Some(offset) => {
306 let p = search + offset;
307 let rest = &bytes[p..];
308 if (rest.len() >= 5
309 && rest[0] == b'<'
310 && rest[1] == b'?'
311 && rest[2..5].eq_ignore_ascii_case(b"php"))
312 || rest.starts_with(b"<?=")
313 {
314 break Some(p - self.pos);
315 }
316 search = p + 1;
317 }
318 }
319 };
320
321 if let Some(tag_pos) = tag_pos {
322 if tag_pos == 0 {
323 self.mode = LexerMode::Php;
325 return self.lex_php();
326 }
327 let end = self.pos + tag_pos;
329 self.pos = end;
330 self.mode = LexerMode::Php;
331 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
332 } else {
333 let end = self.source.len();
335 self.pos = end;
336 Token::new(TokenKind::InlineHtml, Span::new(start as u32, end as u32))
337 }
338 }
339
340 fn lex_php(&mut self) -> Token {
341 let remaining = &self.source[self.pos..];
342
343 if let Some(token) = self.try_lex_heredoc(remaining) {
345 return token;
346 }
347
348 self.skip_whitespace();
350
351 if self.pos >= self.source.len() {
352 return Token::eof(self.source.len() as u32);
353 }
354
355 let bytes = self.source.as_bytes();
356 let start = self.pos;
357
358 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'/' {
361 self.pos += 2;
362 Self::skip_line_comment_body(bytes, &mut self.pos);
363 return self.tok(TokenKind::LineComment, start);
364 }
365
366 if bytes[self.pos] == b'/' && self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'*' {
368 self.pos += 2;
369 let kind = if self.pos < bytes.len()
372 && bytes[self.pos] == b'*'
373 && !(self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'/')
374 {
375 TokenKind::DocComment
376 } else {
377 TokenKind::BlockComment
378 };
379 match memmem::find(&bytes[self.pos..], b"*/") {
380 Some(end) => self.pos += end + 2,
381 None => {
382 let span = Span::new(start as u32, self.source.len() as u32);
383 self.errors.push(LexerError {
384 kind: LexerErrorKind::Other,
385 message: "unterminated block comment".to_string(),
386 span,
387 });
388 self.pos = bytes.len();
389 }
390 }
391 return self.tok(kind, start);
392 }
393
394 if bytes[self.pos] == b'#' && !(self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'[') {
397 self.pos += 1;
398 Self::skip_line_comment_body(bytes, &mut self.pos);
399 return self.tok(TokenKind::HashComment, start);
400 }
401
402 self.scan_token()
403 }
404
405 fn skip_whitespace(&mut self) {
407 let bytes = self.source.as_bytes();
408 while self.pos < bytes.len() && IS_PHP_WHITESPACE[bytes[self.pos] as usize] {
409 self.pos += 1;
410 }
411 }
412
413 fn scan_token(&mut self) -> Token {
415 let start = self.pos;
416 let bytes = self.source.as_bytes();
417 let b = bytes[start];
418
419 match b {
420 b'+' => {
422 if self.check_at(1, b'+') {
423 self.pos = start + 2;
424 self.tok(TokenKind::PlusPlus, start)
425 } else if self.check_at(1, b'=') {
426 self.pos = start + 2;
427 self.tok(TokenKind::PlusEquals, start)
428 } else {
429 self.pos = start + 1;
430 self.tok(TokenKind::Plus, start)
431 }
432 }
433 b'-' => {
434 if self.check_at(1, b'-') {
435 self.pos = start + 2;
436 self.tok(TokenKind::MinusMinus, start)
437 } else if self.check_at(1, b'=') {
438 self.pos = start + 2;
439 self.tok(TokenKind::MinusEquals, start)
440 } else if self.check_at(1, b'>') {
441 self.pos = start + 2;
442 self.tok(TokenKind::Arrow, start)
443 } else {
444 self.pos = start + 1;
445 self.tok(TokenKind::Minus, start)
446 }
447 }
448 b'*' => {
449 if self.check_at(1, b'*') {
450 if self.check_at(2, b'=') {
451 self.pos = start + 3;
452 self.tok(TokenKind::StarStarEquals, start)
453 } else {
454 self.pos = start + 2;
455 self.tok(TokenKind::StarStar, start)
456 }
457 } else if self.check_at(1, b'=') {
458 self.pos = start + 2;
459 self.tok(TokenKind::StarEquals, start)
460 } else {
461 self.pos = start + 1;
462 self.tok(TokenKind::Star, start)
463 }
464 }
465 b'/' => {
466 if self.check_at(1, b'=') {
468 self.pos = start + 2;
469 self.tok(TokenKind::SlashEquals, start)
470 } else {
471 self.pos = start + 1;
472 self.tok(TokenKind::Slash, start)
473 }
474 }
475 b'%' => {
476 if self.check_at(1, b'=') {
477 self.pos = start + 2;
478 self.tok(TokenKind::PercentEquals, start)
479 } else {
480 self.pos = start + 1;
481 self.tok(TokenKind::Percent, start)
482 }
483 }
484 b'.' => {
485 if start + 1 < bytes.len() && bytes[start + 1].is_ascii_digit() {
487 self.pos = start + 1;
488 self.scan_digits(u8::is_ascii_digit);
489 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
491 self.try_scan_exponent();
492 }
493 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
495 self.consume_invalid_numeric_rest();
496 return self.invalid_numeric(start);
497 }
498 return self.tok(TokenKind::FloatLiteralLeadingDot, start);
499 }
500 if self.check_at(1, b'.') && self.check_at(2, b'.') {
501 self.pos = start + 3;
502 self.tok(TokenKind::Ellipsis, start)
503 } else if self.check_at(1, b'=') {
504 self.pos = start + 2;
505 self.tok(TokenKind::DotEquals, start)
506 } else {
507 self.pos = start + 1;
508 self.tok(TokenKind::Dot, start)
509 }
510 }
511 b'=' => {
512 if self.check_at(1, b'=') {
513 if self.check_at(2, b'=') {
514 self.pos = start + 3;
515 self.tok(TokenKind::EqualsEqualsEquals, start)
516 } else {
517 self.pos = start + 2;
518 self.tok(TokenKind::EqualsEquals, start)
519 }
520 } else if self.check_at(1, b'>') {
521 self.pos = start + 2;
522 self.tok(TokenKind::FatArrow, start)
523 } else {
524 self.pos = start + 1;
525 self.tok(TokenKind::Equals, start)
526 }
527 }
528 b'!' => {
529 if self.check_at(1, b'=') {
530 if self.check_at(2, b'=') {
531 self.pos = start + 3;
532 self.tok(TokenKind::BangEqualsEquals, start)
533 } else {
534 self.pos = start + 2;
535 self.tok(TokenKind::BangEquals, start)
536 }
537 } else {
538 self.pos = start + 1;
539 self.tok(TokenKind::Bang, start)
540 }
541 }
542 b'<' => self.scan_less_than(start),
543 b'>' => {
544 if self.check_at(1, b'>') {
545 if self.check_at(2, b'=') {
546 self.pos = start + 3;
547 self.tok(TokenKind::ShiftRightEquals, start)
548 } else {
549 self.pos = start + 2;
550 self.tok(TokenKind::ShiftRight, start)
551 }
552 } else if self.check_at(1, b'=') {
553 self.pos = start + 2;
554 self.tok(TokenKind::GreaterThanEquals, start)
555 } else {
556 self.pos = start + 1;
557 self.tok(TokenKind::GreaterThan, start)
558 }
559 }
560 b'&' => {
561 if self.check_at(1, b'&') {
562 self.pos = start + 2;
563 self.tok(TokenKind::AmpersandAmpersand, start)
564 } else if self.check_at(1, b'=') {
565 self.pos = start + 2;
566 self.tok(TokenKind::AmpersandEquals, start)
567 } else {
568 self.pos = start + 1;
569 self.tok(TokenKind::Ampersand, start)
570 }
571 }
572 b'|' => {
573 if self.check_at(1, b'|') {
574 self.pos = start + 2;
575 self.tok(TokenKind::PipePipe, start)
576 } else if self.check_at(1, b'=') {
577 self.pos = start + 2;
578 self.tok(TokenKind::PipeEquals, start)
579 } else if self.check_at(1, b'>') {
580 self.pos = start + 2;
581 self.tok(TokenKind::PipeArrow, start)
582 } else {
583 self.pos = start + 1;
584 self.tok(TokenKind::Pipe, start)
585 }
586 }
587 b'^' => {
588 if self.check_at(1, b'=') {
589 self.pos = start + 2;
590 self.tok(TokenKind::CaretEquals, start)
591 } else {
592 self.pos = start + 1;
593 self.tok(TokenKind::Caret, start)
594 }
595 }
596 b'~' => {
597 self.pos = start + 1;
598 self.tok(TokenKind::Tilde, start)
599 }
600 b'?' => {
601 if self.check_at(1, b'>') {
602 self.pos = start + 2;
603 self.mode = LexerMode::InlineHtml;
604 self.tok(TokenKind::CloseTag, start)
605 } else if self.check_at(1, b'?') {
606 if self.check_at(2, b'=') {
607 self.pos = start + 3;
608 self.tok(TokenKind::CoalesceEquals, start)
609 } else {
610 self.pos = start + 2;
611 self.tok(TokenKind::QuestionQuestion, start)
612 }
613 } else if self.check_at(1, b'-') && self.check_at(2, b'>') {
614 self.pos = start + 3;
615 self.tok(TokenKind::NullsafeArrow, start)
616 } else {
617 self.pos = start + 1;
618 self.tok(TokenKind::Question, start)
619 }
620 }
621 b':' => {
622 if self.check_at(1, b':') {
623 self.pos = start + 2;
624 self.tok(TokenKind::DoubleColon, start)
625 } else {
626 self.pos = start + 1;
627 self.tok(TokenKind::Colon, start)
628 }
629 }
630 b'@' => {
631 self.pos = start + 1;
632 self.tok(TokenKind::At, start)
633 }
634 b'\\' => {
635 self.pos = start + 1;
636 self.tok(TokenKind::Backslash, start)
637 }
638 b'#' => {
639 if self.check_at(1, b'[') {
642 self.pos = start + 2;
643 self.tok(TokenKind::HashBracket, start)
644 } else {
645 self.pos = start + 1;
647 self.read_next_token()
648 }
649 }
650
651 b'(' => {
653 self.pos = start + 1;
654 self.tok(TokenKind::LeftParen, start)
655 }
656 b')' => {
657 self.pos = start + 1;
658 self.tok(TokenKind::RightParen, start)
659 }
660 b'[' => {
661 self.pos = start + 1;
662 self.tok(TokenKind::LeftBracket, start)
663 }
664 b']' => {
665 self.pos = start + 1;
666 self.tok(TokenKind::RightBracket, start)
667 }
668 b'{' => {
669 self.pos = start + 1;
670 self.tok(TokenKind::LeftBrace, start)
671 }
672 b'}' => {
673 self.pos = start + 1;
674 self.tok(TokenKind::RightBrace, start)
675 }
676 b';' => {
677 self.pos = start + 1;
678 self.tok(TokenKind::Semicolon, start)
679 }
680 b',' => {
681 self.pos = start + 1;
682 self.tok(TokenKind::Comma, start)
683 }
684
685 b'\'' => self.scan_single_quoted_string(),
687 b'"' => self.scan_double_quoted_string(),
688 b'`' => self.scan_backtick_string(),
689
690 b'$' => {
692 if start + 1 < bytes.len() && is_ident_start(bytes[start + 1]) {
693 self.pos = start + 2;
694 while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
695 self.pos += 1;
696 }
697 self.tok(TokenKind::Variable, start)
698 } else {
699 self.pos = start + 1;
700 self.tok(TokenKind::Dollar, start)
701 }
702 }
703
704 b'0'..=b'9' => self.scan_number(),
706
707 _ if is_ident_start(b) => {
709 if b == b'b' || b == b'B' {
711 if self.check_at(1, b'\'') {
712 return self.scan_single_quoted_string();
713 }
714 if self.check_at(1, b'"') {
715 return self.scan_double_quoted_string();
716 }
717 if self.check_at(1, b'<') && self.check_at(2, b'<') && self.check_at(3, b'<') {
718 let remaining = &self.source[self.pos..];
719 if let Some(token) = self.try_lex_heredoc(remaining) {
720 return token;
721 }
722 }
723 }
724 self.scan_identifier()
725 }
726
727 _ => {
729 self.pos = start + 1;
730 self.read_next_token()
731 }
732 }
733 }
734
735 fn scan_less_than(&mut self, start: usize) -> Token {
737 if self.check_at(1, b'<') {
738 if self.check_at(2, b'<') {
739 let remaining = &self.source[self.pos..];
741 if let Some(token) = self.try_lex_heredoc(remaining) {
742 return token;
743 }
744 }
746 if self.check_at(2, b'=') {
747 self.pos = start + 3;
748 return self.tok(TokenKind::ShiftLeftEquals, start);
749 }
750 self.pos = start + 2;
751 return self.tok(TokenKind::ShiftLeft, start);
752 }
753 if self.check_at(1, b'=') {
754 if self.check_at(2, b'>') {
755 self.pos = start + 3;
756 return self.tok(TokenKind::Spaceship, start);
757 }
758 self.pos = start + 2;
759 return self.tok(TokenKind::LessThanEquals, start);
760 }
761 if self.check_at(1, b'>') {
763 self.pos = start + 2;
764 return self.tok(TokenKind::BangEquals, start);
765 }
766 if self.check_at(1, b'?') {
767 let bytes = self.source.as_bytes();
768 if bytes.len() >= self.pos + 5
769 && bytes[self.pos + 2..self.pos + 5].eq_ignore_ascii_case(b"php")
770 {
771 self.pos = start + 5;
772 return self.tok(TokenKind::OpenTag, start);
773 }
774 if self.source[self.pos..].starts_with("<?=") {
775 self.pos = start + 3;
776 return self.tok(TokenKind::OpenTag, start);
777 }
778 }
779 self.pos = start + 1;
780 self.tok(TokenKind::LessThan, start)
781 }
782
783 fn scan_single_quoted_string(&mut self) -> Token {
786 let start = self.pos;
787 let bytes = self.source.as_bytes();
788 let mut p = self.pos;
789 if bytes[p] == b'b' || bytes[p] == b'B' {
791 p += 1;
792 }
793 p += 1; loop {
795 match memchr2(b'\\', b'\'', &bytes[p..]) {
796 None => {
797 self.errors.push(LexerError {
798 kind: LexerErrorKind::UnterminatedString,
799 message: "unterminated string literal".to_string(),
800 span: Span::new(start as u32, self.source.len() as u32),
801 });
802 self.pos = self.source.len();
803 return self.tok(TokenKind::SingleQuotedString, start);
804 }
805 Some(offset) => {
806 p += offset;
807 match bytes[p] {
808 b'\\' => {
809 p += 1;
810 if p < bytes.len() {
811 p += 1;
812 }
813 }
814 _ => {
815 p += 1;
817 break;
818 }
819 }
820 }
821 }
822 }
823 self.pos = p;
824 self.tok(TokenKind::SingleQuotedString, start)
825 }
826
827 fn scan_double_quoted_string(&mut self) -> Token {
828 let start = self.pos;
829 let bytes = self.source.as_bytes();
830 let mut p = self.pos;
831 if bytes[p] == b'b' || bytes[p] == b'B' {
833 p += 1;
834 }
835 p += 1; loop {
837 if p >= bytes.len() {
838 self.errors.push(LexerError {
839 kind: LexerErrorKind::UnterminatedString,
840 message: "unterminated string literal".to_string(),
841 span: Span::new(start as u32, self.source.len() as u32),
842 });
843 self.pos = self.source.len();
844 return self.tok(TokenKind::DoubleQuotedString, start);
845 }
846 match bytes[p] {
847 b'\\' => {
848 p += 1;
849 if p < bytes.len() {
850 p += 1;
851 }
852 }
853 b'"' => {
854 p += 1;
855 break;
856 }
857 b'{' if p + 1 < bytes.len() && bytes[p + 1] == b'$' => {
861 p = skip_complex_interp(bytes, p);
862 }
863 _ => {
864 p += 1;
865 }
866 }
867 }
868 self.pos = p;
869 self.tok(TokenKind::DoubleQuotedString, start)
870 }
871
872 fn scan_backtick_string(&mut self) -> Token {
873 let start = self.pos;
874 let bytes = self.source.as_bytes();
875 let mut p = self.pos;
876 p += 1; loop {
878 match memchr2(b'\\', b'`', &bytes[p..]) {
879 None => {
880 self.errors.push(LexerError {
881 kind: LexerErrorKind::UnterminatedString,
882 message: "unterminated string literal".to_string(),
883 span: Span::new(start as u32, self.source.len() as u32),
884 });
885 self.pos = self.source.len();
886 return self.tok(TokenKind::BacktickString, start);
887 }
888 Some(offset) => {
889 p += offset;
890 match bytes[p] {
891 b'\\' => {
892 p += 1;
893 if p < bytes.len() {
894 p += 1;
895 }
896 }
897 _ => {
898 p += 1;
900 break;
901 }
902 }
903 }
904 }
905 }
906 self.pos = p;
907 self.tok(TokenKind::BacktickString, start)
908 }
909
910 fn scan_number(&mut self) -> Token {
913 let start = self.pos;
914 let bytes = self.source.as_bytes();
915
916 if bytes[start] == b'0' && start + 1 < bytes.len() {
918 match bytes[start + 1] {
919 b'x' | b'X' => {
920 self.pos = start + 2;
921 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
922 self.consume_invalid_numeric_rest();
923 return self.invalid_numeric(start);
924 }
925 if self.scan_digits(u8::is_ascii_hexdigit) {
926 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
927 self.consume_invalid_numeric_rest();
928 return self.invalid_numeric(start);
929 }
930 return self.tok(TokenKind::HexIntLiteral, start);
931 }
932 self.pos = start;
934 }
935 b'b' | b'B' => {
936 self.pos = start + 2;
937 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
938 self.consume_invalid_numeric_rest();
939 return self.invalid_numeric(start);
940 }
941 if self.scan_digits(|b| b == &b'0' || b == &b'1') {
942 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
943 self.consume_invalid_numeric_rest();
944 return self.invalid_numeric(start);
945 }
946 return self.tok(TokenKind::BinIntLiteral, start);
947 }
948 self.pos = start;
950 }
951 b'o' | b'O' => {
952 self.pos = start + 2;
953 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
954 self.consume_invalid_numeric_rest();
955 return self.invalid_numeric(start);
956 }
957 if self.scan_digits(|b| (b'0'..=b'7').contains(b)) {
958 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
959 self.consume_invalid_numeric_rest();
960 return self.invalid_numeric(start);
961 }
962 return self.tok(TokenKind::OctIntLiteralNew, start);
963 }
964 self.pos = start;
966 }
967 _ => {}
968 }
969 }
970
971 self.pos = start;
973 self.scan_digits(u8::is_ascii_digit);
974 let integer_end = self.pos;
975 let mut kind = TokenKind::IntLiteral;
976
977 if bytes[start] == b'0' && integer_end > start + 1 {
980 kind = TokenKind::OctIntLiteral;
981 }
982
983 if self.pos < bytes.len() && bytes[self.pos] == b'.' {
985 if self.pos + 1 < bytes.len() && bytes[self.pos + 1].is_ascii_digit() {
986 self.pos += 1; self.scan_digits(u8::is_ascii_digit);
989 kind = TokenKind::FloatLiteralSimple;
990 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
991 self.consume_invalid_numeric_rest();
993 return self.invalid_numeric(start);
994 } else if self.pos + 1 >= bytes.len() || bytes[self.pos + 1] != b'.' {
995 self.pos += 1; kind = TokenKind::FloatLiteralSimple;
998 }
999 }
1000
1001 if self.pos < bytes.len() && matches!(bytes[self.pos], b'e' | b'E') {
1003 if self.try_scan_exponent() {
1004 kind = TokenKind::FloatLiteral;
1005 } else if self.pos + 1 < bytes.len() && bytes[self.pos + 1] == b'_' {
1006 self.consume_invalid_numeric_rest();
1008 return self.invalid_numeric(start);
1009 }
1010 }
1011
1012 if self.pos < bytes.len() && bytes[self.pos] == b'_' {
1014 self.consume_invalid_numeric_rest();
1015 return self.invalid_numeric(start);
1016 }
1017
1018 self.tok(kind, start)
1019 }
1020
1021 fn scan_digits(&mut self, is_valid: fn(&u8) -> bool) -> bool {
1024 let bytes = self.source.as_bytes();
1025 if self.pos >= bytes.len() || !is_valid(&bytes[self.pos]) {
1026 return false;
1027 }
1028 self.pos += 1;
1029 loop {
1030 if self.pos >= bytes.len() {
1031 break;
1032 }
1033 if is_valid(&bytes[self.pos]) {
1034 self.pos += 1;
1035 } else if bytes[self.pos] == b'_'
1036 && self.pos + 1 < bytes.len()
1037 && is_valid(&bytes[self.pos + 1])
1038 {
1039 self.pos += 2;
1040 } else {
1041 break;
1042 }
1043 }
1044 true
1045 }
1046
1047 fn try_scan_exponent(&mut self) -> bool {
1050 let bytes = self.source.as_bytes();
1051 let saved = self.pos;
1052 self.pos += 1; if self.pos < bytes.len() && matches!(bytes[self.pos], b'+' | b'-') {
1056 self.pos += 1;
1057 }
1058
1059 if self.scan_digits(u8::is_ascii_digit) {
1061 true
1062 } else {
1063 self.pos = saved;
1064 false
1065 }
1066 }
1067
1068 fn scan_identifier(&mut self) -> Token {
1071 let start = self.pos;
1072 let bytes = self.source.as_bytes();
1073 self.pos += 1; while self.pos < bytes.len() && is_ident_continue(bytes[self.pos]) {
1075 self.pos += 1;
1076 }
1077 let text = &self.source[start..self.pos];
1078 let kind = resolve_keyword(text).unwrap_or(TokenKind::Identifier);
1079 self.tok(kind, start)
1080 }
1081
1082 #[inline]
1088 fn skip_line_comment_body(bytes: &[u8], pos: &mut usize) {
1089 loop {
1090 match memchr2(b'\n', b'?', &bytes[*pos..]) {
1091 None => {
1092 *pos = bytes.len();
1093 return;
1094 }
1095 Some(offset) => {
1096 let p = *pos + offset;
1097 if bytes[p] == b'\n' {
1098 *pos = p; return;
1100 }
1101 if p + 1 < bytes.len() && bytes[p + 1] == b'>' {
1103 *pos = p; return;
1105 }
1106 *pos = p + 1;
1108 }
1109 }
1110 }
1111 }
1112
1113 #[inline]
1114 fn check_at(&self, offset: usize, expected: u8) -> bool {
1115 self.source.as_bytes().get(self.pos + offset) == Some(&expected)
1116 }
1117
1118 #[inline]
1119 fn tok(&self, kind: TokenKind, start: usize) -> Token {
1120 Token::new(kind, Span::new(start as u32, self.pos as u32))
1121 }
1122
1123 fn invalid_numeric(&mut self, start: usize) -> Token {
1124 let span = Span::new(start as u32, self.pos as u32);
1125 self.errors.push(LexerError {
1126 kind: LexerErrorKind::Other,
1127 message: "Invalid numeric literal".to_string(),
1128 span,
1129 });
1130 Token::new(TokenKind::InvalidNumericLiteral, span)
1131 }
1132
1133 fn consume_invalid_numeric_rest(&mut self) {
1135 let bytes = self.source.as_bytes();
1136 while self.pos < bytes.len() {
1137 let b = bytes[self.pos];
1138 if b.is_ascii_alphanumeric() || b == b'_' || b == b'.' || b == b'+' || b == b'-' {
1139 if (b == b'+' || b == b'-') && self.pos > 0 {
1141 let prev = bytes[self.pos - 1];
1142 if prev != b'e' && prev != b'E' {
1143 break;
1144 }
1145 }
1146 self.pos += 1;
1147 } else {
1148 break;
1149 }
1150 }
1151 }
1152
1153 fn try_lex_heredoc(&mut self, remaining: &str) -> Option<Token> {
1157 let trimmed = remaining.trim_start_matches(|c: char| {
1159 c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\x0C'
1160 });
1161 let ws_len = remaining.len() - trimmed.len();
1162
1163 let (after_prefix, prefix_len) = if (trimmed.starts_with("b<<<")
1165 || trimmed.starts_with("B<<<"))
1166 && !trimmed[1..].starts_with("<<<>")
1167 {
1168 (&trimmed[1..], 1)
1169 } else {
1170 (trimmed, 0)
1171 };
1172
1173 if !after_prefix.starts_with("<<<") {
1174 return None;
1175 }
1176
1177 let base_pos = self.pos; let start = base_pos + ws_len; let after_arrows = &after_prefix[3..];
1180 let after_arrows_trimmed = after_arrows.trim_start_matches([' ', '\t']);
1181 let arrows_offset =
1182 ws_len + prefix_len + 3 + (after_arrows.len() - after_arrows_trimmed.len());
1183
1184 let (label, is_nowdoc, label_line_end);
1186 if let Some(after_quote) = after_arrows_trimmed.strip_prefix('\'') {
1187 let closing = after_quote.find('\'')?;
1189 label = &after_quote[..closing];
1190 is_nowdoc = true;
1191 let after_label = &after_arrows_trimmed[2 + closing..];
1192 let nl = after_label.find('\n').unwrap_or(after_label.len());
1194 label_line_end = arrows_offset + 2 + closing + nl;
1195 if label_line_end < remaining.len() {
1196 }
1198 } else {
1199 let s = if let Some(after_dquote) = after_arrows_trimmed.strip_prefix('"') {
1201 let closing = after_dquote.find('"')?;
1202 label = &after_dquote[..closing];
1203 &after_dquote[1 + closing..]
1204 } else {
1205 let end = after_arrows_trimmed
1207 .find(|c: char| !c.is_ascii_alphanumeric() && c != '_')
1208 .unwrap_or(after_arrows_trimmed.len());
1209 if end == 0 {
1210 return None;
1211 }
1212 label = &after_arrows_trimmed[..end];
1213 &after_arrows_trimmed[end..]
1214 };
1215 is_nowdoc = false;
1216 let nl = s.find('\n').unwrap_or(s.len());
1217 label_line_end = arrows_offset + (after_arrows_trimmed.len() - s.len()) + nl;
1218 };
1219
1220 if label.is_empty() {
1221 return None;
1222 }
1223
1224 let body_start_in_remaining = if label_line_end < remaining.len() {
1226 label_line_end + 1 } else {
1228 return None; };
1230
1231 let body = &remaining[body_start_in_remaining..];
1232
1233 let mut search_pos = 0;
1235 let end_marker_pos;
1236 loop {
1237 if search_pos >= body.len() {
1238 return None; }
1240 let line_start = search_pos;
1241 let line_end = body[line_start..]
1242 .find('\n')
1243 .map(|p| line_start + p)
1244 .unwrap_or(body.len());
1245 let line = &body[line_start..line_end];
1246 let trimmed_line = line.trim_start_matches([' ', '\t']);
1247
1248 if trimmed_line.len() >= label.len()
1253 && &trimmed_line.as_bytes()[..label.len()] == label.as_bytes()
1254 && !trimmed_line
1255 .as_bytes()
1256 .get(label.len())
1257 .copied()
1258 .is_some_and(is_ident_continue)
1259 {
1260 end_marker_pos = line_start;
1261 break;
1262 }
1263
1264 search_pos = if line_end < body.len() {
1265 line_end + 1
1266 } else {
1267 body.len()
1268 };
1269 }
1270
1271 let end_marker_line = &body[end_marker_pos..];
1273 let trimmed = end_marker_line.trim_start_matches([' ', '\t']);
1274 let indent_len = end_marker_line.len() - trimmed.len();
1275 let token_end_in_remaining =
1276 body_start_in_remaining + end_marker_pos + indent_len + label.len();
1277 self.pos = base_pos + token_end_in_remaining;
1278
1279 let span = Span::new(start as u32, self.pos as u32);
1280
1281 if is_nowdoc {
1282 Some(Token::new(TokenKind::Nowdoc, span))
1283 } else {
1284 Some(Token::new(TokenKind::Heredoc, span))
1285 }
1286 }
1287}
1288
1289pub fn lex_all(source: &str) -> (Vec<Token>, Vec<LexerError>) {
1297 if source.len() > u32::MAX as usize {
1298 let error = LexerError {
1299 kind: LexerErrorKind::FileTooLarge,
1300 message: format!(
1301 "source is {} bytes, which exceeds the maximum supported size of {} bytes",
1302 source.len(),
1303 u32::MAX
1304 ),
1305 span: Span::new(0, 0),
1306 };
1307 let eof = Token::eof(0);
1308 return (vec![eof, eof], vec![error]);
1309 }
1310
1311 let mut lexer = Lexer::new(source);
1312 let mut tokens = Vec::new();
1313
1314 loop {
1315 let tok = lexer.next_token();
1316 let is_eof = tok.kind == TokenKind::Eof;
1317 tokens.push(tok);
1318 if is_eof {
1319 break;
1320 }
1321 }
1322
1323 let eof_span = tokens.last().unwrap().span;
1326 tokens.push(Token::new(TokenKind::Eof, eof_span));
1327
1328 let errors = lexer.errors;
1329 (tokens, errors)
1330}