1use crate::{Span, Token, TokenKind};
4use tracing::trace;
5
6#[derive(Clone)]
8pub struct Tokenizer<'src> {
9 source: &'src str,
11 remaining: &'src str,
13 pos: u32,
15
16 heredoc_state: Option<HeredocState>,
18}
19
20#[derive(Debug, Clone)]
22struct HeredocState {
23 delimiter: String,
25 closing_indent: Option<usize>,
28}
29
30impl<'src> Tokenizer<'src> {
31 pub fn new(source: &'src str) -> Self {
33 Self {
34 source,
35 remaining: source,
36 pos: 0,
37 heredoc_state: None,
38 }
39 }
40
41 #[inline]
43 pub fn position(&self) -> u32 {
44 self.pos
45 }
46
47 #[inline]
49 pub fn is_eof(&self) -> bool {
50 self.remaining.is_empty()
51 }
52
53 #[inline]
57 pub fn heredoc_closing_indent(&self) -> Option<usize> {
58 self.heredoc_state.as_ref().and_then(|s| s.closing_indent)
59 }
60
61 #[inline]
63 fn peek(&self) -> Option<char> {
64 self.remaining.chars().next()
65 }
66
67 #[inline]
69 fn peek_nth(&self, n: usize) -> Option<char> {
70 self.remaining.chars().nth(n)
71 }
72
73 #[inline]
75 fn advance(&mut self) -> Option<char> {
76 let c = self.peek()?;
77 self.pos += c.len_utf8() as u32;
78 self.remaining = &self.remaining[c.len_utf8()..];
79 Some(c)
80 }
81
82 #[inline]
84 fn advance_by(&mut self, n: usize) {
85 self.pos += n as u32;
86 self.remaining = &self.remaining[n..];
87 }
88
89 #[inline]
91 fn starts_with(&self, prefix: &str) -> bool {
92 self.remaining.starts_with(prefix)
93 }
94
95 fn token(&self, kind: TokenKind, start: u32) -> Token<'src> {
97 let span = Span::new(start, self.pos);
98 let text = &self.source[start as usize..self.pos as usize];
99 trace!("Token {:?} at {:?}: {:?}", kind, span, text);
100 Token::new(kind, span, text)
101 }
102
103 pub fn next_token(&mut self) -> Token<'src> {
105 if let Some(ref state) = self.heredoc_state.clone() {
107 return self.tokenize_heredoc_content(&state.delimiter);
108 }
109
110 if self.is_eof() {
112 return self.token(TokenKind::Eof, self.pos);
113 }
114
115 let start = self.pos;
116 let c = self.peek().unwrap();
117
118 match c {
119 '{' => {
121 self.advance();
122 self.token(TokenKind::LBrace, start)
123 }
124 '}' => {
125 self.advance();
126 self.token(TokenKind::RBrace, start)
127 }
128 '(' => {
129 self.advance();
130 self.token(TokenKind::LParen, start)
131 }
132 ')' => {
133 self.advance();
134 self.token(TokenKind::RParen, start)
135 }
136 ',' => {
137 self.advance();
138 self.token(TokenKind::Comma, start)
139 }
140 '>' => {
141 self.advance();
142 self.token(TokenKind::Gt, start)
143 }
144 '@' => self.tokenize_at_or_tag(),
145
146 '"' => self.tokenize_quoted_scalar(),
148
149 '/' if self.starts_with("///") => self.tokenize_doc_comment(),
151 '/' if self.starts_with("//") => self.tokenize_line_comment(),
152 '/' => self.tokenize_bare_scalar(),
154
155 '<' if self.starts_with("<<")
158 && matches!(self.peek_nth(2), Some(c) if c.is_ascii_uppercase()) =>
159 {
160 self.tokenize_heredoc_start()
161 }
162 '<' if self.starts_with("<<") => {
164 let start = self.pos;
165 self.advance(); self.advance(); self.token(TokenKind::Error, start)
168 }
169
170 'r' if matches!(self.peek_nth(1), Some('#' | '"')) => self.tokenize_raw_string(),
172
173 ' ' | '\t' => self.tokenize_whitespace(),
175
176 '\n' => {
178 self.advance();
179 self.token(TokenKind::Newline, start)
180 }
181 '\r' if self.peek_nth(1) == Some('\n') => {
182 self.advance();
183 self.advance();
184 self.token(TokenKind::Newline, start)
185 }
186
187 _ if is_bare_scalar_start(c) => self.tokenize_bare_scalar(),
189
190 _ => {
192 self.advance();
193 self.token(TokenKind::Error, start)
194 }
195 }
196 }
197
198 fn tokenize_whitespace(&mut self) -> Token<'src> {
200 let start = self.pos;
201 while let Some(c) = self.peek() {
202 if c == ' ' || c == '\t' {
203 self.advance();
204 } else {
205 break;
206 }
207 }
208 self.token(TokenKind::Whitespace, start)
209 }
210
211 fn tokenize_bare_scalar(&mut self) -> Token<'src> {
213 let start = self.pos;
214 while let Some(c) = self.peek() {
215 if is_bare_scalar_char(c) {
216 self.advance();
217 } else {
218 break;
219 }
220 }
221 self.token(TokenKind::BareScalar, start)
222 }
223
224 fn tokenize_at_or_tag(&mut self) -> Token<'src> {
226 let start = self.pos;
227 self.advance(); match self.peek() {
231 Some(c) if c.is_ascii_alphabetic() || c == '_' => {
232 self.consume_tag_segment();
233
234 while self.peek() == Some('/')
236 && self.peek_nth(1) == Some('@')
237 && matches!(self.peek_nth(2), Some(c) if c.is_ascii_alphabetic() || c == '_')
238 {
239 self.advance(); self.advance(); self.consume_tag_segment();
242 }
243 self.token(TokenKind::Tag, start)
244 }
245 _ => {
246 self.token(TokenKind::At, start)
248 }
249 }
250 }
251
252 fn consume_tag_segment(&mut self) {
254 self.advance();
257 while let Some(c) = self.peek() {
258 if c == 'r' && matches!(self.peek_nth(1), Some('#' | '"')) {
259 break;
260 }
261 if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
262 self.advance();
263 } else {
264 break;
265 }
266 }
267 }
268
269 fn tokenize_quoted_scalar(&mut self) -> Token<'src> {
271 let start = self.pos;
272
273 self.advance();
275
276 loop {
277 match self.peek() {
278 None => {
279 return self.token(TokenKind::Error, start);
281 }
282 Some('"') => {
283 self.advance();
284 break;
285 }
286 Some('\\') => {
287 self.advance();
289 if self.peek().is_some() {
290 self.advance();
291 }
292 }
293 Some(_) => {
294 self.advance();
295 }
296 }
297 }
298
299 self.token(TokenKind::QuotedScalar, start)
300 }
301
302 fn tokenize_line_comment(&mut self) -> Token<'src> {
305 let start = self.pos;
306
307 self.advance();
309 self.advance();
310
311 while let Some(c) = self.peek() {
313 if c == '\n' || c == '\r' {
314 break;
315 }
316 self.advance();
317 }
318
319 self.token(TokenKind::LineComment, start)
320 }
321
322 fn tokenize_doc_comment(&mut self) -> Token<'src> {
324 let start = self.pos;
325
326 self.advance();
328 self.advance();
329 self.advance();
330
331 while let Some(c) = self.peek() {
333 if c == '\n' || c == '\r' {
334 break;
335 }
336 self.advance();
337 }
338
339 self.token(TokenKind::DocComment, start)
340 }
341
342 fn tokenize_heredoc_start(&mut self) -> Token<'src> {
348 let start = self.pos;
349
350 self.advance();
352 self.advance();
353
354 let delim_start = self.pos as usize;
355
356 match self.peek() {
358 Some(c) if c.is_ascii_uppercase() => {
359 self.advance();
360 }
361 _ => {
362 while let Some(c) = self.peek() {
365 if c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_' {
366 self.advance();
367 } else {
368 break;
369 }
370 }
371 return self.token(TokenKind::Error, start);
372 }
373 }
374
375 while let Some(c) = self.peek() {
377 if c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_' {
378 self.advance();
379 } else {
380 break;
381 }
382 }
383
384 let delimiter = &self.source[delim_start..self.pos as usize];
385
386 if delimiter.len() > 16 {
388 return self.token(TokenKind::Error, start);
389 }
390
391 if self.peek() == Some(',') {
395 self.advance(); if let Some(c) = self.peek()
398 && c.is_ascii_lowercase()
399 {
400 self.advance();
401 while let Some(c) = self.peek() {
403 if c.is_ascii_lowercase()
404 || c.is_ascii_digit()
405 || c == '_'
406 || c == '.'
407 || c == '-'
408 {
409 self.advance();
410 } else {
411 break;
412 }
413 }
414 }
415 }
416
417 if self.peek() == Some('\r') {
419 self.advance();
420 }
421 if self.peek() == Some('\n') {
422 self.advance();
423 }
424
425 self.heredoc_state = Some(HeredocState {
427 delimiter: delimiter.to_string(),
428 closing_indent: None,
429 });
430
431 self.token(TokenKind::HeredocStart, start)
432 }
433
434 fn find_heredoc_delimiter(&self, delimiter: &str) -> Option<usize> {
438 let indent_len = self
440 .remaining
441 .chars()
442 .take_while(|c| *c == ' ' || *c == '\t')
443 .count();
444
445 let after_indent = &self.remaining[indent_len..];
447 if let Some(after_delim) = after_indent.strip_prefix(delimiter)
448 && (after_delim.is_empty()
449 || after_delim.starts_with('\n')
450 || after_delim.starts_with("\r\n"))
451 {
452 return Some(indent_len);
453 }
454 None
455 }
456
457 fn tokenize_heredoc_content(&mut self, delimiter: &str) -> Token<'src> {
461 let start = self.pos;
462
463 if let Some(indent_len) = self.find_heredoc_delimiter(delimiter) {
465 self.advance_by(indent_len + delimiter.len());
467 self.heredoc_state = None;
468 return self.token(TokenKind::HeredocEnd, start);
469 }
470
471 let mut found_end = false;
473 let mut closing_indent = 0usize;
474 while !self.is_eof() {
475 while let Some(c) = self.peek() {
477 if c == '\n' {
478 self.advance();
479 break;
480 } else if c == '\r' && self.peek_nth(1) == Some('\n') {
481 self.advance();
482 self.advance();
483 break;
484 }
485 self.advance();
486 }
487
488 if let Some(indent_len) = self.find_heredoc_delimiter(delimiter) {
490 found_end = true;
491 closing_indent = indent_len;
492 break;
493 }
494
495 if self.is_eof() {
496 break;
497 }
498 }
499
500 if start == self.pos
501 && found_end
502 && let Some(indent_len) = self.find_heredoc_delimiter(delimiter)
503 {
504 self.advance_by(indent_len + delimiter.len());
506 self.heredoc_state = None;
507 return self.token(TokenKind::HeredocEnd, start);
508 }
509
510 if self.is_eof() && !found_end {
514 self.heredoc_state = None;
515 return self.token(TokenKind::Error, start);
516 }
517
518 if let Some(ref mut state) = self.heredoc_state {
520 state.closing_indent = Some(closing_indent);
521 }
522
523 self.token(TokenKind::HeredocContent, start)
524 }
525
526 fn tokenize_raw_string(&mut self) -> Token<'src> {
530 let start = self.pos;
531
532 self.advance();
534
535 let mut hash_count: u8 = 0;
537 while self.peek() == Some('#') {
538 hash_count = hash_count.saturating_add(1);
539 self.advance();
540 }
541
542 if self.peek() == Some('"') {
544 self.advance();
545 } else {
546 return self.token(TokenKind::Error, start);
548 }
549
550 loop {
552 match self.peek() {
553 None => {
554 return self.token(TokenKind::Error, start);
556 }
557 Some('"') => {
558 let mut matched_hashes = 0u8;
560 let mut lookahead = 1;
561 while matched_hashes < hash_count {
562 if self.peek_nth(lookahead) == Some('#') {
563 matched_hashes += 1;
564 lookahead += 1;
565 } else {
566 break;
567 }
568 }
569
570 if matched_hashes == hash_count {
571 self.advance(); for _ in 0..hash_count {
574 self.advance(); }
576 return self.token(TokenKind::RawScalar, start);
578 } else {
579 self.advance();
581 }
582 }
583 Some(_) => {
584 self.advance();
585 }
586 }
587 }
588 }
589}
590
591impl<'src> Iterator for Tokenizer<'src> {
592 type Item = Token<'src>;
593
594 fn next(&mut self) -> Option<Self::Item> {
595 let token = self.next_token();
596 if token.kind == TokenKind::Eof {
597 None
598 } else {
599 Some(token)
600 }
601 }
602}
603
604fn is_bare_scalar_start(c: char) -> bool {
607 !matches!(c, '{' | '}' | '(' | ')' | ',' | '"' | '=' | '@' | '>' | '/') && !c.is_whitespace()
610}
611
612fn is_bare_scalar_char(c: char) -> bool {
615 !matches!(c, '{' | '}' | '(' | ')' | ',' | '"' | '>') && !c.is_whitespace()
619}
620
621#[cfg(test)]
622mod tests {
623 use super::*;
624 use facet_testhelpers::test;
625
626 fn tokenize(source: &str) -> Vec<(TokenKind, &str)> {
627 Tokenizer::new(source).map(|t| (t.kind, t.text)).collect()
628 }
629
630 #[test]
631 fn test_structural_tokens() {
632 assert_eq!(tokenize("{"), vec![(TokenKind::LBrace, "{")]);
633 assert_eq!(tokenize("}"), vec![(TokenKind::RBrace, "}")]);
634 assert_eq!(tokenize("("), vec![(TokenKind::LParen, "(")]);
635 assert_eq!(tokenize(")"), vec![(TokenKind::RParen, ")")]);
636 assert_eq!(tokenize(","), vec![(TokenKind::Comma, ",")]);
637 assert_eq!(tokenize(">"), vec![(TokenKind::Gt, ">")]);
638 assert_eq!(tokenize("@"), vec![(TokenKind::At, "@")]);
639 }
640
641 #[test]
642 fn test_bare_scalar() {
643 assert_eq!(tokenize("hello"), vec![(TokenKind::BareScalar, "hello")]);
644 assert_eq!(tokenize("42"), vec![(TokenKind::BareScalar, "42")]);
645 assert_eq!(tokenize("true"), vec![(TokenKind::BareScalar, "true")]);
646 assert_eq!(
647 tokenize("https://example.com/path"),
648 vec![(TokenKind::BareScalar, "https://example.com/path")]
649 );
650 }
651
652 #[test]
653 fn test_chained_tag_token() {
654 assert_eq!(
655 tokenize("@must_emit/@discover_start"),
656 vec![(TokenKind::Tag, "@must_emit/@discover_start")]
657 );
658 }
659
660 #[test]
661 fn test_chained_tag_token_with_payload() {
662 assert_eq!(
663 tokenize("@must_emit/@discover_start{executor default}"),
664 vec![
665 (TokenKind::Tag, "@must_emit/@discover_start"),
666 (TokenKind::LBrace, "{"),
667 (TokenKind::BareScalar, "executor"),
668 (TokenKind::Whitespace, " "),
669 (TokenKind::BareScalar, "default"),
670 (TokenKind::RBrace, "}"),
671 ]
672 );
673 }
674
675 #[test]
676 fn test_three_segment_chained_tag_token() {
677 assert_eq!(tokenize("@a/@b/@c"), vec![(TokenKind::Tag, "@a/@b/@c")]);
678 }
679
680 #[test]
681 fn test_chained_tag_token_with_quoted_leaf_payload() {
682 assert_eq!(
683 tokenize(r#"@a/@b"foo""#),
684 vec![
685 (TokenKind::Tag, "@a/@b"),
686 (TokenKind::QuotedScalar, r#""foo""#),
687 ]
688 );
689 }
690
691 #[test]
692 fn test_chained_tag_token_with_raw_leaf_payload() {
693 assert_eq!(
694 tokenize(r##"@a/@br#"foo"#"##),
695 vec![
696 (TokenKind::Tag, "@a/@b"),
697 (TokenKind::RawScalar, r##"r#"foo"#"##),
698 ]
699 );
700 }
701
702 #[test]
703 fn test_chained_tag_token_with_heredoc_leaf_payload() {
704 assert_eq!(
705 tokenize("@a/@b<<EOF\nhello\nEOF"),
706 vec![
707 (TokenKind::Tag, "@a/@b"),
708 (TokenKind::HeredocStart, "<<EOF\n"),
709 (TokenKind::HeredocContent, "hello\n"),
710 (TokenKind::HeredocEnd, "EOF"),
711 ]
712 );
713 }
714
715 #[test]
716 fn test_quoted_scalar() {
717 assert_eq!(
718 tokenize(r#""hello world""#),
719 vec![(TokenKind::QuotedScalar, r#""hello world""#)]
720 );
721 assert_eq!(
722 tokenize(r#""with \"escapes\"""#),
723 vec![(TokenKind::QuotedScalar, r#""with \"escapes\"""#)]
724 );
725 }
726
727 #[test]
728 fn test_raw_scalar() {
729 assert_eq!(
731 tokenize(r#"r"hello""#),
732 vec![(TokenKind::RawScalar, r#"r"hello""#)]
733 );
734 assert_eq!(
735 tokenize(r##"r#"hello"#"##),
736 vec![(TokenKind::RawScalar, r##"r#"hello"#"##)]
737 );
738 }
739
740 #[test]
741 fn test_comments() {
742 assert_eq!(
743 tokenize("// comment"),
744 vec![(TokenKind::LineComment, "// comment")]
745 );
746 assert_eq!(
747 tokenize("/// doc"),
748 vec![(TokenKind::DocComment, "/// doc")]
749 );
750 }
751
752 #[test]
753 fn test_whitespace() {
754 assert_eq!(tokenize(" \t"), vec![(TokenKind::Whitespace, " \t")]);
755 assert_eq!(tokenize("\n"), vec![(TokenKind::Newline, "\n")]);
756 assert_eq!(tokenize("\r\n"), vec![(TokenKind::Newline, "\r\n")]);
757 }
758
759 #[test]
760 fn test_mixed() {
761 let tokens = tokenize("{host localhost}");
762 assert_eq!(
763 tokens,
764 vec![
765 (TokenKind::LBrace, "{"),
766 (TokenKind::BareScalar, "host"),
767 (TokenKind::Whitespace, " "),
768 (TokenKind::BareScalar, "localhost"),
769 (TokenKind::RBrace, "}"),
770 ]
771 );
772 }
773
774 #[test]
775 fn test_heredoc() {
776 let tokens = tokenize("<<EOF\nhello\nworld\nEOF");
777 assert_eq!(
778 tokens,
779 vec![
780 (TokenKind::HeredocStart, "<<EOF\n"),
781 (TokenKind::HeredocContent, "hello\nworld\n"),
782 (TokenKind::HeredocEnd, "EOF"),
783 ]
784 );
785 }
786
787 #[test]
789 fn test_heredoc_valid_delimiters() {
790 assert!(
792 tokenize("<<A\nx\nA")
793 .iter()
794 .all(|t| t.0 != TokenKind::Error)
795 );
796 assert!(
798 tokenize("<<EOF\nx\nEOF")
799 .iter()
800 .all(|t| t.0 != TokenKind::Error)
801 );
802 assert!(
804 tokenize("<<MY123\nx\nMY123")
805 .iter()
806 .all(|t| t.0 != TokenKind::Error)
807 );
808 assert!(
810 tokenize("<<MY_DELIM\nx\nMY_DELIM")
811 .iter()
812 .all(|t| t.0 != TokenKind::Error)
813 );
814 assert!(
816 tokenize("<<ABCDEFGHIJKLMNOP\nx\nABCDEFGHIJKLMNOP")
817 .iter()
818 .all(|t| t.0 != TokenKind::Error)
819 );
820 }
821
822 #[test]
824 fn test_heredoc_must_start_uppercase() {
825 assert!(tokenize("<<123FOO").iter().any(|t| t.0 == TokenKind::Error));
827 assert!(tokenize("<<_FOO").iter().any(|t| t.0 == TokenKind::Error));
829 let tokens = tokenize("<<foo");
831 assert!(!tokens.iter().any(|t| t.0 == TokenKind::HeredocStart));
833 }
834
835 #[test]
837 fn test_heredoc_max_16_chars() {
838 assert!(
840 tokenize("<<ABCDEFGHIJKLMNOPQ\nx\nABCDEFGHIJKLMNOPQ")
841 .iter()
842 .any(|t| t.0 == TokenKind::Error)
843 );
844 }
845
846 #[test]
847 fn test_slash_in_bare_scalar() {
848 let tokens = tokenize("/foo");
850 assert_eq!(tokens, vec![(TokenKind::BareScalar, "/foo")]);
851
852 let tokens = tokenize("/usr/bin/foo");
854 assert_eq!(tokens, vec![(TokenKind::BareScalar, "/usr/bin/foo")]);
855
856 let tokens = tokenize("// comment");
858 assert_eq!(tokens, vec![(TokenKind::LineComment, "// comment")]);
859 }
860
861 #[test]
862 fn test_attribute_syntax_tokens() {
863 let tokens = tokenize("server host>localhost");
865 assert_eq!(
867 tokens,
868 vec![
869 (TokenKind::BareScalar, "server"),
870 (TokenKind::Whitespace, " "),
871 (TokenKind::BareScalar, "host"),
872 (TokenKind::Gt, ">"),
873 (TokenKind::BareScalar, "localhost"),
874 ]
875 );
876 }
877
878 #[test]
879 fn test_unterminated_heredoc() {
880 let tokens = tokenize("<<EOF\nhello world\n");
882 eprintln!("tokens = {:?}", tokens);
883 assert!(
884 tokens.iter().any(|t| t.0 == TokenKind::Error),
885 "Expected Error token for unterminated heredoc"
886 );
887 }
888
889 #[test]
890 fn test_unterminated_string() {
891 let tokens = tokenize("\"hello");
893 eprintln!("tokens = {:?}", tokens);
894 assert!(
895 tokens.iter().any(|t| t.0 == TokenKind::Error),
896 "Expected Error token for unterminated string"
897 );
898 }
899}